Merge branch 'main' into update-license

Nanostring-Biostats · Apr 5, 2023 · 0973e6b · 0973e6b
2 parents 2973a51 + c06abd0
commit 0973e6b
Show file tree

Hide file tree

Showing 52 changed files with 685 additions and 284 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .Rproj.user
+*.Rproj
 .Rhistory
 .RData
 .Ruserdata

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Title: An R package for performing cell typing in SMI and other single cell data
 Version: 1.0.0
 Authors@R: c(person("Patrick", "Danaher", email = "[email protected]", role = c("aut", "cre")),
              person("Zhi", "Yang", email = "[email protected]", role = c("aut")),
-             person("David", "Ross", email = "[email protected]", role = c("aut")))
+             person("David", "Ross", email = "[email protected]", role = c("aut", "cre")))
 Description: Insitutype is an algorithm for performing cell typing in single cell 
              spatial transcriptomics data, such as is generated by the CosMx platform. 
              It can perform supervised cell typing from reference profiles, unsupervised clustering,
@@ -15,7 +15,6 @@ Imports:
   scales,
   umap,
   ggplot2,
-  lsa,
   irlba,
   mclust,
   sparseMatrixStats,
@@ -38,5 +37,5 @@ Suggests:
 VignetteBuilder: knitr
 Depends:
   R (>= 3.5.0)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 LinkingTo: Rcpp, RcppArmadillo
diff --git a/InSituType.Rproj b/InSituType.Rproj
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,14 +6,13 @@ export(chooseClusterNumber)
 export(choose_anchors_from_stats)
 export(colorCellTypes)
 export(fastCohorting)
+export(fast_lldist)
 export(find_anchor_cells)
 export(flightpath_layout)
 export(flightpath_plot)
 export(get_anchor_stats)
 export(insitutype)
 export(insitutypeML)
-export(lls)
-export(numCores)
 export(refineClusters)
 export(updateProfilesFromAnchors)
 export(updateReferenceProfiles)

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,10 @@
 * License updated
 * lldist parallelized with OpenMP
 
+# InSituType 0.99.4
+
+* Re-submission to Bioconductor
+
 # InSituType 0.99.3
 
 * Merge subclustering fix

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -6,8 +6,8 @@
 #' Probability density function of the negative binomial distribution (written in C++)
 #'
 #' @param mat dgCMatrix expression counts
-#' @param s numeric scaling factor
-#' @param x numeric expression for reference profile
+#' @param bgsub vector of background expression per cell
+#' @param x numeric expression for reference profiles
 #' @param bg numeric background level
 #' @param size_dnb int Dispersion parameter
 #'
@@ -16,7 +16,7 @@
 #' @importFrom Rcpp evalCpp
 #' @exportPattern "^[[:alpha:]]+" 
 #' @export
-lls <- function(mat, s, x, bg, size_dnb) {
-    .Call(`_InSituType_lls`, mat, s, x, bg, size_dnb)
+fast_lldist <- function(mat, bgsub, x, bg, size_dnb) {
+    .Call(`_InSituType_fast_lldist`, mat, bgsub, x, bg, size_dnb)
 }
 
diff --git a/R/chooseClusterNumber.R b/R/chooseClusterNumber.R
@@ -33,6 +33,10 @@
 #' \itemize{
 #'  \item
 #' }
+#' @examples
+#' data("mini_nsclc")
+#' chooseClusterNumber(mini_nsclc$counts, Matrix::rowMeans(mini_nsclc$neg),
+#'  n_clust = 2:5)
 chooseClusterNumber <-
   function(counts,
            neg,
@@ -101,13 +105,10 @@ chooseClusterNumber <-
       max_iters = max_iters)  
 
     # get the loglik of the clustering result:
-    loglik_thisclust <- parallel::mclapply(asplit(tempclust$profiles, 2),
-                      lldist,
-                      mat = counts,
-                      bg = bg,
-                      size = nb_size,
-                      mc.cores = numCores())
-    loglik_thisclust <- do.call(cbind, loglik_thisclust)
+    loglik_thisclust <- lldist(x = tempclust$profiles,
+                               mat = counts,
+                               bg = bg,
+                               size = nb_size)
     total_loglik_this_clust <- sum(apply(loglik_thisclust, 1, max))
     return(total_loglik_this_clust)
   })

diff --git a/R/colorCellTypes.R b/R/colorCellTypes.R
@@ -16,6 +16,19 @@
 #' @return A named color vector
 #' @importFrom grDevices col2rgb colors
 #' @export
+#' @examples
+#' data("mini_nsclc")
+#' unsup <- insitutype(
+#'  x = mini_nsclc$counts,
+#'  neg = Matrix::rowMeans(mini_nsclc$neg),
+#'  n_clusts = 8,
+#'  n_phase1 = 200,
+#'  n_phase2 = 500,
+#'  n_phase3 = 2000,
+#'  n_starts = 1,
+#'  max_iters = 5
+#' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
+#' colorCellTypes(freqs = table(unsup$clust), palette = "brewers")
 #' 
 colorCellTypes <- function(names = NULL, freqs = NULL, init_colors = NULL, max_sum_rgb = 600, 
                            palette = "earthplus") {

diff --git a/R/fastCohorting.R b/R/fastCohorting.R
@@ -16,6 +16,13 @@
 #' @importFrom mclust predict.Mclust
 #' @importFrom mclust mclustBIC
 #' @importFrom stats qnorm
+#' @examples 
+#' data("mini_nsclc")
+#' ## simulate immunofluorescence data: 
+#' immunofluordata <- matrix(rpois(n = nrow(mini_nsclc$counts) * 4, lambda = 100), 
+#'                           nrow(mini_nsclc$counts))
+#' cohort <- fastCohorting(immunofluordata, gaussian_transform = TRUE)
+#' table(cohort)
 fastCohorting <- function(mat, n_cohorts = NULL, gaussian_transform = TRUE) {
 
   if (any(is.na(mat))) {

diff --git a/R/find_anchor_cells.R b/R/find_anchor_cells.R
@@ -15,8 +15,13 @@
 #' @param min_cosine Cells must have at least this much cosine similarity to a fixed profile to be used as an anchor.
 #' @return A list with two elements: cos, the matrix of cosine distances;
 #'  and llr, the matrix of log likelihood ratios of each cell under each cell type vs. the 2nd best cell type.
-#' @importFrom lsa cosine
 #' @export
+#' @examples
+#' data("ioprofiles")
+#' data("mini_nsclc")
+#' get_anchor_stats(counts = mini_nsclc$counts,
+#'                  neg = Matrix::rowMeans(mini_nsclc$neg),
+#'                  profiles = ioprofiles)
 get_anchor_stats <- function(counts, neg = NULL, bg = NULL, align_genes = TRUE,
                              profiles, size = 10, 
                              min_cosine = 0.3) {
@@ -64,11 +69,13 @@ get_anchor_stats <- function(counts, neg = NULL, bg = NULL, align_genes = TRUE,
   }
 
   # get cosine distances:
-  cos <- matrix(NA, nrow(counts), ncol(profiles),
-                dimnames = list(rownames(counts), colnames(profiles)))
-  cos <- sapply(colnames(profiles), function(cell) {
-    cos[, cell] <- apply(counts, 1, cosine, profiles[, cell])
-  })
+  cos_numerator <- counts %*% profiles
+  counts2 <- as(counts, "dgCMatrix")
+  counts2@x <- counts2@x^2
+  rs <- sqrt(Matrix::rowSums(counts2))
+  ps <- sqrt(Matrix::colSums(profiles^2))
+  cos_denominator <- (t(t(rs))) %*% ps
+  cos <- as.matrix(cos_numerator / cos_denominator)
 
   # stats for which cells to get loglik on: 
   # get 3rd hightest cosines of each cell:
@@ -128,6 +135,30 @@ get_anchor_stats <- function(counts, neg = NULL, bg = NULL, align_genes = TRUE,
 #' @return A vector holding anchor cell assignments (or NA) for each cell in the
 #'   counts matrix
 #' @export
+#' @examples 
+#' data("ioprofiles")
+#' data("mini_nsclc")
+#' counts <- mini_nsclc$counts
+#' astats <- get_anchor_stats(counts = counts,
+#'                          neg = Matrix::rowMeans(mini_nsclc$neg),
+#'                          profiles = ioprofiles)
+#'
+#' ## estimate per-cell bg as a fraction of total counts:
+#' negmean.per.totcount <- mean(rowMeans(mini_nsclc$neg)) / mean(rowSums(counts))
+#' per.cell.bg <- rowSums(counts) * negmean.per.totcount
+#'
+#' # now choose anchors:
+#' choose_anchors_from_stats(counts = counts, 
+#'                           neg = Matrix::rowMeans(mini_nsclc$neg),
+#'                           bg = per.cell.bg,
+#'                           anchorstats = astats, 
+#'                           # a very low value chosen for the mini
+#'                           # dataset. Typically hundreds of cells
+#'                           # would be better.
+#'                           n_cells = 50, 
+#'                           min_cosine = 0.4, 
+#'                           min_scaled_llr = 0.03, 
+#'                           insufficient_anchors_thresh = 5)
 choose_anchors_from_stats <-
   function(counts,
            neg = NULL,
@@ -230,6 +261,13 @@ choose_anchors_from_stats <-
 #'   counts matrix
 #' @importFrom lsa cosine
 #' @export
+#' @examples 
+#' data("ioprofiles")
+#' data("mini_nsclc")
+#' sharedgenes <- intersect(colnames(mini_nsclc$counts), rownames(ioprofiles))
+#' find_anchor_cells(counts = mini_nsclc$counts[, sharedgenes], 
+#'                   neg = Matrix::rowMeans(mini_nsclc$neg),
+#'                   profiles = ioprofiles)
 find_anchor_cells <- function(counts, neg = NULL, bg = NULL, align_genes = TRUE,
                               profiles, size = 10, n_cells = 500, 
                               min_cosine = 0.3, min_scaled_llr = 0.01, 

diff --git a/R/flightpath_layout.R b/R/flightpath_layout.R
@@ -19,6 +19,19 @@
 #' @importFrom umap umap
 #' @importFrom stats rnorm
 #' @export
+#' @examples
+#' data("mini_nsclc")
+#' unsup <- insitutype(
+#'  x = mini_nsclc$counts,
+#'  neg = Matrix::rowMeans(mini_nsclc$neg),
+#'  n_clusts = 8,
+#'  n_phase1 = 200,
+#'  n_phase2 = 500,
+#'  n_phase3 = 2000,
+#'  n_starts = 1,
+#'  max_iters = 5
+#' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
+#' flightpath_layout(logliks = unsup$logliks, profiles = unsup$profiles)
 flightpath_layout <- function(logliks = NULL, probs = NULL, profiles = NULL, cluster_xpos = NULL, cluster_ypos = NULL) {
 
   if (is.null(probs) && is.null(logliks)) {
@@ -90,7 +103,19 @@ flightpath_layout <- function(logliks = NULL, probs = NULL, profiles = NULL, clu
 #'@return a ggplot object
 #'
 #'@export
-#'
+#'@examples 
+#' data("ioprofiles")
+#' unsup <- insitutype(
+#'  x = mini_nsclc$counts,
+#'  neg = Matrix::rowMeans(mini_nsclc$neg),
+#'  n_clusts = 8,
+#'  n_phase1 = 200,
+#'  n_phase2 = 500,
+#'  n_phase3 = 2000,
+#'  n_starts = 1,
+#'  max_iters = 5
+#' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
+#' flightpath_plot(insitutype_result = unsup)
 flightpath_plot <- function(flightpath_result = NULL, insitutype_result = NULL, col = NULL, showclusterconfidence = TRUE){
 
   # get the flightpath results to use 
@@ -163,6 +188,12 @@ flightpath_plot <- function(flightpath_result = NULL, insitutype_result = NULL,
 #' Calculate the mean confidence of the cell calls from each cluster
 #' @param probs Matrix of probabilities
 #' @return a vector of mean confidences, with values of 1 corresponding to clusters with only prob == 1
+#' @examples
+#' data("mini_nsclc")
+#' probs <- sapply(rownames(mini_nsclc$counts), function(x) {a = runif(10); a/sum(a)})
+#' dimnames(probs)[[1]] <- letters[1:10]
+#' probs <- t(probs)
+#' getMeanClusterConfidence(probs)
 getMeanClusterConfidence <- function(probs) {
 
   maxprobs <- apply(probs, 1, max, na.rm = TRUE)

diff --git a/R/geoSketch.R b/R/geoSketch.R
@@ -12,6 +12,9 @@
 #' @param counts Counts matrix
 #' @return A matrix of data for geoSketch, with cells in rows and features in columns
 #' @importFrom irlba prcomp_irlba
+#' @examples
+#' data("mini_nsclc")
+#' prepDataForSketching(mini_nsclc$counts)
 prepDataForSketching <- function(counts) {
   # get PCs:
   scaling_factors <- pmax(sparseMatrixStats::colQuantiles(counts, probs = 0.99), 5)
@@ -37,20 +40,16 @@ prepDataForSketching <- function(counts) {
 #'   cell
 #' @param minCellsPerBin the minimum number of cells required for a bin to be
 #'   considered for sampling
-#' @param seed set seed for random sampling
 #'
 #' @return Plaid, a named vector of binIDs where names correspond to cellIDs
+#' @examples
+#' data("mini_nsclc")
+#' geoSketch_get_plaid(mini_nsclc$counts, 100)
 geoSketch_get_plaid <- function(X, N,
                                 alpha=0.1,
                                 max_iter=200,
                                 returnBins=FALSE,
-                                minCellsPerBin = 1,
-                                seed=NULL) {
-  # Define seed for sampling if given
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
+                                minCellsPerBin = 1) {
   # Determine the total number of cells and compare it to the desired sample size 
   nCells <- nrow(X)
 
@@ -120,17 +119,14 @@ geoSketch_get_plaid <- function(X, N,
 #' Sample cells, trying to give each plaid equal representation
 #' @param Plaid Vector of cells' plaid IDs
 #' @param N desired sample size
-#' @param seed set seed for random sampling
 #' 
 #' @return Plaid, a named vector of binIDs where names correspond to cellIDs
 #' @return sampledCells, a vector of cellIDs sampled using the geometric sketching method
-geoSketch_sample_from_plaids <- function(Plaid, N, seed=NULL) {
-
-  # Define seed for sampling if given
-  if (!is.null(seed)){
-    set.seed(seed)
-  }
-
+#' @examples
+#' data("mini_nsclc")
+#' plaids <- geoSketch_get_plaid(mini_nsclc$counts, 100)
+#' geoSketch_sample_from_plaids(plaids, 5)
+geoSketch_sample_from_plaids <- function(Plaid, N) {
   # define cells' sampling probabilities as the inverse of their plaid size:
   PlaidCounts <- table(Plaid) # Count the number of cells per bin
   prob <- 1 / PlaidCounts[Plaid]
@@ -152,22 +148,17 @@ geoSketch_sample_from_plaids <- function(Plaid, N, seed=NULL) {
 #' @param max_iter maximum number of iterations used to achieve an acceptable minimum number of bins
 #' @param returnBins determines whether or not to pass back bin labels for each cell
 #' @param minCellsPerBin the minimum number of cells required for a bin to be considered for sampling
-#' @param seed set seed for random sampling
 #' 
 #' @return sampledCells, a vector of cellIDs sampled using the geometric sketching method
 #' @return Plaid, a named vector of binIDs where names correspond to cellIDs
+#' @examples
+#' data("mini_nsclc")
+#' geoSketch(mini_nsclc$counts, 200)
 geoSketch <- function(X, N,
                       alpha=0.1,
                       max_iter=200,
                       returnBins=FALSE,
-                      minCellsPerBin = 1,
-                      seed=NULL) {
-
-  # Define seed for sampling if given
-  if (!is.null(seed)) {
-    set.seed(seed)
-  }
-
+                      minCellsPerBin = 1) {
   # Determine the total number of cells and compare it to the desired sample size 
   nCells <- nrow(X)