Merge pull request #201 from cvanderaa/issue199

Issue199
rformassspectrometry · Mar 8, 2024 · e054fc3 · e054fc3
2 parents e614750 + 0b16164
commit e054fc3
Showing 1 changed file with 121 additions and 65 deletions.
diff --git a/R/QFeatures-constructors.R b/R/QFeatures-constructors.R
@@ -393,15 +393,16 @@ QFeatures <- function(..., assayLinks = NULL) {
 ##' @param extractedData A data.frame or any object that can be coerced
 ##'     to a data.frame that contains the data from the `*_ms1_extracted.tsv`
 ##'     file generated by DIA-NN. This argument is optional and is
-##'     only applicable for mulitplixed experiments
+##'     currently only applicable for mTRAQ multiplexed experiments
+##'     where DIA-NN was run using the `plexdia` module.
 ##'
 ##' @param ecol A `character(1)` indicating which column in
 ##'     `reportData` contains the quantitative information. Default is
 ##'     `"MS1.Area"`.
 ##'
 ##' @param multiplexing A `character(1)` indicating the type of
 ##'     multiplexing used in the experiment. Provide `"none"` if the
-##'     experiment is label-free (default). Available options are:
+##'     experiment is label-free (default). Alternative options are:
 ##'     `"mTRAQ"`.
 ##'
 ##' @param ... Further arguments passed to [readQFeatures()].
@@ -431,9 +432,32 @@ QFeatures <- function(..., assayLinks = NULL) {
 ##' cd <- data.frame(File.Name = unique(x[[1]]))
 ##' readQFeaturesFromDIANN(colData = cd, reportData = x, ecol = "Ms1.Area")
 readQFeaturesFromDIANN <- function(colData, reportData, extractedData = NULL,
-                                   ecol = "MS1.Area",
-                                   multiplexing = "none", # "none" or "mTRAQ"
+                                   ecol = "Ms1.Area", multiplexing = "none",
                                    ...) {
+    suppArgs <- .checkDiannArguments(
+        colData, reportData, extractedData, ecol, multiplexing, ...
+    )
+    if (multiplexing == "mTRAQ") {
+        reportData <- .formatMtraqReportData(reportData, colData, ecol)
+    } else if (multiplexing == "none") {
+        colData$Label <- ecol
+    }
+    allArgs <- c(suppArgs, list(
+        assayData = reportData, colAnnotation = colData,
+        batchCol = "File.Name", channelCol = "Label"
+    ))
+    out <- do.call(readQFeatures, allArgs)
+    if (!is.null(extractedData)) {
+        out <- .addDiannExtractedData(out, extractedData)
+    }
+    out
+}
+
+## Internal function that checks whether the provided arguments match
+## the expected input that is generated by DIA-NN.
+## Parameter description is the same as for `readSCPfromDIANN()`
+.checkDiannArguments <- function(colData, reportData, extractedData,
+                                 ecol, multiplexing, ...) {
     diannReportCols <- c("File.Name", "Precursor.Id", "Modified.Sequence")
     if (!all(diannReportCols %in% colnames(reportData)))
         stop("'reportData' is not an expected DIA-NN report table ",
@@ -449,76 +473,108 @@ readQFeaturesFromDIANN <- function(colData, reportData, extractedData = NULL,
              "('multiplexed == \"none\"') is not expected. Raise an ",
              "issue if you need this feature: ",
              "https://github.com/UCLouvain-CBIO/scp/issues/new/choose")
+    .checkDiannArgumentsDots(multiplexing, ...)
+}
 
-    args <- list(...)
-    ## Get the label used for the reportData
+## Internal function that adapts the dots arguments (that will be used
+## by `readSCP()`) depending on the multiplexing approach used.
+.checkDiannArgumentsDots <- function(multiplexing, ...) {
+    suppArgs <- list(...)
     if (multiplexing == "mTRAQ") {
-        ## Extracted the mTRAQ label from the modified sequence
-        reportData$Label <- sub("^.*[Q-](\\d).*$", "\\1", reportData$Modified.Sequence)
-        reportData$Precursor.Id <- gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", reportData$Precursor.Id)
-        args$sep <- "."
-        ## Make sure the colData has the Label column
-        if (!"Label" %in% colnames(colData))
-            stop("'colData' must contain a column named 'Label' that ",
-                 "provides the mTRAQ reagent used to label the ",
-                 "samples and/or single cells.")
-        if (any(mis <- !colData$Label %in% reportData$Label)) {
-            stop("Some labels from 'colData$Label' were not found as",
-                 "part of the mTRAQ labels found in ",
-                 "'reportData$Modified.Sequence': ",
-                 paste0(unique(colData$Label[mis]), collapse = ", "))
-        }
-        ## Identify which variables are correlated with the run-specific
-        ## precursor IDs
-        nIds <- length(unique(paste0(reportData$Precursor.Id, reportData$File.Name)))
-        nLevels <- sapply(colnames(reportData), function(x) {
-            nrow(unique(reportData[, c("Precursor.Id", "File.Name", x)]))
-        })
-        idCols <- names(nLevels)[nLevels == nIds]
-        ## Transform the reportData to a wide format with respect to label
-        reportData <- pivot_wider(reportData, id_cols = all_of(idCols),
-                                  names_from = "Label",
-                                  values_from = ecol)
+        suppArgs$sep <- "."
     } else if (multiplexing == "none") {
-        colData$Label <- ecol
-        args$sep <- ""
-        args$suffix <- ""
+        suppArgs$sep <- ""
+        suppArgs$suffix <- ""
     } else {
         stop("The '", multiplexing, "' multiplexing strategy is not ",
              "implemented. Raise an issue if you need this feature: ",
              "https://github.com/UCLouvain-CBIO/scp/issues/new/choose")
     }
+    suppArgs
+}
+
+## (Only for mTRAQ multiplexing!) Internal function that extracts the
+## mTRAQlabels from the peptide sequence, removes the mTRAQ annotation
+## from the precursor ID, identifies constant columns within precursor
+## and puts the quantification data for different mTRAQ labels in
+## separate columns (wide format).
+.formatMtraqReportData <- function(reportData, colData, ecol) {
+    reportData$Label <-
+        sub("^.*[Q-](\\d).*$", "\\1", reportData$Modified.Sequence)
+    reportData$Precursor.Id <-
+        gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", reportData$Precursor.Id)
+    .checkLabelsInColData(colData, reportData)
+    idCols <- .findPrecursorVariables(reportData)
+    pivot_wider(
+        reportData, id_cols = all_of(idCols),
+        names_from = "Label", values_from = all_of(ecol)
+    )
+}
 
-    ## Read using readSCP
-    out <- do.call(readQFeatures, c(args, list(assayData = reportData,
-                                               colAnnotation = colData,
-                                               batchCol = "File.Name",
-                                               channelCol = "Label")))
+## Internal function that identifies which variables in the report
+## data are constant within each precursor (with each run).
+.findPrecursorVariables <- function(reportData) {
+    precIds <- paste0(reportData$Precursor.Id, reportData$File.Name)
+    nUniqueIds <- length(unique(precIds))
+    nLevels <- sapply(colnames(reportData), function(x) {
+        nrow(unique(reportData[, c("Precursor.Id", "File.Name", x)]))
+    })
+    names(nLevels)[nLevels == nUniqueIds]
+}
 
-    ## Optionally, add the extractedData
-    if (!is.null(extractedData)) {
-        labs <- unique(colData$Label)
-        ## DIA-NN appends the label to the run name
-        quantCols <- grep(paste0("[", paste0(labs, collapse = ""), "]$"),
-                          colnames(extractedData))
-        extractedData <- readSummarizedExperiment(extractedData,
-                                                  ecol = quantCols,
-                                                  fnames = "Precursor.Id")
-        ## Make sure extractedData has the sames samples as reportData
-        cnames <- unique(unlist(colnames(out)))
-        if (any(mis <- !cnames %in% colnames(extractedData)))
-            stop("Some columns present in reportData are not found in ",
-                 "extracted data", paste0(cnames[mis], collapse = ", "),
-                 "\nAre you sure the two tables were generated from ",
-                 "the same experiment?")
-        extractedData <- extractedData[, cnames]
-        ## Add the assay to the QFeatures object
-        anames <- names(out)
-        out <- addAssay(out, extractedData, name = "Ms1Extracted")
-        out <- addAssayLink(out,
-                            from = anames, to = "Ms1Extracted",
-                            varFrom = rep("Precursor.Id", length(anames)),
-                            varTo = "Precursor.Id")
+## Internal function that ensures that the reportData and the colData
+## are correctly linked.
+.checkLabelsInColData <- function(colData, reportData) {
+    if (!"Label" %in% colnames(colData))
+        stop("'colData' must contain a column named 'Label' that ",
+             "provides the mTRAQ reagent used to label the ",
+             "samples and/or single cells.")
+    if (any(mis <- !colData$Label %in% reportData$Label)) {
+        stop("Some labels from 'colData$Label' were not found as",
+             "part of the mTRAQ labels found in ",
+             "'reportData$Modified.Sequence': ",
+             paste0(unique(colData$Label[mis]), collapse = ", "))
     }
-    out
+    NULL
+}
+
+## Internal function that adds the extractedData to a QFeatures
+## object. The functions first converts the extractedData to a
+## SingleCellExperiment objects and subsets the SCE for the set of
+## shared samples. The added assay is automatically linked (using
+## AssayLinks) to the reportData.
+## Developer's note: the function assumes that DIA-NN creates sample
+## names in the extracted data by appending the labels to the run
+## names
+## @param object A QFeatures object containing DIA-NN report data, as
+##     generated by readSCP
+## @param extractedData A data.frame or any object that can be coerced
+##     to a data.frame that contains the data from the `*_ms1_extracted.tsv`
+##     file generated by DIA-NN.
+.addDiannExtractedData <- function(object, extractedData) {
+    quantColPattern <- paste0(unique(object$Label), "$", collapse = "|")
+    quantCols <- grep(quantColPattern, colnames(extractedData))
+    extractedData <- readSingleCellExperiment(
+        extractedData, ecol = quantCols, fnames = "Precursor.Id"
+    )
+    extractedData <- .keepSharedSamples(extractedData, object)
+    object <- addAssay(object, extractedData, name = "Ms1Extracted")
+    addAssayLink(
+        object,
+        from = grep("Ms1Extracted", names(object), invert = TRUE),
+        to = "Ms1Extracted",
+        varFrom = rep("Precursor.Id", length(names(object)) - 1),
+        varTo = "Precursor.Id"
+    )
+}
+
+## Internal functions that subsets the extractedData to keep only
+.keepSharedSamples <- function(extractedData, object) {
+    cnames <- unique(unlist(colnames(object)))
+    if (any(mis <- !cnames %in% colnames(extractedData)))
+        stop("Some columns present in reportData are not found in ",
+             "extracted data", paste0(cnames[mis], collapse = ", "),
+             "\nAre you sure the two tables were generated from ",
+             "the same experiment?")
+    extractedData[, cnames]
 }