Skip to content

Commit

Permalink
Merge pull request #201 from cvanderaa/issue199
Browse files Browse the repository at this point in the history
Issue199
  • Loading branch information
lgatto authored Mar 8, 2024
2 parents e614750 + 0b16164 commit e054fc3
Showing 1 changed file with 121 additions and 65 deletions.
186 changes: 121 additions & 65 deletions R/QFeatures-constructors.R
Original file line number Diff line number Diff line change
Expand Up @@ -393,15 +393,16 @@ QFeatures <- function(..., assayLinks = NULL) {
##' @param extractedData A data.frame or any object that can be coerced
##' to a data.frame that contains the data from the `*_ms1_extracted.tsv`
##' file generated by DIA-NN. This argument is optional and is
##' only applicable for mulitplixed experiments
##' currently only applicable for mTRAQ multiplexed experiments
##' where DIA-NN was run using the `plexdia` module.
##'
##' @param ecol A `character(1)` indicating which column in
##' `reportData` contains the quantitative information. Default is
##' `"MS1.Area"`.
##'
##' @param multiplexing A `character(1)` indicating the type of
##' multiplexing used in the experiment. Provide `"none"` if the
##' experiment is label-free (default). Available options are:
##' experiment is label-free (default). Alternative options are:
##' `"mTRAQ"`.
##'
##' @param ... Further arguments passed to [readQFeatures()].
Expand Down Expand Up @@ -431,9 +432,32 @@ QFeatures <- function(..., assayLinks = NULL) {
##' cd <- data.frame(File.Name = unique(x[[1]]))
##' readQFeaturesFromDIANN(colData = cd, reportData = x, ecol = "Ms1.Area")
readQFeaturesFromDIANN <- function(colData, reportData, extractedData = NULL,
ecol = "MS1.Area",
multiplexing = "none", # "none" or "mTRAQ"
ecol = "Ms1.Area", multiplexing = "none",
...) {
suppArgs <- .checkDiannArguments(
colData, reportData, extractedData, ecol, multiplexing, ...
)
if (multiplexing == "mTRAQ") {
reportData <- .formatMtraqReportData(reportData, colData, ecol)
} else if (multiplexing == "none") {
colData$Label <- ecol
}
allArgs <- c(suppArgs, list(
assayData = reportData, colAnnotation = colData,
batchCol = "File.Name", channelCol = "Label"
))
out <- do.call(readQFeatures, allArgs)
if (!is.null(extractedData)) {
out <- .addDiannExtractedData(out, extractedData)
}
out
}

## Internal function that checks whether the provided arguments match
## the expected input that is generated by DIA-NN.
## Parameter description is the same as for `readSCPfromDIANN()`
.checkDiannArguments <- function(colData, reportData, extractedData,
ecol, multiplexing, ...) {
diannReportCols <- c("File.Name", "Precursor.Id", "Modified.Sequence")
if (!all(diannReportCols %in% colnames(reportData)))
stop("'reportData' is not an expected DIA-NN report table ",
Expand All @@ -449,76 +473,108 @@ readQFeaturesFromDIANN <- function(colData, reportData, extractedData = NULL,
"('multiplexed == \"none\"') is not expected. Raise an ",
"issue if you need this feature: ",
"https://github.com/UCLouvain-CBIO/scp/issues/new/choose")
.checkDiannArgumentsDots(multiplexing, ...)
}

args <- list(...)
## Get the label used for the reportData
## Internal function that adapts the dots arguments (that will be used
## by `readSCP()`) depending on the multiplexing approach used.
.checkDiannArgumentsDots <- function(multiplexing, ...) {
suppArgs <- list(...)
if (multiplexing == "mTRAQ") {
## Extracted the mTRAQ label from the modified sequence
reportData$Label <- sub("^.*[Q-](\\d).*$", "\\1", reportData$Modified.Sequence)
reportData$Precursor.Id <- gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", reportData$Precursor.Id)
args$sep <- "."
## Make sure the colData has the Label column
if (!"Label" %in% colnames(colData))
stop("'colData' must contain a column named 'Label' that ",
"provides the mTRAQ reagent used to label the ",
"samples and/or single cells.")
if (any(mis <- !colData$Label %in% reportData$Label)) {
stop("Some labels from 'colData$Label' were not found as",
"part of the mTRAQ labels found in ",
"'reportData$Modified.Sequence': ",
paste0(unique(colData$Label[mis]), collapse = ", "))
}
## Identify which variables are correlated with the run-specific
## precursor IDs
nIds <- length(unique(paste0(reportData$Precursor.Id, reportData$File.Name)))
nLevels <- sapply(colnames(reportData), function(x) {
nrow(unique(reportData[, c("Precursor.Id", "File.Name", x)]))
})
idCols <- names(nLevels)[nLevels == nIds]
## Transform the reportData to a wide format with respect to label
reportData <- pivot_wider(reportData, id_cols = all_of(idCols),
names_from = "Label",
values_from = ecol)
suppArgs$sep <- "."
} else if (multiplexing == "none") {
colData$Label <- ecol
args$sep <- ""
args$suffix <- ""
suppArgs$sep <- ""
suppArgs$suffix <- ""
} else {
stop("The '", multiplexing, "' multiplexing strategy is not ",
"implemented. Raise an issue if you need this feature: ",
"https://github.com/UCLouvain-CBIO/scp/issues/new/choose")
}
suppArgs
}

## (Only for mTRAQ multiplexing!) Internal function that extracts the
## mTRAQlabels from the peptide sequence, removes the mTRAQ annotation
## from the precursor ID, identifies constant columns within precursor
## and puts the quantification data for different mTRAQ labels in
## separate columns (wide format).
.formatMtraqReportData <- function(reportData, colData, ecol) {
reportData$Label <-
sub("^.*[Q-](\\d).*$", "\\1", reportData$Modified.Sequence)
reportData$Precursor.Id <-
gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", reportData$Precursor.Id)
.checkLabelsInColData(colData, reportData)
idCols <- .findPrecursorVariables(reportData)
pivot_wider(
reportData, id_cols = all_of(idCols),
names_from = "Label", values_from = all_of(ecol)
)
}

## Read using readSCP
out <- do.call(readQFeatures, c(args, list(assayData = reportData,
colAnnotation = colData,
batchCol = "File.Name",
channelCol = "Label")))
## Internal function that identifies which variables in the report
## data are constant within each precursor (with each run).
.findPrecursorVariables <- function(reportData) {
precIds <- paste0(reportData$Precursor.Id, reportData$File.Name)
nUniqueIds <- length(unique(precIds))
nLevels <- sapply(colnames(reportData), function(x) {
nrow(unique(reportData[, c("Precursor.Id", "File.Name", x)]))
})
names(nLevels)[nLevels == nUniqueIds]
}

## Optionally, add the extractedData
if (!is.null(extractedData)) {
labs <- unique(colData$Label)
## DIA-NN appends the label to the run name
quantCols <- grep(paste0("[", paste0(labs, collapse = ""), "]$"),
colnames(extractedData))
extractedData <- readSummarizedExperiment(extractedData,
ecol = quantCols,
fnames = "Precursor.Id")
## Make sure extractedData has the sames samples as reportData
cnames <- unique(unlist(colnames(out)))
if (any(mis <- !cnames %in% colnames(extractedData)))
stop("Some columns present in reportData are not found in ",
"extracted data", paste0(cnames[mis], collapse = ", "),
"\nAre you sure the two tables were generated from ",
"the same experiment?")
extractedData <- extractedData[, cnames]
## Add the assay to the QFeatures object
anames <- names(out)
out <- addAssay(out, extractedData, name = "Ms1Extracted")
out <- addAssayLink(out,
from = anames, to = "Ms1Extracted",
varFrom = rep("Precursor.Id", length(anames)),
varTo = "Precursor.Id")
## Internal function that ensures that the reportData and the colData
## are correctly linked.
.checkLabelsInColData <- function(colData, reportData) {
if (!"Label" %in% colnames(colData))
stop("'colData' must contain a column named 'Label' that ",
"provides the mTRAQ reagent used to label the ",
"samples and/or single cells.")
if (any(mis <- !colData$Label %in% reportData$Label)) {
stop("Some labels from 'colData$Label' were not found as",
"part of the mTRAQ labels found in ",
"'reportData$Modified.Sequence': ",
paste0(unique(colData$Label[mis]), collapse = ", "))
}
out
NULL
}

## Internal function that adds the extractedData to a QFeatures
## object. The functions first converts the extractedData to a
## SingleCellExperiment objects and subsets the SCE for the set of
## shared samples. The added assay is automatically linked (using
## AssayLinks) to the reportData.
## Developer's note: the function assumes that DIA-NN creates sample
## names in the extracted data by appending the labels to the run
## names
## @param object A QFeatures object containing DIA-NN report data, as
## generated by readSCP
## @param extractedData A data.frame or any object that can be coerced
## to a data.frame that contains the data from the `*_ms1_extracted.tsv`
## file generated by DIA-NN.
.addDiannExtractedData <- function(object, extractedData) {
quantColPattern <- paste0(unique(object$Label), "$", collapse = "|")
quantCols <- grep(quantColPattern, colnames(extractedData))
extractedData <- readSingleCellExperiment(
extractedData, ecol = quantCols, fnames = "Precursor.Id"
)
extractedData <- .keepSharedSamples(extractedData, object)
object <- addAssay(object, extractedData, name = "Ms1Extracted")
addAssayLink(
object,
from = grep("Ms1Extracted", names(object), invert = TRUE),
to = "Ms1Extracted",
varFrom = rep("Precursor.Id", length(names(object)) - 1),
varTo = "Precursor.Id"
)
}

## Internal functions that subsets the extractedData to keep only
.keepSharedSamples <- function(extractedData, object) {
cnames <- unique(unlist(colnames(object)))
if (any(mis <- !cnames %in% colnames(extractedData)))
stop("Some columns present in reportData are not found in ",
"extracted data", paste0(cnames[mis], collapse = ", "),
"\nAre you sure the two tables were generated from ",
"the same experiment?")
extractedData[, cnames]
}

0 comments on commit e054fc3

Please sign in to comment.