From ad0650ddb959d623f29b50219b579da136ac88e9 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Mon, 25 Nov 2024 10:27:35 +0100 Subject: [PATCH] feat: add featureArea for XcmsExperimentHdf5 --- NAMESPACE | 4 +- R/AllGenerics.R | 3 +- R/XcmsExperiment-functions.R | 14 +-- R/XcmsExperiment.R | 16 +++- R/XcmsExperimentHdf5-functions.R | 91 ++++++++++++++----- R/XcmsExperimentHdf5.R | 28 +++++- .../test_XcmsExperimentHdf5-functions.R | 49 +++++++++- tests/testthat/test_XcmsExperimentHdf5.R | 13 +++ 8 files changed, 173 insertions(+), 45 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9916fd0f..89c1dd0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -252,7 +252,6 @@ export( "hasFilledChromPeaks", "plotAdjustedRtime", "groupOverlaps", - "featureArea", "loadXcmsData", "matchLamasChromPeaks", "summarizeLamaMatch", @@ -481,6 +480,7 @@ exportMethods("hasChromPeaks", "adjustedRtime", "adjustedRtime<-", "adjustRtimePeakGroups", + "featureArea", "featureDefinitions", "featureDefinitions<-", "featureValues", @@ -607,4 +607,4 @@ export("PercentMissingFilter") export("BlankFlag") ## HDF5 storage mode -exportClasses("XcmsExperimentHdf5") \ No newline at end of file +exportClasses("XcmsExperimentHdf5") diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 0eb9f42a..36a56f21 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -647,6 +647,7 @@ setGeneric("factorGap", function(object) standardGeneric("factorGap")) setGeneric("factorGap<-", function(object, value) standardGeneric("factorGap<-")) setGeneric("family", function(object, ...) standardGeneric("family")) setGeneric("family<-", function(object, value) standardGeneric("family<-")) +setGeneric("featureArea", function(object, ...) standardGeneric("featureArea")) #' @title Extract ion chromatograms for each feature #' @@ -2144,4 +2145,4 @@ setGeneric("write.mzdata", function(object, ...) standardGeneric("write.mzdata") setGeneric("write.mzQuantML", function(object, ...) standardGeneric("write.mzQuantML")) ## X -setGeneric("xcmsSource", function(object, ...) standardGeneric("xcmsSource")) \ No newline at end of file +setGeneric("xcmsSource", function(object, ...) standardGeneric("xcmsSource")) diff --git a/R/XcmsExperiment-functions.R b/R/XcmsExperiment-functions.R index 1ff36086..ce8aa10a 100644 --- a/R/XcmsExperiment-functions.R +++ b/R/XcmsExperiment-functions.R @@ -982,18 +982,6 @@ chrs } -#' @rdname XcmsExperiment -featureArea <- function(object, mzmin = min, mzmax = max, rtmin = min, - rtmax = max, features = character()) { - if (!hasFeatures(object)) - stop("No correspondence results available. Please run ", - "'groupChromPeaks' first.") - if (!length(features)) - features <- rownames(featureDefinitions(object)) - .features_ms_region(object, mzmin = mzmin, mzmax = mzmax, rtmin = rtmin, - rtmax = rtmax, features = features) -} - #' @title Define MS regions for features #' #' @param x `XcmsExperiment` or `XCMSnExp`. @@ -1266,4 +1254,4 @@ XcmsExperiment <- function() { n@.processHistory <- from@processHistory validObject(n) n -} \ No newline at end of file +} diff --git a/R/XcmsExperiment.R b/R/XcmsExperiment.R index 30d5c51f..6c009723 100644 --- a/R/XcmsExperiment.R +++ b/R/XcmsExperiment.R @@ -1548,6 +1548,20 @@ setMethod( else as.logical(nrow(object@featureDefinitions)) }) +#' @rdname XcmsExperiment +setMethod( + "featureArea", "XcmsResult", + function(object, mzmin = min, mzmax = max, rtmin = min, + rtmax = max, features = character()) { + if (!hasFeatures(object)) + stop("No correspondence results available. Please run ", + "'groupChromPeaks' first.") + if (!length(features)) + features <- rownames(featureDefinitions(object)) + .features_ms_region(object, mzmin = mzmin, mzmax = mzmax, rtmin = rtmin, + rtmax = rtmax, features = features) + }) + #' @rdname XcmsExperiment setReplaceMethod("featureDefinitions", "XcmsExperiment", function(object, value) { @@ -2072,4 +2086,4 @@ setMethod( res <- do.call(rbind, res) pb$tick() res - }) \ No newline at end of file + }) diff --git a/R/XcmsExperimentHdf5-functions.R b/R/XcmsExperimentHdf5-functions.R index 865672da..c7cbb707 100644 --- a/R/XcmsExperimentHdf5-functions.R +++ b/R/XcmsExperimentHdf5-functions.R @@ -114,6 +114,7 @@ NULL } if (!keepChromPeaks && hasChromPeaks(x)) { x@chrom_peaks_ms_level <- integer() + x@gap_peaks_ms_level <- integer() drop <- c(drop, .PROCSTEP.PEAK.DETECTION, .PROCSTEP.PEAK.FILLING, .PROCSTEP.CALIBRATION, .PROCSTEP.PEAK.REFINEMENT) } @@ -566,7 +567,15 @@ NULL } #' Returns the rtmin, rtmax, mzmin and mzmax for each feature depending on the -#' associated chrom peaks. +#' associated chrom peaks. For XcmsExperimentHdf5 we have to: +#' - loop over each sample +#' - load the mapping of features to chrom peaks +#' - load the chrom peak data +#' - extract the required information per feature +#' - after the loop: aggregate the information. +#' +#' We could also use `featureValues()`, but would need to call that 4 times, +#' i.e. load the data 4 times. #' #' For the final calculation of the region boundaries using the functions #' defined with parameters `mzmin`, `mzmax`, `rtmin` and `rtmax` we consider @@ -575,39 +584,73 @@ NULL #' in each sample (in contrast to the `.features_ms_region()` function that #' considered all values for all chrom peaks of a feature). #' +#' @param features `character` with the feature IDs. +#' #' @noRd -.h5_features_ms_region <- function(x, mzmin, mzmax, rtmin, rtmax, feature_idx, +.h5_features_ms_region <- function(x, mzmin, mzmax, rtmin, rtmax, features, ms_level = 1L) { - ## - Get for each feature_idx, each sample the min rtmin, mzmin and max - ## rtmax, mzmax. - ## - add the values to a `matrix` or a `list`? what is more efficient? - ## - calculate the boundaries using these. - ## need to get for each feature all values to select min/max/median or - ## whatever. + pb <- progress_bar$new(format = paste0("[:bar] :current/:", + "total (:percent) in ", + ":elapsed"), + total = length(x@sample_id) + 1L, clear = FALSE) + pb$tick(0) + fids <- rhdf5::h5read(x@hdf5_file, + paste0("/features/ms_", ms_level, + "/feature_definitions_rownames"), drop = TRUE) + feature_idx <- unique(match(features, fids)) + if (anyNA(feature_idx)) + stop("Some of the provided feature IDs were not found in the", + " data set.", call. = FALSE) + feature_idx <- sort(feature_idx) + cn <- .h5_chrom_peaks_colnames(x, ms_level) + cn_idx <- match(c("mzmin", "mzmax", "rtmin", "rtmax"), cn) + template <- rep(NA_real_, length(feature_idx)) + rtmin_mat <- rtmax_mat <- mzmin_mat <- mzmax_mat <- + vector("list", length(x@sample_id)) for (i in seq_along(x@sample_id)) { sample_id <- x@sample_id[i] sid <- paste0("/", sample_id, "/ms_", ms_level) - fidx <- xcms:::.h5_read_data(x@hdf5_file, sample_id, + fidx <- .h5_read_data(x@hdf5_file, sample_id, name = "feature_to_chrom_peaks", ms_level = ms_level)[[1L]] - fidx <- fidx[fidx[, 1L] %in% feature_ids, ] - - vals <- .h5_read_data(hdf5_file, sample_id, name = "chrom_peaks", - ms_level = ms_level, j = col_idx)[[1L]] -LLLLLL - } -} - -a <- function() { - m <- matrix(NA_real_, ncol = 1000, nrow = 2000) - for (i in 1:1000) { + fidx <- fidx[fidx[, 1L] %in% feature_idx, , drop = FALSE] + if (nrow(fidx)) { + vals <- .h5_read_data( + x@hdf5_file, sample_id, name = "chrom_peaks", + ms_level = ms_level, i = fidx[, 2L], j = cn_idx, + read_colnames = FALSE, read_rownames = FALSE)[[1L]] + idx <- as.factor(match(fidx[, 1L], feature_idx)) + mzmin_mat[[i]] <- .h5_features_ms_region_values( + template, vals[, 1L], idx, dups = base::min) + mzmax_mat[[i]] <- .h5_features_ms_region_values( + template, vals[, 2L], idx, dups = base::max) + rtmin_mat[[i]] <- .h5_features_ms_region_values( + template, vals[, 3L], idx, dups = base::min) + rtmax_mat[[i]] <- .h5_features_ms_region_values( + template, vals[, 4L], idx, dups = base::max) + } else + mzmin_mat[[i]] <- mzmax_mat[[i]] <- rtmin_mat[[i]] <- + rtmax_mat[[i]] <- template + pb$tick() } + mzmin_mat <- do.call(base::cbind, mzmin_mat) + mzmax_mat <- do.call(base::cbind, mzmax_mat) + rtmin_mat <- do.call(base::cbind, rtmin_mat) + rtmax_mat <- do.call(base::cbind, rtmax_mat) + res <- cbind(mzmin = apply(mzmin_mat, 1L, mzmin, na.rm = TRUE), + mzmax = apply(mzmax_mat, 1L, mzmax, na.rm = TRUE), + rtmin = apply(rtmin_mat, 1L, rtmin, na.rm = TRUE), + rtmax = apply(rtmax_mat, 1L, rtmax, na.rm = TRUE)) + rownames(res) <- fids[feature_idx] + res[features, , drop = FALSE] } -b <- function() { - l <- vector("list", 1000) - for (i in 1:1000) { - } +.h5_features_ms_region_values <- function(x, vals, map, dups = base::min) { + vl <- base::split(vals, map) + l <- lengths(vl) > 1L + x[as.integer(names(vl[!l]))] <- unlist(vl[!l], FALSE, FALSE) + x[as.integer(names(vl[l]))] <- vapply(vl[l], dups, x[1L], USE.NAMES = FALSE) + x } ################################################################################ diff --git a/R/XcmsExperimentHdf5.R b/R/XcmsExperimentHdf5.R index 24ea92ec..d8ead766 100644 --- a/R/XcmsExperimentHdf5.R +++ b/R/XcmsExperimentHdf5.R @@ -426,12 +426,16 @@ setMethod( ## Identify for each feature the samples in which there is a missing ## value fvals <- is.na(featureValues(object, msLevel = msLevel, method = "sum")) - keep <- rowSums(fvals) > 0 - fidx <- seq_len(nrow(fvals))[keep] + fidx <- which(rowSums(fvals) > 0) fvals <- fvals[keep, , drop = FALSE] ## Define the feature region to integrate signal from. Need to iterate ## over all samples/files. - + message("Defining MS area to integrate signal from") + fr <- .h5_features_ms_region( + object, mzmin = param@mzmin, mzmax = param@mzmax, + rtmin = param@rtmin, rtmax = param@rtmax, feature_idx = fidx, + ms_level = msLevel) + ## LLLLLL feature_ids <- rownames(featureDefinitions(object, msLevel = msLevel)) fr <- .features_ms_region(object, mzmin = param@mzmin, @@ -740,6 +744,24 @@ setMethod( object }) +#' @rdname hidden_aliases +setMethod( + "featureArea", "XcmsExperimentHdf5", + function(object, mzmin = min, mzmax = max, rtmin = min, + rtmax = max, features = character(), msLevel = 1L) { + if (!hasFeatures(object, msLevel)) + stop("No correspondence results available. Please run ", + "'groupChromPeaks' first.", call. = FALSE) + if (!length(features)) + features <- rhdf5::h5read(object@hdf5_file, + paste0("/features/ms_", msLevel, + "/feature_definitions_rownames"), + drop = TRUE) + .h5_features_ms_region( + object, mzmin = mzmin, mzmax = mzmax, rtmin = rtmin, + rtmax = rtmax, features, ms_level = msLevel) + }) + #' @rdname hidden_aliases setReplaceMethod("featureDefinitions", "XcmsExperimentHdf5", function(object, value) { diff --git a/tests/testthat/test_XcmsExperimentHdf5-functions.R b/tests/testthat/test_XcmsExperimentHdf5-functions.R index 2aaca46d..a7c81c3d 100644 --- a/tests/testthat/test_XcmsExperimentHdf5-functions.R +++ b/tests/testthat/test_XcmsExperimentHdf5-functions.R @@ -763,4 +763,51 @@ test_that(".h5_x_chromatogram works", { fts <- featureDefinitions(res) }) -rm(h5f_full_g) \ No newline at end of file +test_that(".h5_features_ms_region_values works", { + tmpl <- rep(NA_real_, 20) + v <- as.numeric(1:14) + m <- c(2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 9, 10, 11) + res <- .h5_features_ms_region_values(tmpl, v, m) + expect_true(is.na(res[1L])) + expect_true(all(is.na(res[12:20]))) + expect_equal(res[2:11], c(1, 3, 4, 5, 6, 7, 8, 10, 13, 14)) + res <- .h5_features_ms_region_values(tmpl, v, m, max) + expect_true(is.na(res[1L])) + expect_true(all(is.na(res[12:20]))) + expect_equal(res[2:11], c(2, 3, 4, 5, 6, 7, 9, 12, 13, 14)) +}) + +test_that(".h5_features_ms_region works", { + res <- xcms:::.h5_features_ms_region( + xmseg_full_h5, mzmin = min, mzmax = max, rtmin = min, rtmax = max, + features = rownames(featureDefinitions(xmseg_full_h5)), + ms_level = 1L) + expect_true(is.matrix(res)) + expect_equal(colnames(res), c("mzmin", "mzmax", "rtmin", "rtmax")) + expect_equal(nrow(res), nrow(featureDefinitions(xmseg_full_h5))) + expect_equal(rownames(res), rownames(featureDefinitions(xmseg_full_h5))) + + res_sub <- xcms:::.h5_features_ms_region( + xmseg_full_h5, mzmin = min, mzmax = max, rtmin = min, rtmax = max, + features = rownames(res)[c(4, 10, 12)], ms_level = 1L) + expect_true(is.matrix(res_sub)) + expect_equal(colnames(res), c("mzmin", "mzmax", "rtmin", "rtmax")) + expect_equal(nrow(res_sub), 3) + expect_equal(res[c(4, 10, 12), ], res_sub) + + res_sub <- xcms:::.h5_features_ms_region( + xmseg_full_h5, mzmin = min, mzmax = max, rtmin = min, rtmax = max, + features = rownames(res)[c(10, 12, 10, 4)], ms_level = 1L) + expect_true(is.matrix(res_sub)) + expect_equal(colnames(res), c("mzmin", "mzmax", "rtmin", "rtmax")) + expect_equal(nrow(res_sub), 4) + expect_equal(res[c(10, 12, 10, 4), ], res_sub) + + ## errors + expect_error(xcms:::.h5_features_ms_region( + xmseg_full_h5, mzmin = min, mzmax = max, rtmin = min, rtmax = max, + features = c(rownames(res)[c(10, 12, 10, 4)], "a"), ms_level = 1L), + "not found") +}) + +rm(h5f_full_g) diff --git a/tests/testthat/test_XcmsExperimentHdf5.R b/tests/testthat/test_XcmsExperimentHdf5.R index b4cfad72..03255002 100644 --- a/tests/testthat/test_XcmsExperimentHdf5.R +++ b/tests/testthat/test_XcmsExperimentHdf5.R @@ -513,6 +513,19 @@ test_that("fillChromPeaks,XcmsExperimentHdf5 works", { "No feature definitions") }) +test_that("featureArea,XcmsExperimentHdf5 works", { + expect_error(featureArea(xmse_h5), "No correspondence") + expect_error(featureArea(xmseg_full_h5, msLevel = 2L), "No correspondence") + expect_error(featureArea(xmseg_full_h5, features = c("a", "b")), + "Some of the provided") + res <- featureArea(xmseg_full_h5) + ref <- featureArea(xmseg_full_ref) + expect_equal(unname(res), unname(ref)) + res <- featureArea(xmseg_full_h5, features = rownames(res)[c(5, 12, 20)]) + ref <- featureArea(xmseg_full_ref, features = rownames(ref)[c(5, 12, 20)]) + expect_equal(unname(res), unname(ref)) +}) + ## test_that(".h5_feature_chrom_peaks_sample works", { ## cn <- .h5_chrom_peaks_colnames(xmseg_full_h5, 1L) ## res <- .h5_feature_chrom_peaks_sample("S3", xmseg_full_h5@hdf5_file,