diff --git a/DESCRIPTION b/DESCRIPTION index 808166a3..05339c8e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,7 +66,8 @@ Imports: Spectra (>= 1.15.7), progress, RColorBrewer, - MetaboCoreUtils (>= 1.11.2) + MetaboCoreUtils (>= 1.11.2), + data.table Suggests: BiocStyle, caTools, diff --git a/NAMESPACE b/NAMESPACE index 72443de8..9916fd0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,8 @@ importFrom("utils", "flush.console", "head", "object.size", importFrom("MassSpecWavelet", "peakDetectionCWT", "tuneInPeakInfo") +importFrom("data.table", "rbindlist") + ## MSnbase: importClassesFrom("MSnbase", "MSnExp", "pSet", "OnDiskMSnExp", "Chromatogram", "MChromatograms", "MSpectra") diff --git a/R/XcmsExperimentHdf5-functions.R b/R/XcmsExperimentHdf5-functions.R index 32ed89cf..fcd7ed8b 100644 --- a/R/XcmsExperimentHdf5-functions.R +++ b/R/XcmsExperimentHdf5-functions.R @@ -302,22 +302,31 @@ NULL } else idx_columns <- NULL ids <- rep(x@sample_id, length(msLevel)) msl <- rep(msLevel, each = length(x@sample_id)) + if (by_sample) { + sample_idx <- integer() + } else { + ## pass the sample index to the import function to add the "sample" col + sample_idx <- match(ids, x@sample_id) + names(sample_idx) <- paste0("/", ids, "/ms_", msl, "/chrom_peaks") + } res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peaks", ms_level = msl, read_colnames = TRUE, read_rownames = TRUE, j = idx_columns, - rt = rt, mz = mz, ppm = ppm, type = type) + rt = rt, mz = mz, ppm = ppm, type = type, + sample_index = sample_idx) if (by_sample) { names(res) <- ids res } else { - l <- vapply(res, nrow, 1L) - cbind(do.call(rbind, res), sample = rep(match(ids, x@sample_id), l)) + do.call(base::rbind, res) } } #' Extract the `chromPeakData` data.frame. Using `peaks` allows to reduce memory #' demand because only data from the specified chrom peaks is returned. This #' assumes that `chromPeaks()` was called before to get the IDs of the peaks. +#' We're using the data.table::rbindlist to combine the data.frames because its +#' much faster - but unfortunately drops also the rownames. #' #' @param x `XcmsExperimentHdf5` #' @@ -330,24 +339,25 @@ NULL #' @param by_sample `logical(1)` whether results should be `rbind` or returned #' as a `list` of `data.frame`. #' +#' @importFrom data.table rbindlist +#' #' @noRd .h5_chrom_peak_data <- function(x, msLevel = integer(), columns = character(), peaks = character(), by_sample = TRUE) { ids <- rep(x@sample_id, length(msLevel)) msl <- rep(msLevel, each = length(x@sample_id)) - ## Eventually pass chrom peak ids along to read only specicic data... + names(msl) <- paste0("/", ids, "/ms_", msl, "/chrom_peak_data") res <- .h5_read_data(x@hdf5_file, id = ids, name = "chrom_peak_data", - ms_level = msl, read_rownames = TRUE, peaks = peaks) + ms_level = msl, read_rownames = TRUE, peaks = peaks, + ms_levels = msl) if (by_sample) { names(res) <- ids - res <- mapply(FUN = function(a, b) { - a$ms_level <- b - a - }, res, msl, SIMPLIFY = FALSE) res } else { - l <- vapply(res, nrow, 1L) - cbind(do.call(rbind, res), ms_level = rep(msl, l)) + rn <- unlist(lapply(res, rownames), use.names= FALSE, recursive = FALSE) + res <- base::as.data.frame(data.table::rbindlist(res)) + attr(res, "row.names") <- rn + res } } @@ -358,6 +368,15 @@ NULL drop = TRUE) } +.h5_chrom_peaks_rownames <- function(x, msLevel = x@chrom_peaks_ms_level) { + ids <- rep(x@sample_id, length(msLevel)) + msl <- rep(msLevel, each = length(x@sample_id)) + h5 <- rhdf5::H5Fopen(x@hdf5_file) + on.exit(invisible(rhdf5::H5Fclose(h5))) + d <- paste0("/", ids, "/ms_", msl, "/chrom_peaks_rownames") + lapply(d, FUN = rhdf5::h5read, file = h5, drop = TRUE) +} + .h5_chrom_peak_data_colnames <- function(x, msLevel = 1L) { h5 <- rhdf5::H5Fopen(x@hdf5_file) on.exit(rhdf5::H5Fclose(h5)) @@ -594,7 +613,7 @@ NULL ms_level = ms_level, read_colnames = TRUE, i = idx, read_rownames = FALSE)[[1L]] b$ms_level <- rep(ms_level, length(idx)) - rownames(b) <- rownames(a) + attr(b, "row.names") <- rownames(a) tmp <- chr@.Data[j, i][[1L]] slot(tmp, "chromPeaks", check = FALSE) <- a slot(tmp, "chromPeakData", check = FALSE) <- as(b, "DataFrame") @@ -697,14 +716,19 @@ NULL read_rownames = FALSE, rownames = paste0(name, "_rownames"), rt = numeric(), mz = numeric(), - ppm = 0, type = "any") { + ppm = 0, type = "any", + sample_index = integer()) { read_colnames <- read_colnames || length(rt) > 0 || length(mz) > 0 - d <- .h5_read_matrix2(name, h5, index, read_colnames, read_rownames, - rownames) + d <- .h5_read_matrix2( + name, h5, index, read_colnames, read_rownames, rownames) if (length(rt) | length(mz)) - d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz, - ppm = ppm, type = type), , drop = FALSE] - else d + d <- d[.is_chrom_peaks_within_mz_rt(d, rt = rt, mz = mz, + ppm = ppm, type = type), , + drop = FALSE] + ## If sample_index is provided add a column "sample" with the index. + if (length(sample_index)) + d <- cbind(d, sample = sample_index[name]) + d } #' Read a single `data.frame` from the HDF5 file. With @@ -737,15 +761,25 @@ NULL d <- lapply(d, as.vector) d <- as.data.frame(d) if (read_rownames) - rownames(d) <- rhdf5::h5read(h5, rownames, drop = TRUE) + attr(d, "row.names") <- rhdf5::h5read(h5, rownames, drop = TRUE) if (is.null(index[[1L]])) d else d[index[[1L]], , drop = FALSE] } +#' Function to read the chromPeakData `data.frame` for one sample. +#' +#' @param peaks optional `character` with the chrom peak IDs for which the +#' data should be extracted. +#' +#' @param ms_levels optional **named** `integer` with the MS levels of all +#' imported data sets. At least one of the `names(ms_levels)` should match +#' param `name`. If provided, a column `$ms_level` is added to the result. +#' +#' @noRd .h5_read_chrom_peak_data <- function(name, h5, index = list(NULL, NULL), read_rownames = FALSE, peaks = character(), - ...) { + ms_levels = integer(), ...) { cd <- .h5_read_data_frame( name, h5, read_rownames = read_rownames || length(peaks) > 0, index = index, rownames = sub("_data", "s_rownames", name)) @@ -755,6 +789,8 @@ NULL if (!read_rownames) rownames(cd) <- NULL } + if (length(ms_levels)) + cd$ms_level <- rep(unname(ms_levels[name]), nrow(cd)) cd } diff --git a/R/XcmsExperimentHdf5.R b/R/XcmsExperimentHdf5.R index 6a2b7767..fba822fc 100644 --- a/R/XcmsExperimentHdf5.R +++ b/R/XcmsExperimentHdf5.R @@ -42,14 +42,28 @@ #' chromatographic peak in the chrom peak matrix **of that sample** and #' MS level. #' +#' HDF5 does not support parallel processing, thus preprocessing results need +#' to be loaded sequentially. +#' +#' All functionality for `XcmsExperimentHdf5` objects is optimized to reduce +#' memory demand at the cost of eventually lower performance. +#' #' @section Functionality related to chromatographic peaks: #' -#' - `chromPeakData()` gains a new parameter `peaks` which allows to specify -#' from which chromatographic peaks data should be returned. For these -#' chromatographic peaks the ID (row name in `chromPeaks()`) should be -#' provided with the `peaks` parameter. This can reduce the memory +#' - `chromPeaks()` gains parameter `bySample = FALSE` that, if set to `TRUE` +#' returns a `list` of `chromPeaks` matrices, one for each sample. Due to +#' the way data is organized in `XcmsExperimentHdf5` objects this is more +#' efficient than `bySample = FALSE`. Thus, in cases where chrom peak data +#' is subsequently evaluated or processed by sample, it is suggested to +#' use `bySample = TRUE`. +#' +#' - `chromPeakData()` gains a new parameter `peaks = character()` which allows +#' to specify from which chromatographic peaks data should be returned. +#' For these chromatographic peaks the ID (row name in `chromPeaks()`) +#' should be provided with the `peaks` parameter. This can reduce the memory #' requirement for cases in which only data of some selected chromatographic -#' peaks needs to be extracted. +#' peaks needs to be extracted. Also, `chromPeakData()` supports the +#' `bySample` parameter described for `chromPeaks()` above. #' #' @section Retention time alignment: #' @@ -133,7 +147,7 @@ setMethod("show", "XcmsExperimentHdf5", function(object) { ## ################################################################################ -#' @rdname XcmsExperiment +#' @rdname hidden_aliases setMethod("[", "XcmsExperimentHdf5", function(x, i, j, ...) { if (!missing(j)) stop("subsetting by j not supported") @@ -241,7 +255,7 @@ setMethod( "chromPeaks", "XcmsExperimentHdf5", function(object, rt = numeric(), mz = numeric(), ppm = 0, msLevel = integer(), type = c("any", "within", "apex_within"), - isFilledColumn = FALSE, columns = character()) { + isFilledColumn = FALSE, columns = character(), bySample = FALSE) { if (isFilledColumn) stop("Parameter 'isFilledColumn = TRUE' is not supported") type <- match.arg(type) @@ -255,7 +269,7 @@ setMethod( if (length(msLevel)) msl <- msl[msl %in% msLevel] .h5_chrom_peaks(object, msLevel = msl, columns = columns, - by_sample = FALSE, mz = mz, rt = rt, ppm = ppm, + by_sample = bySample, mz = mz, rt = rt, ppm = ppm, type = type) }) @@ -270,8 +284,9 @@ setReplaceMethod( setMethod( "chromPeakData", "XcmsExperimentHdf5", function(object, msLevel = integer(), peaks = character(), - return.type = c("DataFrame", "data.frame")) { + return.type = c("DataFrame", "data.frame"), bySample = FALSE) { return.type <- match.arg(return.type) + .h5_require_rhdf5() if (!length(object)) return(as(object@chromPeakData, return.type)) .h5_check_mod_count(object@hdf5_file, object@hdf5_mod_count) @@ -283,7 +298,7 @@ setMethod( as(.h5_chrom_peak_data(object, msLevel, peaks = peaks, by_sample = FALSE), "DataFrame") else .h5_chrom_peak_data(object, msLevel, peaks = peaks, - by_sample = FALSE) + by_sample = bySample) }) ## #' @rdname refineChromPeaks diff --git a/tests/testthat/test_XcmsExperimentHdf5-functions.R b/tests/testthat/test_XcmsExperimentHdf5-functions.R index b6e1f19e..5bf6da74 100644 --- a/tests/testthat/test_XcmsExperimentHdf5-functions.R +++ b/tests/testthat/test_XcmsExperimentHdf5-functions.R @@ -361,6 +361,19 @@ test_that(".h5_read_chrom_peaks_matrix works", { read_colnames = TRUE, read_rownames = FALSE, mz = c(300, 350), type = "within") expect_true(all(res[, "mz"] > 300 & res[, "mz"] < 350)) + + ## with sample_index + sidx <- c(4L, 5L, 6L) + names(sidx) <- c("/S1/ms_1/chrom_peaks", "/S2/ms_1/chrom_peaks", + "/S3/ms_1/chrom_peaks") + res <- .h5_read_chrom_peaks_matrix( + "/S2/ms_1/chrom_peaks", xmse_h5@hdf5_file, read_colnames = TRUE, + read_rownames = FALSE, sample_index = sidx) + expect_true(is.matrix(res)) + expect_true(is.numeric(res)) + expect_true(nrow(res) > 0) + expect_true(any(colnames(res) %in% "sample")) + expect_true(all(res[, "sample"] == 5)) }) test_that(".h5_read_data_frame works", { @@ -664,6 +677,13 @@ test_that(".h5_chrom_peaks_colnames works", { expect_true(all(c("mz", "mzmin", "mzmax", "rt") %in% res)) }) +test_that(".h5_chrom_peaks_rownames works", { + res <- .h5_chrom_peaks_rownames(xmse_h5) + expect_true(is.list(res)) + expect_equal(length(res), 3L) + expect_true(is.character(res[[1L]])) +}) + test_that(".h5_filter works", { expect_equal(.h5_filter(), "NONE") }) diff --git a/tests/testthat/test_XcmsExperimentHdf5.R b/tests/testthat/test_XcmsExperimentHdf5.R index dc0144a3..fcd0ed4a 100644 --- a/tests/testthat/test_XcmsExperimentHdf5.R +++ b/tests/testthat/test_XcmsExperimentHdf5.R @@ -236,19 +236,19 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", { nf <- nrow(b) rtmed <- b$rtmed ## .h5_feature_values_sample - a <- .h5_feature_values_sample( + a <- xcms:::.h5_feature_values_sample( xmseg_full_h5@hdf5_file, sample_id = "S1", ms_level = 1L, n_features = nf, method = "sum", filled = FALSE, col_idx = 9L) b <- unname(featureValues(ref, method = "sum", value = "maxo", filled = FALSE)[, 1L]) expect_equal(a, b) - a <- .h5_feature_values_sample( + a <- xcms:::.h5_feature_values_sample( xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L, n_features = nf, filled = FALSE, method = "maxint", col_idx = c(7L, 9L)) b <- unname(featureValues(ref, method = "maxint", value = "into", filled = FALSE, intensity = "maxo")[, 4L]) expect_equal(a, b) - a <- .h5_feature_values_sample( + a <- xcms:::.h5_feature_values_sample( xmseg_full_h5@hdf5_file, sample_id = "S4", ms_level = 1L, n_features = nf, filled = FALSE, method = "medret", col_idx = c(8L, 4L), rtmed = rtmed) @@ -257,7 +257,7 @@ test_that("featureValues,XcmsExperimentHdf5 etc works", { expect_equal(a, b) ## .h5_feature_values_ms_level - a <- .h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret", + a <- xcms:::.h5_feature_values_ms_level(1L, xmseg_full_h5, method = "medret", value = "into", filled = FALSE) b <- featureValues(ref, method = "medret", value = "into", filled = FALSE) expect_equal(unname(a), unname(b))