diff --git a/.github/workflows/bump_dev_version.yaml b/.github/workflows/bump_dev_version.yaml index fa16e7e..5a0d832 100644 --- a/.github/workflows/bump_dev_version.yaml +++ b/.github/workflows/bump_dev_version.yaml @@ -15,8 +15,6 @@ jobs: uses: actions/checkout@v4 - name: Set up R uses: r-lib/actions/setup-r@v2 - with: - install-r: false - name: Bump dev version uses: DanChaltiel/actions/bump-dev-version@v1 with: diff --git a/DESCRIPTION b/DESCRIPTION index 29226da..e055557 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: EDCimport -Version: 0.5.2.9010 +Version: 0.5.2.9011 Title: Import Data from EDC Software Authors@R: c(person(given = "Dan", diff --git a/EDCimport.Rproj b/EDCimport.Rproj index 67e22ea..b154f60 100644 --- a/EDCimport.Rproj +++ b/EDCimport.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 3f8ab476-d410-42c9-9187-11e15b11b498 RestoreWorkspace: No SaveWorkspace: No diff --git a/R/swimmerplot.R b/R/swimmerplot.R index 701f993..ce65109 100644 --- a/R/swimmerplot.R +++ b/R/swimmerplot.R @@ -5,10 +5,11 @@ #' Swimmer plot of all dates columns #' -#' Join all tables from `.lookup$dataset` on `id` +#' Join all tables on `id` with only date columns to build a ggplot (or a +#' plotly if `plotly=TRUE`) showing all dates for each patients. This allows +#' outliers to be easily identified. #' -#' @param .lookup the lookup table, default to `edc_lookup()` -#' @param id the patient identifier. Will be coerced as numeric. +#' @param id the patient identifier. Will be coerced as numeric if possible. #' @param group a grouping variable, given as "dataset$column" #' @param origin a variable to consider as time 0, given as "dataset$column" #' @param id_lim a numeric vector of length 2 providing the minimum and maximum `id` to subset on. @@ -16,6 +17,7 @@ #' @param time_unit if `origin!=NULL`, the unit to measure time. One of `c("days", "weeks", "months", "years")`. #' @param aes_color either `variable` ("\{dataset\} - \{column\}") or `label` (the column label) #' @param plotly whether to use `{plotly}` to get an interactive plot +#' @param .lookup deprecated #' @param ... not used #' #' @return either a `plotly` or a `ggplot` @@ -25,10 +27,10 @@ #' #tm = read_trialmaster("filename.zip", pw="xx") #' tm = edc_example() #' load_list(tm) -#' p = edc_swimmerplot(.lookup, id_lim=c(5,45)) -#' p2 = edc_swimmerplot(.lookup, origin="enrol$date_naissance", time_unit="weeks", +#' p = edc_swimmerplot(id_lim=c(5,45)) +#' p2 = edc_swimmerplot(origin="db0$date_naissance", time_unit="weeks", #' exclude=c("DB1$DATE2", "db3$.*")) -#' p3 = edc_swimmerplot(.lookup, group="enrol$arm", aes_color="label") +#' p3 = edc_swimmerplot(group="db0$group", aes_color="label") #' \dontrun{ #' #save the plotly plot as HTML to share it #' save_plotly(p, "edc_swimmerplot.html") @@ -43,77 +45,30 @@ #' @importFrom stringr str_detect str_ends str_remove str_replace_all #' @importFrom tidyr pivot_longer #' @importFrom tidyselect matches -edc_swimmerplot = function(.lookup=edc_lookup(), ..., - id=get_subjid_cols(), +edc_swimmerplot = function(..., group=NULL, origin=NULL, id_lim=NULL, exclude=NULL, + id=get_subjid_cols(), time_unit=c("days", "weeks", "months", "years"), aes_color=c("variable", "label"), - plotly=getOption("edc_plotly", FALSE)){ + plotly=getOption("edc_plotly", FALSE), + .lookup="deprecated"){ check_dots_empty() + aes_color = match.arg(aes_color) + aes_label = if(aes_color=="variable") "label" else "variable" time_unit = match.arg(time_unit[1], c(time_unit, str_remove(time_unit, "s$"))) if(!str_ends(time_unit, "s")) time_unit = paste0(time_unit, "s") - aes_color = match.arg(aes_color) parent = parent.frame() - if(is.null(.lookup)){ - cli_abort("{.arg .lookup} should not be {.val NULL}") - } - - dbs = .lookup$dataset %>% - set_names() %>% - map(~get(.x, envir=parent)) - if(length(dbs)==0){ - cli_abort("Unexpected error, contact the developper") - } - - dbs = dbs %>% - discard(~!any(id %in% names(.x))) - if(length(dbs)==0){ - cli_abort(c("None of the datasets contains an identifier column", i="{.arg id}={.val {id}}")) - } - dbs = dbs %>% - map(~.x %>% select(matches(id), where(is.Date)) %>% rename(id=1)) %>% - discard(~ncol(.x)<2) - if(length(dbs)==0){ - cli_abort(c("None of the datasets contains a date column")) - } - - dbs = dbs %>% - imap(~{ - xid = suppressWarnings(as.numeric(.x$id)) - pb = is.na(xid)!=is.na(.x$id) - if(any(pb)) cli_warn(c("NAs introduced by coercion to numeric in {.val {.y}${id}}. Is {.arg id} set correctly?", - i="Problematic value{?s}: {.val { .x$id[pb]}}")) - .x$id = xid - .x - }) - dat = dbs %>% - imap(~{ - .x %>% - pivot_longer(-id) %>% - mutate( - label=unlist(get_label(.x)[name]) %||% name, - dataset=.y, - variable=paste0(toupper(dataset), " - ", toupper(name)) - ) - - }) %>% + dat = get_datasets() %>% + .discard_if_no_id(id=id) %>% + .select_dates(id=id) %>% + .pivot_dates(id=id) %>% list_rbind() %>% - mutate(date=value) - - if(!is.null(exclude)){ - excl = tolower(paste(exclude, collapse="|")) %>% - str_replace_all("\\$", "\\\\$") - dat = dat %>% - filter(!str_detect(tolower(paste0(dataset, "$", name)), excl)) - } - - if(!is.null(id_lim)){ - if(!is.numeric(id_lim) && length(id_lim)!=2) cli_abort("{.arg id_lim} should be a numeric vector of length 2") - dat = dat %>% filter(between(id, id_lim[1], id_lim[2])) - } + .exclude_columns(exclude) %>% + .parse_id_to_numeric(id=id, id_lim=id_lim)%>% + arrange(variable) if(!is.null(group)){ dat_group = parse_var(group, id, parent) @@ -121,40 +76,27 @@ edc_swimmerplot = function(.lookup=edc_lookup(), ..., cli_abort("{.arg group} ({group}) should identify subjects ({id}) uniquely.", class="edc_swimplot_group_dup") } - dat = dat %>% left_join(dat_group, by="id") } - if(can_be_numeric(dat$id)) dat$id=as.numeric(dat$id) - else if(all(str_detect(dat$id, "\\d+"))){ - if(is_installed("gtools")) dat=slice(dat, gtools::mixedorder(id)) - else cli_warn(c("{.arg id} contains numbers, you will need the - {.pkg gtools} package to sort it properly.", - i='Run {.run utils::install.package("gtools")}')) - } tooltip = c("x", "y", "color", "label") x_label = "Calendar date" + aes_x = "date" if(!is.null(origin)){ dat_origin = parse_var(origin, id, parent) values = c(days=1, weeks=7, months=365.24/12, years=365.24) dat = dat %>% left_join(dat_origin, by="id") %>% mutate( - date = value, - value = as.double(value-origin, units="days") / values[time_unit] + time = as.double(date-origin, units="days") / values[time_unit] ) x_label = glue("Date difference from `{origin}` (in {time_unit})") tooltip = c(tooltip, "date") - } - - aes_label = "variable" - if(aes_color=="variable"){ - aes_label = "label" + aes_x = "time" } p = dat %>% - mutate(id=as_factor(id)) %>% - ggplot(aes(x=value, y=id, group=id, date=date)) + + ggplot(aes(x=!!sym(aes_x), y=id, group=id, date=date)) + aes(color=!!sym(aes_color), label=!!sym(aes_label)) + geom_line(na.rm=TRUE) + geom_point(na.rm=TRUE) + @@ -173,37 +115,90 @@ edc_swimmerplot = function(.lookup=edc_lookup(), ..., } -# Helper ------------------------------------------------------------------ -#' Save a plotly to an HTML file -#' -#' @param p a plot object (`plotly` or `ggplot`) -#' @param file a file path to save the HTML file -#' @param ... passed on to [htmlwidgets::saveWidget] -#' -#' @export -#' @return nothing, used for side effect -#' -#' @examples -#' \dontrun{ -#' tm = edc_example_plot() -#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45)) -#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") -#' } -#' @importFrom fs dir_create path_dir -#' @importFrom rlang check_installed -save_plotly = function(p, file, ...){ - check_installed("plotly", reason="for `save_plotly()` to work.") - check_installed("htmlwidgets", reason="for `save_plotly()` to work.") - if(inherits(p, "ggplot")) p = plotly::ggplotly(p) - dir_create(path_dir(file), recurse=TRUE) - wd = setwd(path_dir(file)) - on.exit(setwd(wd)) - htmlwidgets::saveWidget(p, file=basename(file), ...) +# Utils ------------------------------------------------------------------- + + +#' @noRd +#' @keywords internal +.discard_if_no_id = function(datasets, id){ + has_id = datasets %>% + map(~tolower(names(.x))) %>% + map(~intersect(.x, tolower(id))) %>% + map_lgl(~length(.x)>0) + if(!any(has_id)){ + cli_abort(c("None of the datasets contains an identifier column", + i="{.arg id}={.val {id}}"), + call=parent.frame()) + } + datasets[has_id] } -# Utils ------------------------------------------------------------------- +#' @noRd +#' @keywords internal +.select_dates = function(datasets, id){ + data_dates = datasets %>% + map(~{ + .x %>% + select(id=any_of2(id), where(is.Date)) %>% + rename(id=1) + }) %>% + discard(~ncol(.x)<2) + if(length(data_dates)==0){ + cli_abort(c("None of the datasets contains a date column"), + call=parent.frame()) + } + data_dates +} + +.pivot_dates = function(datasets, id){ + datasets %>% + imap(~{ + .x %>% + pivot_longer(-id, values_to="date") %>% + mutate( + label=unlist(get_label(.x)[name]) %||% name, + dataset=.y, + variable=paste0(dataset, "$", name) + ) + }) +} + +.exclude_columns = function(data, exclude) { + if(!is.null(exclude)){ + excl = exclude %>% paste(collapse="|") %>% tolower() %>% + str_replace_all("\\$", "\\\\$") + data = data %>% + filter(!str_detect(tolower(variable), excl)) + } + data +} + +.parse_id_to_numeric = function (data, id, id_lim) { + if(is.null(id_lim)) id_lim =c(-Inf, Inf) + if(!is.numeric(id_lim) && length(id_lim)!=2) { + cli_abort("{.arg id_lim} should be a numeric vector of length 2.") + } + + if(can_be_numeric(data$id)) { + data = data %>% + mutate(id = as.numeric(id)) %>% + filter(id>=id_lim[1] & id<=id_lim[2]) + } else if(all(str_detect(data$id, "\\d+"))){ + if(is_installed("gtools")){ + data = data %>% + slice(gtools::mixedorder(id)) %>% + mutate(id=as_factor(id)) + } else { + cli_warn(c("{.arg id} contains numbers, you will need the + {.pkg gtools} package to sort it properly.", + i='Run {.run utils::install.package("gtools")}'), + call=parent.frame()) + } + } + data +} #' @importFrom cli cli_abort #' @importFrom dplyr rename select @@ -214,7 +209,6 @@ save_plotly = function(p, file, ...){ #' @keywords internal parse_var = function(input, id, env){ input_name = caller_arg(input) - if(!str_detect(input, "^.*\\$.*$")){ cli_abort(c(x="{.arg {input_name}} is not in the form `dataset$column`.", i="{.arg {input_name}} = {.val {input}}"), @@ -231,7 +225,6 @@ parse_var = function(input, id, env){ } dat_input = get(input2[1], envir=env) - if(!input2[2] %in% names(dat_input)){ cli_abort(c(x="{.arg {input_name}} is wrong: no column {.val {input2[2]}} in dataset {.val {input2[1]}} was found.", i="{.arg {input_name}} = {.val {input}}"), @@ -243,3 +236,35 @@ parse_var = function(input, id, env){ select(matches(id), !!input_name:=!!input2[2]) %>% rename(id=1) } + + + + +# Helper ------------------------------------------------------------------ + +#' Save a plotly to an HTML file +#' +#' @param p a plot object (`plotly` or `ggplot`) +#' @param file a file path to save the HTML file +#' @param ... passed on to [htmlwidgets::saveWidget] +#' +#' @export +#' @return nothing, used for side effect +#' +#' @examples +#' \dontrun{ +#' tm = edc_example_plot() +#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45)) +#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") +#' } +#' @importFrom fs dir_create path_dir +#' @importFrom rlang check_installed +save_plotly = function(p, file, ...){ + check_installed("plotly", reason="for `save_plotly()` to work.") + check_installed("htmlwidgets", reason="for `save_plotly()` to work.") + if(inherits(p, "ggplot")) p = plotly::ggplotly(p) + dir_create(path_dir(file), recurse=TRUE) + wd = setwd(path_dir(file)) + on.exit(setwd(wd)) + htmlwidgets::saveWidget(p, file=basename(file), ...) +} \ No newline at end of file diff --git a/man/edc_swimmerplot.Rd b/man/edc_swimmerplot.Rd index 37335a2..4bf6e64 100644 --- a/man/edc_swimmerplot.Rd +++ b/man/edc_swimmerplot.Rd @@ -5,25 +5,21 @@ \title{Swimmer plot of all dates columns} \usage{ edc_swimmerplot( - .lookup = edc_lookup(), ..., - id = get_subjid_cols(), group = NULL, origin = NULL, id_lim = NULL, exclude = NULL, + id = get_subjid_cols(), time_unit = c("days", "weeks", "months", "years"), aes_color = c("variable", "label"), - plotly = getOption("edc_plotly", FALSE) + plotly = getOption("edc_plotly", FALSE), + .lookup = "deprecated" ) } \arguments{ -\item{.lookup}{the lookup table, default to \code{edc_lookup()}} - \item{...}{not used} -\item{id}{the patient identifier. Will be coerced as numeric.} - \item{group}{a grouping variable, given as "dataset$column"} \item{origin}{a variable to consider as time 0, given as "dataset$column"} @@ -32,26 +28,32 @@ edc_swimmerplot( \item{exclude}{a character vector of variables to exclude, in the form \code{dataset$column}. Can be a regex, but \code{$} symbols don't count. Case-insensitive.} +\item{id}{the patient identifier. Will be coerced as numeric if possible.} + \item{time_unit}{if \code{origin!=NULL}, the unit to measure time. One of \code{c("days", "weeks", "months", "years")}.} \item{aes_color}{either \code{variable} ("\{dataset\} - \{column\}") or \code{label} (the column label)} \item{plotly}{whether to use \code{{plotly}} to get an interactive plot} + +\item{.lookup}{deprecated} } \value{ either a \code{plotly} or a \code{ggplot} } \description{ -Join all tables from \code{.lookup$dataset} on \code{id} +Join all tables on \code{id} with only date columns to build a ggplot (or a +plotly if \code{plotly=TRUE}) showing all dates for each patients. This allows +outliers to be easily identified. } \examples{ #tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example() load_list(tm) -p = edc_swimmerplot(.lookup, id_lim=c(5,45)) -p2 = edc_swimmerplot(.lookup, origin="enrol$date_naissance", time_unit="weeks", +p = edc_swimmerplot(id_lim=c(5,45)) +p2 = edc_swimmerplot(origin="db0$date_naissance", time_unit="weeks", exclude=c("DB1$DATE2", "db3$.*")) -p3 = edc_swimmerplot(.lookup, group="enrol$arm", aes_color="label") +p3 = edc_swimmerplot(group="db0$group", aes_color="label") \dontrun{ #save the plotly plot as HTML to share it save_plotly(p, "edc_swimmerplot.html")