Merge branch 'main' into 53-improve-example-datasets

DanChaltiel · Jan 17, 2025 · 50156fa · 50156fa
2 parents e1bc25a + 89ec73c
commit 50156fa
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 126 deletions.
diff --git a/.github/workflows/bump_dev_version.yaml b/.github/workflows/bump_dev_version.yaml
@@ -15,8 +15,6 @@ jobs:
       uses: actions/checkout@v4
     - name: Set up R
       uses: r-lib/actions/setup-r@v2
-      with:
-        install-r: false
     - name: Bump dev version
       uses: DanChaltiel/actions/bump-dev-version@v1
       with:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: EDCimport
-Version: 0.5.2.9010
+Version: 0.5.2.9011
 Title: Import Data from EDC Software
 Authors@R: 
     c(person(given = "Dan",

diff --git a/EDCimport.Rproj b/EDCimport.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 3f8ab476-d410-42c9-9187-11e15b11b498
 
 RestoreWorkspace: No
 SaveWorkspace: No

diff --git a/R/swimmerplot.R b/R/swimmerplot.R
@@ -5,17 +5,19 @@
 
 #' Swimmer plot of all dates columns
 #' 
-#' Join all tables from `.lookup$dataset` on `id` 
+#' Join all tables on `id` with only date columns to build a ggplot (or a 
+#' plotly if `plotly=TRUE`) showing all dates for each patients. This allows 
+#' outliers to be easily identified.
 #'
-#' @param .lookup the lookup table, default to `edc_lookup()`
-#' @param id the patient identifier. Will be coerced as numeric.
+#' @param id the patient identifier. Will be coerced as numeric if possible.
 #' @param group a grouping variable, given as "dataset$column"
 #' @param origin a variable to consider as time 0, given as "dataset$column"
 #' @param id_lim a numeric vector of length 2 providing the minimum and maximum `id` to subset on. 
 #' @param exclude a character vector of variables to exclude, in the form `dataset$column`. Can be a regex, but `$` symbols don't count. Case-insensitive.
 #' @param time_unit if `origin!=NULL`, the unit to measure time. One of `c("days", "weeks", "months", "years")`.
 #' @param aes_color either `variable` ("\{dataset\} - \{column\}") or `label` (the column label)
 #' @param plotly whether to use `{plotly}` to get an interactive plot
+#' @param .lookup deprecated
 #' @param ... not used
 #'
 #' @return either a `plotly` or a `ggplot`
@@ -25,10 +27,10 @@
 #' #tm = read_trialmaster("filename.zip", pw="xx")
 #' tm = edc_example()
 #' load_list(tm)
-#' p = edc_swimmerplot(.lookup, id_lim=c(5,45))
-#' p2 = edc_swimmerplot(.lookup, origin="enrol$date_naissance", time_unit="weeks", 
+#' p = edc_swimmerplot(id_lim=c(5,45))
+#' p2 = edc_swimmerplot(origin="db0$date_naissance", time_unit="weeks", 
 #'                      exclude=c("DB1$DATE2", "db3$.*"))
-#' p3 = edc_swimmerplot(.lookup, group="enrol$arm", aes_color="label")
+#' p3 = edc_swimmerplot(group="db0$group", aes_color="label")
 #' \dontrun{
 #' #save the plotly plot as HTML to share it
 #' save_plotly(p, "edc_swimmerplot.html")
@@ -43,118 +45,58 @@
 #' @importFrom stringr str_detect str_ends str_remove str_replace_all
 #' @importFrom tidyr pivot_longer
 #' @importFrom tidyselect matches
-edc_swimmerplot = function(.lookup=edc_lookup(), ..., 
-                           id=get_subjid_cols(), 
+edc_swimmerplot = function(..., 
                            group=NULL, origin=NULL, 
                            id_lim=NULL,
                            exclude=NULL,
+                           id=get_subjid_cols(), 
                            time_unit=c("days", "weeks", "months", "years"),
                            aes_color=c("variable", "label"), 
-                           plotly=getOption("edc_plotly", FALSE)){
+                           plotly=getOption("edc_plotly", FALSE),
+                           .lookup="deprecated"){
   check_dots_empty()
+  aes_color = match.arg(aes_color)
+  aes_label = if(aes_color=="variable") "label" else "variable"
   time_unit = match.arg(time_unit[1], c(time_unit, str_remove(time_unit, "s$")))
   if(!str_ends(time_unit, "s")) time_unit = paste0(time_unit, "s")
-  aes_color = match.arg(aes_color)
   parent = parent.frame()
-  if(is.null(.lookup)){
-    cli_abort("{.arg .lookup} should not be {.val NULL}")
-  }
-
-  dbs = .lookup$dataset %>%
-    set_names() %>% 
-    map(~get(.x, envir=parent))
-  if(length(dbs)==0){
-    cli_abort("Unexpected error, contact the developper")
-  }
-
-  dbs = dbs %>% 
-    discard(~!any(id %in% names(.x)))
-  if(length(dbs)==0){
-    cli_abort(c("None of the datasets contains an identifier column", i="{.arg id}={.val {id}}"))
-  }
-  dbs = dbs %>% 
-    map(~.x %>% select(matches(id), where(is.Date)) %>% rename(id=1)) %>% 
-    discard(~ncol(.x)<2)
-  if(length(dbs)==0){
-    cli_abort(c("None of the datasets contains a date column"))
-  }
-
-  dbs = dbs %>% 
-    imap(~{
-      xid = suppressWarnings(as.numeric(.x$id))
-      pb = is.na(xid)!=is.na(.x$id)
-      if(any(pb)) cli_warn(c("NAs introduced by coercion to numeric in {.val {.y}${id}}. Is {.arg id} set correctly?", 
-                             i="Problematic value{?s}: {.val { .x$id[pb]}}"))
-      .x$id = xid
-      .x
-    })
 
-  dat = dbs %>% 
-    imap(~{
-      .x %>% 
-        pivot_longer(-id) %>% 
-        mutate(
-          label=unlist(get_label(.x)[name]) %||% name,
-          dataset=.y,
-          variable=paste0(toupper(dataset), " - ", toupper(name))
-        )
-
-    }) %>% 
+  dat = get_datasets() %>% 
+    .discard_if_no_id(id=id) %>% 
+    .select_dates(id=id) %>% 
+    .pivot_dates(id=id) %>% 
     list_rbind() %>% 
-    mutate(date=value)
-
-  if(!is.null(exclude)){
-    excl = tolower(paste(exclude, collapse="|")) %>% 
-      str_replace_all("\\$", "\\\\$")
-    dat = dat %>% 
-      filter(!str_detect(tolower(paste0(dataset, "$", name)), excl))
-  }
-
-  if(!is.null(id_lim)){
-    if(!is.numeric(id_lim) && length(id_lim)!=2) cli_abort("{.arg id_lim} should be a numeric vector of length 2")
-    dat = dat %>% filter(between(id, id_lim[1], id_lim[2]))
-  }
+    .exclude_columns(exclude) %>% 
+    .parse_id_to_numeric(id=id, id_lim=id_lim)%>% 
+    arrange(variable)
 
   if(!is.null(group)){
     dat_group = parse_var(group, id, parent)
     if(anyDuplicated(dat_group$id)!=0){
       cli_abort("{.arg group} ({group}) should identify subjects ({id}) uniquely.", 
                 class="edc_swimplot_group_dup")
     }
-
     dat = dat %>% left_join(dat_group, by="id")
   }
 
-  if(can_be_numeric(dat$id)) dat$id=as.numeric(dat$id)
-  else if(all(str_detect(dat$id, "\\d+"))){
-    if(is_installed("gtools")) dat=slice(dat, gtools::mixedorder(id))
-    else cli_warn(c("{.arg id} contains numbers, you will need the 
-                    {.pkg gtools} package to sort it properly.", 
-                    i='Run {.run utils::install.package("gtools")}'))
-  }
   tooltip = c("x", "y", "color", "label")
   x_label = "Calendar date"
+  aes_x = "date"
   if(!is.null(origin)){
     dat_origin = parse_var(origin, id, parent)
     values = c(days=1, weeks=7, months=365.24/12, years=365.24)
     dat = dat %>%
       left_join(dat_origin, by="id") %>% 
       mutate(
-        date = value,
-        value = as.double(value-origin, units="days") / values[time_unit]
+        time = as.double(date-origin, units="days") / values[time_unit]
       )
     x_label = glue("Date difference from `{origin}` (in {time_unit})")
     tooltip = c(tooltip, "date")
-  }
-
-  aes_label = "variable"
-  if(aes_color=="variable"){ 
-    aes_label = "label"
+    aes_x = "time"
   }
 
   p = dat %>% 
-    mutate(id=as_factor(id)) %>% 
-    ggplot(aes(x=value, y=id, group=id, date=date)) + 
+    ggplot(aes(x=!!sym(aes_x), y=id, group=id, date=date)) + 
     aes(color=!!sym(aes_color), label=!!sym(aes_label)) +
     geom_line(na.rm=TRUE) +
     geom_point(na.rm=TRUE) +
@@ -173,37 +115,90 @@ edc_swimmerplot = function(.lookup=edc_lookup(), ...,
 }
 
 
-# Helper ------------------------------------------------------------------
 
-#' Save a plotly to an HTML file
-#'
-#' @param p a plot object (`plotly` or `ggplot`)
-#' @param file a file path to save the HTML file
-#' @param ... passed on to [htmlwidgets::saveWidget]
-#'
-#' @export
-#' @return nothing, used for side effect
-#'
-#' @examples
-#' \dontrun{
-#' tm = edc_example_plot()
-#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45))
-#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot")
-#' }
-#' @importFrom fs dir_create path_dir
-#' @importFrom rlang check_installed
-save_plotly = function(p, file, ...){
-  check_installed("plotly", reason="for `save_plotly()` to work.")
-  check_installed("htmlwidgets", reason="for `save_plotly()` to work.")
-  if(inherits(p, "ggplot")) p = plotly::ggplotly(p)
-  dir_create(path_dir(file), recurse=TRUE)
-  wd = setwd(path_dir(file))
-  on.exit(setwd(wd))
-  htmlwidgets::saveWidget(p, file=basename(file), ...)
+# Utils -------------------------------------------------------------------
+
+
+#' @noRd
+#' @keywords internal
+.discard_if_no_id = function(datasets, id){
+  has_id = datasets %>% 
+    map(~tolower(names(.x))) %>% 
+    map(~intersect(.x, tolower(id))) %>% 
+    map_lgl(~length(.x)>0)
+  if(!any(has_id)){
+    cli_abort(c("None of the datasets contains an identifier column", 
+                i="{.arg id}={.val {id}}"), 
+              call=parent.frame())
+  }
+  datasets[has_id]
 }
 
 
-# Utils -------------------------------------------------------------------
+#' @noRd
+#' @keywords internal
+.select_dates = function(datasets, id){
+  data_dates = datasets %>% 
+    map(~{
+      .x %>% 
+        select(id=any_of2(id), where(is.Date)) %>% 
+        rename(id=1)
+    }) %>% 
+    discard(~ncol(.x)<2)
+  if(length(data_dates)==0){
+    cli_abort(c("None of the datasets contains a date column"), 
+              call=parent.frame())
+  }
+  data_dates
+}
+
+.pivot_dates = function(datasets, id){
+  datasets %>% 
+    imap(~{
+      .x %>% 
+        pivot_longer(-id, values_to="date") %>% 
+        mutate(
+          label=unlist(get_label(.x)[name]) %||% name,
+          dataset=.y,
+          variable=paste0(dataset, "$", name)
+        )
+    })
+}
+
+.exclude_columns = function(data, exclude) {
+  if(!is.null(exclude)){
+    excl = exclude %>% paste(collapse="|") %>% tolower() %>% 
+      str_replace_all("\\$", "\\\\$")
+    data = data %>% 
+      filter(!str_detect(tolower(variable), excl))
+  }
+  data
+}
+
+.parse_id_to_numeric = function (data, id, id_lim) {
+  if(is.null(id_lim)) id_lim =c(-Inf, Inf)
+  if(!is.numeric(id_lim) && length(id_lim)!=2) {
+    cli_abort("{.arg id_lim} should be a numeric vector of length 2.")
+  }
+
+  if(can_be_numeric(data$id)) {
+    data = data %>% 
+      mutate(id = as.numeric(id)) %>% 
+      filter(id>=id_lim[1] & id<=id_lim[2])
+  } else if(all(str_detect(data$id, "\\d+"))){
+    if(is_installed("gtools")){
+      data = data %>% 
+        slice(gtools::mixedorder(id)) %>% 
+        mutate(id=as_factor(id))
+    } else {
+      cli_warn(c("{.arg id} contains numbers, you will need the 
+                    {.pkg gtools} package to sort it properly.", 
+                 i='Run {.run utils::install.package("gtools")}'),
+               call=parent.frame())
+    }
+  }
+  data
+}
 
 #' @importFrom cli cli_abort
 #' @importFrom dplyr rename select
@@ -214,7 +209,6 @@ save_plotly = function(p, file, ...){
 #' @keywords internal
 parse_var = function(input, id, env){
   input_name = caller_arg(input)
-
   if(!str_detect(input, "^.*\\$.*$")){
     cli_abort(c(x="{.arg {input_name}} is not in the form `dataset$column`.", 
                 i="{.arg {input_name}} = {.val {input}}"), 
@@ -231,7 +225,6 @@ parse_var = function(input, id, env){
   }
 
   dat_input = get(input2[1], envir=env)
-
   if(!input2[2] %in% names(dat_input)){
     cli_abort(c(x="{.arg {input_name}} is wrong: no column {.val {input2[2]}} in dataset {.val {input2[1]}} was found.", 
                 i="{.arg {input_name}} = {.val {input}}"), 
@@ -243,3 +236,35 @@ parse_var = function(input, id, env){
     select(matches(id), !!input_name:=!!input2[2]) %>%
     rename(id=1)
 }
+
+
+
+
+# Helper ------------------------------------------------------------------
+
+#' Save a plotly to an HTML file
+#'
+#' @param p a plot object (`plotly` or `ggplot`)
+#' @param file a file path to save the HTML file
+#' @param ... passed on to [htmlwidgets::saveWidget]
+#'
+#' @export
+#' @return nothing, used for side effect
+#'
+#' @examples
+#' \dontrun{
+#' tm = edc_example_plot()
+#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45))
+#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot")
+#' }
+#' @importFrom fs dir_create path_dir
+#' @importFrom rlang check_installed
+save_plotly = function(p, file, ...){
+  check_installed("plotly", reason="for `save_plotly()` to work.")
+  check_installed("htmlwidgets", reason="for `save_plotly()` to work.")
+  if(inherits(p, "ggplot")) p = plotly::ggplotly(p)
+  dir_create(path_dir(file), recurse=TRUE)
+  wd = setwd(path_dir(file))
+  on.exit(setwd(wd))
+  htmlwidgets::saveWidget(p, file=basename(file), ...)
+}