Skip to content

Commit

Permalink
Merge branch 'main' into 53-improve-example-datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
DanChaltiel authored Jan 17, 2025
2 parents e1bc25a + 89ec73c commit 50156fa
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 126 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/bump_dev_version.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ jobs:
uses: actions/checkout@v4
- name: Set up R
uses: r-lib/actions/setup-r@v2
with:
install-r: false
- name: Bump dev version
uses: DanChaltiel/actions/bump-dev-version@v1
with:
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: EDCimport
Version: 0.5.2.9010
Version: 0.5.2.9011
Title: Import Data from EDC Software
Authors@R:
c(person(given = "Dan",
Expand Down
1 change: 1 addition & 0 deletions EDCimport.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: 3f8ab476-d410-42c9-9187-11e15b11b498

RestoreWorkspace: No
SaveWorkspace: No
Expand Down
249 changes: 137 additions & 112 deletions R/swimmerplot.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@

#' Swimmer plot of all dates columns
#'
#' Join all tables from `.lookup$dataset` on `id`
#' Join all tables on `id` with only date columns to build a ggplot (or a
#' plotly if `plotly=TRUE`) showing all dates for each patients. This allows
#' outliers to be easily identified.
#'
#' @param .lookup the lookup table, default to `edc_lookup()`
#' @param id the patient identifier. Will be coerced as numeric.
#' @param id the patient identifier. Will be coerced as numeric if possible.
#' @param group a grouping variable, given as "dataset$column"
#' @param origin a variable to consider as time 0, given as "dataset$column"
#' @param id_lim a numeric vector of length 2 providing the minimum and maximum `id` to subset on.
#' @param exclude a character vector of variables to exclude, in the form `dataset$column`. Can be a regex, but `$` symbols don't count. Case-insensitive.
#' @param time_unit if `origin!=NULL`, the unit to measure time. One of `c("days", "weeks", "months", "years")`.
#' @param aes_color either `variable` ("\{dataset\} - \{column\}") or `label` (the column label)
#' @param plotly whether to use `{plotly}` to get an interactive plot
#' @param .lookup deprecated
#' @param ... not used
#'
#' @return either a `plotly` or a `ggplot`
Expand All @@ -25,10 +27,10 @@
#' #tm = read_trialmaster("filename.zip", pw="xx")
#' tm = edc_example()
#' load_list(tm)
#' p = edc_swimmerplot(.lookup, id_lim=c(5,45))
#' p2 = edc_swimmerplot(.lookup, origin="enrol$date_naissance", time_unit="weeks",
#' p = edc_swimmerplot(id_lim=c(5,45))
#' p2 = edc_swimmerplot(origin="db0$date_naissance", time_unit="weeks",
#' exclude=c("DB1$DATE2", "db3$.*"))
#' p3 = edc_swimmerplot(.lookup, group="enrol$arm", aes_color="label")
#' p3 = edc_swimmerplot(group="db0$group", aes_color="label")
#' \dontrun{
#' #save the plotly plot as HTML to share it
#' save_plotly(p, "edc_swimmerplot.html")
Expand All @@ -43,118 +45,58 @@
#' @importFrom stringr str_detect str_ends str_remove str_replace_all
#' @importFrom tidyr pivot_longer
#' @importFrom tidyselect matches
edc_swimmerplot = function(.lookup=edc_lookup(), ...,
id=get_subjid_cols(),
edc_swimmerplot = function(...,
group=NULL, origin=NULL,
id_lim=NULL,
exclude=NULL,
id=get_subjid_cols(),
time_unit=c("days", "weeks", "months", "years"),
aes_color=c("variable", "label"),
plotly=getOption("edc_plotly", FALSE)){
plotly=getOption("edc_plotly", FALSE),
.lookup="deprecated"){
check_dots_empty()
aes_color = match.arg(aes_color)
aes_label = if(aes_color=="variable") "label" else "variable"
time_unit = match.arg(time_unit[1], c(time_unit, str_remove(time_unit, "s$")))
if(!str_ends(time_unit, "s")) time_unit = paste0(time_unit, "s")
aes_color = match.arg(aes_color)
parent = parent.frame()
if(is.null(.lookup)){
cli_abort("{.arg .lookup} should not be {.val NULL}")
}

dbs = .lookup$dataset %>%
set_names() %>%
map(~get(.x, envir=parent))
if(length(dbs)==0){
cli_abort("Unexpected error, contact the developper")
}

dbs = dbs %>%
discard(~!any(id %in% names(.x)))
if(length(dbs)==0){
cli_abort(c("None of the datasets contains an identifier column", i="{.arg id}={.val {id}}"))
}
dbs = dbs %>%
map(~.x %>% select(matches(id), where(is.Date)) %>% rename(id=1)) %>%
discard(~ncol(.x)<2)
if(length(dbs)==0){
cli_abort(c("None of the datasets contains a date column"))
}

dbs = dbs %>%
imap(~{
xid = suppressWarnings(as.numeric(.x$id))
pb = is.na(xid)!=is.na(.x$id)
if(any(pb)) cli_warn(c("NAs introduced by coercion to numeric in {.val {.y}${id}}. Is {.arg id} set correctly?",
i="Problematic value{?s}: {.val { .x$id[pb]}}"))
.x$id = xid
.x
})

dat = dbs %>%
imap(~{
.x %>%
pivot_longer(-id) %>%
mutate(
label=unlist(get_label(.x)[name]) %||% name,
dataset=.y,
variable=paste0(toupper(dataset), " - ", toupper(name))
)

}) %>%
dat = get_datasets() %>%
.discard_if_no_id(id=id) %>%
.select_dates(id=id) %>%
.pivot_dates(id=id) %>%
list_rbind() %>%
mutate(date=value)

if(!is.null(exclude)){
excl = tolower(paste(exclude, collapse="|")) %>%
str_replace_all("\\$", "\\\\$")
dat = dat %>%
filter(!str_detect(tolower(paste0(dataset, "$", name)), excl))
}

if(!is.null(id_lim)){
if(!is.numeric(id_lim) && length(id_lim)!=2) cli_abort("{.arg id_lim} should be a numeric vector of length 2")
dat = dat %>% filter(between(id, id_lim[1], id_lim[2]))
}
.exclude_columns(exclude) %>%
.parse_id_to_numeric(id=id, id_lim=id_lim)%>%
arrange(variable)

if(!is.null(group)){
dat_group = parse_var(group, id, parent)
if(anyDuplicated(dat_group$id)!=0){
cli_abort("{.arg group} ({group}) should identify subjects ({id}) uniquely.",
class="edc_swimplot_group_dup")
}

dat = dat %>% left_join(dat_group, by="id")
}

if(can_be_numeric(dat$id)) dat$id=as.numeric(dat$id)
else if(all(str_detect(dat$id, "\\d+"))){
if(is_installed("gtools")) dat=slice(dat, gtools::mixedorder(id))
else cli_warn(c("{.arg id} contains numbers, you will need the
{.pkg gtools} package to sort it properly.",
i='Run {.run utils::install.package("gtools")}'))
}
tooltip = c("x", "y", "color", "label")
x_label = "Calendar date"
aes_x = "date"
if(!is.null(origin)){
dat_origin = parse_var(origin, id, parent)
values = c(days=1, weeks=7, months=365.24/12, years=365.24)
dat = dat %>%
left_join(dat_origin, by="id") %>%
mutate(
date = value,
value = as.double(value-origin, units="days") / values[time_unit]
time = as.double(date-origin, units="days") / values[time_unit]
)
x_label = glue("Date difference from `{origin}` (in {time_unit})")
tooltip = c(tooltip, "date")
}

aes_label = "variable"
if(aes_color=="variable"){
aes_label = "label"
aes_x = "time"
}

p = dat %>%
mutate(id=as_factor(id)) %>%
ggplot(aes(x=value, y=id, group=id, date=date)) +
ggplot(aes(x=!!sym(aes_x), y=id, group=id, date=date)) +
aes(color=!!sym(aes_color), label=!!sym(aes_label)) +
geom_line(na.rm=TRUE) +
geom_point(na.rm=TRUE) +
Expand All @@ -173,37 +115,90 @@ edc_swimmerplot = function(.lookup=edc_lookup(), ...,
}


# Helper ------------------------------------------------------------------

#' Save a plotly to an HTML file
#'
#' @param p a plot object (`plotly` or `ggplot`)
#' @param file a file path to save the HTML file
#' @param ... passed on to [htmlwidgets::saveWidget]
#'
#' @export
#' @return nothing, used for side effect
#'
#' @examples
#' \dontrun{
#' tm = edc_example_plot()
#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45))
#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot")
#' }
#' @importFrom fs dir_create path_dir
#' @importFrom rlang check_installed
save_plotly = function(p, file, ...){
check_installed("plotly", reason="for `save_plotly()` to work.")
check_installed("htmlwidgets", reason="for `save_plotly()` to work.")
if(inherits(p, "ggplot")) p = plotly::ggplotly(p)
dir_create(path_dir(file), recurse=TRUE)
wd = setwd(path_dir(file))
on.exit(setwd(wd))
htmlwidgets::saveWidget(p, file=basename(file), ...)
# Utils -------------------------------------------------------------------


#' @noRd
#' @keywords internal
.discard_if_no_id = function(datasets, id){
has_id = datasets %>%
map(~tolower(names(.x))) %>%
map(~intersect(.x, tolower(id))) %>%
map_lgl(~length(.x)>0)
if(!any(has_id)){
cli_abort(c("None of the datasets contains an identifier column",
i="{.arg id}={.val {id}}"),
call=parent.frame())
}
datasets[has_id]
}


# Utils -------------------------------------------------------------------
#' @noRd
#' @keywords internal
.select_dates = function(datasets, id){
data_dates = datasets %>%
map(~{
.x %>%
select(id=any_of2(id), where(is.Date)) %>%
rename(id=1)
}) %>%
discard(~ncol(.x)<2)
if(length(data_dates)==0){
cli_abort(c("None of the datasets contains a date column"),
call=parent.frame())
}
data_dates
}

.pivot_dates = function(datasets, id){
datasets %>%
imap(~{
.x %>%
pivot_longer(-id, values_to="date") %>%
mutate(
label=unlist(get_label(.x)[name]) %||% name,
dataset=.y,
variable=paste0(dataset, "$", name)
)
})
}

.exclude_columns = function(data, exclude) {
if(!is.null(exclude)){
excl = exclude %>% paste(collapse="|") %>% tolower() %>%
str_replace_all("\\$", "\\\\$")
data = data %>%
filter(!str_detect(tolower(variable), excl))
}
data
}

.parse_id_to_numeric = function (data, id, id_lim) {
if(is.null(id_lim)) id_lim =c(-Inf, Inf)
if(!is.numeric(id_lim) && length(id_lim)!=2) {
cli_abort("{.arg id_lim} should be a numeric vector of length 2.")
}

if(can_be_numeric(data$id)) {
data = data %>%
mutate(id = as.numeric(id)) %>%
filter(id>=id_lim[1] & id<=id_lim[2])
} else if(all(str_detect(data$id, "\\d+"))){
if(is_installed("gtools")){
data = data %>%
slice(gtools::mixedorder(id)) %>%
mutate(id=as_factor(id))
} else {
cli_warn(c("{.arg id} contains numbers, you will need the
{.pkg gtools} package to sort it properly.",
i='Run {.run utils::install.package("gtools")}'),
call=parent.frame())
}
}
data
}

#' @importFrom cli cli_abort
#' @importFrom dplyr rename select
Expand All @@ -214,7 +209,6 @@ save_plotly = function(p, file, ...){
#' @keywords internal
parse_var = function(input, id, env){
input_name = caller_arg(input)

if(!str_detect(input, "^.*\\$.*$")){
cli_abort(c(x="{.arg {input_name}} is not in the form `dataset$column`.",
i="{.arg {input_name}} = {.val {input}}"),
Expand All @@ -231,7 +225,6 @@ parse_var = function(input, id, env){
}

dat_input = get(input2[1], envir=env)

if(!input2[2] %in% names(dat_input)){
cli_abort(c(x="{.arg {input_name}} is wrong: no column {.val {input2[2]}} in dataset {.val {input2[1]}} was found.",
i="{.arg {input_name}} = {.val {input}}"),
Expand All @@ -243,3 +236,35 @@ parse_var = function(input, id, env){
select(matches(id), !!input_name:=!!input2[2]) %>%
rename(id=1)
}




# Helper ------------------------------------------------------------------

#' Save a plotly to an HTML file
#'
#' @param p a plot object (`plotly` or `ggplot`)
#' @param file a file path to save the HTML file
#' @param ... passed on to [htmlwidgets::saveWidget]
#'
#' @export
#' @return nothing, used for side effect
#'
#' @examples
#' \dontrun{
#' tm = edc_example_plot()
#' p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45))
#' save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot")
#' }
#' @importFrom fs dir_create path_dir
#' @importFrom rlang check_installed
save_plotly = function(p, file, ...){
check_installed("plotly", reason="for `save_plotly()` to work.")
check_installed("htmlwidgets", reason="for `save_plotly()` to work.")
if(inherits(p, "ggplot")) p = plotly::ggplotly(p)
dir_create(path_dir(file), recurse=TRUE)
wd = setwd(path_dir(file))
on.exit(setwd(wd))
htmlwidgets::saveWidget(p, file=basename(file), ...)
}
Loading

0 comments on commit 50156fa

Please sign in to comment.