SAFEHR-data · milanmlft · Nov 21, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,3 @@
 **/renv
 **/.Rprofile
+**/.Renviron
diff --git a/.env.sample b/.env.sample
@@ -1,13 +1,16 @@
 ENV=test
-DATA_VOLUME_PATH=./data/test_data
+DATA_VOLUME_PATH=./data/test_data/internal
+
+SHINY_PORT: 3838
 
 # For preprocessing
-PREPROCESS_DB_NAME=         # name of the source database
-PREPROCESS_HOST=            # host address for the source database
-PREPROCESS_PORT=            # port on which to connect to the source database
-PREPROCESS_DB_USERNAME=     # username for the source database
-PREPROCESS_DB_PASSWORD=     # password for the source database
-PREPROCESS_DB_CDM_SCHEMA=   # Schema name in the database to connect the OMOP CDM to 
+PREPROCESS_DB_NAME=                 # name of the source database
+PREPROCESS_HOST=                    # host address for the source database
+PREPROCESS_PORT=                    # port on which to connect to the source database
+PREPROCESS_DB_USERNAME=             # username for the source database
+PREPROCESS_DB_PASSWORD=             # password for the source database
+PREPROCESS_DB_CDM_SCHEMA=           # Schema name in the database to connect the OMOP CDM to
+PREPROCESS_SUMMARISE_LEVEL=monthly  # Level to summarise record counts at (monthly or quarterly)
 
 # Low-frequency replacement
 LOW_FREQUENCY_THRESHOLD=5

diff --git a/.gitignore b/.gitignore
@@ -49,4 +49,4 @@ po/*~
 rsconnect/
 
 !dev
-.env
+*.env
diff --git a/README.md b/README.md
@@ -100,4 +100,18 @@ docker compose up -d omopcat
 
 By default, the app will be hosted at `http://localhost:3838`.
 
-See the [deployment docs](./deploy/README.md) for more details.
+### Public version
+
+Copy the `public.env.sample` to `public.env` and fill out the necessary environment variables.
+
+Then run the pre-processing pipeline with
+
+```sh
+docker compose --env-file public.env run preprocess
+```
+
+and deploy the app with
+
+```sh
+docker compose --env-file public.env -p omopcat-public up -d omopcat
+```
diff --git a/app/R/fct_monthly_count_plot.R b/app/R/fct_monthly_count_plot.R
@@ -16,7 +16,10 @@ monthly_count_plot <- function(monthly_counts) {
   stopifnot(is.data.frame(monthly_counts))
   stopifnot(all(c("date_year", "date_month", "person_count") %in% colnames(monthly_counts)))
 
-  monthly_counts$date <- .convert_to_date(monthly_counts$date_year, monthly_counts$date_month)
+  monthly_counts$date <- lubridate::make_date(
+    year = monthly_counts$date_year,
+    month = monthly_counts$date_month
+  )
 
   ggplot(monthly_counts, aes(x = .data$date, y = .data$record_count)) +
     geom_bar(aes(fill = .data$concept_name), stat = "identity") +
@@ -26,7 +29,3 @@ monthly_count_plot <- function(monthly_counts) {
     scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
     theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
 }
-
-.convert_to_date <- function(date_year, date_month) {
-  as.Date(paste0(date_year, "-", date_month, "-01"))
-}
diff --git a/app/R/utils_get_data.R b/app/R/utils_get_data.R
@@ -44,9 +44,19 @@ get_monthly_counts <- function() {
   } else {
     data <- .read_parquet_table("omopcat_monthly_counts")
   }
+
+  if ("date_quarter" %in% colnames(data)) {
+    ## Set `date_month` to the first month of the quarter
+    data$date_month <- .quarter_to_month(data$date_quarter)
+  }
+
   return(data)
 }
 
+.quarter_to_month <- function(quarter) {
+  return((quarter - 1) * 3 + 1)
+}
+
 get_summary_stats <- function() {
   if (should_use_dev_data()) {
     out <- readr::read_csv(

diff --git a/data/test_data/internal/omopcat_concepts.parquet b/data/test_data/internal/omopcat_concepts.parquet
diff --git a/data/test_data/internal/omopcat_monthly_counts.parquet b/data/test_data/internal/omopcat_monthly_counts.parquet
diff --git a/data/test_data/internal/omopcat_summary_stats.parquet b/data/test_data/internal/omopcat_summary_stats.parquet
diff --git a/data/test_data/omopcat_concepts.parquet b/data/test_data/omopcat_concepts.parquet
diff --git a/data/test_data/omopcat_monthly_counts.parquet b/data/test_data/omopcat_monthly_counts.parquet
diff --git a/data/test_data/omopcat_summary_stats.parquet b/data/test_data/omopcat_summary_stats.parquet
diff --git a/data/test_data/public/omopcat_concepts.parquet b/data/test_data/public/omopcat_concepts.parquet
diff --git a/data/test_data/public/omopcat_monthly_counts.parquet b/data/test_data/public/omopcat_monthly_counts.parquet
diff --git a/data/test_data/public/omopcat_summary_stats.parquet b/data/test_data/public/omopcat_summary_stats.parquet
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -21,6 +21,7 @@ services:
       EUNOMIA_DATA_FOLDER: /mnt/preprocessing/data-raw/test_db
       LOW_FREQUENCY_THRESHOLD: ${LOW_FREQUENCY_THRESHOLD}
       LOW_FREQUENCY_REPLACEMENT: ${LOW_FREQUENCY_REPLACEMENT}
+      SUMMARISE_LEVEL: ${PREPROCESS_SUMMARISE_LEVEL}
     command: ["R", "-e", "omopcat.preprocessing::preprocess()"]
     volumes:
       - ${DATA_VOLUME_PATH}:/mnt/preprocessing/data
@@ -49,4 +50,4 @@ services:
     volumes:
       - ${DATA_VOLUME_PATH}:/etc/omopcat/data
     ports:
-      - 3838:3838
+      - ${SHINY_PORT}:3838
diff --git a/preprocessing/R/monthly_counts.R b/preprocessing/R/monthly_counts.R
@@ -3,29 +3,57 @@
 #' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
 #' @param threshold Threshold value below which values will be replaced by `replacement`
 #' @param replacement Value with which values below `threshold` will be replaced
+#' @param level At which resolution the counts should be summarised.
+#'    Currently supports `"monthly"` or `"quarterly"`.
 #'
 #' @return A `data.frame` with the monthly counts
 #' @keywords internal
-generate_monthly_counts <- function(cdm, threshold, replacement) {
+generate_monthly_counts <- function(cdm, threshold, replacement,
+                                    level = c("monthly", "quarterly")) {
+  level <- match.arg(level)
+  .summarise <- function(...) summarise_counts(..., level = level)
+
   # Combine results for all tables
-  out <- dplyr::bind_rows(
-    cdm$condition_occurrence |> calculate_monthly_counts(
-      .data$condition_concept_id, .data$condition_start_date
+  arg_list <- list(
+    list(
+      omop_table = cdm[["measurement"]],
+      concept_col = "measurement_concept_id",
+      date_col = "measurement_date"
+    ),
+    list(
+      omop_table = cdm[["observation"]],
+      concept_col = "observation_concept_id",
+      date_col = "observation_date"
+    ),
+    list(
+      omop_table = cdm[["condition_occurrence"]],
+      concept_col = "condition_concept_id",
+      date_col = "condition_start_date"
+    ),
+    list(
+      omop_table = cdm[["drug_exposure"]],
+      concept_col = "drug_concept_id",
+      date_col = "drug_exposure_start_date"
+    ),
+    list(
+      omop_table = cdm[["procedure_occurrence"]],
+      concept_col = "procedure_concept_id",
+      date_col = "procedure_date"
+    ),
+    list(
+      omop_table = cdm[["device_exposure"]],
+      concept_col = "device_concept_id",
+      date_col = "device_exposure_start_date"
     ),
-    cdm$drug_exposure |>
-      calculate_monthly_counts(.data$drug_concept_id, .data$drug_exposure_start_date),
-    cdm$procedure_occurrence |>
-      calculate_monthly_counts(.data$procedure_concept_id, .data$procedure_date),
-    cdm$device_exposure |>
-      calculate_monthly_counts(.data$device_concept_id, .data$device_exposure_start_date),
-    cdm$measurement |>
-      calculate_monthly_counts(.data$measurement_concept_id, .data$measurement_date),
-    cdm$observation |>
-      calculate_monthly_counts(.data$observation_concept_id, .data$observation_date),
-    cdm$specimen |>
-      calculate_monthly_counts(.data$specimen_concept_id, .data$specimen_date)
+    list(
+      omop_table = cdm[["specimen"]],
+      concept_col = "specimen_concept_id",
+      date_col = "specimen_date"
+    )
   )
 
+  out <- purrr::map_dfr(arg_list, function(args) do.call(.summarise, args))
+
   # Map concept names to the concept IDs
   concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |>
     dplyr::filter(.data$concept_id %in% out$concept_id) |>
@@ -39,31 +67,44 @@ generate_monthly_counts <- function(cdm, threshold, replacement) {
     )
 }
 
-
-#' Calculate monthly statistics for an OMOP concept
+#' Summarise record counts
 #'
 #' @param omop_table A table from the OMOP CDM
-#' @param concept The name of the concept column to calculate statistics for
-#' @param date The name of the date column to calculate statistics for
+#' @param concept_col The name of the concept column to calculate statistics for
+#' @param date_col The name of the date column to calculate statistics for
+#' @param level The resolution at which to summarise the record counts.
+#'    Currently supports `"monthly"` or `"quarterly"`
 #'
 #' @return A `data.frame` with the following columns:
 #'   - `concept_id`: The concept ID
 #'   - `concept_name`: The concept name
 #'   - `date_year`: The year of the date
-#'   - `date_month`: The month of the date
+#'   - `date_month` or `date_quarter`: The month or quarter of the date, depending on `level`
 #'   - `person_count`: The number of unique patients per concept for each month
 #'   - `records_per_person`: The average number of records per person per concept for each month
 #' @keywords internal
-calculate_monthly_counts <- function(omop_table, concept, date) {
-  # Extract year and month from date column
+summarise_counts <- function(omop_table, concept_col, date_col, level) {
+  group_by_var <- switch(level,
+    monthly = "date_month",
+    quarterly = "date_quarter",
+    stop(sprintf("Summary level `%s` not supported", level))
+  )
+
+  # Extract year, month and quarter from date column
   omop_table <- dplyr::mutate(omop_table,
-    concept_id = {{ concept }},
-    date_year = as.integer(lubridate::year({{ date }})),
-    date_month = as.integer(lubridate::month({{ date }}))
+    concept_id = .data[[concept_col]],
+    date_year = as.integer(lubridate::year(.data[[date_col]])),
+    date_month = as.integer(lubridate::month(.data[[date_col]]))
   )
 
+  if (level == "quarterly") {
+    # NOTE: lubridate::quarter is not supported for all SQL back-ends
+    omop_table <- omop_table |>
+      dplyr::mutate(date_quarter = as.integer(lubridate::quarter(.data[[date_col]])))
+  }
+
   omop_table |>
-    dplyr::group_by(.data$date_year, .data$date_month, .data$concept_id) |>
+    dplyr::group_by(.data$date_year, .data[[group_by_var]], .data$concept_id) |>
     dplyr::summarise(
       record_count = dplyr::n(),
       person_count = dplyr::n_distinct(.data$person_id),
@@ -79,7 +120,7 @@ calculate_monthly_counts <- function(omop_table, concept, date) {
     dplyr::select(
       "concept_id",
       "date_year",
-      "date_month",
+      dplyr::all_of(group_by_var),
       "record_count",
       "person_count",
       "records_per_person"

diff --git a/preprocessing/R/preprocess.R b/preprocessing/R/preprocess.R
@@ -48,9 +48,15 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
 
   threshold <- Sys.getenv("LOW_FREQUENCY_THRESHOLD")
   replacement <- Sys.getenv("LOW_FREQUENCY_REPLACEMENT")
+  summarise_level <- Sys.getenv("SUMMARISE_LEVEL")
+
+  cli::cli_alert_info("Summarising record counts at the '{summarise_level}' level")
 
   cli::cli_progress_message("Generating monthly_counts table")
-  monthly_counts <- generate_monthly_counts(cdm, threshold = threshold, replacement = replacement)
+  monthly_counts <- generate_monthly_counts(cdm,
+    threshold = threshold, replacement = replacement,
+    level = summarise_level
+  )
 
   cli::cli_progress_message("Generating summary_stats table")
   summary_stats <- generate_summary_stats(cdm, threshold = threshold, replacement = replacement)
@@ -79,7 +85,8 @@ preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
     "DB_USERNAME",
     "DB_PASSWORD",
     "LOW_FREQUENCY_THRESHOLD",
-    "LOW_FREQUENCY_REPLACEMENT"
+    "LOW_FREQUENCY_REPLACEMENT",
+    "SUMMARISE_LEVEL"
   )
 
   missing <- Sys.getenv(required_envvars) == ""

diff --git a/preprocessing/man/generate_monthly_counts.Rd b/preprocessing/man/generate_monthly_counts.Rd
diff --git a/...rocessing/man/calculate_monthly_counts.Rd → preprocessing/man/summarise_counts.Rd b/...rocessing/man/calculate_monthly_counts.Rd → preprocessing/man/summarise_counts.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,4 +49,4 @@ po/*~ @@
     rsconnect/
     !dev
-    .env
+    *.env