Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

baptistebr/low-frequency-stats #51

Merged
merged 19 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ Imports:
lubridate,
dplyr,
cli,
nanoparquet
nanoparquet,
rlang
Suggests:
devtools,
usethis,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ importFrom(golem,add_resource_path)
importFrom(golem,bundle_resources)
importFrom(golem,favicon)
importFrom(golem,with_golem_options)
importFrom(rlang,.data)
importFrom(shiny,NS)
importFrom(shiny,shinyApp)
importFrom(shiny,tagList)
Expand Down
11 changes: 10 additions & 1 deletion R/app_ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ app_ui <- function(request) {
layout_columns(
card(mod_plots_ui("monthly_counts")),
card(mod_plots_ui("summary_stats"))
)
),
.low_frequency_disclaimer()
),
nav_panel(
title = "Export",
Expand Down Expand Up @@ -65,3 +66,11 @@ golem_add_external_resources <- function() {
}
title
}

.low_frequency_disclaimer <- function() {
tags$div(
class = "alert alert-warning",
"Note: to avoid identifiability of the data, we convert all `records_per_person` and `person_count`",
glue::glue("values below {Sys.getenv('LOW_FREQUENCY_THRESHOLD')} to {Sys.getenv('LOW_FREQUENCY_REPLACEMENT')}.")
)
}
32 changes: 27 additions & 5 deletions R/utils_get_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ get_concepts_table <- function() {
}

get_monthly_counts <- function() {
# If the app is run in development mode
if (golem::app_dev()) {
return(
readr::read_csv(app_sys("dev_data", "calypso_monthly_counts.csv"), show_col_types = FALSE)
)
data <- readr::read_csv(app_sys("dev_data", "calypso_monthly_counts.csv"), show_col_types = FALSE)
} else {
data <- .read_parquet_table("calypso_monthly_counts")
}
.read_parquet_table("calypso_monthly_counts")
.manage_low_frequency(data)
}

get_summary_stats <- function() {
Expand All @@ -30,7 +31,6 @@ get_summary_stats <- function() {
.read_parquet_table("calypso_summary_stats")
}


.read_parquet_table <- function(table_name) {
data_dir <- Sys.getenv("CALYPSO_DATA_PATH")
if (data_dir == "") {
Expand All @@ -42,3 +42,25 @@ get_summary_stats <- function() {

nanoparquet::read_parquet(glue::glue("{data_dir}/{table_name}.parquet"))
}

# Manage low frequency statistics
# by removing values equal to 0 and
# by replacing values below the threshold with the replacement value
# (both defined in environment variables)
#' @importFrom rlang .data
.manage_low_frequency <- function(df) {
threshold <- as.double(Sys.getenv("LOW_FREQUENCY_THRESHOLD"))
replacement <- as.double(Sys.getenv("LOW_FREQUENCY_REPLACEMENT"))
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved

stopifnot("LOW_FREQUENCY_THRESHOLD is not a valid number" = !is.na(threshold))
stopifnot("LOW_FREQUENCY_REPLACEMENT is not a valid number" = !is.na(replacement))
# Remove records with values equal to 0
df <- dplyr::filter(df, .data$records_per_person > 0)
df <- dplyr::filter(df, .data$person_count > 0)
# Replace values below the threshold with the replacement value
dplyr::mutate(
df,
records_per_person = ifelse(.data$records_per_person < threshold, replacement, .data$records_per_person),
person_count = ifelse(.data$person_count < threshold, replacement, .data$person_count)
)
}
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ The `dev/02_dev.R` script contains a few helper functions to get you started.

The test data can be found in [`inst/dev_data`](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/inst/data). These data have been generated by using the synthetic dataset '[synthea-allergies-10k](https://darwin-eu.github.io/CDMConnector/reference/eunomiaDir.html)', and adding some [dummy data](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/dev/test_db/dummy) for the MEASUREMENT and OBSERVATION tables (to have some records in the 'calypso-summary-stats' table).

5. Configure the "low-frequency statistics masking" feature, by defining a threshold and a replacement value (values below the threshold will be replaced, in the summary statistics), as environment variables:

- `LOW_FREQUENCY_THRESHOLD`
- `LOW_FREQUENCY_REPLACEMENT`
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved

These variables should be defined in the [docker-compose file](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/deploy/docker-compose.yml) when running the app in production.

When running the app in development mode, these variables need to be defined in a `.Renviron` file.

### File structure

Expand Down
2 changes: 2 additions & 0 deletions deploy/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ services:
environment:
- GOLEM_CONFIG_ACTIVE=production
- CALYPSO_DATA_PATH=/etc/calypso/data
- LOW_FREQUENCY_THRESHOLD=5
- LOW_FREQUENCY_REPLACEMENT=2.5
volumes:
- ../data/prod_data:/etc/calypso/data
ports:
Expand Down
44 changes: 22 additions & 22 deletions tests/testthat/_snaps/utils_get_data/monthly_counts.csv
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
concept_id,concept_name,date_year,date_month,person_count,records_per_person
3001079,Blood group antibody screen [Presence] in Serum or Plasma,2020,5,1,1
4108450,Inspiration/expiration time ratio,2019,9,1,1
4128111,T - Tumor stage,2020,11,1,1
4128111,T - Tumor stage,2020,12,1,1
4248525,Lying systolic blood pressure,2019,5,1,1
4248525,Lying systolic blood pressure,2019,10,1,1
4248525,Lying systolic blood pressure,2020,6,1,1
4248525,Lying systolic blood pressure,2021,10,1,1
4353713,Positive end expiratory pressure,2019,2,1,1
4353713,Positive end expiratory pressure,2021,7,1,1
4353717,Ventilator delivered minute volume,2021,5,1,1
4353717,Ventilator delivered minute volume,2022,12,1,1
4353843,Invasive systolic arterial pressure,2021,9,1,1
4353843,Invasive systolic arterial pressure,2021,10,1,1
4353843,Invasive systolic arterial pressure,2021,12,1,1
4353843,Invasive systolic arterial pressure,2023,4,1,1
4354252,Non-invasive systolic arterial pressure,2019,8,1,1
4354252,Non-invasive systolic arterial pressure,2020,8,1,1
4354252,Non-invasive systolic arterial pressure,2021,2,1,1
4354252,Non-invasive systolic arterial pressure,2021,6,1,1
4354252,Non-invasive systolic arterial pressure,2021,11,1,1
45766147,Appearance,2022,6,1,1
3001079,Blood group antibody screen [Presence] in Serum or Plasma,2020,5,2.5,2.5
4108450,Inspiration/expiration time ratio,2019,9,2.5,2.5
4128111,T - Tumor stage,2020,11,2.5,2.5
4128111,T - Tumor stage,2020,12,2.5,2.5
4248525,Lying systolic blood pressure,2019,5,2.5,2.5
4248525,Lying systolic blood pressure,2019,10,2.5,2.5
4248525,Lying systolic blood pressure,2020,6,2.5,2.5
4248525,Lying systolic blood pressure,2021,10,2.5,2.5
4353713,Positive end expiratory pressure,2019,2,2.5,2.5
4353713,Positive end expiratory pressure,2021,7,2.5,2.5
4353717,Ventilator delivered minute volume,2021,5,2.5,2.5
4353717,Ventilator delivered minute volume,2022,12,2.5,2.5
4353843,Invasive systolic arterial pressure,2021,9,2.5,2.5
4353843,Invasive systolic arterial pressure,2021,10,2.5,2.5
4353843,Invasive systolic arterial pressure,2021,12,2.5,2.5
4353843,Invasive systolic arterial pressure,2023,4,2.5,2.5
4354252,Non-invasive systolic arterial pressure,2019,8,2.5,2.5
4354252,Non-invasive systolic arterial pressure,2020,8,2.5,2.5
4354252,Non-invasive systolic arterial pressure,2021,2,2.5,2.5
4354252,Non-invasive systolic arterial pressure,2021,6,2.5,2.5
4354252,Non-invasive systolic arterial pressure,2021,11,2.5,2.5
45766147,Appearance,2022,6,2.5,2.5
5 changes: 5 additions & 0 deletions tests/testthat/setup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
withr::local_envvar(
"LOW_FREQUENCY_THRESHOLD" = 10,
"LOW_FREQUENCY_REPLACEMENT" = 2.5,
.local_envir = testthat::teardown_env()
)
24 changes: 23 additions & 1 deletion tests/testthat/test-utils_get_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ test_that("Dev data files are consistent", {
expect_snapshot_file(save_csv(get_summary_stats()), "summary_stats.csv")
})


## Check if we can access the data used in production
test_that("Test data parquet files exist and are accessible", {
withr::local_envvar(c(
Expand Down Expand Up @@ -56,3 +55,26 @@ test_that("Test data parquet files exist and are accessible", {
c("concept_id", "concept_name", "summary_attribute", "value_as_number", "value_as_string")
)
})

# Check that low-frequency monthly counts are well processed
# (by removing values equal to 0 and
# by replacing values if they are below the threshold)
test_that("Test low frequency stats replacement for monthly counts", {
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved
mock_monthly_counts <- data.frame(
concept_id = c(1, 2, 3, 4, 5, 6, 7),
person_count = c(0, 0, 1000, 1000, 1, 1000, 1),
records_per_person = c(0, 1000, 0, 1000, 1000, 1, 1)
)
replacement <- as.double(Sys.getenv("LOW_FREQUENCY_REPLACEMENT"))
results <- .manage_low_frequency(mock_monthly_counts)
expect_equal(nrow(results), 4)
expect_equal(sum(results$person_count == 0), 0)
expect_equal(sum(results$person_count == 1), 0)
expect_equal(sum(results$person_count == replacement), 2)
expect_equal(sum(results$person_count == 1000), 2)

expect_equal(sum(results$records_per_person == 0), 0)
expect_equal(sum(results$records_per_person == 1), 0)
expect_equal(sum(results$records_per_person == replacement), 2)
expect_equal(sum(results$records_per_person == 1000), 2)
})