From 5e08bb84f07feab09470be438e9d05ec40c1db03 Mon Sep 17 00:00:00 2001 From: BaptisteBR Date: Tue, 27 Aug 2024 16:45:55 +0100 Subject: [PATCH] Add realistic test data (#24) * Insert dummy data to MEASUREMENT and OBSERVATION for the test db. Create realistic test dataset. * Clean CSV files. * Regenerate realistic test data after cleaning dummy CSVs. * Add script to produce the realistic test data. * Update dev/test_db/produce_test_data.R Co-authored-by: Milan Malfait <38256462+milanmlft@users.noreply.github.com> * Update README to explain how the reproduce test data. * Add 'ORDER BY' clause to produce results. * Update README with test dataset creation process. * Update README with test dataset creation process. * Update README with test dataset creation process. * Make sure DB connections are always closed * Remove clean up steps * Remove more cleanup steps * Use test data in data getters * Test consistency of test_data files * Fix indentation * Improve logging * Update test data * Update test data * Overwrite existing data to ensure consistency * Add sanity checks for dummy data Make sure the data in the database matches the local dummy files * Explicitly specify column types for dummy data to avoid errors later on * Overwrite database tables for summary results as well * Update test data (should be consistent now) * Use `readr` to write data as we're dealing with tibbles * Update snapshot test data * Just do a simple `arrange()` across all columns to order data * Clean up code * Reduce test data size * Readd sorting of tables * Update test data (really the last time now) * Read test data as tibble * Add readr as dependency --------- Co-authored-by: Milan Malfait <38256462+milanmlft@users.noreply.github.com> --- .Rprofile | 2 +- DESCRIPTION | 1 + R/utils_get_data.R | 33 ++------ README.md | 11 +++ dev/omop_analyses/analyse_omop_cdm.R | 11 ++- dev/test_db/dummy/measurement.csv | 24 ++++++ dev/test_db/dummy/observation.csv | 13 +++ dev/test_db/eunomia/.gitignore | 4 + dev/test_db/insert_dummy_tables.R | 79 +++++++++++++++++++ dev/test_db/produce_test_data.R | 52 ++++++++++++ dev/test_db/setup_test_db.R | 3 + inst/test_data/calypso_concepts.csv | 10 +++ inst/test_data/calypso_monthly_counts.csv | 23 ++++++ inst/test_data/calypso_summary_stats.csv | 17 ++++ .../_snaps/utils_get_data/concepts_table.csv | 10 +++ .../_snaps/utils_get_data/monthly_counts.csv | 23 ++++++ .../_snaps/utils_get_data/summary_stats.csv | 17 ++++ tests/testthat/test-utils_get_data.R | 20 +++++ 18 files changed, 321 insertions(+), 32 deletions(-) create mode 100644 dev/test_db/dummy/measurement.csv create mode 100644 dev/test_db/dummy/observation.csv create mode 100644 dev/test_db/insert_dummy_tables.R create mode 100644 dev/test_db/produce_test_data.R create mode 100644 inst/test_data/calypso_concepts.csv create mode 100644 inst/test_data/calypso_monthly_counts.csv create mode 100644 inst/test_data/calypso_summary_stats.csv create mode 100644 tests/testthat/_snaps/utils_get_data/concepts_table.csv create mode 100644 tests/testthat/_snaps/utils_get_data/monthly_counts.csv create mode 100644 tests/testthat/_snaps/utils_get_data/summary_stats.csv create mode 100644 tests/testthat/test-utils_get_data.R diff --git a/.Rprofile b/.Rprofile index db760ca..a0a4f3f 100644 --- a/.Rprofile +++ b/.Rprofile @@ -16,7 +16,7 @@ source("renv/activate.R") # Path to download Eunomia datasets Sys.setenv(EUNOMIA_DATA_FOLDER = file.path("dev/test_db/eunomia")) # Name of the synthetic dataset to use -Sys.setenv(TEST_DB_NAME = "GiBleed") +Sys.setenv(TEST_DB_NAME = "synthea-allergies-10k") # OMOP CDM version Sys.setenv(TEST_DB_OMOP_VERSION = "5.3") # Schema name for data diff --git a/DESCRIPTION b/DESCRIPTION index fd306fe..919c6e8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Imports: glue, tidyr, withr, + readr, lubridate, dplyr Suggests: diff --git a/R/utils_get_data.R b/R/utils_get_data.R index b759dfe..e17e239 100644 --- a/R/utils_get_data.R +++ b/R/utils_get_data.R @@ -5,19 +5,9 @@ #' @noRd get_concepts_table <- function() { if (golem::app_dev()) { - return(data.frame( - concept_id = c(40213251, 133834, 4057420), - concept_name = c( - "varicella virus vaccine", - "Atopic dermatitis", - "Catheter ablation of tissue of heart" - ), - domain_id = c("Drug", "Condition", "Procedure"), - vocabulary_id = c("CVX", "SNOMED", "SNOMED"), - concept_class_id = c("CVX", "Clinical Finding", "Procedure"), - standard_concept = c("S", "S", "S"), - concept_code = c("21", "24079001", "18286008") - )) + return( + readr::read_csv(app_sys("test_data", "calypso_concepts.csv"), show_col_types = FALSE) + ) } con <- connect_to_test_db() @@ -28,15 +18,7 @@ get_concepts_table <- function() { get_monthly_counts <- function() { if (golem::app_dev()) { return( - data.frame( - concept_id = c( - rep(c(40213251, 133834, 4057420), each = 3) - ), - date_year = c(2019L, 2020L, 2020L, 2019L, 2020L, 2020L, 2020L, 2019L, 2019L), - date_month = c(4L, 3L, 5L, 5L, 8L, 4L, 11L, 6L, 3L), - person_count = c(1, 1, 3, 4, 2, 3, 2, 4, 1), - records_per_person = c(1, 1, 1, 1, 1, 1, 1, 1, 1) - ) + readr::read_csv(app_sys("test_data", "calypso_monthly_counts.csv"), show_col_types = FALSE) ) } @@ -48,12 +30,7 @@ get_monthly_counts <- function() { get_summary_stats <- function() { if (golem::app_dev()) { return( - data.frame( - concept_id = rep(c(40213251, 133834, 4057420), each = 2), - summary_attribute = rep(c("mean", "sd"), times = 3), - value_as_string = rep(NA, 6), - value_as_number = c(1.5, 0.5, 2.5, 0.7, 3.5, 0.8) - ) + readr::read_csv(app_sys("test_data", "calypso_summary_stats.csv"), show_col_types = FALSE) ) } diff --git a/README.md b/README.md index 39ebc93..b4021a4 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,17 @@ as it has good support for R package development and Shiny. The `dev/02_dev.R` script contains a few helper functions to get you started. +Calypso test data can be found in [`inst/test_data`](https://github.com/UCLH-Foundry/omop-data-catalogue/tree/main/inst/data). These data have been generated by using the synthetic dataset '[synthea-allergies-10k](https://darwin-eu.github.io/CDMConnector/reference/eunomiaDir.html)', and adding some [dummy data](https://github.com/UCLH-Foundry/omop-data-catalogue/tree/main/dev/test_db/dummy) for the MEASUREMENT and OBSERVATION tables (to have some records in the 'calypso-summary-stats' table). + +If you want to recreate a test dataset, you can run the following R scripts: + +```r +source(here::here("dev/test_db/setup_test_db.R")) +source(here::here("dev/test_db/insert_dummy_tables.R")) +source(here::here("dev/omop_analyses/analyse_omop_cdm.R")) +source(here::here("dev/test_db/produce_test_data.R")) +``` + ### Updating the `renv` lockfile Make sure to regularly run `renv::status(dev = TRUE)` to check if your local library and the lockfile diff --git a/dev/omop_analyses/analyse_omop_cdm.R b/dev/omop_analyses/analyse_omop_cdm.R index 4fc3d5b..b70dcdb 100644 --- a/dev/omop_analyses/analyse_omop_cdm.R +++ b/dev/omop_analyses/analyse_omop_cdm.R @@ -1,4 +1,8 @@ -library(tidyverse) +cli::cli_h1("Generating summarys statistics") + +suppressPackageStartupMessages( + library(tidyverse) +) dir <- Sys.getenv("EUNOMIA_DATA_FOLDER") name <- Sys.getenv("TEST_DB_NAME") @@ -180,8 +184,7 @@ write_results <- function(data, con, table) { table = table ), value = data, - append = TRUE, - overwrite = FALSE + overwrite = TRUE ) } @@ -220,3 +223,5 @@ ids <- unique(c(monthly_counts$concept_id, summary_stats$concept_id)) # Retrieve concept properties from the list of ids get_concepts_table(cdm, ids) |> write_results(con, "calypso_concepts") + +cli::cli_alert_success("Summary statistics generated successfully") diff --git a/dev/test_db/dummy/measurement.csv b/dev/test_db/dummy/measurement.csv new file mode 100644 index 0000000..ae4d97b --- /dev/null +++ b/dev/test_db/dummy/measurement.csv @@ -0,0 +1,24 @@ +measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value +10000000,103,4354252,2021-06-01,2021-06-01T10:45:00Z,NA,32817,0,115,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000001,12,4354252,2020-08-25,2020-08-25T14:15:00Z,NA,32817,0,122,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000002,866,4354252,2021-11-14,2021-11-14T13:05:00Z,NA,32817,0,125,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000003,12,4354252,2019-08-23,2019-08-23T02:28:00Z,NA,32817,0,131,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000004,51,4354252,2021-02-11,2021-02-11T15:01:00Z,NA,32817,0,111,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000005,3028,4354252,2018-01-18,2018-01-18T12:14:00Z,NA,32817,0,138,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000006,7,4248525,2021-10-15,2021-10-15T10:20:00Z,NA,32817,0,169,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000007,553,4248525,2015-04-19,2015-04-19T16:47:00Z,NA,32817,0,131,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000008,1641,4248525,2019-10-11,2019-10-11T07:00:00Z,NA,32817,0,128,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000009,553,4248525,2020-06-26,2020-06-26T00:00:00Z,NA,32817,0,114,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000010,12,4248525,2019-05-01,2019-05-01T20:55:00Z,NA,32817,0,122,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000011,978,4353843,2002-03-03,2002-03-03T22:00:00Z,NA,32817,0,162,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000012,12,4353843,2021-09-18,2021-09-18T02:00:00Z,NA,32817,0,152,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000013,6459,4353843,2021-12-28,2021-12-28T02:00:00Z,NA,32817,0,118,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000014,995,4353843,2023-04-08,2023-04-08T08:00:00Z,NA,32817,0,99,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000015,110,4353843,2015-09-20,2015-09-20T07:00:00Z,NA,32817,0,117,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000016,8746,4353843,2021-10-01,2021-10-01T09:00:00Z,NA,32817,0,125,0,8876,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000017,978,4108450,2001-06-15,2001-06-15T07:50:00Z,NA,32817,0,0.6666666666666666,0,8523,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000018,8916,4108450,2019-09-13,2019-09-13T08:29:00Z,NA,32817,0,0.6666666666666666,0,8523,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000019,51,3001079,2020-05-15,2020-05-15T22:44:00Z,NA,32817,0,NA,45878588,0,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000020,909,3001079,2018-03-11,2018-03-11T13:30:00Z,NA,32817,0,NA,45878588,0,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000021,553,4128111,2020-12-02,2020-12-02T00:00:00Z,NA,32817,0,NA,1635564,0,NA,NA,NA,NA,NA,NA,NA,NA,NA +10000022,7,4128111,2020-11-25,2020-11-25T00:00:00Z,NA,32817,0,NA,1633781,0,NA,NA,NA,NA,NA,NA,NA,NA,NA diff --git a/dev/test_db/dummy/observation.csv b/dev/test_db/dummy/observation.csv new file mode 100644 index 0000000..eed2d34 --- /dev/null +++ b/dev/test_db/dummy/observation.csv @@ -0,0 +1,13 @@ +observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value +10000000,11,45766147,2022-06-24,2022-06-24T09:00:00Z,32817,NA,NA,4086518,NA,0,NA,NA,NA,NA,NA,NA,NA +10000001,59,4257036,2018-09-02,2018-09-02T09:19:00Z,32817,NA,NA,37208662,NA,0,NA,NA,NA,NA,NA,NA,NA +10000002,237,4257036,2014-11-19,2014-11-19T17:10:00Z,32817,NA,NA,37208662,NA,0,NA,NA,NA,NA,NA,NA,NA +10000003,299,4257036,2017-02-17,2017-02-17T11:14:00Z,32817,NA,NA,37208662,NA,0,NA,NA,NA,NA,NA,NA,NA +10000004,673,4216746,2011-03-22,2011-03-22T16:00:00Z,32817,8,NA,0,NA,44777590,NA,NA,NA,NA,NA,NA,NA +10000005,11,4353717,2022-12-05,2022-12-05T17:00:00Z,32817,10.6,NA,0,NA,8698,NA,NA,NA,NA,NA,NA,NA +10000006,1502,4353717,2021-05-13,2021-05-13T11:00:00Z,32817,16.8,NA,0,NA,8698,NA,NA,NA,NA,NA,NA,NA +10000007,986,4216746,2008-09-28,2008-09-28T20:00:00Z,32817,8,NA,0,NA,44777590,NA,NA,NA,NA,NA,NA,NA +10000008,299,4353717,2016-10-09,2016-10-09T01:00:00Z,32817,11,NA,0,NA,8698,NA,NA,NA,NA,NA,NA,NA +10000009,299,4353713,2018-01-12,2018-01-12T15:58:00Z,32817,5,NA,0,NA,44777590,NA,NA,NA,NA,NA,NA,NA +10000010,6288,4353713,2021-07-19,2021-07-19T15:14:00Z,32817,7,NA,0,NA,44777590,NA,NA,NA,NA,NA,NA,NA +10000011,3362,4353713,2019-02-08,2019-02-08T12:25:00Z,32817,5,NA,0,NA,44777590,NA,NA,NA,NA,NA,NA,NA diff --git a/dev/test_db/eunomia/.gitignore b/dev/test_db/eunomia/.gitignore index 7499f89..9f0a7c3 100644 --- a/dev/test_db/eunomia/.gitignore +++ b/dev/test_db/eunomia/.gitignore @@ -3,3 +3,7 @@ # duckdb databases *.duckdb + +# duckdb temp files +# (in case of failure) +*.duckdb.wal diff --git a/dev/test_db/insert_dummy_tables.R b/dev/test_db/insert_dummy_tables.R new file mode 100644 index 0000000..ef51cb6 --- /dev/null +++ b/dev/test_db/insert_dummy_tables.R @@ -0,0 +1,79 @@ +# PRODUCED FOR A SPECIFIC DATASET: +# synthea-allergies-10k +# (but could work for others) + +cli::cli_h1("Inserting dummy tables") + +library(readr) + +dir <- Sys.getenv("EUNOMIA_DATA_FOLDER") +name <- Sys.getenv("TEST_DB_NAME") +version <- Sys.getenv("TEST_DB_OMOP_VERSION") + +# Connect to the duckdb test database +con <- DBI::dbConnect( + duckdb::duckdb(dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")) +) + +withr::defer(DBI::dbDisconnect(con)) + +# Function to write data to a table in the cdm schema +write_table <- function(data, con, table) { + # Insert data into the specified table + # (in the cdm schema) + DBI::dbWriteTable( + conn = con, + name = DBI::Id( + schema = Sys.getenv("TEST_DB_CDM_SCHEMA"), + table = table + ), + value = data, + overwrite = TRUE + ) +} + +## Load dummy data and write tables to database +## We explicitly set the column types for columns that are needed later down the pipeline +dummy_measurements <- read_csv( + here::here("dev/test_db/dummy/measurement.csv"), + col_types = cols( + measurement_id = col_double(), + person_id = col_double(), + measurement_concept_id = col_double(), + measurement_date = col_date(), + value_as_number = col_double(), + value_as_concept_id = col_double(), + ) +) +write_table(dummy_measurements, con, "measurement") + +dummy_observations <- read_csv(here::here( + "dev/test_db/dummy/observation.csv"), + col_types = cols( + observation_id = col_double(), + person_id = col_double(), + observation_concept_id = col_double(), + observation_date = col_date(), + value_as_number = col_double(), + value_as_string = col_logical(), + value_as_concept_id = col_double(), + ) +) +write_table(dummy_observations, con, "observation") + +# Sanity check: read the data back and make sure its consistent +db_measurements <- DBI::dbReadTable(con, "measurement") +stopifnot(all.equal(db_measurements, as.data.frame(dummy_measurements))) + +db_observations <- DBI::dbReadTable(con, "observation") +stopifnot(all.equal(db_observations, as.data.frame(dummy_observations))) + +# Load the CMD object to verify integrity of the schema after insertions +cdm <- CDMConnector::cdm_from_con( + con = con, + cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"), + write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"), + cdm_name = name +) + +cli::cli_alert_success("Dummy tables inserted successfully") diff --git a/dev/test_db/produce_test_data.R b/dev/test_db/produce_test_data.R new file mode 100644 index 0000000..11ce3bf --- /dev/null +++ b/dev/test_db/produce_test_data.R @@ -0,0 +1,52 @@ +cli::cli_h1("Producing test data") + +suppressPackageStartupMessages({ + library(dplyr) +}) + +dir <- Sys.getenv("EUNOMIA_DATA_FOLDER") +name <- Sys.getenv("TEST_DB_NAME") +version <- Sys.getenv("TEST_DB_OMOP_VERSION") + +# Connect to the duckdb test database +con <- DBI::dbConnect( + duckdb::duckdb(dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")) +) +withr::defer(DBI::dbDisconnect(con)) + +# Function to write results from a table to the test data folder +read_table <- function(con, table) { + schema <- Sys.getenv("TEST_DB_RESULTS_SCHEMA") + # Get all rows from the table + query <- glue::glue("SELECT * FROM {schema}.{table}") + # Run the query and write results + con |> + DBI::dbGetQuery(query) |> + arrange(across(everything())) +} + +# Get the relevant tables and filter +table_names <- c("calypso_concepts", "calypso_monthly_counts", "calypso_summary_stats") +tables <- purrr::map(table_names, read_table, con = con) +names(tables) <- table_names + +# Keep only concepts for which we have summary statistics +keep_concepts <- tables$calypso_summary_stats$concept_id +tables <- purrr::map(tables, ~ .x[.x$concept_id %in% keep_concepts, ]) + +# Keep only data from 2019 onwards +monthly_counts <- tables$calypso_monthly_counts +filtered_monthly <- monthly_counts[monthly_counts$date_year >= 2019, ] +tables$calypso_monthly_counts <- filtered_monthly + +# Filter the other tables to match the concepts left over after year filtering +tables <- purrr::map(tables, ~ .x[.x$concept_id %in% filtered_monthly$concept_id, ]) + +# Write all results to the test data folder +purrr::iwalk(tables, function(tbl, name) { + path <- here::here(glue::glue("inst/test_data/{name}.csv")) + cli::cli_alert_info("Writing {name} to {path}") + readr::write_csv(tbl, file = path) +}) + +cli::cli_alert_success("Test data produced") diff --git a/dev/test_db/setup_test_db.R b/dev/test_db/setup_test_db.R index ca9ea19..03e1d13 100644 --- a/dev/test_db/setup_test_db.R +++ b/dev/test_db/setup_test_db.R @@ -1,3 +1,4 @@ +cli::cli_h1("Setting up test database") # Create an duckdb database from Eunomia datasets con <- DBI::dbConnect( @@ -18,3 +19,5 @@ CDMConnector::cdm_from_con( write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"), cdm_name = Sys.getenv("TEST_DB_NAME") ) + +cli::cli_alert_success("Test database setup successfully") diff --git a/inst/test_data/calypso_concepts.csv b/inst/test_data/calypso_concepts.csv new file mode 100644 index 0000000..f6dcdb0 --- /dev/null +++ b/inst/test_data/calypso_concepts.csv @@ -0,0 +1,10 @@ +concept_id,concept_name,vocabulary_id,domain_id,concept_class_id,standard_concept,concept_code +3001079,Blood group antibody screen [Presence] in Serum or Plasma,LOINC,Measurement,Lab Test,S,890-4 +4108450,Inspiration/expiration time ratio,SNOMED,Measurement,Observable Entity,S,250822000 +4128111,T - Tumor stage,SNOMED,Observation,Attribute,S,260878002 +4248525,Lying systolic blood pressure,SNOMED,Measurement,Observable Entity,S,407556006 +4353713,Positive end expiratory pressure,SNOMED,Observation,Observable Entity,S,250854009 +4353717,Ventilator delivered minute volume,SNOMED,Observation,Observable Entity,S,250875001 +4353843,Invasive systolic arterial pressure,SNOMED,Measurement,Observable Entity,S,251071003 +4354252,Non-invasive systolic arterial pressure,SNOMED,Measurement,Observable Entity,S,251070002 +45766147,Appearance,SNOMED,Observation,Observable Entity,S,703248002 diff --git a/inst/test_data/calypso_monthly_counts.csv b/inst/test_data/calypso_monthly_counts.csv new file mode 100644 index 0000000..1d8d600 --- /dev/null +++ b/inst/test_data/calypso_monthly_counts.csv @@ -0,0 +1,23 @@ +concept_id,concept_name,date_year,date_month,person_count,records_per_person +3001079,Blood group antibody screen [Presence] in Serum or Plasma,2020,5,1,1 +4108450,Inspiration/expiration time ratio,2019,9,1,1 +4128111,T - Tumor stage,2020,11,1,1 +4128111,T - Tumor stage,2020,12,1,1 +4248525,Lying systolic blood pressure,2019,5,1,1 +4248525,Lying systolic blood pressure,2019,10,1,1 +4248525,Lying systolic blood pressure,2020,6,1,1 +4248525,Lying systolic blood pressure,2021,10,1,1 +4353713,Positive end expiratory pressure,2019,2,1,1 +4353713,Positive end expiratory pressure,2021,7,1,1 +4353717,Ventilator delivered minute volume,2021,5,1,1 +4353717,Ventilator delivered minute volume,2022,12,1,1 +4353843,Invasive systolic arterial pressure,2021,9,1,1 +4353843,Invasive systolic arterial pressure,2021,10,1,1 +4353843,Invasive systolic arterial pressure,2021,12,1,1 +4353843,Invasive systolic arterial pressure,2023,4,1,1 +4354252,Non-invasive systolic arterial pressure,2019,8,1,1 +4354252,Non-invasive systolic arterial pressure,2020,8,1,1 +4354252,Non-invasive systolic arterial pressure,2021,2,1,1 +4354252,Non-invasive systolic arterial pressure,2021,6,1,1 +4354252,Non-invasive systolic arterial pressure,2021,11,1,1 +45766147,Appearance,2022,6,1,1 diff --git a/inst/test_data/calypso_summary_stats.csv b/inst/test_data/calypso_summary_stats.csv new file mode 100644 index 0000000..6d41528 --- /dev/null +++ b/inst/test_data/calypso_summary_stats.csv @@ -0,0 +1,17 @@ +concept_id,concept_name,summary_attribute,value_as_number,value_as_string +3001079,Blood group antibody screen [Presence] in Serum or Plasma,frequency,2,Not present +4108450,Inspiration/expiration time ratio,mean,0.6666666666666666,NA +4108450,Inspiration/expiration time ratio,sd,0,NA +4128111,T - Tumor stage,frequency,1,NA +4128111,T - Tumor stage,frequency,1,NA +4248525,Lying systolic blood pressure,mean,132.8,NA +4248525,Lying systolic blood pressure,sd,21.25323504786977,NA +4353713,Positive end expiratory pressure,mean,5.666666666666667,NA +4353713,Positive end expiratory pressure,sd,1.1547005383792517,NA +4353717,Ventilator delivered minute volume,mean,12.799999999999999,NA +4353717,Ventilator delivered minute volume,sd,3.469870314579495,NA +4353843,Invasive systolic arterial pressure,mean,128.83333333333334,NA +4353843,Invasive systolic arterial pressure,sd,23.65938855225694,NA +4354252,Non-invasive systolic arterial pressure,mean,123.66666666666667,NA +4354252,Non-invasive systolic arterial pressure,sd,9.993331109628395,NA +45766147,Appearance,frequency,1,Well nourished diff --git a/tests/testthat/_snaps/utils_get_data/concepts_table.csv b/tests/testthat/_snaps/utils_get_data/concepts_table.csv new file mode 100644 index 0000000..f6dcdb0 --- /dev/null +++ b/tests/testthat/_snaps/utils_get_data/concepts_table.csv @@ -0,0 +1,10 @@ +concept_id,concept_name,vocabulary_id,domain_id,concept_class_id,standard_concept,concept_code +3001079,Blood group antibody screen [Presence] in Serum or Plasma,LOINC,Measurement,Lab Test,S,890-4 +4108450,Inspiration/expiration time ratio,SNOMED,Measurement,Observable Entity,S,250822000 +4128111,T - Tumor stage,SNOMED,Observation,Attribute,S,260878002 +4248525,Lying systolic blood pressure,SNOMED,Measurement,Observable Entity,S,407556006 +4353713,Positive end expiratory pressure,SNOMED,Observation,Observable Entity,S,250854009 +4353717,Ventilator delivered minute volume,SNOMED,Observation,Observable Entity,S,250875001 +4353843,Invasive systolic arterial pressure,SNOMED,Measurement,Observable Entity,S,251071003 +4354252,Non-invasive systolic arterial pressure,SNOMED,Measurement,Observable Entity,S,251070002 +45766147,Appearance,SNOMED,Observation,Observable Entity,S,703248002 diff --git a/tests/testthat/_snaps/utils_get_data/monthly_counts.csv b/tests/testthat/_snaps/utils_get_data/monthly_counts.csv new file mode 100644 index 0000000..1d8d600 --- /dev/null +++ b/tests/testthat/_snaps/utils_get_data/monthly_counts.csv @@ -0,0 +1,23 @@ +concept_id,concept_name,date_year,date_month,person_count,records_per_person +3001079,Blood group antibody screen [Presence] in Serum or Plasma,2020,5,1,1 +4108450,Inspiration/expiration time ratio,2019,9,1,1 +4128111,T - Tumor stage,2020,11,1,1 +4128111,T - Tumor stage,2020,12,1,1 +4248525,Lying systolic blood pressure,2019,5,1,1 +4248525,Lying systolic blood pressure,2019,10,1,1 +4248525,Lying systolic blood pressure,2020,6,1,1 +4248525,Lying systolic blood pressure,2021,10,1,1 +4353713,Positive end expiratory pressure,2019,2,1,1 +4353713,Positive end expiratory pressure,2021,7,1,1 +4353717,Ventilator delivered minute volume,2021,5,1,1 +4353717,Ventilator delivered minute volume,2022,12,1,1 +4353843,Invasive systolic arterial pressure,2021,9,1,1 +4353843,Invasive systolic arterial pressure,2021,10,1,1 +4353843,Invasive systolic arterial pressure,2021,12,1,1 +4353843,Invasive systolic arterial pressure,2023,4,1,1 +4354252,Non-invasive systolic arterial pressure,2019,8,1,1 +4354252,Non-invasive systolic arterial pressure,2020,8,1,1 +4354252,Non-invasive systolic arterial pressure,2021,2,1,1 +4354252,Non-invasive systolic arterial pressure,2021,6,1,1 +4354252,Non-invasive systolic arterial pressure,2021,11,1,1 +45766147,Appearance,2022,6,1,1 diff --git a/tests/testthat/_snaps/utils_get_data/summary_stats.csv b/tests/testthat/_snaps/utils_get_data/summary_stats.csv new file mode 100644 index 0000000..c5017c1 --- /dev/null +++ b/tests/testthat/_snaps/utils_get_data/summary_stats.csv @@ -0,0 +1,17 @@ +concept_id,concept_name,summary_attribute,value_as_number,value_as_string +3001079,Blood group antibody screen [Presence] in Serum or Plasma,frequency,2,Not present +4108450,Inspiration/expiration time ratio,mean,0.6666666666666666,NA +4108450,Inspiration/expiration time ratio,sd,0,NA +4128111,T - Tumor stage,frequency,1,NA +4128111,T - Tumor stage,frequency,1,NA +4248525,Lying systolic blood pressure,mean,132.8,NA +4248525,Lying systolic blood pressure,sd,21.25323504786977,NA +4353713,Positive end expiratory pressure,mean,5.666666666666667,NA +4353713,Positive end expiratory pressure,sd,1.1547005383792517,NA +4353717,Ventilator delivered minute volume,mean,12.8,NA +4353717,Ventilator delivered minute volume,sd,3.469870314579495,NA +4353843,Invasive systolic arterial pressure,mean,128.83333333333334,NA +4353843,Invasive systolic arterial pressure,sd,23.65938855225694,NA +4354252,Non-invasive systolic arterial pressure,mean,123.66666666666669,NA +4354252,Non-invasive systolic arterial pressure,sd,9.993331109628397,NA +45766147,Appearance,frequency,1,Well nourished diff --git a/tests/testthat/test-utils_get_data.R b/tests/testthat/test-utils_get_data.R new file mode 100644 index 0000000..84b1800 --- /dev/null +++ b/tests/testthat/test-utils_get_data.R @@ -0,0 +1,20 @@ +# Sanity checks +test_that("Test data files exist", { + expect_true(file.exists(app_sys("test_data", "calypso_concepts.csv"))) + expect_true(file.exists(app_sys("test_data", "calypso_monthly_counts.csv"))) + expect_true(file.exists(app_sys("test_data", "calypso_summary_stats.csv"))) +}) + +# These tests act as proxy tests for the pre-processing scripts that generate the test data +# making sure the test data files are generated correctly and consistently +test_that("Test data files are consistent", { + # To use expect_snapshot_file(), need to save the output to a temporary file + save_csv <- function(x) { + path <- tempfile(fileext = ".csv") + readr::write_csv(x, file = path) + path + } + expect_snapshot_file(save_csv(get_concepts_table()), "concepts_table.csv") + expect_snapshot_file(save_csv(get_monthly_counts()), "monthly_counts.csv") + expect_snapshot_file(save_csv(get_summary_stats()), "summary_stats.csv") +})