Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle analysis of categorical concepts #32

Merged
merged 24 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
34b722f
add analyse_categorical_column() untested because afaik we haven't ye…
Aug 20, 2024
4a05037
fix logic bug with passing 'value' and simplify slightly. Don't need …
Aug 20, 2024
6d3a8e6
fix 2nd logic bug, need to retain columns (actually only value_as_con…
Aug 20, 2024
e0ef9f3
fix logic bug in argument order, move collect to end. Still seem to g…
Aug 20, 2024
67687cd
seems to be working now on independent test data :-) fixed another lo…
Aug 20, 2024
748da0c
Update dev/omop_analyses/analyse_omop_cdm.R
andysouth Aug 20, 2024
ffdfa7f
completing table rename started on Github fron Stef's suggestion
Aug 20, 2024
13566a5
Merge branch 'main' into add-analyse_categorical_column
milanmlft Aug 21, 2024
1c4c9e1
Format and simplify code
milanmlft Aug 21, 2024
75c391e
Update spellcheck wordlist
milanmlft Aug 21, 2024
610b8e6
Fix: set correct name for attribute value
milanmlft Aug 21, 2024
65f9c85
Remove clean up steps from scripts
milanmlft Aug 21, 2024
52d1e00
Ensure database connections are clossed on exit, even if script fails
milanmlft Aug 21, 2024
6e245ee
Pull `analyse_*_column` helpers out of main function
milanmlft Aug 21, 2024
20e2a67
Make `analyse_categorical_column` more consistent with `analyse_numer…
milanmlft Aug 21, 2024
40251e6
Rename helper functions
milanmlft Aug 21, 2024
b48a5c4
Refactor `analyse_*` functions
milanmlft Aug 21, 2024
cdbc9d7
Add concept names to summary table (#33)
milanmlft Aug 21, 2024
bb345ce
Add concept names to monthly counts table (#33)
milanmlft Aug 21, 2024
2bb42fe
Rename functions
milanmlft Aug 21, 2024
5c90c5b
Fix comments
milanmlft Aug 21, 2024
7e3ae67
Remove NA values when calculating mean and sd
milanmlft Aug 21, 2024
4527344
Fix column selection
milanmlft Aug 21, 2024
3ab3e6d
Add concept names to result tables (#33)
milanmlft Aug 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 89 additions & 64 deletions dev/omop_analyses/analyse_omop_cdm.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

library(tidyverse)

dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
Expand All @@ -7,7 +6,9 @@ version <- Sys.getenv("TEST_DB_OMOP_VERSION")

# Connect to the duckdb test database
con <- DBI::dbConnect(duckdb::duckdb(
dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")))
dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")
))
withr::defer(DBI::dbDisconnect(con))
milanmlft marked this conversation as resolved.
Show resolved Hide resolved

# Function to execute one or more SQL queries and clear results
create_results_tables <- function(con, sql) {
Expand All @@ -19,7 +20,7 @@ create_results_tables <- function(con, sql) {

# Function to produce the 'calypso_concepts' table
# from a list of concept ids
analyse_concepts <- function(cdm, concepts) {
get_concepts_table <- function(cdm, concepts) {
# Extract columns from concept table
cdm$concept |>
filter(concept_id %in% concepts) |>
Expand All @@ -36,7 +37,7 @@ analyse_concepts <- function(cdm, concepts) {
}

# Function to produce the 'calypso_monthly_counts' table
analyse_monthly_counts <- function(cdm) {
calculate_monthly_counts <- function(cdm) {
# Function to analyse a column from a specific table
# for each month
analyse_table <- function(table, concept, date) {
Expand Down Expand Up @@ -76,7 +77,7 @@ analyse_monthly_counts <- function(cdm) {
)
}
# Combine results for all tables
bind_rows(
out <- bind_rows(
cdm$condition_occurrence |> analyse_table(condition_concept_id, condition_start_date),
cdm$drug_exposure |> analyse_table(drug_concept_id, drug_exposure_start_date),
cdm$procedure_occurrence |> analyse_table(procedure_concept_id, procedure_date),
Expand All @@ -85,40 +86,87 @@ analyse_monthly_counts <- function(cdm) {
cdm$observation |> analyse_table(observation_concept_id, observation_date),
cdm$specimen |> analyse_table(specimen_concept_id, specimen_date)
)

# Map concept names to the concept IDs
concept_names <- select(cdm$concept, concept_id, concept_name) |>
filter(concept_id %in% out$concept_id) |>
collect()
out |>
left_join(concept_names, by = c("concept_id" = "concept_id")) |>
select(concept_id, concept_name, everything())
}

# Function to analyse a numeric column
# by calculating the mean and the standard deviation
summarise_numeric_concepts <- function(.data) {
# Calculate mean and sd
stats <- .data |>
group_by(concept_id) |>
summarise(mean = mean(value_as_number, na.rm = TRUE), sd = sd(value_as_number, na.rm = TRUE))

# Wrangle output to expected format
stats |>
pivot_longer(
cols = c(mean, sd),
names_to = "summary_attribute",
values_to = "value_as_number"
)
}

# Function to analyse a categorical column - present in observation and measurement
# by joining value_as_concept_id to cdm$concept by concept_id
summarise_categorical_concepts <- function(.data) {
# Calculate frequencies
frequencies <- .data |>
count(concept_id, value_as_concept_id)

# Wrangle output into the expected format
frequencies |>
mutate(summary_attribute = "frequency") |>
select(
concept_id,
summary_attribute,
value_as_number = n,
value_as_concept_id
)
}

summarise_concepts <- function(.data, concept_name) {
stopifnot(inherits(.data, "tbl"))
stopifnot(is.character(concept_name))

.data <- rename(.data, concept_id = all_of(concept_name))

numeric_concepts <- filter(.data, !is.na(value_as_number))
# beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping
categorical_concepts <- filter(.data, !is.null(value_as_concept_id) & value_as_concept_id != 0)

numeric_stats <- summarise_numeric_concepts(numeric_concepts) |> collect()
categorical_stats <- summarise_categorical_concepts(categorical_concepts) |> collect()
bind_rows(numeric_stats, categorical_stats)
}

# Function to produce the 'calypso_summary_stats' table
analyse_summary_stats <- function(cdm) {
# Function to analyse a numeric column
# by calculation the mean and the standard deviation
analyse_numeric_column <- function(table, concept, value) {
# Rename columns and remove empty values
table <- table |>
select(concept_id = {{ concept }}, value = {{ value }}) |>
filter(!is.na(value)) |>
collect()
# Calculate mean
df_mean <- table |>
group_by(concept_id) |>
reframe(
summary_attribute = "mean",
value_as_number = mean(value)
)
# Calculate standard deviation
df_sd <- table |>
group_by(concept_id) |>
reframe(
summary_attribute = "sd",
value_as_number = sd(value)
)
# Combine mean and standard deviation
bind_rows(df_mean, df_sd)
}
# Combine results for all columns
bind_rows(
cdm$measurement |> analyse_numeric_column(measurement_concept_id, value_as_number),
cdm$observation |> analyse_numeric_column(observation_concept_id, value_as_number)
)
calculate_summary_stats <- function(cdm) {
table_names <- c("measurement", "observation")
concept_names <- c("measurement_concept_id", "observation_concept_id")

# Combine results for all tables
stats <- map2(table_names, concept_names, ~ summarise_concepts(cdm[[.x]], .y))
stats <- bind_rows(stats)

# Map concept names to the concept_ids
concept_names <- select(cdm$concept, concept_id, concept_name) |>
filter(concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |>
collect()
stats |>
# Order is important here, first we get the names for the value_as_concept_ids
# from the categorical data summaries and record it as `value_as_string`
left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |>
rename(value_as_string = concept_name) |>
# Then we get the names for the main concept_ids
left_join(concept_names, by = c("concept_id" = "concept_id")) |>
select(concept_id, concept_name, !value_as_concept_id)
}

# Function to write result to the results schema
Expand Down Expand Up @@ -157,41 +205,18 @@ cdm <- CDMConnector::cdm_from_con(
)

# Generate monthly counts and write it to the DB
monthly_counts <- analyse_monthly_counts(cdm)
monthly_counts <- calculate_monthly_counts(cdm)
monthly_counts |>
write_results(con, "calypso_monthly_counts")

# Generate summary stats and write it to the DB
summary_stats <- analyse_summary_stats(cdm)
summary_stats <- calculate_summary_stats(cdm)
summary_stats |>
write_results(con, "calypso_summary_stats")

# Get list of concept ids
ids <- bind_rows(
{ monthly_counts |> select(concept_id) },
{ summary_stats |> select(concept_id) }
) |> distinct()
ids <- ids$concept_id
# Get all distinct concept ids
ids <- unique(c(monthly_counts$concept_id, summary_stats$concept_id))

# Retrieve concept properties from the list of ids
analyse_concepts(cdm, ids) |>
get_concepts_table(cdm, ids) |>
write_results(con, "calypso_concepts")

# Clean up
DBI::dbDisconnect(con)
rm(create_results_tables)
rm(analyse_concepts)
rm(analyse_monthly_counts)
rm(analyse_summary_stats)
rm(write_results)
rm(monthly_counts)
rm(summary_stats)
rm(ids)
rm(cdm)
rm(con)
rm(sql)
rm(dir)
rm(name)
rm(version)

detach("package:tidyverse", unload = TRUE)
2 changes: 2 additions & 0 deletions dev/omop_analyses/calypso_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ CREATE TABLE @resultsDatabaseSchema.calypso_concepts (

CREATE TABLE @resultsDatabaseSchema.calypso_monthly_counts (
concept_id BIGINT,
concept_name VARCHAR,
date_year INTEGER,
date_month INTEGER,
person_count BIGINT,
Expand All @@ -22,6 +23,7 @@ CREATE TABLE @resultsDatabaseSchema.calypso_monthly_counts (

CREATE TABLE @resultsDatabaseSchema.calypso_summary_stats (
concept_id BIGINT,
concept_name VARCHAR,
summary_attribute VARCHAR,
value_as_string VARCHAR,
value_as_number DOUBLE
Expand Down
5 changes: 1 addition & 4 deletions dev/test_db/setup_test_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ con <- DBI::dbConnect(
)
)
)
withr::defer(DBI::dbDisconnect(con))

# Use 'cdm_from_con' to load the dataset and verify integrity
CDMConnector::cdm_from_con(
Expand All @@ -17,7 +18,3 @@ CDMConnector::cdm_from_con(
write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
cdm_name = Sys.getenv("TEST_DB_NAME")
)

# Clean up
DBI::dbDisconnect(con)
rm(con)
6 changes: 6 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
CMD
Catalogue
Lifecycle
OMOP
RStudio
UI
catalogue
duckdb
golem
lockfile
tidyverse