Skip to content

Commit

Permalink
Add summary stat plot for categorical concepts (#36)
Browse files Browse the repository at this point in the history
* Add dummy categorical data for testing

* Add plotting function for categorical data

* Update `NAMESPACE`

* Hide legend for the barplot

* Add roxygen comments for `stat_categorical_plot`

* Add wrapper function for summary stat plots

* Rename module for summary stat plotting

* Rename `stat_numeric` -> `summary_stat`

* Rename tests for `summary_stat`

* Typo fix

* More renaming in tests

* Add test for categorical plots

* Add forcats as dependency

* Handle edge case when no data available for requested concept
  • Loading branch information
milanmlft authored Aug 27, 2024
1 parent 5e08bb8 commit ff8d57f
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 86 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Imports:
glue,
tidyr,
withr,
forcats,
readr,
lubridate,
dplyr
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ importFrom(duckdb,duckdb)
importFrom(ggplot2,aes)
importFrom(ggplot2,geom_bar)
importFrom(ggplot2,geom_boxplot)
importFrom(ggplot2,geom_col)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggtitle)
importFrom(ggplot2,labs)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(glue,glue)
Expand Down
4 changes: 2 additions & 2 deletions R/app_server.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ app_server <- function(input, output, session) {
selected_row <- mod_datatable_server("totals", selected_data)
selected_dates <- mod_date_range_server("date_range")

# TODO: refactor monthly_count and stat_numeric modules into a single module
# TODO: refactor monthly_count and summary_stat modules into a single module?n
# https://github.com/UCLH-Foundry/omop-data-catalogue/issues/30
mod_monthly_count_server("monthly_count", monthly_counts, selected_row, selected_dates)
mod_stat_numeric_server("stat_numeric", summary_stats, selected_row)
mod_summary_stat_server("summary_stat", summary_stats, selected_row)

mod_export_tab_server("export_tab", selected_data)
}
2 changes: 1 addition & 1 deletion R/app_ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ app_ui <- function(request) {
card(mod_datatable_ui("totals")),
layout_columns(
card(mod_monthly_count_ui("monthly_count")),
card(mod_stat_numeric_ui("stat_numeric"))
card(mod_summary_stat_ui("summary_stat"))
)
),
nav_panel(
Expand Down
49 changes: 0 additions & 49 deletions R/fct_stat_numeric_plot.R

This file was deleted.

111 changes: 111 additions & 0 deletions R/fct_summary_stat_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#' summary_stat_plot
#'
#' Wrapper function to generate a plot for a summary statistic depending on its type
#' (categorical or numeric).
#'
#' @param summary_stats A `data.frame` containing the summary statistics.
#' @param plot_title A `character`, to be used as title of the plot.
#'
#' @return A `ggplot2` object.
#'
#' @noRd
summary_stat_plot <- function(summary_stats, plot_title) {
if (.is_categorical(summary_stats)) {
stat_categorical_plot(summary_stats, plot_title)
} else {
stat_numeric_plot(summary_stats, plot_title)
}
}

#' stat_numeric_plot
#'
#' Generates a boxplot of the summary statistics for a numeric concept.
#' Uses pre-calculated `mean` and `sd` to generate the boxplot.
#'
#' Expects the input data to have the following columns:
#' - `concept_id`: The concept ID.
#' - `summary_attribute`: The type of the summary attribute, e.g. `mean` or `sd`.
#' - `value_as_number`: The value of the summary attribute as a numeric value.
#'
#' @param summary_stats A `data.frame` containing the summary statistics.
#' @param plot_title A `character`, to be used as title of the plot.
#'
#' @return A `ggplot2` object.
#'
#' @importFrom ggplot2 ggplot aes geom_boxplot
#' @noRd
stat_numeric_plot <- function(summary_stats, plot_title) {
processed_stats <- .process_numeric_stats(summary_stats)

mean <- sd <- concept_id <- NULL
ggplot(processed_stats, aes(x = factor(concept_id))) +
geom_boxplot(
aes(
lower = mean - sd,
upper = mean + sd,
middle = mean,
ymin = mean - 3 * sd,
ymax = mean + 3 * sd
),
stat = "identity"
) +
xlab(NULL) +
ggtitle(plot_title)
}

#' stat_categorical_plot
#'
#' Generates a bar plot of the category frequencies for a categorical concept.
#' Uses pre-calculated frequencies to generate the plot.
#'
#' Expects the input data to have the following columns:
#' - `concept_id`: The concept ID.
#' - `summary_attribute`: The type of the summary attribute, should be 'frequency'.
#' - `value_as_string`: The name of the category
#' - `value_as_number`: The value of the summary attribute as a numeric value.
#'
#' @param summary_stats A `data.frame` containing the summary statistics.
#' @param plot_title A `character`, to be used as title of the plot.
#'
#' @return A `ggplot2` object.
#'
#' @importFrom ggplot2 ggplot aes geom_col labs
#' @noRd
stat_categorical_plot <- function(summary_stats, plot_title) {
# We expect only single concept ID at this point
# NOTE: this might change when we support bundles of concepts, in which case we might want to
# display the entire batch in one plot
stopifnot("Expecting a single concept ID" = length(unique(summary_stats$concept_id)) == 1)
stopifnot(c("concept_id", "value_as_string", "value_as_number") %in% names(summary_stats))

summary_stats$value_as_string <- as.factor(summary_stats$value_as_string)
# Reorder factor levels by frequency
summary_stats$value_as_string <- forcats::fct_reorder(
summary_stats$value_as_string, summary_stats$value_as_number,
.desc = TRUE
)

value_as_string <- value_as_number <- NULL
ggplot(summary_stats, aes(value_as_string, value_as_number)) +
geom_col(aes(fill = value_as_string), show.legend = FALSE) +
labs(x = "Category", y = "Frequency") +
ggtitle(plot_title)
}

.process_numeric_stats <- function(summary_stats) {
# We expect only single concept ID at this point
# NOTE: this might change when we support bundles of concepts, in which case we might want to
# display the entire batch in one plot
stopifnot("Expecting a single concept ID" = length(unique(summary_stats$concept_id)) == 1)
stopifnot(c("concept_id", "summary_attribute", "value_as_number") %in% names(summary_stats))

tidyr::pivot_wider(summary_stats,
id_cols = "concept_id",
names_from = "summary_attribute",
values_from = "value_as_number"
)
}

.is_categorical <- function(summary_stats) {
"frequency" %in% summary_stats$summary_attribute
}
23 changes: 11 additions & 12 deletions R/mod_monthly_count.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,17 @@ mod_monthly_count_server <- function(id, data, selected_concept, selected_dates)
stopifnot(is.reactive(selected_concept))
stopifnot(is.reactive(selected_dates))


moduleServer(id, function(input, output, session) {
## Filter data based on selected concept and datea range
## Filter data based on selected concept and date range
selected_concept_id <- reactive(selected_concept()$concept_id)
selected_concept_name <- reactive(selected_concept()$concept_name)
filtered_monthly_counts <- reactive({
if (is.null(selected_concept_id())) {
return(NULL)
}
req(selected_dates()) # we expect always to have dates selected
out <- data[data$concept_id == selected_concept_id(), ]

req(selected_dates()) # we expect always to have dates selected
.filter_dates(out, selected_dates())
})

Expand All @@ -47,21 +47,20 @@ mod_monthly_count_server <- function(id, data, selected_concept, selected_dates)
if (is.null(filtered_monthly_counts()) || nrow(filtered_monthly_counts()) == 0) {
return(NULL)
}

monthly_count_plot(filtered_monthly_counts(), selected_concept_name())
})
})
}



.filter_dates <- function(monthly_counts, date_range) {
range_years <- lubridate::year(date_range)
range_months <- lubridate::month(date_range)
date_range <- as.Date(date_range)
if (date_range[2] < date_range[1]) {
stop("Invalid date range, end date is before start date")
}

date_year <- date_month <- NULL
dplyr::filter(
monthly_counts,
date_year >= range_years[1] & date_year <= range_years[2],
date_month >= range_months[1] & date_month <= range_months[2]
)
dates <- lubridate::make_date(year = monthly_counts$date_year, month = monthly_counts$date_month)
keep_dates <- dplyr::between(dates, date_range[1], date_range[2])
dplyr::filter(monthly_counts, keep_dates)
}
17 changes: 9 additions & 8 deletions R/mod_stat_numeric.R → R/mod_summary_stat.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' stat_numeric UI Function
#' summary_stat UI Function
#'
#' Displays the boxplot of the summary statistics for a numeric concept.
#'
Expand All @@ -7,14 +7,14 @@
#' @noRd
#'
#' @importFrom shiny NS tagList
mod_stat_numeric_ui <- function(id) {
mod_summary_stat_ui <- function(id) {
ns <- NS(id)
tagList(
plotOutput(ns("stat_numeric_plot"), height = 250)
plotOutput(ns("summary_stat_plot"), height = 250)
)
}

#' stat_numeric Server Functions
#' summary_stat Server Functions
#'
#' Generates the boxplot of the summary statistics for a numeric concept.
#' When no concept was selected, an empty plot is returned.
Expand All @@ -23,7 +23,7 @@ mod_stat_numeric_ui <- function(id) {
#' @param selected_concept Reactive value containing the selected concept, used for filtering
#'
#' @noRd
mod_stat_numeric_server <- function(id, data, selected_concept) {
mod_summary_stat_server <- function(id, data, selected_concept) {
stopifnot(is.data.frame(data))
stopifnot(is.reactive(selected_concept))

Expand All @@ -40,10 +40,11 @@ mod_stat_numeric_server <- function(id, data, selected_concept) {
data[data$concept_id == selected_concept_id(), ]
})

output$stat_numeric_plot <- renderPlot({
## Return empty plot if no data is selected
output$summary_stat_plot <- renderPlot({
## Return empty plot if no data is selected or if no data is available for the selected concept
if (is.null(filtered_summary_stats())) return(NULL)
stat_numeric_plot(filtered_summary_stats(), selected_concept_name())
if (nrow(filtered_summary_stats()) == 0) return(NULL)
summary_stat_plot(filtered_summary_stats(), selected_concept_name())
})
})
}
43 changes: 42 additions & 1 deletion tests/testthat/test-mod_monthly_count.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mock_monthly_counts <- data.frame(
# Application-logic tests ---------------------------------------------------------------------

mock_concept_row <- reactiveVal()
mock_date_range <- reactiveVal(c("2000-01-01", "2200-12-31"))
mock_date_range <- reactiveVal(c("2019-04-01", "2024-08-01"))

test_that("mod_monthly_count_server reacts to changes in the selected concept", {
testServer(
Expand Down Expand Up @@ -62,6 +62,31 @@ test_that("mod_monthly_count_server reacts to changes in the selected date range
)
})

test_that("Date filtering works as expected", {
testServer(
mod_monthly_count_server,
args = list(data = mock_monthly_counts, selected_concept = mock_concept_row, selected_dates = mock_date_range),
{
# We have data for this concept from 2019-04 to 2020-05
mock_concept_row(list(concept_id = 40213251, concept_name = "test")) # update reactive value

# Test boundary dates, we only care up to the month level
selected_dates <- c("2019-04-01", "2020-05-01")
mock_date_range(selected_dates)
session$flushReact()
expect_equal(nrow(filtered_monthly_counts()), 3)

# This checks a previous bug where a row with date_month larger than the date range months
# would always get removed while it should be kept in case the year is within the range
# e.g. 2019-04 should be kept when the range is 2019-01 to 2020-01
selected_dates2 <- c("2019-01-01", "2020-01-01")
mock_date_range(selected_dates2)
session$flushReact()
expect_equal(nrow(filtered_monthly_counts()), 1)
}
)
})

test_that("mod_monthly_count_server generates an empty plot when no row is selected", {
testServer(
mod_monthly_count_server,
Expand All @@ -73,6 +98,17 @@ test_that("mod_monthly_count_server generates an empty plot when no row is selec
)
})

test_that("mod_monthly_count_server generates an empty plot when no data is available for the selected concept", {
testServer(
mod_monthly_count_server,
args = list(data = mock_monthly_counts, selected_concept = mock_concept_row, selected_dates = mock_date_range),
{
mock_concept_row(list(concept_id = 9999999, concept_name = "idontexist"))
expect_length(output$monthly_count_plot$coordmap$panels[[1]]$mapping, 0)
}
)
})

test_that("module ui works", {
ui <- mod_monthly_count_ui(id = "test")
golem::expect_shinytaglist(ui)
Expand All @@ -99,3 +135,8 @@ test_that("monthly_count_plot correctly parses dates", {
expect_false(is.null(p$mapping))
expect_false(is.null(p$layers))
})

test_that("Date range filtering fails for invalid date range", {
selected_dates <- c("2020-01-01", "2019-01-01")
expect_error(.filter_dates(monthly_counts, selected_dates), "Invalid date range, end date is before start date")
})
Loading

0 comments on commit ff8d57f

Please sign in to comment.