From 76c1defaf2d668570ed9f9a283ced2732047332e Mon Sep 17 00:00:00 2001 From: tbrittoborges Date: Tue, 7 Sep 2021 11:11:10 +0200 Subject: [PATCH] initial commit for script to plot and train the meta predictor --- benchmark_scripts/confmat_calling_nanopore.R | 26 ++ benchmark_scripts/confmat_calling_sirv.R | 27 ++ benchmark_scripts/eulerr_calling_sirv.R | 53 ++++ benchmark_scripts/heatmap_calling_nanopore.R | 66 ++++ benchmark_scripts/heatmap_calling_sirv.R | 74 +++++ benchmark_scripts/load_nanopore_data.R | 31 ++ benchmark_scripts/load_sirv_data.R | 36 +++ benchmark_scripts/pearson_calling_sirv.R | 59 ++++ benchmark_scripts/roc_calling.R | 46 ++- benchmark_scripts/roc_calling_nanopore.R | 49 +++ benchmark_scripts/roc_calling_sirv.R | 48 +++ .../roc_identification_nanopore.R | 49 +++ benchmark_scripts/roc_identification_sirv.R | 47 +++ benchmark_scripts/upset_calling.R | 163 +++++----- benchmark_scripts/upset_calling_sirv.R | 47 +++ meta_score/train_meta_predictor.py | 286 ++++++++++++++++++ 16 files changed, 998 insertions(+), 109 deletions(-) create mode 100644 benchmark_scripts/confmat_calling_nanopore.R create mode 100644 benchmark_scripts/confmat_calling_sirv.R create mode 100644 benchmark_scripts/eulerr_calling_sirv.R create mode 100644 benchmark_scripts/heatmap_calling_nanopore.R create mode 100644 benchmark_scripts/heatmap_calling_sirv.R create mode 100644 benchmark_scripts/load_nanopore_data.R create mode 100644 benchmark_scripts/load_sirv_data.R create mode 100644 benchmark_scripts/pearson_calling_sirv.R create mode 100644 benchmark_scripts/roc_calling_nanopore.R create mode 100644 benchmark_scripts/roc_calling_sirv.R create mode 100644 benchmark_scripts/roc_identification_nanopore.R create mode 100644 benchmark_scripts/roc_identification_sirv.R create mode 100644 benchmark_scripts/upset_calling_sirv.R create mode 100644 meta_score/train_meta_predictor.py diff --git a/benchmark_scripts/confmat_calling_nanopore.R b/benchmark_scripts/confmat_calling_nanopore.R new file mode 100644 index 0000000..d87ca4a --- /dev/null +++ b/benchmark_scripts/confmat_calling_nanopore.R @@ -0,0 +1,26 @@ +source("load_nanopore_data.R") + +cm <- lapply( + seq_along(df), + function(x) { + caret::confusionMatrix( + as.factor(ifelse(df[[x]] > 0.95, 1, 0)), + as.factor(ifelse(df$orthogonal > 0.95, 1, 0)), + positive = "1" + ) + } +) + +names(cm) <- colnames(df) + +lapply( + names(cm), + function(x) { + writeLines( + capture.output( + cm[[x]] + ), + paste0("../nanopore_benchmark/results/confmat_", x, ".tab") + ) + } +) diff --git a/benchmark_scripts/confmat_calling_sirv.R b/benchmark_scripts/confmat_calling_sirv.R new file mode 100644 index 0000000..b841de8 --- /dev/null +++ b/benchmark_scripts/confmat_calling_sirv.R @@ -0,0 +1,27 @@ +source("load_sirv_data.R") + + +cm <- lapply( + seq_along(df), + function(x) { + caret::confusionMatrix( + as.factor(ifelse(df[[x]] > 0.95, 1, 0)), + as.factor(df$orthogonal), + positive = "1" + ) + } +) + +names(cm) <- colnames(df) + +lapply( + names(cm), + function(x) { + writeLines( + capture.output( + cm[[x]] + ), + paste0("../sirv_benchmark/results/confmat_", x, ".tab") + ) + } +) diff --git a/benchmark_scripts/eulerr_calling_sirv.R b/benchmark_scripts/eulerr_calling_sirv.R new file mode 100644 index 0000000..b73ddfd --- /dev/null +++ b/benchmark_scripts/eulerr_calling_sirv.R @@ -0,0 +1,53 @@ +library(eulerr) + +source("~/Baltica/benchmark_scripts/load_sirv_data.R") + +mat <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL + ) %>% + as.matrix(df) + +mat[mat > 0.95] <- 1 +mat[mat < 0.95] <- 0 +mat[is.na(mat)] <- 0 +mat <- mat[rowSums(mat) >= 1, ] + +fit <- euler(mat, shape = "ellipse") + +plot( + fit, + fills = color_list$method, + edges = F, + fontsize = 14, + quantities = list(fontsize = 16), + labels = FALSE, + legend = list(nrow = 1, ncol = 5, side = "bottom", fontsize = 16), + main = "Intesection of SJ per method" +) + +library(VennDiagram) + +input_sj <- lapply( + as.data.frame(mat), + function(i) unique(coordinates[i]) +) + +venn.diagram( + x = as.list(mat), + category.names = colnames(mat), + filename = "venn_sirv_calling.png" + # output=TRUE +) + +input <- lapply(as.data.frame(mat), function(i) unique(gene_name[i])) + +input$junctionseq + +venn.diagram( + x = input, + # category.names = colnames(mat), + filename = "venn_sirv_calling.png", + output = TRUE +) diff --git a/benchmark_scripts/heatmap_calling_nanopore.R b/benchmark_scripts/heatmap_calling_nanopore.R new file mode 100644 index 0000000..da12395 --- /dev/null +++ b/benchmark_scripts/heatmap_calling_nanopore.R @@ -0,0 +1,66 @@ +source("load_nanopore_data.R") + +library(ComplexHeatmap) +library(RColorBrewer) + +set.seed(123) + +methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") +methods <- str_extract(colnames(df), str_c(methods, collapse = "|")) +methods <- methods[!is.na(methods)] + +mat <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL + ) %>% + as.matrix(df) + +idx <- data.frame( + is=ifelse(df$orthogonal>0.95, 1, 0) +) + +idx <- idx %>% + mutate(n=row_number()) %>% + group_by(is) %>% + sample_n(500) + +method <- str_extract(colnames(mat), str_c(methods, collapse = "|")) +method_cols <- setNames(brewer.pal(n = length(methods), name = "Set2"), methods) + +mat <- mat[idx$n, ] +ca=HeatmapAnnotation( + method_names=anno_block( + gp = gpar(fill = method_cols[method]), + labels = method, + labels_gp = gpar(col = "white", fontsize = 9)) +) + +lengend_method = Legend( + labels = unique(method), + title = "method", + legend_gp = gpar( + fontsize = 14, + fontface = "bold", + fill = method_cols[unique(method)])) + +col_fun = circlize::colorRamp2(c(0, 0.95, 1), c("blue", "white", "red")) + +hm=Heatmap( + mat, + name='score', + col = col_fun, + top_annotation = ca, + cluster_columns = F, + cluster_rows = F, + column_split = 1:5, + border = TRUE, + column_title = NULL, + show_column_names=F) + +png("~/Baltica/nanopore_benchmark/results/heatmap_calling.pdf", width = 8, height = 4, units = "in", res = 200) +draw(hm, + heatmap_legend_list = list(lengend_method)) +dev.off() + + diff --git a/benchmark_scripts/heatmap_calling_sirv.R b/benchmark_scripts/heatmap_calling_sirv.R new file mode 100644 index 0000000..cec690d --- /dev/null +++ b/benchmark_scripts/heatmap_calling_sirv.R @@ -0,0 +1,74 @@ +source("load_sirv_data.R") + +library(ComplexHeatmap) +library(RColorBrewer) + +set.seed(123) + +methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") +methods <- str_extract(colnames(df), str_c(methods, collapse = "|")) +methods <- methods[!is.na(methods)] + +mat <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL + ) %>% + as.matrix(df) + +is_SIRV <- startsWith(coordinates, 'SIRV') + +idx <- data_frame( + is_SIRV=is_SIRV) + +idx <- idx %>% + mutate(n=row_number()) %>% + group_by(is_SIRV) %>% + sample_n(504) + +method <- str_extract(colnames(mat), str_c(methods, collapse = "|")) +method_cols <- setNames(brewer.pal(n = length(methods), name = "Set2"), methods) + +mat <- mat[idx$n, ] +ca=HeatmapAnnotation( + method_names=anno_block( + gp = gpar(fill = method_cols[method]), + labels = method, + labels_gp = gpar(col = "white", fontsize = 9)) +) + +lengend_method = Legend( + labels = unique(method), + title = "method", + legend_gp = gpar( + fontsize = 14, + fontface = "bold", + fill = method_cols[unique(method)])) + +ra = HeatmapAnnotation( + SIRV = is_SIRV[idx$n], + col=list(SIRV=c('TRUE'='black', 'FALSE'='white')), + which = 'row') + +col_fun = circlize::colorRamp2(c(0, 0.95, 1), c("blue", "white", "red")) + +hm=Heatmap( + mat, + name='score', + col = col_fun, + left_annotation = ra, + top_annotation = ca, + cluster_columns = F, + cluster_rows = F, + column_split = 1:5, + row_split = is_SIRV[idx$n], + border = TRUE, + column_title = NULL, + show_column_names=F) + +png("~/Baltica/sirv_benchmark/results/heatmap_calling.pdf", width = 8, height = 4, units = "in", res = 200) +draw(hm, + heatmap_legend_list = list(lengend_method)) +dev.off() + + diff --git a/benchmark_scripts/load_nanopore_data.R b/benchmark_scripts/load_nanopore_data.R new file mode 100644 index 0000000..94e2a08 --- /dev/null +++ b/benchmark_scripts/load_nanopore_data.R @@ -0,0 +1,31 @@ +library(tidyverse) + +df <- readr::read_csv( + "../nanopore_benchmark/results/SJ_annotated.csv", + col_types = cols( + .default = col_double(), + is_novel = col_logical(), + gene_name = col_character(), + transcript_name = col_character(), + class_code = col_character(), + exon_number = col_character(), + coordinates = col_character() + ) +) + +gene_name <- df$gene_name +coordinates <- df$coordinates +is_novel <- df$is_novel + +colnames(df) <- gsub(x=colnames(df), pattern = "_SMG7KO2SMG6KD-vs-control", '') +colnames(df) <- gsub(x=colnames(df), pattern = "_NA", '') + +methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") +df <- df %>% dplyr::select(matches(methods)) + +color_list <- list( + method = setNames( + scales::brewer_pal(palette = "Set2")(length(methods)), + methods + ) +) diff --git a/benchmark_scripts/load_sirv_data.R b/benchmark_scripts/load_sirv_data.R new file mode 100644 index 0000000..f309ab9 --- /dev/null +++ b/benchmark_scripts/load_sirv_data.R @@ -0,0 +1,36 @@ +library(tidyverse) + +df <- readr::read_csv( + "../sirv_benchmark/results/SJ_annotated.csv", + col_types = cols( + .default = col_double(), + is_novel = col_logical(), + gene_name = col_character(), + transcript_name = col_character(), + class_code = col_character(), + exon_number = col_character(), + coordinates = col_character() + ) +) + +df <- df %>% + # pivot to remove the mixes + pivot_longer(matches("-vs-"), + names_to = c(".value", 'comparison'), + names_pattern = "(.+)_(.+)" + ) + +gene_name <- df$gene_name +coordinates <- df$coordinates +is_novel <- df$is_novel + +methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") + +df <- df %>% dplyr::select(matches(methods)) + +color_list <- list( + method = setNames( + scales::brewer_pal(palette = "Set2")(length(methods)), + methods + ) +) diff --git a/benchmark_scripts/pearson_calling_sirv.R b/benchmark_scripts/pearson_calling_sirv.R new file mode 100644 index 0000000..4b7e950 --- /dev/null +++ b/benchmark_scripts/pearson_calling_sirv.R @@ -0,0 +1,59 @@ +library(tidyverse) +library(reshape2) + +df <- readr::read_csv( + "../sirv_benchmark/results/SJ_annotated.csv", + col_types = cols( + .default = col_double(), + is_novel = col_logical(), + gene_name = col_character(), + transcript_name = col_character(), + class_code = col_character(), + exon_number = col_character(), + coordinates = col_character() + ) +) + +df <- df %>% + dplyr::select(matches("-vs-"), "coordinates", "gene_name") + +coordinates <- df$coordinates +gene_name <- df$gene_name +df$coordinates <- NULL +df$gene_name <- NULL +df <- df %>% mutate( + across(everything(), ~ replace_na(.x, 0)) +) + +cormat <- apply(df, 2, function(x){ -log10(x+1e-10) }) +cormat <- round( + cor(cormat, method='pearson', use="pairwise.complete.obs"), + 2) +cormat[lower.tri(cormat)]=NA +cormat=melt(cormat) +cormat=cormat[!is.na(cormat$value),] +ggplot(cormat, aes(Var2, Var1, fill = value))+ + geom_tile(color = "white")+ + scale_fill_gradient2(low = "blue", high = "red", mid = "white", + midpoint = 0, limit = c(-1,1), space = "Lab", + name="Pearson\nCorrelation") + + theme_minimal()+ + theme(axis.text.x = element_text(angle = 45, vjust = 1, + size = 12, hjust = 1))+ + coord_fixed() + + geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) + + theme( + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + panel.grid.major = element_blank(), + panel.border = element_blank(), + panel.background = element_blank(), + axis.ticks = element_blank(), + legend.justification = c(1, 0), + legend.position = c(0.7, 0.7), + legend.direction = "horizontal")+ + guides(fill = guide_colorbar(barwidth = 7, barheight = 1, + title.position = "top", title.hjust = 0.5)) +ggsave('../sirv_benchmark/results/heatmap_pearson.png') diff --git a/benchmark_scripts/roc_calling.R b/benchmark_scripts/roc_calling.R index fbbafcf..3545d1d 100644 --- a/benchmark_scripts/roc_calling.R +++ b/benchmark_scripts/roc_calling.R @@ -1,10 +1,5 @@ library(tidyverse) -# input -# output -# method = c("majiq", "leafcutter", "rmats", "junctionseq"), -# group = c("mix2-vs-mix1", "mix3-vs-mix1", "mix3-vs-mix2") - df <- readr::read_csv( "../sirv_benchmark/results/SJ_annotated.csv", col_types = cols( @@ -16,18 +11,21 @@ df <- readr::read_csv( exon_number = col_character(), coordinates = col_character() ) -) - -df <- df %>% - dplyr::select(matches("-vs-"), "coordinates", "gene_name") +) %>% + dplyr::select(matches("-vs-")) %>% + # NA are not called + mutate( + across(everything(), ~ replace_na(.x, 0)) + ) %>% # calling task + mutate_all( + list(~ if_else(. > 0.05, T, F)) + ) %>% + pivot_longer(everything(), + names_to = c(".value", 'comparison'), + names_pattern = "(.+)_(.+)" + ) -coordinates <- df$coordinates -gene_name <- df$gene_name -df$coordinates <- NULL -df$gene_name <- NULL -df <- df %>% mutate( - across(everything(), ~ replace_na(.x, 0)) -) +df$comparison <- NULL compute_cm <- function(col, ref) { case_when( @@ -39,20 +37,13 @@ compute_cm <- function(col, ref) { ) } -pars <- expand_grid( - method = c("majiq", "leafcutter", "rmats", "junctionseq"), - group = c("mix2-vs-mix1", "mix3-vs-mix1", "mix3-vs-mix2") -) -pars <- pars %>% - mutate(col = str_glue("{method}_{group}")) -pars <- pars %>% - mutate(ref = str_glue("orthogonal_{group}")) -pars$method <- NULL -pars$group <- NULL +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) cm_result <- purrr::pmap(pars, compute_cm) names(cm_result) <- pars$col -table(cm_result$`majiq_mix2-vs-mix1`) cm_result %>% purrr::map(~ table(.)) %>% @@ -63,6 +54,7 @@ cm_result %>% library(ROCR) library(ggplot2) library(cowplot) + compute_prediction <- function(col, ref) { .x <- df[, c(col, ref)] .x <- filter_all(.x, any_vars(. != 0)) diff --git a/benchmark_scripts/roc_calling_nanopore.R b/benchmark_scripts/roc_calling_nanopore.R new file mode 100644 index 0000000..97ac0d6 --- /dev/null +++ b/benchmark_scripts/roc_calling_nanopore.R @@ -0,0 +1,49 @@ +source("load_nanopore_data.R") + +library(ROCR) +library(ggplot2) +library(cowplot) + +# NA are not called +df <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL +) + +compute_prediction <- function(col, ref) { + .x <- df[, c(col, ref)] + .x <- filter_all(.x, any_vars(. != 0)) + prediction(.x[[col]], ifelse(1-.x[[ref]] > .95, 0, 1)) +} + + +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) +prediction <- purrr::pmap(pars, compute_prediction) +names(prediction) <- pars$col +tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr") +auc <- lapply(prediction, performance, measure = "auc") + +method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]]))) +auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2)) + +roc_data <- tibble( + FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])), + TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])), + Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method) +) + +p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) + + geom_line() + + geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") + + theme_cowplot(14) + + labs(title = "SIRV benchmark - calling task") + + theme(legend.position = c(0.60, 0.3)) + + scale_fill_manual(values = color_list$method) +p + +ggsave("../nanopore_benchmark/results/roc_calling.pdf") + diff --git a/benchmark_scripts/roc_calling_sirv.R b/benchmark_scripts/roc_calling_sirv.R new file mode 100644 index 0000000..e9887c6 --- /dev/null +++ b/benchmark_scripts/roc_calling_sirv.R @@ -0,0 +1,48 @@ +source("load_sirv_data.R") + +library(ROCR) +library(ggplot2) +library(cowplot) + +# NA are not called +df <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL +) + +compute_prediction <- function(col, ref) { + .x <- df[, c(col, ref)] + .x <- filter_all(.x, any_vars(. != 0)) + prediction(.x[[col]], 1 -.x[[ref]]) +} + +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) +prediction <- purrr::pmap(pars, compute_prediction) +names(prediction) <- pars$col +tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr") +auc <- lapply(prediction, performance, measure = "auc") + +method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]]))) +auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2)) + +roc_data <- tibble( + FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])), + TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])), + Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method) +) + +p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) + + geom_line() + + geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") + + theme_cowplot(14) + + labs(title = "SIRV benchmark - calling task") + + theme(legend.position = c(0.60, 0.3)) + + scale_fill_manual(values = color_list$method) +p + +ggsave("../sirv_benchmark/results/roc_calling.pdf") + diff --git a/benchmark_scripts/roc_identification_nanopore.R b/benchmark_scripts/roc_identification_nanopore.R new file mode 100644 index 0000000..91079c3 --- /dev/null +++ b/benchmark_scripts/roc_identification_nanopore.R @@ -0,0 +1,49 @@ +source("load_nanopore_data.R") + +library(ROCR) +library(ggplot2) +library(cowplot) + +# NA are not called +df <- df %>% + mutate( + across(everything(), ~ifelse(is.na(.), 0, 1)), + comparison = NULL + ) + +compute_prediction <- function(col, ref) { + .x <- df[, c(col, ref)] + .x <- filter_all(.x, any_vars(. != 0)) + prediction( + ifelse(.x[[col]] > .95, 1, 0), + ifelse(1 - .x[[ref]] > .95, 1, 0)) +} + +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) +prediction <- purrr::pmap(pars, compute_prediction) +names(prediction) <- pars$col +tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr") +auc <- lapply(prediction, performance, measure = "auc") + +method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]]))) +auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2)) + +roc_data <- tibble( + FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])), + TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])), + Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method) +) + +p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) + + geom_line() + + geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") + + theme_cowplot(14) + + labs(title = "SIRV benchmark - identification task") + + theme(legend.position = c(0.60, 0.3)) +p + +ggsave("../nanopore_benchmark/results/roc_identification.pdf") + diff --git a/benchmark_scripts/roc_identification_sirv.R b/benchmark_scripts/roc_identification_sirv.R new file mode 100644 index 0000000..dec2d23 --- /dev/null +++ b/benchmark_scripts/roc_identification_sirv.R @@ -0,0 +1,47 @@ +source("load_sirv_data.R") + +library(ROCR) +library(ggplot2) +library(cowplot) + +# NA are not called +df <- df %>% + mutate( + across(everything(), ~ifelse(is.na(.), 0, 1)), + comparison = NULL + ) + +compute_prediction <- function(col, ref) { + .x <- df[, c(col, ref)] + .x <- filter_all(.x, any_vars(. != 0)) + prediction(.x[[col]], .x[[ref]]) +} + +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) +prediction <- purrr::pmap(pars, compute_prediction) +names(prediction) <- pars$col +tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr") +auc <- lapply(prediction, performance, measure = "auc") + +method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]]))) +auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2)) + +roc_data <- tibble( + FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])), + TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])), + Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method) +) + +p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) + + geom_line() + + geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") + + theme_cowplot(14) + + labs(title = "SIRV benchmark - identification task") + + theme(legend.position = c(0.60, 0.3)) +p + +ggsave("../sirv_benchmark/results/roc_identification.pdf") + diff --git a/benchmark_scripts/upset_calling.R b/benchmark_scripts/upset_calling.R index ed137a2..26631c8 100644 --- a/benchmark_scripts/upset_calling.R +++ b/benchmark_scripts/upset_calling.R @@ -1,82 +1,81 @@ -library(tidyverse) - -df <- readr::read_csv( - "results/SJ_annotated.csv", - col_types = cols( - .default = col_double(), - is_novel = col_logical(), - gene_name = col_character(), - transcript_name = col_character(), - class_code = col_character(), - exon_number = col_character(), - coordinates = col_character() - ) -) -df <- df %>% - dplyr::select(matches("-vs-"), "coordinates", "gene_name") - -coordinates <- df$coordinates -gene_name <- df$gene_name -df$coordinates <- NULL -df$gene_name <- NULL - -## Upset -library(ComplexHeatmap) -library(RColorBrewer) - -comparisons <- c("mix3-vs-mix1", "mix2-vs-mix1", "mix3-vs-mix2") -methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") - -mat <- df %>% - dplyr::select(matches("-vs-")) %>% - as.matrix(.) - -mat[mat > 0.95] <- 1 -mat[mat < 0.95] <- 0 -mat[is.na(mat)] <- 0 -mat <- mat[rowSums(mat)>=1,] - -comb_mat <- make_comb_mat(mat) - -group = str_extract(colnames(mat), str_c(comparisons, collapse = "|")) -method = str_extract(colnames(mat), str_c(methods, collapse = "|")) - -comb_mat <- comb_mat[comb_degree(comb_mat) > 1] - -p2 <- UpSet( - comb_mat, - comb_order = order(rev(comb_degree(comb_mat))), - pt_size = unit(2, "mm"), lwd = 1.5, - right_annotation = rowAnnotation( - group = group, - method = method, - col = list( - group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons) , - method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods) - ) - ) -) - - -p2 <- UpSet( - comb_mat, - comb_order = order(rev(comb_degree(comb_mat))), - pt_size = unit(2, "mm"), lwd = 1.5, - right_annotation = rowAnnotation( - group = group, - method = method, - col = list( - group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons) , - method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods) - ) - ) -) -pdf( - "upset_calling.pdf", - width = 10, - height = 5, - useDingbats = FALSE - ) -p2 -dev.off() - +# library(tidyverse) +# +# df <- readr::read_csv( +# "results/SJ_annotated.csv", +# col_types = cols( +# .default = col_double(), +# is_novel = col_logical(), +# gene_name = col_character(), +# transcript_name = col_character(), +# class_code = col_character(), +# exon_number = col_character(), +# coordinates = col_character() +# ) +# ) +# df <- df %>% +# dplyr::select(matches("-vs-"), "coordinates", "gene_name") +# +# coordinates <- df$coordinates +# gene_name <- df$gene_name +# df$coordinates <- NULL +# df$gene_name <- NULL +# +# ## Upset +# library(ComplexHeatmap) +# library(RColorBrewer) +# +# comparisons <- c("mix3-vs-mix1", "mix2-vs-mix1", "mix3-vs-mix2") +# methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") +# +# mat <- df %>% +# dplyr::select(matches("-vs-")) %>% +# as.matrix(.) +# +# mat[mat > 0.95] <- 1 +# mat[mat < 0.95] <- 0 +# mat[is.na(mat)] <- 0 +# mat <- mat[rowSums(mat) >= 1, ] +# +# comb_mat <- make_comb_mat(mat) +# +# group <- str_extract(colnames(mat), str_c(comparisons, collapse = "|")) +# method <- str_extract(colnames(mat), str_c(methods, collapse = "|")) +# +# comb_mat <- comb_mat[comb_degree(comb_mat) > 1] +# +# p2 <- UpSet( +# comb_mat, +# comb_order = order(rev(comb_degree(comb_mat))), +# pt_size = unit(2, "mm"), lwd = 1.5, +# right_annotation = rowAnnotation( +# group = group, +# method = method, +# col = list( +# group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons), +# method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods) +# ) +# ) +# ) +# +# +# p2 <- UpSet( +# comb_mat, +# comb_order = order(rev(comb_degree(comb_mat))), +# pt_size = unit(2, "mm"), lwd = 1.5, +# right_annotation = rowAnnotation( +# group = group, +# method = method, +# col = list( +# group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons), +# method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods) +# ) +# ) +# ) +# pdf( +# "upset_calling.pdf", +# width = 10, +# height = 5, +# useDingbats = FALSE +# ) +# p2 +# dev.off() diff --git a/benchmark_scripts/upset_calling_sirv.R b/benchmark_scripts/upset_calling_sirv.R new file mode 100644 index 0000000..5fd74a5 --- /dev/null +++ b/benchmark_scripts/upset_calling_sirv.R @@ -0,0 +1,47 @@ +source("load_sirv_data.R") + +library(ComplexHeatmap) +library(RColorBrewer) + +methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal") +methods <- str_extract(colnames(df), str_c(methods, collapse = "|")) +methods <- methods[!is.na(methods)] + +mat <- df %>% + mutate( + across(everything(), ~ replace_na(.x, 0)), + comparison = NULL + ) %>% + as.matrix(df) + +mat[mat > 0.95] <- 1 +mat[mat < 0.95] <- 0 +mat[is.na(mat)] <- 0 +mat <- mat[rowSums(mat) >= 1, ] +comb_mat <- make_comb_mat(mat) +method <- str_extract(colnames(mat), str_c(methods, collapse = "|")) +color_list <- list( + method = setNames( + scales::brewer_pal(palette = "Set2")(length(methods)), + methods + ) +) +comb_mat <- comb_mat[comb_degree(comb_mat) > 1] + +UpSet( + comb_mat, + comb_order = order(rev(comb_degree(comb_mat))), + top_annotation=upset_top_annotation(comb_mat, add_numbers = T), + right_annotation = upset_right_annotation( + comb_mat, add_numbers = T), + lwd = 0.8, + pt_size = unit(2, "mm")) + +pdf( + "../sirv_benchmark/results/upset_calling_sirv.pdf", + width = 6, + height = 4, + useDingbats = FALSE +) +draw(us) +dev.off() diff --git a/meta_score/train_meta_predictor.py b/meta_score/train_meta_predictor.py new file mode 100644 index 0000000..ba47805 --- /dev/null +++ b/meta_score/train_meta_predictor.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.linear_model import LogisticRegression + +from mlxtend.feature_selection import ColumnSelector + +from joblib import dump + + +# ## Parameters + +# In[ ]: + + +INPUT='/home/tbrittoborges/Baltica/nanopore_benchmark/results/SJ_annotated.csv' +TEST_SIZE=0.2 +RANDOM=1982 +NANOPORE_CUTOFF=0.95 + +np.random.seed(RANDOM) +# ## Setup + +# In[ ]: + + + +# In[ ]: + + +data = pd.read_csv(INPUT) + + +# In[ ]: + + +data.head() + + +# In[ ]: + + +data.columns = data.columns.str.lower() + + +# In[ ]: + + +data = data.drop(columns=['is_novel', 'gene_name', 'transcript_name', 'class_code', 'exon_number', 'coordinates']) + + +# In[ ]: + + +data = data.fillna(0) + + +# In[ ]: + + +data.head() + + +# In[ ]: + + +y = data['orthogonal_na'] +y = np.where(y > 0.95, 1, 0) # classification problem +x = data.drop(columns='orthogonal_na') + + +# ## Data split and preprocess + +# In[ ]: + + +X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM) + + +# In[ ]: + + +X_train.info() + + +# In[ ]: + + +np.unique(y_train, return_counts=True)[1] + + +# In[ ]: + + +np.unique(y_test, return_counts=True) + + +# ## Pipeline contruction + +# ColumnSelector API http://rasbt.github.io/mlxtend/user_guide/feature_selection/ColumnSelector/#api + +# In[ ]: + + +import itertools + + +# In[ ]: + + +col_selector = list(itertools.chain.from_iterable([itertools.combinations(range(4), i + 1) for i in range(4)])) + + +# In[ ]: + + +col_selector + + +# ## GBC + +# In[ ]: + + +pipe = Pipeline( + [('select', ColumnSelector()), + ('xboost', GradientBoostingClassifier(random_state=RANDOM))]) + + +# In[ ]: + + +param_grid = { + "select__cols": col_selector, + "xboost__learning_rate": [1, 0.5, 0.25, 0.1, 0.05, 0.01], + "xboost__max_depth": [1, 2, 4, 8], + "xboost__subsample": [0.5, 0.8, 1.0], + "xboost__min_samples_split": [0.2, 0.4, 0.8], + "xboost__n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200]} +print('Starting GCB grid search') +grid = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1, scoring='roc_auc') + + +# In[ ]: + + +grid.fit(X_train, y_train) +print('Best parameters:', grid.best_params_) +print('Best performance:', grid.best_score_) + + +# In[ ]: + + +df_grid_results_1 = pd.DataFrame(grid.cv_results_) + + +# In[ ]: + + +df_grid_results_1 = df_grid_results_1.sort_values('rank_test_score') + + +# In[ ]: + + +df_grid_results_1 + + +# In[ ]: + +print('Starting GCB grid search') +df_grid_results_1.to_csv('sklearn_result.csv') + + +# In[ ]: + + +df_grid_results_1.shape + + +# In[ ]: + + +df_grid_results_1 + + +# In[ ]: + + +df_grid_results_1.loc[ + :,['param_select__cols', 'mean_test_score', 'rank_test_score']] + + +# In[ ]: + + +df_grid_results_1.loc[ + :,['param_select__cols']].value_counts() + + +# In[ ]: + + +df_grid_results_1.loc[:, "param_select__cols"] = df_grid_results_1.loc[:, "param_select__cols"].astype(str) + + +# In[ ]: + + +df_grid_results_1.query("param_select__cols != '(0, 1, 2, 3)'").loc[ + :, ["param_select__cols", "mean_test_score", "rank_test_score"]] + + +# In[ ]: + + +dump(grid.best_estimator_, 'gcb_meta_best_estimator.joblib') +dump(grid, 'gcb_grid_object.joblib') + + +# ## Logistic regression + +# In[ ]: + + +pipe2 = Pipeline( + [('select', ColumnSelector()), + ('log_reg', LogisticRegression(random_state=RANDOM))]) + + +# In[ ]: +print("Starting LR grid search") + +param_grid2= { + "select__cols": col_selector, + "log_reg__C": np.logspace(-3, 4, 5), + "log_reg__penalty":["l1", "l2"] } + + +# In[ ]: + + +grid2 = GridSearchCV(pipe2, param_grid2, cv=10, n_jobs=-1, scoring='roc_auc') + + +# In[ ]: + +print("Saving LR grid search") +grid2.fit(X_train, y_train) +print('Best parameters:', grid2.best_params_) +print('Best performance:', grid2.best_score_) + + +# In[ ]: + + +df_grid2_results = pd.DataFrame(grid2.cv_results_) + + +# In[ ]: + + +df_grid2_results = df_grid2_results.sort_values('rank_test_score') + + +# In[ ]: + + +df_grid2_results.groupby('param_select__cols').first() + + +# In[ ]: + + + +