From 76c1defaf2d668570ed9f9a283ced2732047332e Mon Sep 17 00:00:00 2001
From: tbrittoborges <thiago.brittoborges@uni-heidelberg.de>
Date: Tue, 7 Sep 2021 11:11:10 +0200
Subject: [PATCH] initial commit for script to plot and train the meta
 predictor

---
 benchmark_scripts/confmat_calling_nanopore.R  |  26 ++
 benchmark_scripts/confmat_calling_sirv.R      |  27 ++
 benchmark_scripts/eulerr_calling_sirv.R       |  53 ++++
 benchmark_scripts/heatmap_calling_nanopore.R  |  66 ++++
 benchmark_scripts/heatmap_calling_sirv.R      |  74 +++++
 benchmark_scripts/load_nanopore_data.R        |  31 ++
 benchmark_scripts/load_sirv_data.R            |  36 +++
 benchmark_scripts/pearson_calling_sirv.R      |  59 ++++
 benchmark_scripts/roc_calling.R               |  46 ++-
 benchmark_scripts/roc_calling_nanopore.R      |  49 +++
 benchmark_scripts/roc_calling_sirv.R          |  48 +++
 .../roc_identification_nanopore.R             |  49 +++
 benchmark_scripts/roc_identification_sirv.R   |  47 +++
 benchmark_scripts/upset_calling.R             | 163 +++++-----
 benchmark_scripts/upset_calling_sirv.R        |  47 +++
 meta_score/train_meta_predictor.py            | 286 ++++++++++++++++++
 16 files changed, 998 insertions(+), 109 deletions(-)
 create mode 100644 benchmark_scripts/confmat_calling_nanopore.R
 create mode 100644 benchmark_scripts/confmat_calling_sirv.R
 create mode 100644 benchmark_scripts/eulerr_calling_sirv.R
 create mode 100644 benchmark_scripts/heatmap_calling_nanopore.R
 create mode 100644 benchmark_scripts/heatmap_calling_sirv.R
 create mode 100644 benchmark_scripts/load_nanopore_data.R
 create mode 100644 benchmark_scripts/load_sirv_data.R
 create mode 100644 benchmark_scripts/pearson_calling_sirv.R
 create mode 100644 benchmark_scripts/roc_calling_nanopore.R
 create mode 100644 benchmark_scripts/roc_calling_sirv.R
 create mode 100644 benchmark_scripts/roc_identification_nanopore.R
 create mode 100644 benchmark_scripts/roc_identification_sirv.R
 create mode 100644 benchmark_scripts/upset_calling_sirv.R
 create mode 100644 meta_score/train_meta_predictor.py

diff --git a/benchmark_scripts/confmat_calling_nanopore.R b/benchmark_scripts/confmat_calling_nanopore.R
new file mode 100644
index 0000000..d87ca4a
--- /dev/null
+++ b/benchmark_scripts/confmat_calling_nanopore.R
@@ -0,0 +1,26 @@
+source("load_nanopore_data.R")
+
+cm <- lapply(
+  seq_along(df),
+  function(x) {
+    caret::confusionMatrix(
+      as.factor(ifelse(df[[x]] > 0.95, 1, 0)),
+      as.factor(ifelse(df$orthogonal > 0.95, 1, 0)),
+      positive = "1"
+    )
+  }
+)
+
+names(cm) <- colnames(df)
+
+lapply(
+  names(cm),
+  function(x) {
+    writeLines(
+      capture.output(
+        cm[[x]]
+      ),
+      paste0("../nanopore_benchmark/results/confmat_", x, ".tab")
+    )
+  }
+)
diff --git a/benchmark_scripts/confmat_calling_sirv.R b/benchmark_scripts/confmat_calling_sirv.R
new file mode 100644
index 0000000..b841de8
--- /dev/null
+++ b/benchmark_scripts/confmat_calling_sirv.R
@@ -0,0 +1,27 @@
+source("load_sirv_data.R")
+
+
+cm <- lapply(
+  seq_along(df),
+  function(x) {
+    caret::confusionMatrix(
+      as.factor(ifelse(df[[x]] > 0.95, 1, 0)),
+      as.factor(df$orthogonal),
+      positive = "1"
+    )
+  }
+)
+
+names(cm) <- colnames(df)
+
+lapply(
+  names(cm),
+  function(x) {
+    writeLines(
+      capture.output(
+        cm[[x]]
+      ),
+      paste0("../sirv_benchmark/results/confmat_", x, ".tab")
+    )
+  }
+)
diff --git a/benchmark_scripts/eulerr_calling_sirv.R b/benchmark_scripts/eulerr_calling_sirv.R
new file mode 100644
index 0000000..b73ddfd
--- /dev/null
+++ b/benchmark_scripts/eulerr_calling_sirv.R
@@ -0,0 +1,53 @@
+library(eulerr)
+
+source("~/Baltica/benchmark_scripts/load_sirv_data.R")
+
+mat <- df %>%
+  mutate(
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+  ) %>%
+  as.matrix(df)
+
+mat[mat > 0.95] <- 1
+mat[mat < 0.95] <- 0
+mat[is.na(mat)] <- 0
+mat <- mat[rowSums(mat) >= 1, ]
+
+fit <- euler(mat, shape = "ellipse")
+
+plot(
+  fit,
+  fills = color_list$method,
+  edges = F,
+  fontsize = 14,
+  quantities = list(fontsize = 16),
+  labels = FALSE,
+  legend = list(nrow = 1, ncol = 5, side = "bottom", fontsize = 16),
+  main = "Intesection of SJ per method"
+)
+
+library(VennDiagram)
+
+input_sj <- lapply(
+  as.data.frame(mat),
+  function(i) unique(coordinates[i])
+)
+
+venn.diagram(
+  x = as.list(mat),
+  category.names = colnames(mat),
+  filename = "venn_sirv_calling.png"
+  #   output=TRUE
+)
+
+input <- lapply(as.data.frame(mat), function(i) unique(gene_name[i]))
+
+input$junctionseq
+
+venn.diagram(
+  x = input,
+  #   category.names = colnames(mat),
+  filename = "venn_sirv_calling.png",
+  output = TRUE
+)
diff --git a/benchmark_scripts/heatmap_calling_nanopore.R b/benchmark_scripts/heatmap_calling_nanopore.R
new file mode 100644
index 0000000..da12395
--- /dev/null
+++ b/benchmark_scripts/heatmap_calling_nanopore.R
@@ -0,0 +1,66 @@
+source("load_nanopore_data.R")
+
+library(ComplexHeatmap)
+library(RColorBrewer)
+
+set.seed(123)
+
+methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+methods <- str_extract(colnames(df), str_c(methods, collapse = "|"))
+methods <- methods[!is.na(methods)]
+
+mat <- df %>% 
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+  ) %>% 
+  as.matrix(df)
+
+idx <- data.frame(
+  is=ifelse(df$orthogonal>0.95, 1, 0)
+)
+
+idx <- idx %>%
+  mutate(n=row_number()) %>% 
+  group_by(is) %>% 
+  sample_n(500)
+
+method <- str_extract(colnames(mat), str_c(methods, collapse = "|"))
+method_cols <- setNames(brewer.pal(n = length(methods), name = "Set2"), methods)
+
+mat <- mat[idx$n, ]
+ca=HeatmapAnnotation(
+  method_names=anno_block(
+    gp = gpar(fill = method_cols[method]),
+    labels = method,
+    labels_gp = gpar(col = "white", fontsize = 9))
+)
+
+lengend_method = Legend(
+  labels = unique(method),
+  title = "method",
+  legend_gp = gpar(
+    fontsize = 14,
+    fontface = "bold",
+    fill =  method_cols[unique(method)]))
+
+col_fun = circlize::colorRamp2(c(0, 0.95, 1), c("blue", "white", "red"))
+
+hm=Heatmap(
+  mat,
+  name='score',
+  col = col_fun,
+  top_annotation = ca,
+  cluster_columns = F,
+  cluster_rows = F,
+  column_split = 1:5,
+  border = TRUE,
+  column_title = NULL,
+  show_column_names=F)
+
+png("~/Baltica/nanopore_benchmark/results/heatmap_calling.pdf", width = 8, height = 4, units = "in", res = 200)
+draw(hm,
+     heatmap_legend_list = list(lengend_method))
+dev.off()
+
+
diff --git a/benchmark_scripts/heatmap_calling_sirv.R b/benchmark_scripts/heatmap_calling_sirv.R
new file mode 100644
index 0000000..cec690d
--- /dev/null
+++ b/benchmark_scripts/heatmap_calling_sirv.R
@@ -0,0 +1,74 @@
+source("load_sirv_data.R")
+
+library(ComplexHeatmap)
+library(RColorBrewer)
+
+set.seed(123)
+
+methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+methods <- str_extract(colnames(df), str_c(methods, collapse = "|"))
+methods <- methods[!is.na(methods)]
+
+mat <- df %>% 
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+  ) %>% 
+  as.matrix(df)
+
+is_SIRV <- startsWith(coordinates, 'SIRV')
+
+idx <- data_frame(
+  is_SIRV=is_SIRV)
+
+idx <- idx %>%
+  mutate(n=row_number()) %>% 
+  group_by(is_SIRV) %>% 
+  sample_n(504)
+
+method <- str_extract(colnames(mat), str_c(methods, collapse = "|"))
+method_cols <- setNames(brewer.pal(n = length(methods), name = "Set2"), methods)
+
+mat <- mat[idx$n, ]
+ca=HeatmapAnnotation(
+  method_names=anno_block(
+    gp = gpar(fill = method_cols[method]),
+    labels = method,
+    labels_gp = gpar(col = "white", fontsize = 9))
+)
+
+lengend_method = Legend(
+  labels = unique(method),
+  title = "method",
+  legend_gp = gpar(
+    fontsize = 14,
+    fontface = "bold",
+    fill =  method_cols[unique(method)]))
+
+ra = HeatmapAnnotation(
+  SIRV = is_SIRV[idx$n],
+  col=list(SIRV=c('TRUE'='black', 'FALSE'='white')),
+  which = 'row')
+
+col_fun = circlize::colorRamp2(c(0, 0.95, 1), c("blue", "white", "red"))
+
+hm=Heatmap(
+  mat,
+  name='score',
+  col = col_fun,
+  left_annotation = ra,
+  top_annotation = ca,
+  cluster_columns = F,
+  cluster_rows = F,
+  column_split = 1:5,
+  row_split = is_SIRV[idx$n],
+  border = TRUE,
+  column_title = NULL,
+  show_column_names=F)
+
+png("~/Baltica/sirv_benchmark/results/heatmap_calling.pdf", width = 8, height = 4, units = "in", res = 200)
+draw(hm,
+     heatmap_legend_list = list(lengend_method))
+dev.off()
+
+
diff --git a/benchmark_scripts/load_nanopore_data.R b/benchmark_scripts/load_nanopore_data.R
new file mode 100644
index 0000000..94e2a08
--- /dev/null
+++ b/benchmark_scripts/load_nanopore_data.R
@@ -0,0 +1,31 @@
+library(tidyverse)
+
+df <- readr::read_csv(
+  "../nanopore_benchmark/results/SJ_annotated.csv",
+  col_types = cols(
+    .default = col_double(),
+    is_novel = col_logical(),
+    gene_name = col_character(),
+    transcript_name = col_character(),
+    class_code = col_character(),
+    exon_number = col_character(),
+    coordinates = col_character()
+  )
+) 
+
+gene_name <- df$gene_name
+coordinates <- df$coordinates
+is_novel <- df$is_novel
+
+colnames(df) <- gsub(x=colnames(df), pattern = "_SMG7KO2SMG6KD-vs-control", '')
+colnames(df) <- gsub(x=colnames(df), pattern = "_NA", '')
+
+methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+df <- df %>% dplyr::select(matches(methods)) 
+
+color_list <- list(
+  method = setNames(
+    scales::brewer_pal(palette = "Set2")(length(methods)),
+    methods
+  )
+)
diff --git a/benchmark_scripts/load_sirv_data.R b/benchmark_scripts/load_sirv_data.R
new file mode 100644
index 0000000..f309ab9
--- /dev/null
+++ b/benchmark_scripts/load_sirv_data.R
@@ -0,0 +1,36 @@
+library(tidyverse)
+
+df <- readr::read_csv(
+  "../sirv_benchmark/results/SJ_annotated.csv",
+  col_types = cols(
+    .default = col_double(),
+    is_novel = col_logical(),
+    gene_name = col_character(),
+    transcript_name = col_character(),
+    class_code = col_character(),
+    exon_number = col_character(),
+    coordinates = col_character()
+  )
+) 
+
+df <- df %>% 
+  # pivot to remove the mixes
+  pivot_longer(matches("-vs-"),
+               names_to = c(".value", 'comparison'),
+               names_pattern = "(.+)_(.+)"
+  ) 
+
+gene_name <- df$gene_name
+coordinates <- df$coordinates
+is_novel <- df$is_novel
+
+methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+
+df <- df %>% dplyr::select(matches(methods)) 
+
+color_list <- list(
+  method = setNames(
+    scales::brewer_pal(palette = "Set2")(length(methods)),
+    methods
+  )
+)
diff --git a/benchmark_scripts/pearson_calling_sirv.R b/benchmark_scripts/pearson_calling_sirv.R
new file mode 100644
index 0000000..4b7e950
--- /dev/null
+++ b/benchmark_scripts/pearson_calling_sirv.R
@@ -0,0 +1,59 @@
+library(tidyverse)
+library(reshape2)
+
+df <- readr::read_csv(
+  "../sirv_benchmark/results/SJ_annotated.csv",
+  col_types = cols(
+    .default = col_double(),
+    is_novel = col_logical(),
+    gene_name = col_character(),
+    transcript_name = col_character(),
+    class_code = col_character(),
+    exon_number = col_character(),
+    coordinates = col_character()
+  )
+)
+
+df <- df %>%
+  dplyr::select(matches("-vs-"), "coordinates", "gene_name")
+
+coordinates <- df$coordinates
+gene_name <- df$gene_name
+df$coordinates <- NULL
+df$gene_name <- NULL
+df <- df %>% mutate(
+  across(everything(), ~ replace_na(.x, 0))
+)
+
+cormat <- apply(df, 2, function(x){ -log10(x+1e-10) })
+cormat <- round(
+  cor(cormat, method='pearson',  use="pairwise.complete.obs"),
+  2)
+cormat[lower.tri(cormat)]=NA
+cormat=melt(cormat)
+cormat=cormat[!is.na(cormat$value),]
+ggplot(cormat, aes(Var2, Var1, fill = value))+
+  geom_tile(color = "white")+
+  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
+                       midpoint = 0, limit = c(-1,1), space = "Lab", 
+                       name="Pearson\nCorrelation") +
+  theme_minimal()+ 
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
+                                   size = 12, hjust = 1))+
+  coord_fixed() +
+  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
+  theme(
+    axis.title.x = element_blank(),
+    axis.title.y = element_blank(),
+    axis.text.x=element_blank(),
+    axis.ticks.x=element_blank(),
+    panel.grid.major = element_blank(),
+    panel.border = element_blank(),
+    panel.background = element_blank(),
+    axis.ticks = element_blank(),
+    legend.justification = c(1, 0),
+    legend.position = c(0.7, 0.7),
+    legend.direction = "horizontal")+
+  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
+                               title.position = "top", title.hjust = 0.5))
+ggsave('../sirv_benchmark/results/heatmap_pearson.png')
diff --git a/benchmark_scripts/roc_calling.R b/benchmark_scripts/roc_calling.R
index fbbafcf..3545d1d 100644
--- a/benchmark_scripts/roc_calling.R
+++ b/benchmark_scripts/roc_calling.R
@@ -1,10 +1,5 @@
 library(tidyverse)
 
-# input
-# output
-# method = c("majiq", "leafcutter", "rmats", "junctionseq"),
-# group = c("mix2-vs-mix1", "mix3-vs-mix1", "mix3-vs-mix2")
-
 df <- readr::read_csv(
   "../sirv_benchmark/results/SJ_annotated.csv",
   col_types = cols(
@@ -16,18 +11,21 @@ df <- readr::read_csv(
     exon_number = col_character(),
     coordinates = col_character()
   )
-)
-
-df <- df %>%
-  dplyr::select(matches("-vs-"), "coordinates", "gene_name")
+) %>%
+  dplyr::select(matches("-vs-")) %>% 
+  # NA are not called
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0))
+  ) %>% # calling task
+  mutate_all( 
+    list(~ if_else(. > 0.05, T, F))
+  ) %>%
+  pivot_longer(everything(),
+               names_to = c(".value", 'comparison'),
+               names_pattern = "(.+)_(.+)"
+  ) 
 
-coordinates <- df$coordinates
-gene_name <- df$gene_name
-df$coordinates <- NULL
-df$gene_name <- NULL
-df <- df %>% mutate(
-  across(everything(), ~ replace_na(.x, 0))
-)
+df$comparison <- NULL
 
 compute_cm <- function(col, ref) {
   case_when(
@@ -39,20 +37,13 @@ compute_cm <- function(col, ref) {
   )
 }
 
-pars <- expand_grid(
-  method = c("majiq", "leafcutter", "rmats", "junctionseq"),
-  group = c("mix2-vs-mix1", "mix3-vs-mix1", "mix3-vs-mix2")
-)
-pars <- pars %>%
-  mutate(col = str_glue("{method}_{group}"))
-pars <- pars %>%
-  mutate(ref = str_glue("orthogonal_{group}"))
-pars$method <- NULL
-pars$group <- NULL
+pars <- tibble(
+  col = c("majiq", "leafcutter", "rmats", "junctionseq"),
+  ref = rep("orthogonal", 4) 
+) 
 
 cm_result <- purrr::pmap(pars, compute_cm)
 names(cm_result) <- pars$col
-table(cm_result$`majiq_mix2-vs-mix1`)
 
 cm_result %>%
   purrr::map(~ table(.)) %>%
@@ -63,6 +54,7 @@ cm_result %>%
 library(ROCR)
 library(ggplot2)
 library(cowplot)
+
 compute_prediction <- function(col, ref) {
   .x <- df[, c(col, ref)]
   .x <- filter_all(.x, any_vars(. != 0))
diff --git a/benchmark_scripts/roc_calling_nanopore.R b/benchmark_scripts/roc_calling_nanopore.R
new file mode 100644
index 0000000..97ac0d6
--- /dev/null
+++ b/benchmark_scripts/roc_calling_nanopore.R
@@ -0,0 +1,49 @@
+source("load_nanopore_data.R")
+
+library(ROCR)
+library(ggplot2)
+library(cowplot)
+
+# NA are not called
+df <- df %>% 
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+)
+
+compute_prediction <- function(col, ref) {
+  .x <- df[, c(col, ref)]
+  .x <- filter_all(.x, any_vars(. != 0))
+  prediction(.x[[col]], ifelse(1-.x[[ref]] > .95, 0, 1))
+}
+
+
+pars <- tibble(
+  col = c("majiq", "leafcutter", "rmats", "junctionseq"),
+  ref = rep("orthogonal", 4) 
+) 
+prediction <- purrr::pmap(pars, compute_prediction)
+names(prediction) <- pars$col
+tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr")
+auc <- lapply(prediction, performance, measure = "auc")
+
+method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]])))
+auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2))
+
+roc_data <- tibble(
+  FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])),
+  TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])),
+  Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method)
+)
+
+p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) +
+  geom_line() +
+  geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") +
+  theme_cowplot(14) +
+  labs(title = "SIRV benchmark - calling task") +
+  theme(legend.position = c(0.60, 0.3)) +
+  scale_fill_manual(values = color_list$method)
+p
+
+ggsave("../nanopore_benchmark/results/roc_calling.pdf")
+
diff --git a/benchmark_scripts/roc_calling_sirv.R b/benchmark_scripts/roc_calling_sirv.R
new file mode 100644
index 0000000..e9887c6
--- /dev/null
+++ b/benchmark_scripts/roc_calling_sirv.R
@@ -0,0 +1,48 @@
+source("load_sirv_data.R")
+
+library(ROCR)
+library(ggplot2)
+library(cowplot)
+
+# NA are not called
+df <- df %>% 
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+)
+
+compute_prediction <- function(col, ref) {
+  .x <- df[, c(col, ref)]
+  .x <- filter_all(.x, any_vars(. != 0))
+  prediction(.x[[col]], 1 -.x[[ref]])
+}
+
+pars <- tibble(
+  col = c("majiq", "leafcutter", "rmats", "junctionseq"),
+  ref = rep("orthogonal", 4) 
+) 
+prediction <- purrr::pmap(pars, compute_prediction)
+names(prediction) <- pars$col
+tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr")
+auc <- lapply(prediction, performance, measure = "auc")
+
+method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]])))
+auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2))
+
+roc_data <- tibble(
+  FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])),
+  TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])),
+  Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method)
+)
+
+p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) +
+  geom_line() +
+  geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") +
+  theme_cowplot(14) +
+  labs(title = "SIRV benchmark - calling task") +
+  theme(legend.position = c(0.60, 0.3)) +
+  scale_fill_manual(values = color_list$method)
+p
+
+ggsave("../sirv_benchmark/results/roc_calling.pdf")
+
diff --git a/benchmark_scripts/roc_identification_nanopore.R b/benchmark_scripts/roc_identification_nanopore.R
new file mode 100644
index 0000000..91079c3
--- /dev/null
+++ b/benchmark_scripts/roc_identification_nanopore.R
@@ -0,0 +1,49 @@
+source("load_nanopore_data.R")
+
+library(ROCR)
+library(ggplot2)
+library(cowplot)
+
+# NA are not called
+df <- df %>% 
+  mutate( 
+    across(everything(), ~ifelse(is.na(.), 0, 1)),
+    comparison = NULL
+  ) 
+
+compute_prediction <- function(col, ref) {
+  .x <- df[, c(col, ref)]
+  .x <- filter_all(.x, any_vars(. != 0))
+  prediction(
+    ifelse(.x[[col]] > .95, 1, 0), 
+    ifelse(1 - .x[[ref]] > .95, 1, 0))
+}
+
+pars <- tibble(
+  col = c("majiq", "leafcutter", "rmats", "junctionseq"),
+  ref = rep("orthogonal", 4) 
+) 
+prediction <- purrr::pmap(pars, compute_prediction)
+names(prediction) <- pars$col
+tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr")
+auc <- lapply(prediction, performance, measure = "auc")
+
+method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]])))
+auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2))
+
+roc_data <- tibble(
+  FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])),
+  TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])),
+  Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method)
+)
+
+p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) +
+  geom_line() +
+  geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") +
+  theme_cowplot(14) +
+  labs(title = "SIRV benchmark - identification task") +
+  theme(legend.position = c(0.60, 0.3))
+p
+
+ggsave("../nanopore_benchmark/results/roc_identification.pdf")
+
diff --git a/benchmark_scripts/roc_identification_sirv.R b/benchmark_scripts/roc_identification_sirv.R
new file mode 100644
index 0000000..dec2d23
--- /dev/null
+++ b/benchmark_scripts/roc_identification_sirv.R
@@ -0,0 +1,47 @@
+source("load_sirv_data.R")
+
+library(ROCR)
+library(ggplot2)
+library(cowplot)
+
+# NA are not called
+df <- df %>% 
+  mutate( 
+    across(everything(), ~ifelse(is.na(.), 0, 1)),
+    comparison = NULL
+  ) 
+
+compute_prediction <- function(col, ref) {
+  .x <- df[, c(col, ref)]
+  .x <- filter_all(.x, any_vars(. != 0))
+  prediction(.x[[col]], .x[[ref]])
+}
+
+pars <- tibble(
+  col = c("majiq", "leafcutter", "rmats", "junctionseq"),
+  ref = rep("orthogonal", 4) 
+) 
+prediction <- purrr::pmap(pars, compute_prediction)
+names(prediction) <- pars$col
+tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr")
+auc <- lapply(prediction, performance, measure = "auc")
+
+method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]])))
+auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2))
+
+roc_data <- tibble(
+  FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])),
+  TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])),
+  Method = rep(paste0(names(method), " (AUC=", auc[names(method)], ")"), method)
+)
+
+p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) +
+  geom_line() +
+  geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") +
+  theme_cowplot(14) +
+  labs(title = "SIRV benchmark - identification task") +
+  theme(legend.position = c(0.60, 0.3))
+p
+
+ggsave("../sirv_benchmark/results/roc_identification.pdf")
+
diff --git a/benchmark_scripts/upset_calling.R b/benchmark_scripts/upset_calling.R
index ed137a2..26631c8 100644
--- a/benchmark_scripts/upset_calling.R
+++ b/benchmark_scripts/upset_calling.R
@@ -1,82 +1,81 @@
-library(tidyverse)
-
-df <- readr::read_csv(
-  "results/SJ_annotated.csv",
-  col_types = cols(
-    .default = col_double(),
-    is_novel = col_logical(),
-    gene_name = col_character(),
-    transcript_name = col_character(),
-    class_code = col_character(),
-    exon_number = col_character(),
-    coordinates = col_character()
-  )
-)
-df <- df %>%
-  dplyr::select(matches("-vs-"), "coordinates", "gene_name")
-
-coordinates <- df$coordinates
-gene_name <- df$gene_name
-df$coordinates <- NULL
-df$gene_name <- NULL
-
-## Upset
-library(ComplexHeatmap)
-library(RColorBrewer)
-
-comparisons <- c("mix3-vs-mix1", "mix2-vs-mix1", "mix3-vs-mix2")
-methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
-
-mat <- df %>%
-  dplyr::select(matches("-vs-")) %>%
-  as.matrix(.)
-
-mat[mat > 0.95] <- 1
-mat[mat < 0.95] <- 0
-mat[is.na(mat)] <- 0
-mat <- mat[rowSums(mat)>=1,]
-
-comb_mat <- make_comb_mat(mat)
-
-group = str_extract(colnames(mat), str_c(comparisons, collapse = "|"))
-method = str_extract(colnames(mat), str_c(methods, collapse = "|"))
-
-comb_mat <- comb_mat[comb_degree(comb_mat) > 1]
-
-p2 <- UpSet(
-  comb_mat,
-  comb_order = order(rev(comb_degree(comb_mat))),
-  pt_size = unit(2, "mm"), lwd = 1.5,
-  right_annotation = rowAnnotation(
-    group = group,
-    method = method,
-    col = list(
-      group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons) ,
-      method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods)
-    )
-  )
-)
-
-
-p2 <- UpSet(
-  comb_mat,
-  comb_order = order(rev(comb_degree(comb_mat))),
-  pt_size = unit(2, "mm"), lwd = 1.5,
-  right_annotation = rowAnnotation(
-    group = group,
-    method = method,
-    col = list(
-      group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons) ,
-      method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods)
-    )
-  )
-)
-pdf(
-   "upset_calling.pdf",
-   width = 10,
-   height = 5,
-   useDingbats = FALSE
- )
-p2
-dev.off()
-
+# library(tidyverse)
+# 
+# df <- readr::read_csv(
+#   "results/SJ_annotated.csv",
+#   col_types = cols(
+#     .default = col_double(),
+#     is_novel = col_logical(),
+#     gene_name = col_character(),
+#     transcript_name = col_character(),
+#     class_code = col_character(),
+#     exon_number = col_character(),
+#     coordinates = col_character()
+#   )
+# )
+# df <- df %>%
+#   dplyr::select(matches("-vs-"), "coordinates", "gene_name")
+# 
+# coordinates <- df$coordinates
+# gene_name <- df$gene_name
+# df$coordinates <- NULL
+# df$gene_name <- NULL
+# 
+# ## Upset
+# library(ComplexHeatmap)
+# library(RColorBrewer)
+# 
+# comparisons <- c("mix3-vs-mix1", "mix2-vs-mix1", "mix3-vs-mix2")
+# methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+# 
+# mat <- df %>%
+#   dplyr::select(matches("-vs-")) %>%
+#   as.matrix(.)
+# 
+# mat[mat > 0.95] <- 1
+# mat[mat < 0.95] <- 0
+# mat[is.na(mat)] <- 0
+# mat <- mat[rowSums(mat) >= 1, ]
+# 
+# comb_mat <- make_comb_mat(mat)
+# 
+# group <- str_extract(colnames(mat), str_c(comparisons, collapse = "|"))
+# method <- str_extract(colnames(mat), str_c(methods, collapse = "|"))
+# 
+# comb_mat <- comb_mat[comb_degree(comb_mat) > 1]
+# 
+# p2 <- UpSet(
+#   comb_mat,
+#   comb_order = order(rev(comb_degree(comb_mat))),
+#   pt_size = unit(2, "mm"), lwd = 1.5,
+#   right_annotation = rowAnnotation(
+#     group = group,
+#     method = method,
+#     col = list(
+#       group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons),
+#       method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods)
+#     )
+#   )
+# )
+# 
+# 
+# p2 <- UpSet(
+#   comb_mat,
+#   comb_order = order(rev(comb_degree(comb_mat))),
+#   pt_size = unit(2, "mm"), lwd = 1.5,
+#   right_annotation = rowAnnotation(
+#     group = group,
+#     method = method,
+#     col = list(
+#       group = setNames(brewer.pal(n = length(comparisons), name = "Set2"), comparisons),
+#       method = setNames(brewer.pal(n = length(methods), name = "Set1"), methods)
+#     )
+#   )
+# )
+# pdf(
+#   "upset_calling.pdf",
+#   width = 10,
+#   height = 5,
+#   useDingbats = FALSE
+# )
+# p2
+# dev.off()
diff --git a/benchmark_scripts/upset_calling_sirv.R b/benchmark_scripts/upset_calling_sirv.R
new file mode 100644
index 0000000..5fd74a5
--- /dev/null
+++ b/benchmark_scripts/upset_calling_sirv.R
@@ -0,0 +1,47 @@
+source("load_sirv_data.R")
+
+library(ComplexHeatmap)
+library(RColorBrewer)
+
+methods <- c("rmats", "junctionseq", "majiq", "leafcutter", "orthogonal")
+methods <- str_extract(colnames(df), str_c(methods, collapse = "|"))
+methods <- methods[!is.na(methods)]
+
+mat <- df %>% 
+  mutate( 
+    across(everything(), ~ replace_na(.x, 0)),
+    comparison = NULL
+  ) %>% 
+  as.matrix(df)
+
+mat[mat > 0.95] <- 1
+mat[mat < 0.95] <- 0
+mat[is.na(mat)] <- 0
+mat <- mat[rowSums(mat) >= 1, ]
+comb_mat <- make_comb_mat(mat)
+method <- str_extract(colnames(mat), str_c(methods, collapse = "|"))
+color_list <- list(
+  method = setNames(
+    scales::brewer_pal(palette = "Set2")(length(methods)),
+    methods
+  )
+)
+comb_mat <- comb_mat[comb_degree(comb_mat) > 1]
+
+UpSet(
+  comb_mat,
+  comb_order = order(rev(comb_degree(comb_mat))),
+  top_annotation=upset_top_annotation(comb_mat, add_numbers = T),
+  right_annotation = upset_right_annotation(
+    comb_mat, add_numbers = T),
+  lwd = 0.8,
+  pt_size = unit(2, "mm"))
+
+pdf(
+  "../sirv_benchmark/results/upset_calling_sirv.pdf",
+  width = 6,
+  height = 4,
+  useDingbats = FALSE
+)
+draw(us)
+dev.off()
diff --git a/meta_score/train_meta_predictor.py b/meta_score/train_meta_predictor.py
new file mode 100644
index 0000000..ba47805
--- /dev/null
+++ b/meta_score/train_meta_predictor.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import GridSearchCV
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+
+from mlxtend.feature_selection import ColumnSelector
+
+from joblib import dump
+
+
+# ## Parameters
+
+# In[ ]:
+
+
+INPUT='/home/tbrittoborges/Baltica/nanopore_benchmark/results/SJ_annotated.csv'
+TEST_SIZE=0.2
+RANDOM=1982
+NANOPORE_CUTOFF=0.95
+
+np.random.seed(RANDOM)
+# ## Setup
+
+# In[ ]:
+
+
+
+# In[ ]:
+
+
+data = pd.read_csv(INPUT)
+
+
+# In[ ]:
+
+
+data.head()
+
+
+# In[ ]:
+
+
+data.columns = data.columns.str.lower()
+
+
+# In[ ]:
+
+
+data = data.drop(columns=['is_novel', 'gene_name', 'transcript_name', 'class_code', 'exon_number', 'coordinates'])
+
+
+# In[ ]:
+
+
+data = data.fillna(0)
+
+
+# In[ ]:
+
+
+data.head()
+
+
+# In[ ]:
+
+
+y = data['orthogonal_na']
+y = np.where(y > 0.95, 1, 0) # classification problem
+x = data.drop(columns='orthogonal_na')
+
+
+# ## Data split and preprocess
+
+# In[ ]:
+
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM)
+
+
+# In[ ]:
+
+
+X_train.info()
+
+
+# In[ ]:
+
+
+np.unique(y_train, return_counts=True)[1]
+
+
+# In[ ]:
+
+
+np.unique(y_test, return_counts=True)
+
+
+# ## Pipeline contruction
+
+# ColumnSelector API http://rasbt.github.io/mlxtend/user_guide/feature_selection/ColumnSelector/#api
+
+# In[ ]:
+
+
+import itertools
+
+
+# In[ ]:
+
+
+col_selector = list(itertools.chain.from_iterable([itertools.combinations(range(4), i + 1) for i in range(4)]))
+
+
+# In[ ]:
+
+
+col_selector
+
+
+# ## GBC 
+
+# In[ ]:
+
+
+pipe = Pipeline(
+    [('select', ColumnSelector()),
+     ('xboost', GradientBoostingClassifier(random_state=RANDOM))])
+
+
+# In[ ]:
+
+
+param_grid = {
+    "select__cols": col_selector,
+    "xboost__learning_rate": [1, 0.5, 0.25, 0.1, 0.05, 0.01],
+    "xboost__max_depth": [1, 2, 4, 8],
+    "xboost__subsample": [0.5, 0.8, 1.0],
+    "xboost__min_samples_split": [0.2, 0.4, 0.8],
+    "xboost__n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200]}
+print('Starting GCB grid search')
+grid = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
+
+
+# In[ ]:
+
+
+grid.fit(X_train, y_train)
+print('Best parameters:', grid.best_params_)
+print('Best performance:', grid.best_score_)
+
+
+# In[ ]:
+
+
+df_grid_results_1 = pd.DataFrame(grid.cv_results_)
+
+
+# In[ ]:
+
+
+df_grid_results_1  = df_grid_results_1.sort_values('rank_test_score')
+
+
+# In[ ]:
+
+
+df_grid_results_1
+
+
+# In[ ]:
+
+print('Starting GCB grid search')
+df_grid_results_1.to_csv('sklearn_result.csv')
+
+
+# In[ ]:
+
+
+df_grid_results_1.shape
+
+
+# In[ ]:
+
+
+df_grid_results_1
+
+
+# In[ ]:
+
+
+df_grid_results_1.loc[
+    :,['param_select__cols', 'mean_test_score', 'rank_test_score']]
+
+
+# In[ ]:
+
+
+df_grid_results_1.loc[
+    :,['param_select__cols']].value_counts()
+
+
+# In[ ]:
+
+
+df_grid_results_1.loc[:, "param_select__cols"] = df_grid_results_1.loc[:, "param_select__cols"].astype(str)
+
+
+# In[ ]:
+
+
+df_grid_results_1.query("param_select__cols != '(0, 1, 2, 3)'").loc[
+    :, ["param_select__cols", "mean_test_score", "rank_test_score"]]
+
+
+# In[ ]:
+
+
+dump(grid.best_estimator_, 'gcb_meta_best_estimator.joblib')
+dump(grid, 'gcb_grid_object.joblib')
+
+
+# ## Logistic regression
+
+# In[ ]:
+
+
+pipe2 = Pipeline(
+    [('select', ColumnSelector()),
+     ('log_reg', LogisticRegression(random_state=RANDOM))])
+
+
+# In[ ]:
+print("Starting LR grid search")
+
+param_grid2= {
+    "select__cols": col_selector,
+    "log_reg__C": np.logspace(-3, 4, 5), 
+    "log_reg__penalty":["l1", "l2"] }
+
+
+# In[ ]:
+
+
+grid2 = GridSearchCV(pipe2, param_grid2, cv=10, n_jobs=-1, scoring='roc_auc')
+
+
+# In[ ]:
+
+print("Saving LR grid search")
+grid2.fit(X_train, y_train)
+print('Best parameters:', grid2.best_params_)
+print('Best performance:', grid2.best_score_)
+
+
+# In[ ]:
+
+
+df_grid2_results = pd.DataFrame(grid2.cv_results_)
+
+
+# In[ ]:
+
+
+df_grid2_results  = df_grid2_results.sort_values('rank_test_score')
+
+
+# In[ ]:
+
+
+df_grid2_results.groupby('param_select__cols').first()
+
+
+# In[ ]:
+
+
+
+