diff --git a/benchmark_scripts/parse_suppa_sup.R b/benchmark_scripts/parse_suppa_sup.R index bd314e2..63270ff 100644 --- a/benchmark_scripts/parse_suppa_sup.R +++ b/benchmark_scripts/parse_suppa_sup.R @@ -1,3 +1,8 @@ +library(tidyverse) +library(GenomicFeatures) +library(rtracklayer) +library(GenomicRanges) + parse_suppa_sup = function(url, sheet, event_id=TRUE, startRow = 3){ x = openxlsx::read.xlsx(url, sheet = sheet, startRow = startRow) if (isTRUE(event_id)) { @@ -18,18 +23,10 @@ url='https://static-content.springer.com/esm/art%3A10.1186%2Fs13059-018-1417-1/M # 44 negative pos = parse_suppa_sup(url, 4, event_id = TRUE, startRow = 3) neg = parse_suppa_sup(url, 10, event_id = FALSE, 3) -pos$score = 1 -neg$score = 0 - -library(tidyverse) neg <- neg %>% rownames_to_column('event_n') %>% - dplyr::rename(chr=Chr, Gene=Gene_symbol) %>% - mutate( - coord = str_glue_data(., "{Exon_start}-{Exon_end}"), - Exon_start = NULL, - Exon_end = NULL) + dplyr::rename(chr=Chr, Gene=Gene_symbol) reference <- rtracklayer::import.gff("/biodb/genomes/homo_sapiens/GRCh38_102/GRCh38.102.SIRV.gtf") @@ -53,23 +50,24 @@ get_introns <- function(ex_tx) { introns <- unlist(introns) introns } +ch <- rtracklayer::import.chain('~/GRCh37_to_GRCh38_2.chain') ref_ex_tx <- filter_multi_exon(reference) ref_introns <- get_introns(ref_ex_tx) ref_introns <- ref_introns[width(ref_introns) > 2, ] neg_gr <- GRanges( - seqnames = neg$Chr, - ranges = IRanges(neg$Exon_start, pos$Exon_end)) + seqnames = neg$chr, + ranges = IRanges(neg$Exon_start, neg$Exon_end)) seqlevelsStyle(neg_gr) <- 'ensembl' +neg_gr <- rtracklayer::liftOver(neg_gr, ch) +neg_gr <- unlist(neg_gr) + neg_introns <- subsetByOverlaps(ref_introns, neg_gr) -perc_olaps <- width(pintersect(ref_introns[from(neg_olaps)], neg_gr[to(neg_olaps)])) / width(neg_gr[to(neg_olaps)]) -neg_introns <- neg_introns[ perc_olaps == 1 ] +# perc_olaps <- width(pintersect(ref_introns[from(neg_olaps)], neg_gr[to(neg_olaps)])) / width(neg_gr[to(neg_olaps)]) +# neg_introns <- neg_introns[ perc_olaps == 1 ] score(neg_introns) <- 0 -library(GenomicFeatures) -library(rtracklayer) - pos <- pos %>% rownames_to_column('event_n') %>% mutate( @@ -88,17 +86,17 @@ pos <- pos %>% gr <- GRanges(seqnames = pos$chr, ranges = IRanges(pos$coord), strand = pos$strand) seqlevelsStyle(gr) <- 'ensembl' # https://gist.github.com/tbrittoborges/f0bc89cd7192d0955679d5105fd39475 -ch <- rtracklayer::import.chain('~/GRCh37_to_GRCh38_2.chain') gr <- rtracklayer::liftOver(gr, ch) gr <- unlist(gr) + +pairs <- findOverlapPairs(ref_introns, gr) +same_end <- end(pairs@first) - end(pairs@second) == -1 +same_start <- start(pairs@first) - start(pairs@second) == 1 +length(unique(pairs[same_start & same_end]@second)) +pairs <- pairs[same_start & same_end] +gr <- unique(pairs@first) mcols(gr)$score <- 1 names(gr) <- seq_along(gr) -neg_gr <- rtracklayer::liftOver(neg_gr, ch) -neg_gr <- unlist(neg_gr) - gr <- c(neg_introns, gr) export.bed(gr, '/prj/Niels_Gehring/baltica_benchmark/best_et_al/baltica/ortho.bed') -head(gr) - - diff --git a/benchmark_scripts/roc_calling_suppa.R b/benchmark_scripts/roc_calling_suppa.R new file mode 100644 index 0000000..8aac630 --- /dev/null +++ b/benchmark_scripts/roc_calling_suppa.R @@ -0,0 +1,51 @@ +library(ROCR) +library(cowplot) + +source("~/Baltica/benchmark_scripts/load_suppa_data.R") + +length(unique(coordinates[!is.na(df$orthogonal) ])) +table(df$orthogonal) + +compute_prediction <- function(col, ref) { + col <- df[, col] + ref <- df[, ref] + col <- col[!is.na(ref)] + ref <- ref[!is.na(ref)] + col <- replace(col, is.na(col), 0) + prediction(col, ref) +} + +pars <- tibble( + col = c("majiq", "leafcutter", "rmats", "junctionseq"), + ref = rep("orthogonal", 4) +) +prediction <- purrr::pmap(pars, compute_prediction) +names(prediction) <- pars$col +tpr_fpr <- lapply(prediction, performance, measure = "tpr", x.measure = "fpr") +auc <- lapply(prediction, performance, measure = "auc") + +method <- unlist(lapply(tpr_fpr, function(x) length(slot(x, "x.values")[[1]]))) +auc <- lapply(auc, function(x) round(slot(x, "y.values")[[1]][[1]], 2)) + +roc_data <- tibble( + FPR = unlist(lapply(tpr_fpr, function(x) slot(x, "x.values")[[1]])), + TPR = unlist(lapply(tpr_fpr, function(x) slot(x, "y.values")[[1]])), + Method = rep(names(method), method), +) + +p <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Method)) + + geom_line() + + geom_point(size=0.5) + + geom_abline(aes(slope = 1, intercept = 0), linetype = "dashed") + + theme_cowplot(14) + + theme(legend.position="bottom") + + scale_color_manual( + name = NULL, + labels = setNames(nm=names(auc), paste(names(auc), ' AUCROC=', auc, sep="")), + values = color_list$method[1:4]) + + guides(color=guide_legend(nrow=2,byrow=TRUE)) + +p + +ggsave( + "results/roc_suppa_calling.pdf", width = 15, height = 15, units = 'cm')