-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcallNearestGenes.R
executable file
·140 lines (108 loc) · 6.44 KB
/
callNearestGenes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/Rscript
# AAVengeR/buildStdFragments.R
# John K. Everett, Ph.D.
#
# This script accepts input from buildSites or modules following it and annotates
# nearest genomic features. Details such as distance to nearest features and their
# orientation are included.
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(GenomicRanges))
# Read in the configuration file and perform basic sanity checks.
args <- commandArgs(trailingOnly=TRUE)
if(length(args) == 0) stop('Expected at least one command line argument')
source(file.path(yaml::read_yaml(args[1])$softwareDir, 'lib.R'))
opt <- startModule(args)
createOuputDir()
dir.create(file.path(opt$outputDir, opt$callNearestGenes_outputDir))
# Start log.
opt$defaultLogFile <- file.path(opt$outputDir, opt$callNearestGenes_outputDir, 'log')
logo <- readLines(file.path(opt$softwareDir, 'figures', 'ASCII_logo.txt'))
write(logo, opt$defaultLogFile, append = FALSE)
write(paste0('version: ', readLines(file.path(opt$softwareDir, 'version', 'version')), "\n"), opt$defaultLogFile, append = TRUE)
quitOnErorr <- function(msg){
updateLog(msg)
message(msg)
message(paste0('See log for more details: ', opt$defaultLogFile))
q(save = 'no', status = 1, runLast = FALSE)
}
updateLog(paste0('Starting callNearestGenes using ', opt$callNearestGenes_CPUs, ' CPUs.'))
if(! file.exists(file.path(opt$outputDir, opt$callNearestGenes_inputFile))){
updateLog('Error - input file does not exist.')
q(save = 'no', status = 1, runLast = FALSE)
}
sites <- readRDS(file.path(opt$outputDir, opt$callNearestGenes_inputFile))
if(nrow(sites) == 0){
updateLog('Error - sites file was read and contains zero rows of data.')
q(save = 'no', status = 1, runLast = FALSE)
}
if(! opt$callNearestGenes_addAfter %in% names(sites)){
updateLog(paste0('Error - ', opt$callNearestGenes_addAfter, ' is not a column in your input data frame.'))
q(save = 'no', status = 1, runLast = FALSE)
}
sites <- distinct(bind_rows(lapply(split(sites, sites$refGenome), function(x){
updateLog(paste0('Starting analysis of sites found in ', x$refGenome[1]))
if(! x$refGenome[1] %in% names(opt$callNearestGenes_boundaries)){
updateLog(paste0(' Error - ', x$refGenome[1], ' is not defined beneath callNearestGenes_boundaries in the config file.'))
q(save = 'no', status = 1, runLast = FALSE)
}
if(! 'TUs' %in% names(opt$callNearestGenes_boundaries[[x$refGenome[1]]])){
updateLog(paste0('Error - TUs file not defined for ', x$refGenome[1], ' in the config file.'))
q(save = 'no', status = 1, runLast = FALSE)
}
if(! 'exons' %in% names(opt$callNearestGenes_boundaries[[x$refGenome[1]]])){
updateLog(paste0(' Error - exons file not defined for ', x$refGenome[1], ' in the config file.'))
q(save = 'no', status = 1, runLast = FALSE)
}
if(! file.exists(file.path(opt$softwareDir, 'data', 'genomeAnnotations', opt$callNearestGenes_boundaries[[x$refGenome[1]]][['TUs']]))){
updateLog(paste0('Error - TUs file set for ', x$refGenome[1], ' could not be found.'))
q(save = 'no', status = 1, runLast = FALSE)
}
if(! file.exists(file.path(opt$softwareDir, 'data', 'genomeAnnotations', opt$callNearestGenes_boundaries[[x$refGenome[1]]][['exons']]))){
updateLog(paste0(' Error - exons file set for ', x$refGenome[1], ' could not be found.'))
q(save = 'no', status = 1, runLast = FALSE)
}
genes <- readRDS(file.path(opt$softwareDir, 'data', 'genomeAnnotations', opt$callNearestGenes_boundaries[[x$refGenome[1]]][['TUs']]))
exons <- readRDS(file.path(opt$softwareDir, 'data', 'genomeAnnotations', opt$callNearestGenes_boundaries[[x$refGenome[1]]][['exons']]))
if(tolower(opt$callNearestGenes_geneList) != 'none'){
geneList <- toupper(readLines(opt$callNearestGenes_geneList))
genes <- genes[toupper(genes$name2) %in% geneList]
exons <- exons[toupper(exons$name2) %in% geneList]
}
# Collapse TUs to single positions if requested.
# Remove exons since they would not be relevant.
if(opt$callNearestGenes_TU_position != 'either'){
options(warn=-1)
if (opt$callNearestGenes_TU_position == "start") genes <- GenomicRanges::flank(genes, width = -1)
if (opt$callNearestGenes_TU_position == "end") genes <- GenomicRanges::flank(genes, width = -1, start = FALSE)
if (opt$callNearestGenes_TU_position == "center") ranges(genes) <- IRanges(mid(ranges(genes)), width = 1)
options(warn=0)
exons <- GenomicRanges::GRanges()
}
posids <- x$posid
posids <- unique(sub('\\.\\S+$', '', posids))
updateLog(paste0('Calling nearestGene() for ', n_distinct(posids), ' sites.'))
n <- nearestGene(posids, genes, exons, CPUs = opt$callNearestGenes_CPUs)
n$posid2 <- paste0(n$chromosome, n$strand, n$position)
n <- n[!duplicated(n$posid2),]
x$posid2 <- sub('\\.\\S+$', '', x$posid)
len <- length(x)
n <- select(n, nearestGene, nearestGeneStrand, nearestGeneDist, inGene, inExon, beforeNearestGene, posid2)
if(tolower(opt$callNearestGenes_columnPrefix) != 'none'){
names(n) <- paste0(opt$callNearestGenes_columnPrefix, names(n))
x <- left_join(x, n, by = c('posid2' = paste0(opt$callNearestGenes_columnPrefix, 'posid2'))) %>% select(-posid2)
} else {
x <- left_join(x, n, by = 'posid2') %>% select(-posid2)
}
dplyr::relocate(x, names(n)[! grepl('posid', names(n))], .after = opt$callNearestGenes_addAfter)
})))
sites <- arrange(sites, desc(sonicLengths))
if(tolower(opt$databaseConfigGroup) != 'none'){
suppressPackageStartupMessages(library(RMariaDB))
uploadSitesToDB(sites)
}
saveRDS(sites, file.path(opt$outputDir, opt$callNearestGenes_outputDir, 'sites.rds'))
openxlsx::write.xlsx(sites, file.path(opt$outputDir, opt$callNearestGenes_outputDir, 'sites.xlsx'))
readr::write_tsv(sites, file.path(opt$outputDir, opt$callNearestGenes_outputDir, 'sites.tsv.gz'))
updateLog('callNearestGenes completed.')
q(save = 'no', status = 0, runLast = FALSE)