-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcollect_ranges.R
225 lines (181 loc) · 7.34 KB
/
collect_ranges.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#'
#' Script to collect all associated ids (expr, meth and geno)
#' for a specific sentinel snp
#'
#' @author Johann Hawe
#'
#' @date 2017/03/04
#'
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")
# ------------------------------------------------------------------------------
# load packages, source scripts
# ------------------------------------------------------------------------------
library(GenomicRanges)
library(GenomicFeatures)
library(FDb.InfiniumMethylation.hg19)
library(data.table)
library(illuminaHumanv3.db)
library(rtracklayer)
library(graph)
library(RBGL) # for shortest paths
library(Matrix)
source("scripts/lib.R")
source("scripts/collect_ranges_methods.R")
# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------
fcosmo <- snakemake@input$tcosmo
fmeqtl <- snakemake@input$meqtl
fppi_db <- snakemake@input$ppi_db
fprio_tab <- snakemake@input$priorization
fgene_annot <- snakemake@input$gene_annot
# TODO: create this file from scratch!
fcpgcontext <- snakemake@input$cpgcontext
ofile <- snakemake@output[[1]]
sentinel <- snakemake@wildcards$sentinel
# ------------------------------------------------------------------------------
# Load and preprocess data
# ------------------------------------------------------------------------------
print("Loading data.")
gene_annot <- load_gene_annotation(fgene_annot)
gene_annot$ids <- probes.from.symbols(gene_annot$SYMBOL,
as_list=T)
ppi_db <- readRDS(fppi_db)
# load trans-meQTL table
trans_meQTL = read.csv(fmeqtl,
sep="\t", stringsAsFactors=F)
# load trans-cosmo information
cosmo <- readRDS(fcosmo)
# load priorization table
prio <- read.table(fprio_tab, sep="\t", header=T, stringsAsFactors = F)
# get trans-genes which should be used for shortest path extraction
prio <- prio[prio$sentinel == sentinel,,drop=F]
if(nrow(prio) > 0) {
best_trans <- unique(prio$trans.gene)
} else {
best_trans <- NULL
}
# ------------------------------------------------------------------------------
# Collect and save ranges
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
print("Collecting SNP and CpG ranges.")
# ------------------------------------------------------------------------------
pairs = which(trans_meQTL[,"sentinel.snp"] == sentinel)
# get sentinel idx
sentinel_idx <- which(cosmo$snp==sentinel)[1]
# the large interval range for the sentinel
chr <- paste0("chr", trans_meQTL[pairs,"chr.snp"][1])
start <- trans_meQTL[pairs,"interval.start.snp"][1]
end <- trans_meQTL[pairs,"interval.end.snp"][1]
sentinel_extrange <- GRanges(chr, IRanges(start,end))
sentinel_range <- with(cosmo[sentinel_idx,],
GRanges(paste0("chr", snp.chr),
IRanges(snp.pos, width=1)))
names(sentinel_range) <- names(sentinel_extrange) <- sentinel
# get cosmo subset
idxs <- get.trans.cpgs(sentinel, trans_meQTL, cosmo)
# get related genes, i.e. genes near meQTL loci (snp+cpg)
# extend cpgs
cosmosub <- cosmo[idxs,]
croi <- with(cosmosub, GRanges(paste0("chr", cpg.chr),
IRanges(cpg.pos,width=2)))
names(croi) <- as.character(cosmosub[,"cpg"])
croi <- unique(croi)
# extended sentinel region
sroi <- sentinel_extrange
names(sroi) <- sentinel
# ------------------------------------------------------------------------------
print("Retrieving SNP and CpG genes.")
# ------------------------------------------------------------------------------
# get the relevant snp genes by overlapping with our sentinel region
genes_sroi <- subsetByOverlaps(gene_annot, sroi, ignore.strand=T)
# get genes near our cpg regions
genes_by_cpg <- get.nearby.ranges(croi, promoters(gene_annot))
names(genes_by_cpg) <- names(croi)
# get original ranges (not promoters)
genes_by_cpg <- lapply(names(genes_by_cpg), function(cg) {
gs <- genes_by_cpg[[cg]]
gene_annot[gs$hit_idx]
})
names(genes_by_cpg) <- names(croi)
# get single list of all cpg genes
genes_croi <- unique(unlist(GRangesList(unlist(genes_by_cpg))))
# ------------------------------------------------------------------------------
print("Collecting TFs and shortest path genes.")
# ------------------------------------------------------------------------------
tfs <- NULL
sp <- NULL
# get cpg ids and SNP gene symbols
cpgs <- names(croi)
snp_genes <- unique(genes_sroi$SYMBOL)
# modify ppi_db to contain our CpGs
# load the cpg-tf context
tfbs_ann <- get_tfbs_context(names(croi), fcpgcontext)
cpgs_with_tfbs <- cpgs[cpgs %in% rownames(tfbs_ann[rowSums(tfbs_ann)>0,])]
snp_genes_in_string <- snp_genes[snp_genes %in% nodes(ppi_db)]
# get locus graph
locus_graph <- add.to.graphs(list(ppi_db), sentinel, snp_genes,
cpgs_with_tfbs, tfbs_ann)[[1]]
# get tfs connected to cpgs
tf_syms = unique(unlist(adj(locus_graph, cpgs_with_tfbs)))
print(paste0("Annotated TFs: ", paste(tf_syms, collapse=", ")))
if(length(tf_syms) < 1 | length(snp_genes_in_string) < 1) {
warning(paste0("No TFs or none of the SNP genes are in PPI DB. ",
"Skipping shortest paths calculation."))
# still, we want to keep the available TFs if there are no SNP genes
# within the PPI DB (would get adjusted using shortest paths below)
if(length(snp_genes_in_string) >= 1) {
tfs <- gene_annot[gene_annot$SYMBOL %in% tf_syms]
}
} else {
# the nodes we want to keep
# in the original meQTL paper we removed KAP1 from the TF symbols
nodeset <- c(nodes(ppi_db), tf_syms,
snp_genes_in_string, cpgs_with_tfbs)
locus_graph <- subGraph(intersect(nodes(locus_graph), nodeset), locus_graph)
shortest_paths <- get_shortest_paths(cis = cpgs_with_tfbs,
trans=unique(c(snp_genes_in_string,
tf_syms)),
snp_genes=snp_genes_in_string,
locus_graph,
tf_syms,
best_trans)
non_tf_sp <- shortest_paths$non_tf_sp
tf_sp <- shortest_paths$tf_sp
if(length(non_tf_sp) < 1){
warning("No shortest path genes.")
} else {
sp <- gene_annot[gene_annot$SYMBOL %in% non_tf_sp]
}
if(length(tf_sp) < 1) {
# This should not happen -> sanity check
stop("No TFs on shortest paths!")
} else {
tfs <- gene_annot[gene_annot$SYMBOL %in% tf_sp]
}
}
print(paste0("Annotated TFs after shortest path calculations: ",
paste(tfs$SYMBOL, collapse=", ")))
# ------------------------------------------------------------------------------
print("Finalizing and saving results.")
# ------------------------------------------------------------------------------
result <- list(cpgs=croi,sentinel=sentinel_range,
sentinel_ext_range=sentinel_extrange,
snp_genes=genes_sroi, cpg_genes=genes_croi,
cpg_genes_by_cpg=genes_by_cpg)
if(!is.null(sp)){
result$spath <- sp
}
if(!is.null(tfs)){
result$tfs <- tfs
}
# set seed name
result$seed <- "meqtl"
saveRDS(file=ofile, result)
# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()