-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmicroscope_automated_analysis.nf
254 lines (208 loc) · 9.5 KB
/
microscope_automated_analysis.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env nextflow
def helpMessage() {
log.info"""
=========================================
microscope_automated_analysis.nf
=========================================
Usage:
The typical command for running the pipeline on sdag is as follows:
nextflow run microscope_automated_analysis.nf -c MICROSCOPE.config -profile eva,archgen,automated_analysis --batch <batch_name> --outdir /mnt/archgen/MICROSCOPE/automated_analysis
Mandatory arguments:
-profile [str] Institution or personal hardware config to use (e.g. standard, docker, singularity, conda, aws). Ask your system admin if unsure, or check documentation.
--batch [str] The sequencing batch name to process.
--outdir [str] The desired directory within which all output files will be placed. One directory per sequencing batch will be created within this directory, which in turn will contain one directory per analysis.
--phenotype_annotation [str] The path to the desired SNP annotation file for phenotypic analysis.
""".stripIndent()
}
nextflow.enable.dsl=1 // Force DSL1 syntax
///////////////////////////////////////////////////////////////////////////////
/* -- SET UP CONFIGURATION VARIABLES -- */
///////////////////////////////////////////////////////////////////////////////
// Show help message
params.help = false
if (params.help){
helpMessage()
exit 0
}
// Small console separator to make it easier to read errors after launch
println ""
/////////////////////////////////////////////////////////
/* -- Create input channel -- */
/////////////////////////////////////////////////////////
Channel.fromPath("/mnt/archgen/MICROSCOPE/poseidon_packages/${params.batch}/*.{geno,snp,ind}")
.toSortedList()
.map {
it ->
def geno=it[0]
def snp=it[2]
def ind=it[1]
[geno, snp, ind]
}
.into { ch_input_for_pmmr; ch_input_dummy }
process pmmrCalculator {
conda 'bioconda::pmmrcalculator=1.1.0'
tag "${params.batch}"
publishDir "${params.outdir}/${params.batch}/pmmr", mode: 'copy'
memory '1GB'
cpus 1
input:
tuple path(geno), path(snp), path(ind) from ch_input_for_pmmr.dump(tag:"input_pmmr")
output:
file "${params.batch}.pmmr.txt"
script:
"""
## Older local fixed version used for testing.
# /r1/people/thiseas_christos_lamnidis/Software/github/TCLamnidis/pMMRCalculator/pMMRCalculator.py -i ${params.batch} -o ${params.batch}.pmmr.txt
pmmrcalculator -i ${params.batch} -o ${params.batch}.pmmr.txt
"""
}
process forge_package {
conda 'bioconda::poseidon-trident=0.26.1'
memory '1GB'
cpus 1
// No inputs means this will run each time. forge_pca_package.sh already knows not to run if not needed.
// Dummy output to run this before attempting to run smartpca, which fails the job.
output:
val "dummy" into ch_dummy_for_pca_dependency_eurasia,ch_dummy_for_pca_dependency_europe
script:
"""
${projectDir}/plink_mds/forge_pca_package.sh
"""
}
// Create channel with microscope PCA bed/bim/fam for PCA run
Channel.fromPath("/mnt/archgen/MICROSCOPE/forged_packages/microscope_pca/*.{geno,snp,ind}")
.toSortedList()
.map {
it ->
def geno=it[0]
def snp=it[2]
def ind=it[1]
[geno, snp, ind]
}
.into{ ch_for_smartpca_europe; ch_for_smartpca_eurasia }
process microscope_pca_eurasia {
conda 'bioconda::eigensoft=7.2.1'
publishDir "/mnt/archgen/MICROSCOPE/automated_analysis/microscope_pca/", mode: 'copy'
memory '20GB'
cpus 4
input:
tuple path(geno), path(snp), path(ind) from ch_for_smartpca_eurasia.dump(tag:"input_pca_eur")
val dummy from ch_dummy_for_pca_dependency_eurasia
output:
file 'West_Eurasian_pca.evec' optional true
file 'West_Eurasian_pca.eval' optional true
file 'West_Eurasian_pca.weights' optional true
script:
"""
${projectDir}/plink_mds/run_smartpca.sh ${geno} ${snp} ${ind} ${projectDir}/plink_mds/west_eurasian_poplist.txt ${task.cpus} West_Eurasian_pca
"""
}
process microscope_pca_europe {
conda 'bioconda::eigensoft=7.2.1'
publishDir "/mnt/archgen/MICROSCOPE/automated_analysis/microscope_pca/", mode: 'copy'
memory '20GB'
cpus 4
input:
tuple path(geno), path(snp), path(ind) from ch_for_smartpca_europe.dump(tag:"input_pca_eur")
val dummy from ch_dummy_for_pca_dependency_europe
output:
file 'Europe_only_pca.eval' optional true
file 'Europe_only_pca.evec' optional true
file 'Europe_only_pca.weights' optional true
script:
"""
${projectDir}/plink_mds/run_smartpca.sh ${geno} ${snp} ${ind} ${projectDir}/plink_mds/europe_poplist.txt ${task.cpus} Europe_only_pca
"""
}
Channel.fromPath("/mnt/archgen/MICROSCOPE/poseidon_packages/${params.batch}/*.{bed,bim,fam}")
.toSortedList()
.map {
it ->
def bed=it[0]
def bim=it[2]
def fam=it[1]
[bed, bim, fam]
}
.into { ch_input_for_read; ch_input_dummy }
process kinship_read {
conda 'conda-forge::python=2.7.15 conda-forge::r-base=4.0.3 bioconda::plink=1.90b6.21'
tag "${params.batch}"
publishDir "${params.outdir}/${params.batch}/read", mode: 'copy'
memory '1GB'
cpus 1
input:
tuple path(bed), path(bim), path(fam) from ch_input_for_read.dump(tag:"input_read")
output:
file "${params.batch}.read.txt"
file "${params.batch}.read.plot.pdf"
script:
"""
## Filter out non-autosomal genotypes, and samples with missingness below about 10k SNPs.
plink --bfile ${bed.baseName} --make-bed --out ${bed.baseName}.autosomes --autosome --mind 0.9918
## Create map file from bim
## cut -f 1-4 ${bim.baseName}.autosomes.bim >${bed.baseName}.autosomes.recoded .map
## Transpose the plink data
plink --bfile ${bed.baseName}.autosomes --out ${bed.baseName}.autosomes.recoded --recode --transpose
## If only one individual is left after filtering, stop execution using the ignored exit code.
if [[ \$(wc -l ${bed.baseName}.autosomes.recoded.tfam | cut -f1 -d " ") == "1" ]]; then
exit 11
fi
python /r1/people/thiseas_christos_lamnidis/Software/bitbucket/tguenther/read/READ.py ${bed.baseName}.autosomes.recoded
mv READ_results ${params.batch}.read.txt
mv READ_results_plot.pdf ${params.batch}.read.plot.pdf
"""
}
// Create channel with all trimmed bams. Will contain duplicates and multiple libraries etc in older batches.
ch_bams_for_phenotypes = Channel
.fromFilePairs("//mnt/archgen/MICROSCOPE/eager_outputs/${params.batch}/trimmed_bam/*bam{,.bai}")
//Filter out reaaaally old *.SG1.1 batches. unmerged-trimmed libs that later got merged and trimmed will still exist sadly.
.filter {! (it[0] =~ /.*1.1.trimmed/) }
.map{
def ind_name = it[0].substring(0, 9) // 9 is the length of the individual name if it has the _ss suffix.
if (ind_name.endsWith("_ss")) {
ind_name = ind_name.substring(0, 6)
}
def sample_name = it[0].minus(".trimmed")
def bam = it [1][0]
def bai = it [1][1]
[ind_name, sample_name, bam, bai]
}
// To avoid picking up older unmerged bams, we group by ind_name, then remove the sample_name/bam/bai combination that is less new.
.groupTuple(by: 0)
.map {
ind_name, names, bams, bais ->
def fileSortCondition = { it.lastModified() } // Sort by last modified date
newest_bam = bams.sort(fileSortCondition).reverse()[0] // Get the newest bam
idx = bams.indexOf(newest_bam) // Get the index of the newest bam
[names[idx], bams[idx], bais[idx]] // Return the newest bam and its corresponding name and bai
}
.dump(tag:"phenotypes")
process phenotypic_analysis {
tag "${params.batch}"
conda "bioconda::samtools=1.14 conda-forge::python=3.10.2"
publishDir "${params.outdir}/${params.batch}/phenotypes", mode: 'copy'
memory '1GB'
cpus 1
input:
// Apparently using groupTuple with an out-of-range index flips columns and rows of the channel... :shrug:
tuple bam_name, path(bams), path(bais) from ch_bams_for_phenotypes.groupTuple(by: [4] ).dump(tag:"phenotypes_input")
output:
file("${params.batch}.phenotypes.txt")
file("${params.batch}.*.hirisplex.csv")
script:
def name_list = bam_name.flatten().join(" ")
"""
## Create samplelist
for name in ${name_list}; do
echo \${name} >>samplelist.txt
done
## get mpileup results (set max depth to 100 to limit memory footprint.)
samtools mpileup -d 100 -a -Q 30 -B -q30 -l <(awk -v OFS="\t" -F "\t" '{print \$3,\$4}' ${params.phenotype_annotation} | tail -n +2) ${bams} >${params.batch}.mpileup.q30.Q30.B.txt
## Check phenotypes
${baseDir}/phenotypic_snps/infer_phenotypes.py -a${params.phenotype_annotation} -f samplelist.txt ${params.batch}.mpileup.q30.Q30.B.txt >${params.batch}.phenotypes.txt
## Create HIrisPlex CSV with minimum read depth of 0, 5, and 10 per SNP.
${baseDir}/phenotypic_snps/phenotypes2irisplex.py -m 0 ${params.batch}.phenotypes.txt >${params.batch}.minDepth0.hirisplex.csv
${baseDir}/phenotypic_snps/phenotypes2irisplex.py -m 5 ${params.batch}.phenotypes.txt >${params.batch}.minDepth5.hirisplex.csv
${baseDir}/phenotypic_snps/phenotypes2irisplex.py -m 10 ${params.batch}.phenotypes.txt >${params.batch}.minDepth10.hirisplex.csv
"""
}