diff --git a/README.md b/README.md index b165f58..ee1f216 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,9 @@ each run is the input directory, which can be specified with the --work_dir flag command line. - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta] -- Reference genome used to align reads to during guided assembly [./config/Ref.gb] +- One of the two following configurations is required to describe the reference genome: + 1. A single GenBank file passed to `--ref` describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR + 2. A reference sequence in FASTA format under `--ref` AND a reference annotation in GFF format under `--ref_anno`. - File containing metadata for sequences under analysis [./data/metadata.csv] - Excluded strains/ samples [./config/dropped_strains.txt] - Colors used in final auspice visualization [./config/colors.csv] diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py new file mode 100755 index 0000000..be14eff --- /dev/null +++ b/bin/fix_aa_muts.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import os, json +import argparse + + +def init_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('-a', '--aa_json', required=True, help='aa_muts.json produced by the Augur translate step') + parser.add_argument('-n', '--nt_json', required=True, help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.') + parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.') + + return parser + + +def main(args): + + with open(args.aa_json, 'r') as infile: + aa_data = json.load(infile) + + with open(args.nt_json, 'r') as infile: + reference_length = len(json.load(infile)['reference']['nuc']) + + if 'nuc' not in aa_data['annotations']: + key = list(aa_data['annotations'].keys())[0] + name = aa_data['annotations'][key]['seqid'] + + aa_data['annotations']['nuc'] = { + "end": reference_length, + "seqid": name, + "start": 1, + "strand": "+", + "type": "source" + } + + with open(args.outpath, 'w') as outfile: + json.dump(aa_data, outfile, indent=4) + +if __name__ == "__main__": + args = init_parser().parse_args() + main(args) \ No newline at end of file diff --git a/main.nf b/main.nf index 24dcea1..8d09fd5 100644 --- a/main.nf +++ b/main.nf @@ -26,7 +26,8 @@ Mandatory arguments: Optional arguments: --seqs Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta] - --ref Reference genome used to align reads to during guided assembly [./config/Ref.gb] + --ref Reference genome used to align reads to during guided assembly [./config/Ref.gb (default); *.fasta file also accepted here] + --ref_anno Reference genome annotation file; only required when using a FASTA under --ref [*.gb / *.gff file accepted here] --meta File containing metadata for sequences under analysis [./data/metadata.csv] --drop_strains Excluded strains/ samples [./config/dropped_strains.txt] --colors Colors used in final auspice visualization [./config/colors.csv] @@ -126,7 +127,7 @@ process refine { publishDir "${params.work_dir}/results/", mode: 'copy' input: - tuple file(tree), file(msa) + tuple file(tree), file(msa), file(metadata) output: tuple path("tree.nwk"), path("branch_lengths.json") @@ -135,7 +136,7 @@ process refine { augur refine \ --tree ${tree} \ --alignment ${msa} \ - --metadata ${params.meta} \ + --metadata ${metadata} \ --timetree \ --keep-root \ --divergence-units ${params.divergence_units} \ @@ -154,14 +155,14 @@ process ancestral { output: path("nt_muts.json") - """ - augur ancestral \ - --tree ${refine_tree} \ - --alignment ${msa} \ - --output-node-data nt_muts.json \ - --keep-overhangs \ - --keep-ambiguous - """ + """ + augur ancestral \ + --tree ${refine_tree} \ + --alignment ${msa} \ + --output-node-data nt_muts.json \ + --keep-overhangs \ + --keep-ambiguous + """ } process translate { @@ -183,30 +184,46 @@ process translate { """ } +process fix_aa_json { + tag "Fixing aa_muts.json when using a GFF3 file for augur translate." + publishDir "${params.work_dir}/results/", mode: 'copy' + + input: + tuple file(aa_muts_json), file(nt_muts_json) + + output: + file("aa_muts_fix.json") + + """ + fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json + """ +} + process export { tag "Exporting data files for auspice" publishDir "${params.work_dir}/auspice", mode: 'copy' input: - tuple file(refine_tree), file(branch_len), file(nt_muts),\ - file(aa_muts) + tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts) + file(metadata) + tuple file(auspice_config), file(colors), file(lat_long) output: - file("flu_na.json") + file("flu.json") - """ - export AUGUR_RECURSION_LIMIT=${params.recursion_limit} - - augur export v2 \ - --tree ${refine_tree} \ - --metadata ${params.meta} \ - --node-data ${branch_len} ${nt_muts} ${aa_muts} \ - --colors ${params.colors} \ - --lat-longs ${params.lat_long} \ - --minify-json \ - --auspice-config ${params.auspice} \ - --output flu_na.json - """ + """ + export AUGUR_RECURSION_LIMIT=${params.recursion_limit} + + augur export v2 \ + --tree ${refine_tree} \ + --metadata ${metadata} \ + --node-data ${branch_len} ${nt_muts} ${aa_muts} \ + --colors ${colors} \ + --lat-longs ${lat_long} \ + --minify-json \ + --auspice-config ${auspice_config} \ + --output flu.json + """ } /** @@ -218,13 +235,39 @@ workflow workflow { seq_ch = Channel.fromPath(params.seqs, checkIfExists:true) ref_ch = Channel.fromPath(params.ref, checkIfExists:true) + meta_ch = Channel.fromPath(params.meta, checkIfExists:true) + config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect() + + + // Catch invalid reference input combinations + ref_gb_format = (params.ref =~ /.+\.[Gg]b$/) + + if (!ref_gb_format && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format + error "ERROR: Parameter --ref_anno (.gff3 or .gb format) must be specified if non-GenBank formatted reference is provided under --ref." + } + if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter + error "ERROR: Parameter --ref_anno must be in either .gff or .gb (GenBank) format." + } + + // Load the ref_anno_ch channel appropriately + if (ref_gb_format){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch) + ref_anno_ch = ref_ch + }else{ // Load new channel from scratch if different reference annotation format specified + ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true) + } align(seq_ch.combine(ref_ch)) | tree msa_ch = align.out - refine(tree.out.combine(align.out)) + refine(tree.out.combine(msa_ch).combine(meta_ch)) ancestral(refine.out.combine(msa_ch)) - translate(ancestral.out.combine(refine.out.combine(ref_ch))) - export(refine.out.combine(ancestral.out.combine(translate.out))) + translate(ancestral.out.combine(refine.out).combine(ref_anno_ch)) + + ch_aa_muts = translate.out + + if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error) + ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out)) + } + export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch) } /** diff --git a/nextflow.config b/nextflow.config index a97e069..b4cb71f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,44 +11,47 @@ manifest { params { -//help message -help = null + //help message + help = null -//version number -version = null + //version number + version = null -//conda env local cache -conda_cache = null + //conda env local cache + conda_cache = null -//directory containing config/ & data/ folders -work_dir = "/path/to/data/" + //directory containing config/ & data/ folders + work_dir = "/path/to/data/" -//reference for alignment -ref = "${params.work_dir}config/Ref.gb" + //reference for alignment + ref = "${params.work_dir}/config/Ref.gb" -//input sequences -seqs = "${params.work_dir}data/sequences.fasta" + //reference annotation + ref_anno = "NO_FILE" -//metadata of input sequences -meta = "${params.work_dir}data/metadata.csv" + //input sequences + seqs = "${params.work_dir}/data/sequences.fasta" -//strains that are excluded -drop_strains = "${params.work_dir}config/dropped_strains.txt" + //metadata of input sequences + meta = "${params.work_dir}/data/metadata.csv" -//colors used in final auspice visualization -colors = "${params.work_dir}config/colors.csv" + //strains that are excluded + drop_strains = "${params.work_dir}/config/dropped_strains.txt" -//latitude and longitudes -lat_long = "${params.work_dir}config/lat_longs.csv" + //colors used in final auspice visualization + colors = "${params.work_dir}/config/colors.csv" -//details for auspice visualization -auspice = "${params.work_dir}config/auspice_config.json" + //latitude and longitudes + lat_long = "${params.work_dir}/config/lat_longs.csv" -//refining phylogeny -divergence_units = "mutations" + //details for auspice visualization + auspice = "${params.work_dir}/config/auspice_config.json" -//env variable AUGUR_RECURSION_LIMIT -recursion_limit = 10000 + //refining phylogeny + divergence_units = "mutations" + + //env variable AUGUR_RECURSION_LIMIT + recursion_limit = 10000 } @@ -56,12 +59,12 @@ recursion_limit = 10000 //the process section of the config file. ex. AWS, SLURM, sun grid engine: process { -withName: align { - cpus = 28 + withName: align { + cpus = 28 + } + withName: tree { + cpus = 28 } -withName: tree { - cpus = 28 - } // penv='smp' // executor='sge' // memory='30 GB' @@ -83,17 +86,17 @@ profiles { //html displaying breakdown of time taken to execute workflow timeline { enabled = true - file = "${params.work_dir}reports/fluflo_timeline.html" + file = "${params.work_dir}/reports/fluflo_timeline.html" } //html of cpu/mem usage report { enabled = true - file = "${params.work_dir}reports/fluflo_usage.html" + file = "${params.work_dir}/reports/fluflo_usage.html" } //dag of beast-flow workflow dag { enabled = true - file = "${params.work_dir}reports/fluflo_dag.html" + file = "${params.work_dir}/reports/fluflo_dag.html" }