From 456829e56d3821d71e3b92b26cf66c96c9da5796 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Wed, 6 Dec 2023 14:25:08 -0800 Subject: [PATCH 1/9] changes --- bin/extract_sequences.py | 77 +++++++++++++++++++++++++ bin/fix_aa_muts.py | 35 ++++++++++++ main.nf | 118 ++++++++++++++++++++++++++++++--------- nextflow.config | 79 ++++++++++++++------------ 4 files changed, 247 insertions(+), 62 deletions(-) create mode 100755 bin/extract_sequences.py create mode 100755 bin/fix_aa_muts.py diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py new file mode 100755 index 0000000..7d6fe46 --- /dev/null +++ b/bin/extract_sequences.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import os, sys +from Bio import SeqIO +import re +import argparse +from glob import glob + + +def init_parser(): + parser = argparse.ArgumentParser(description="Concatenate sequences from files.") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("-s", "--segment", default='NONE', type=str, help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") + group.add_argument("-c", "--concat", action='store_true', help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") + + parser.add_argument("-i", "--inpath", help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") + parser.add_argument('-o', "--outpath", type=str, required=True, help="Output file name") + + return parser + +def concatenate_sequences(filepath): + sequences = list(SeqIO.parse(filepath, "fasta")) + + if len(sequences) == 8: + concatenated_seq = sequences[0] + for seq in sequences[1:]: + concatenated_seq += seq + + # rename sequences + concatenated_seq.id = re.split("_fluviewer|_segment", sequences[0].id)[0] + concatenated_seq.name = concatenated_seq.id + concatenated_seq.description = concatenated_seq.id + return concatenated_seq + + print(f"WARN: Skipping {filepath}. Segment sequences are missing") + + return None + +def extract_sequences(filepath, target_segment): + sequences = list(SeqIO.parse(filepath, "fasta")) + + for seq in sequences: + segment_search = re.search("(?:_fluviewer\||_segment\d_)([A-Z0-9]+)", seq.id) + + if segment_search and segment_search.group(1) == target_segment: + return seq + + print(f"WARN: Skipping {filepath}. Could not find segment of interest.") + return None + +def main(args): + + if os.path.isfile(args.inpath): + print("ERROR: Expected directory of consensus.fa files as input.", file=sys.stderr) + exit(1) + + filepaths = glob(os.path.join(args.inpath, '*consensus*')) + + if args.concat: + output_sequences = [] + for filepath in filepaths: + concatenated_seq = concatenate_sequences(filepath) + if concatenated_seq: + output_sequences.append(concatenated_seq) + + + else: + output_sequences = [] + for filepath in filepaths: + extracted_seq = extract_sequences(filepath, args.segment) + if extracted_seq: + output_sequences.append(extracted_seq) + + SeqIO.write(output_sequences, args.outpath, "fasta") + +if __name__ == "__main__": + args = init_parser().parse_args() + main(args) \ No newline at end of file diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py new file mode 100755 index 0000000..27d58be --- /dev/null +++ b/bin/fix_aa_muts.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import os, json +import argparse + + +def init_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( 'json', help='aa_muts.json produced by the Augur translate step') + parser.add_argument( 'outpath', help='Fixed amino acid mutation node data in JSON format.') + + return parser + + +def main(args): + + with open(args.json, 'r') as infile: + data = json.load(infile) + + + name = data['annotations']['HA']['seqid'] + + data['annotations']['nuc'] = { + "end": 13590, + "seqid": name, + "start": 1, + "strand": "+", + "type": "source" + } + + with open(args.outpath, 'w') as outfile: + json.dump(data, outfile, indent=4) + +if __name__ == "__main__": + args = init_parser().parse_args() + main(args) \ No newline at end of file diff --git a/main.nf b/main.nf index 24dcea1..cb03e69 100644 --- a/main.nf +++ b/main.nf @@ -79,6 +79,34 @@ process definition --------------------------------------------------------------------------------- */ +process extract_seqs{ + tag "Optional process to automatically extract relevant segments from FluViewer consensus.fa" + publishDir "${params.work_dir}/results/", mode: 'copy' + + input: + path(sequences_path) + + output: + path("extracted_seqs.fasta") + + script: + concat = params.concat ? '--concat' : '' + segment = params.segment != 'NONE' ? "--segment ${params.segment}" : '' + + """ + if [ ! -d ${sequences_path} ]; then + echo "ERROR: Provided path must be a directory when running automatic sequence extraction with --concat or --segment." + exit 1 + fi + + extract_sequences.py \ + --inpath ${sequences_path} \ + --outpath extracted_seqs.fasta \ + ${concat} \ + ${segment} + """ +} + process align { tag "Aligning sequences to ${params.ref} & filling gaps with N" @@ -126,7 +154,7 @@ process refine { publishDir "${params.work_dir}/results/", mode: 'copy' input: - tuple file(tree), file(msa) + tuple file(tree), file(msa), file(metadata) output: tuple path("tree.nwk"), path("branch_lengths.json") @@ -135,7 +163,7 @@ process refine { augur refine \ --tree ${tree} \ --alignment ${msa} \ - --metadata ${params.meta} \ + --metadata ${metadata} \ --timetree \ --keep-root \ --divergence-units ${params.divergence_units} \ @@ -154,14 +182,14 @@ process ancestral { output: path("nt_muts.json") - """ - augur ancestral \ - --tree ${refine_tree} \ - --alignment ${msa} \ - --output-node-data nt_muts.json \ - --keep-overhangs \ - --keep-ambiguous - """ + """ + augur ancestral \ + --tree ${refine_tree} \ + --alignment ${msa} \ + --output-node-data nt_muts.json \ + --keep-overhangs \ + --keep-ambiguous + """ } process translate { @@ -183,6 +211,21 @@ process translate { """ } +process fix_aa_json { + tag "Fixing aa_muts.json for concatenated workflow." + publishDir "${params.work_dir}/results/", mode: 'copy' + + input: + file(aa_muts_json) + + output: + file("aa_muts_fix.json") + + """ + fix_aa_muts.py ${aa_muts_json} aa_muts_fix.json + """ +} + process export { tag "Exporting data files for auspice" publishDir "${params.work_dir}/auspice", mode: 'copy' @@ -190,23 +233,25 @@ process export { input: tuple file(refine_tree), file(branch_len), file(nt_muts),\ file(aa_muts) + file(metadata) + tuple file(auspice_config), file(colors), file(lat_long) output: - file("flu_na.json") + file("flu.json") - """ - export AUGUR_RECURSION_LIMIT=${params.recursion_limit} - - augur export v2 \ - --tree ${refine_tree} \ - --metadata ${params.meta} \ - --node-data ${branch_len} ${nt_muts} ${aa_muts} \ - --colors ${params.colors} \ - --lat-longs ${params.lat_long} \ - --minify-json \ - --auspice-config ${params.auspice} \ - --output flu_na.json - """ + """ + export AUGUR_RECURSION_LIMIT=${params.recursion_limit} + + augur export v2 \ + --tree ${refine_tree} \ + --metadata ${metadata} \ + --node-data ${branch_len} ${nt_muts} ${aa_muts} \ + --colors ${colors} \ + --lat-longs ${lat_long} \ + --minify-json \ + --auspice-config ${auspice_config} \ + --output flu.json + """ } /** @@ -218,13 +263,32 @@ workflow workflow { seq_ch = Channel.fromPath(params.seqs, checkIfExists:true) ref_ch = Channel.fromPath(params.ref, checkIfExists:true) + meta_ch = Channel.fromPath(params.meta, checkIfExists:true) + config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect() + + if (params.concat) { + ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true) + seq_ch = extract_seqs(seq_ch) + + }else if (params.segment != 'NONE') { + ref_anno_ch = ref_ch + seq_ch = extract_seqs(seq_ch) + + }else{ + ref_anno_ch = ref_ch + } align(seq_ch.combine(ref_ch)) | tree msa_ch = align.out - refine(tree.out.combine(align.out)) + refine(tree.out.combine(msa_ch).combine(meta_ch)) ancestral(refine.out.combine(msa_ch)) - translate(ancestral.out.combine(refine.out.combine(ref_ch))) - export(refine.out.combine(ancestral.out.combine(translate.out))) + translate(ancestral.out.combine(refine.out.combine(ref_anno_ch))) + + ch_aa_muts = translate.out + if (params.concat) { + ch_aa_muts = fix_aa_json(ch_aa_muts) + } + export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch) } /** diff --git a/nextflow.config b/nextflow.config index a97e069..2ea64c0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,44 +11,53 @@ manifest { params { -//help message -help = null + //help message + help = null -//version number -version = null + //version number + version = null -//conda env local cache -conda_cache = null + //conda env local cache + conda_cache = null -//directory containing config/ & data/ folders -work_dir = "/path/to/data/" + //directory containing config/ & data/ folders + work_dir = "/path/to/data/" -//reference for alignment -ref = "${params.work_dir}config/Ref.gb" + //reference for alignment + ref = "${params.work_dir}/config/Ref.gb" -//input sequences -seqs = "${params.work_dir}data/sequences.fasta" + //reference annotation (needed for concatenated workflow only) + ref_anno = "${params.work_dir}/config/Ref.gff3" -//metadata of input sequences -meta = "${params.work_dir}data/metadata.csv" + //input sequences + seqs = "${params.work_dir}/data/sequences.fasta" -//strains that are excluded -drop_strains = "${params.work_dir}config/dropped_strains.txt" + //metadata of input sequences + meta = "${params.work_dir}/data/metadata.csv" -//colors used in final auspice visualization -colors = "${params.work_dir}config/colors.csv" + //strains that are excluded + drop_strains = "${params.work_dir}/config/dropped_strains.txt" -//latitude and longitudes -lat_long = "${params.work_dir}config/lat_longs.csv" + //colors used in final auspice visualization + colors = "${params.work_dir}/config/colors.csv" -//details for auspice visualization -auspice = "${params.work_dir}config/auspice_config.json" + //latitude and longitudes + lat_long = "${params.work_dir}/config/lat_longs.csv" -//refining phylogeny -divergence_units = "mutations" + //details for auspice visualization + auspice = "${params.work_dir}/config/auspice_config.json" -//env variable AUGUR_RECURSION_LIMIT -recursion_limit = 10000 + //refining phylogeny + divergence_units = "mutations" + + //run workflow for concatenated sequences instead of single segment + concat = false + + //if not NONE, automate the extraction of a specific segment from fluviewer_consensus.fa files + segment = 'NONE' + + //env variable AUGUR_RECURSION_LIMIT + recursion_limit = 10000 } @@ -56,12 +65,12 @@ recursion_limit = 10000 //the process section of the config file. ex. AWS, SLURM, sun grid engine: process { -withName: align { - cpus = 28 + withName: align { + cpus = 28 + } + withName: tree { + cpus = 28 } -withName: tree { - cpus = 28 - } // penv='smp' // executor='sge' // memory='30 GB' @@ -70,7 +79,7 @@ withName: tree { profiles { conda { - process.conda = "${projectDir}/environments/ENV.yml" + process.conda = "${projectDir}/environments/environment.yml" conda.createTimeout = '1 h' if (params.conda_cache) { conda.cacheDir = params.conda_cache @@ -83,17 +92,17 @@ profiles { //html displaying breakdown of time taken to execute workflow timeline { enabled = true - file = "${params.work_dir}reports/fluflo_timeline.html" + file = "${params.work_dir}/reports/fluflo_timeline.html" } //html of cpu/mem usage report { enabled = true - file = "${params.work_dir}reports/fluflo_usage.html" + file = "${params.work_dir}/reports/fluflo_usage.html" } //dag of beast-flow workflow dag { enabled = true - file = "${params.work_dir}reports/fluflo_dag.html" + file = "${params.work_dir}/reports/fluflo_dag.html" } From a47f4ed74844e2305a1a933c6479ca02c5c37f38 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Wed, 6 Dec 2023 15:30:38 -0800 Subject: [PATCH 2/9] revert change to default conda --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 2ea64c0..393c369 100644 --- a/nextflow.config +++ b/nextflow.config @@ -79,7 +79,7 @@ process { profiles { conda { - process.conda = "${projectDir}/environments/environment.yml" + process.conda = "${projectDir}/environments/ENV.yml" conda.createTimeout = '1 h' if (params.conda_cache) { conda.cacheDir = params.conda_cache From d585131ad51366fc88ce3a17787d34720ee461e7 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Thu, 7 Dec 2023 12:27:15 -0800 Subject: [PATCH 3/9] updated extract_sequences.py --- bin/extract_sequences.py | 95 +++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py index 7d6fe46..eac635f 100755 --- a/bin/extract_sequences.py +++ b/bin/extract_sequences.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import os, sys -from Bio import SeqIO import re import argparse from glob import glob @@ -17,35 +16,64 @@ def init_parser(): return parser -def concatenate_sequences(filepath): - sequences = list(SeqIO.parse(filepath, "fasta")) +def parse_fasta(filepath): + seqlist = [] + seq = '' + header = '' + with open(filepath, 'r') as handle: + for line in handle.readlines(): + if line[0] == '>': + if header != '': + seqlist.append((header, seq)) + + header = line.strip().lstrip('>') + seq = '' + else: + seq += line.strip() + seqlist.append((header, seq)) - if len(sequences) == 8: - concatenated_seq = sequences[0] - for seq in sequences[1:]: - concatenated_seq += seq - - # rename sequences - concatenated_seq.id = re.split("_fluviewer|_segment", sequences[0].id)[0] - concatenated_seq.name = concatenated_seq.id - concatenated_seq.description = concatenated_seq.id - return concatenated_seq + return seqlist + +def write_fasta(seqs, outpath): + with open(outpath, 'w') as outfile: + for header, seq in seqs: + outfile.write(">" + header + '\n') + outfile.write(seq + '\n') + +def concatenate_sequences(filepath): + sequences = parse_fasta(filepath) + + filepath_fields = re.split("(?:_fluviewer)?[\._]consensus", os.path.basename(filepath)) + + if len(filepath_fields) < 2: + print(f"WARN: Could not extract isolate name from file path {filepath}") + return None + + header = filepath_fields[0] + + if len(sequences) != 8: + print(f"WARN: Skipping {filepath}. Segment sequences are missing") + return None - print(f"WARN: Skipping {filepath}. Segment sequences are missing") + concatenated_seq = '' + for _ , seq in sequences: + concatenated_seq += seq + + return (header, concatenated_seq) - return None def extract_sequences(filepath, target_segment): - sequences = list(SeqIO.parse(filepath, "fasta")) + sequences = parse_fasta(filepath) - for seq in sequences: - segment_search = re.search("(?:_fluviewer\||_segment\d_)([A-Z0-9]+)", seq.id) + search_segments = [x for x in sequences if re.search(target_segment, x[0])] - if segment_search and segment_search.group(1) == target_segment: - return seq + if len(search_segments) != 1: + print("ERROR: Could not locate a singular sequences that matches the segment of interest. You may need to specify a more advanced regex with --segment") + print(search_segments) + return None + + return search_segments[0] - print(f"WARN: Skipping {filepath}. Could not find segment of interest.") - return None def main(args): @@ -54,23 +82,20 @@ def main(args): exit(1) filepaths = glob(os.path.join(args.inpath, '*consensus*')) + output_sequences = [] + + for filepath in filepaths: - if args.concat: - output_sequences = [] - for filepath in filepaths: - concatenated_seq = concatenate_sequences(filepath) - if concatenated_seq: - output_sequences.append(concatenated_seq) + if args.concat: + seq = concatenate_sequences(filepath) + else: + seq = extract_sequences(filepath, args.segment) - else: - output_sequences = [] - for filepath in filepaths: - extracted_seq = extract_sequences(filepath, args.segment) - if extracted_seq: - output_sequences.append(extracted_seq) + if seq: + output_sequences.append(seq) - SeqIO.write(output_sequences, args.outpath, "fasta") + write_fasta(output_sequences, args.outpath) if __name__ == "__main__": args = init_parser().parse_args() From 6bd2ad4644a369bfb1841795f244cd94606b541f Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Wed, 13 Dec 2023 08:31:38 -0800 Subject: [PATCH 4/9] reduced changes to only those necessary for concat workflow --- main.nf | 56 +++++++++++++------------------------------------ nextflow.config | 8 +------ 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/main.nf b/main.nf index cb03e69..81b43ff 100644 --- a/main.nf +++ b/main.nf @@ -79,34 +79,6 @@ process definition --------------------------------------------------------------------------------- */ -process extract_seqs{ - tag "Optional process to automatically extract relevant segments from FluViewer consensus.fa" - publishDir "${params.work_dir}/results/", mode: 'copy' - - input: - path(sequences_path) - - output: - path("extracted_seqs.fasta") - - script: - concat = params.concat ? '--concat' : '' - segment = params.segment != 'NONE' ? "--segment ${params.segment}" : '' - - """ - if [ ! -d ${sequences_path} ]; then - echo "ERROR: Provided path must be a directory when running automatic sequence extraction with --concat or --segment." - exit 1 - fi - - extract_sequences.py \ - --inpath ${sequences_path} \ - --outpath extracted_seqs.fasta \ - ${concat} \ - ${segment} - """ -} - process align { tag "Aligning sequences to ${params.ref} & filling gaps with N" @@ -212,7 +184,7 @@ process translate { } process fix_aa_json { - tag "Fixing aa_muts.json for concatenated workflow." + tag "Fixing aa_muts.json when using a GFF3 file for augur translate." publishDir "${params.work_dir}/results/", mode: 'copy' input: @@ -231,8 +203,7 @@ process export { publishDir "${params.work_dir}/auspice", mode: 'copy' input: - tuple file(refine_tree), file(branch_len), file(nt_muts),\ - file(aa_muts) + tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts) file(metadata) tuple file(auspice_config), file(colors), file(lat_long) @@ -266,26 +237,29 @@ workflow { meta_ch = Channel.fromPath(params.meta, checkIfExists:true) config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect() - if (params.concat) { - ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true) - seq_ch = extract_seqs(seq_ch) - - }else if (params.segment != 'NONE') { - ref_anno_ch = ref_ch - seq_ch = extract_seqs(seq_ch) - }else{ + // Catch invalid reference input combinations + if (!(params.ref =~ /.+\.[Gg]b$/) && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format + error "ERROR: Extra parameter --ref_anno (.gff3 or .gb format) must be specified for augur translate if non-GenBank formatted reference is provided." + }else if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter + error "ERROR: Extra parameter --ref_anno must be in either .gff or .gb (GenBank) format." + } + + // Load the ref_anno_ch channel appropriately + if (params.ref_anno == 'NO_FILE'){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch) ref_anno_ch = ref_ch + }else{ // Load new channel from scratch if different reference annotation format specified + ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true) } align(seq_ch.combine(ref_ch)) | tree msa_ch = align.out refine(tree.out.combine(msa_ch).combine(meta_ch)) ancestral(refine.out.combine(msa_ch)) - translate(ancestral.out.combine(refine.out.combine(ref_anno_ch))) + translate(ancestral.out.combine(refine.out).combine(ref_anno_ch)) ch_aa_muts = translate.out - if (params.concat) { + if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error) ch_aa_muts = fix_aa_json(ch_aa_muts) } export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch) diff --git a/nextflow.config b/nextflow.config index 393c369..e7577fc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ params { ref = "${params.work_dir}/config/Ref.gb" //reference annotation (needed for concatenated workflow only) - ref_anno = "${params.work_dir}/config/Ref.gff3" + ref_anno = "NO_FILE" //input sequences seqs = "${params.work_dir}/data/sequences.fasta" @@ -50,12 +50,6 @@ params { //refining phylogeny divergence_units = "mutations" - //run workflow for concatenated sequences instead of single segment - concat = false - - //if not NONE, automate the extraction of a specific segment from fluviewer_consensus.fa files - segment = 'NONE' - //env variable AUGUR_RECURSION_LIMIT recursion_limit = 10000 From f4aedbab6022b140ed01df113d2bb9a3bf638096 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Wed, 13 Dec 2023 08:37:05 -0800 Subject: [PATCH 5/9] remove extract_sequences.py --- bin/extract_sequences.py | 102 --------------------------------------- 1 file changed, 102 deletions(-) delete mode 100755 bin/extract_sequences.py diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py deleted file mode 100755 index eac635f..0000000 --- a/bin/extract_sequences.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python3 -import os, sys -import re -import argparse -from glob import glob - - -def init_parser(): - parser = argparse.ArgumentParser(description="Concatenate sequences from files.") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("-s", "--segment", default='NONE', type=str, help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") - group.add_argument("-c", "--concat", action='store_true', help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") - - parser.add_argument("-i", "--inpath", help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.") - parser.add_argument('-o', "--outpath", type=str, required=True, help="Output file name") - - return parser - -def parse_fasta(filepath): - seqlist = [] - seq = '' - header = '' - with open(filepath, 'r') as handle: - for line in handle.readlines(): - if line[0] == '>': - if header != '': - seqlist.append((header, seq)) - - header = line.strip().lstrip('>') - seq = '' - else: - seq += line.strip() - seqlist.append((header, seq)) - - return seqlist - -def write_fasta(seqs, outpath): - with open(outpath, 'w') as outfile: - for header, seq in seqs: - outfile.write(">" + header + '\n') - outfile.write(seq + '\n') - -def concatenate_sequences(filepath): - sequences = parse_fasta(filepath) - - filepath_fields = re.split("(?:_fluviewer)?[\._]consensus", os.path.basename(filepath)) - - if len(filepath_fields) < 2: - print(f"WARN: Could not extract isolate name from file path {filepath}") - return None - - header = filepath_fields[0] - - if len(sequences) != 8: - print(f"WARN: Skipping {filepath}. Segment sequences are missing") - return None - - concatenated_seq = '' - for _ , seq in sequences: - concatenated_seq += seq - - return (header, concatenated_seq) - - -def extract_sequences(filepath, target_segment): - sequences = parse_fasta(filepath) - - search_segments = [x for x in sequences if re.search(target_segment, x[0])] - - if len(search_segments) != 1: - print("ERROR: Could not locate a singular sequences that matches the segment of interest. You may need to specify a more advanced regex with --segment") - print(search_segments) - return None - - return search_segments[0] - - -def main(args): - - if os.path.isfile(args.inpath): - print("ERROR: Expected directory of consensus.fa files as input.", file=sys.stderr) - exit(1) - - filepaths = glob(os.path.join(args.inpath, '*consensus*')) - output_sequences = [] - - for filepath in filepaths: - - - if args.concat: - seq = concatenate_sequences(filepath) - else: - seq = extract_sequences(filepath, args.segment) - - if seq: - output_sequences.append(seq) - - write_fasta(output_sequences, args.outpath) - -if __name__ == "__main__": - args = init_parser().parse_args() - main(args) \ No newline at end of file From ad79c832e2f154c8cbd1ec42dade960c1b5b8c24 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Wed, 13 Dec 2023 09:42:33 -0800 Subject: [PATCH 6/9] improved fix_aa_muts.py, made it more flexible --- bin/fix_aa_muts.py | 35 ++++++++++++++++++++--------------- main.nf | 8 ++++---- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py index 27d58be..be14eff 100755 --- a/bin/fix_aa_muts.py +++ b/bin/fix_aa_muts.py @@ -5,30 +5,35 @@ def init_parser(): parser = argparse.ArgumentParser() - parser.add_argument( 'json', help='aa_muts.json produced by the Augur translate step') - parser.add_argument( 'outpath', help='Fixed amino acid mutation node data in JSON format.') + parser.add_argument('-a', '--aa_json', required=True, help='aa_muts.json produced by the Augur translate step') + parser.add_argument('-n', '--nt_json', required=True, help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.') + parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.') return parser def main(args): - with open(args.json, 'r') as infile: - data = json.load(infile) + with open(args.aa_json, 'r') as infile: + aa_data = json.load(infile) - - name = data['annotations']['HA']['seqid'] - - data['annotations']['nuc'] = { - "end": 13590, - "seqid": name, - "start": 1, - "strand": "+", - "type": "source" - } + with open(args.nt_json, 'r') as infile: + reference_length = len(json.load(infile)['reference']['nuc']) + + if 'nuc' not in aa_data['annotations']: + key = list(aa_data['annotations'].keys())[0] + name = aa_data['annotations'][key]['seqid'] + + aa_data['annotations']['nuc'] = { + "end": reference_length, + "seqid": name, + "start": 1, + "strand": "+", + "type": "source" + } with open(args.outpath, 'w') as outfile: - json.dump(data, outfile, indent=4) + json.dump(aa_data, outfile, indent=4) if __name__ == "__main__": args = init_parser().parse_args() diff --git a/main.nf b/main.nf index 81b43ff..f164a47 100644 --- a/main.nf +++ b/main.nf @@ -188,13 +188,13 @@ process fix_aa_json { publishDir "${params.work_dir}/results/", mode: 'copy' input: - file(aa_muts_json) + tuple file(aa_muts_json), file(nt_muts_json) output: file("aa_muts_fix.json") """ - fix_aa_muts.py ${aa_muts_json} aa_muts_fix.json + fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json """ } @@ -259,8 +259,8 @@ workflow { translate(ancestral.out.combine(refine.out).combine(ref_anno_ch)) ch_aa_muts = translate.out - if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error) - ch_aa_muts = fix_aa_json(ch_aa_muts) + if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error) + ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out)) } export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch) } From ecaa086b69a29abb099b1ea68bf875605bc9ab48 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Mon, 18 Dec 2023 20:21:53 -0800 Subject: [PATCH 7/9] updated README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b165f58..4fef0dd 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,9 @@ each run is the input directory, which can be specified with the --work_dir flag command line. - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta] -- Reference genome used to align reads to during guided assembly [./config/Ref.gb] +- One of the two following configurations is required to describe the reference genome: + 1. A single GenBank file describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR + 2. A reference sequence in FASTA format [./config/Ref.fasta] AND a reference annotation in GFF format [./config/Ref.gff3]. - File containing metadata for sequences under analysis [./data/metadata.csv] - Excluded strains/ samples [./config/dropped_strains.txt] - Colors used in final auspice visualization [./config/colors.csv] From bb94881bd36fc0c9e4d48212c3309132ae4a0a6f Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Tue, 19 Dec 2023 06:49:19 -0800 Subject: [PATCH 8/9] minor improvements / changes --- main.nf | 17 +++++++++++------ nextflow.config | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index f164a47..8d09fd5 100644 --- a/main.nf +++ b/main.nf @@ -26,7 +26,8 @@ Mandatory arguments: Optional arguments: --seqs Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta] - --ref Reference genome used to align reads to during guided assembly [./config/Ref.gb] + --ref Reference genome used to align reads to during guided assembly [./config/Ref.gb (default); *.fasta file also accepted here] + --ref_anno Reference genome annotation file; only required when using a FASTA under --ref [*.gb / *.gff file accepted here] --meta File containing metadata for sequences under analysis [./data/metadata.csv] --drop_strains Excluded strains/ samples [./config/dropped_strains.txt] --colors Colors used in final auspice visualization [./config/colors.csv] @@ -239,14 +240,17 @@ workflow { // Catch invalid reference input combinations - if (!(params.ref =~ /.+\.[Gg]b$/) && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format - error "ERROR: Extra parameter --ref_anno (.gff3 or .gb format) must be specified for augur translate if non-GenBank formatted reference is provided." - }else if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter - error "ERROR: Extra parameter --ref_anno must be in either .gff or .gb (GenBank) format." + ref_gb_format = (params.ref =~ /.+\.[Gg]b$/) + + if (!ref_gb_format && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format + error "ERROR: Parameter --ref_anno (.gff3 or .gb format) must be specified if non-GenBank formatted reference is provided under --ref." + } + if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter + error "ERROR: Parameter --ref_anno must be in either .gff or .gb (GenBank) format." } // Load the ref_anno_ch channel appropriately - if (params.ref_anno == 'NO_FILE'){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch) + if (ref_gb_format){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch) ref_anno_ch = ref_ch }else{ // Load new channel from scratch if different reference annotation format specified ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true) @@ -259,6 +263,7 @@ workflow { translate(ancestral.out.combine(refine.out).combine(ref_anno_ch)) ch_aa_muts = translate.out + if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error) ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out)) } diff --git a/nextflow.config b/nextflow.config index e7577fc..b4cb71f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,7 +26,7 @@ params { //reference for alignment ref = "${params.work_dir}/config/Ref.gb" - //reference annotation (needed for concatenated workflow only) + //reference annotation ref_anno = "NO_FILE" //input sequences From 38cfe5b337f7614bb83d2532921c890dd0621bc2 Mon Sep 17 00:00:00 2001 From: jpalmer37 Date: Tue, 19 Dec 2023 06:54:02 -0800 Subject: [PATCH 9/9] minor README changes --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4fef0dd..ee1f216 100644 --- a/README.md +++ b/README.md @@ -84,8 +84,8 @@ command line. - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta] - One of the two following configurations is required to describe the reference genome: - 1. A single GenBank file describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR - 2. A reference sequence in FASTA format [./config/Ref.fasta] AND a reference annotation in GFF format [./config/Ref.gff3]. + 1. A single GenBank file passed to `--ref` describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR + 2. A reference sequence in FASTA format under `--ref` AND a reference annotation in GFF format under `--ref_anno`. - File containing metadata for sequences under analysis [./data/metadata.csv] - Excluded strains/ samples [./config/dropped_strains.txt] - Colors used in final auspice visualization [./config/colors.csv]