From 456829e56d3821d71e3b92b26cf66c96c9da5796 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Wed, 6 Dec 2023 14:25:08 -0800
Subject: [PATCH 1/9] changes

---
 bin/extract_sequences.py |  77 +++++++++++++++++++++++++
 bin/fix_aa_muts.py       |  35 ++++++++++++
 main.nf                  | 118 ++++++++++++++++++++++++++++++---------
 nextflow.config          |  79 ++++++++++++++------------
 4 files changed, 247 insertions(+), 62 deletions(-)
 create mode 100755 bin/extract_sequences.py
 create mode 100755 bin/fix_aa_muts.py

diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py
new file mode 100755
index 0000000..7d6fe46
--- /dev/null
+++ b/bin/extract_sequences.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3 
+import os, sys
+from Bio import SeqIO
+import re
+import argparse
+from glob import glob 
+
+
+def init_parser():
+	parser = argparse.ArgumentParser(description="Concatenate sequences from files.")
+	group = parser.add_mutually_exclusive_group(required=True)
+	group.add_argument("-s", "--segment", default='NONE', type=str, help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
+	group.add_argument("-c", "--concat", action='store_true', help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
+	
+	parser.add_argument("-i", "--inpath",  help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
+	parser.add_argument('-o', "--outpath", type=str, required=True, help="Output file name")
+	
+	return parser
+
+def concatenate_sequences(filepath):
+	sequences = list(SeqIO.parse(filepath, "fasta"))
+	
+	if len(sequences) == 8:
+		concatenated_seq = sequences[0]
+		for seq in sequences[1:]:
+			concatenated_seq += seq
+
+		# rename sequences
+		concatenated_seq.id = re.split("_fluviewer|_segment", sequences[0].id)[0]
+		concatenated_seq.name = concatenated_seq.id
+		concatenated_seq.description = concatenated_seq.id
+		return concatenated_seq
+	
+	print(f"WARN: Skipping {filepath}. Segment sequences are missing")
+
+	return None
+
+def extract_sequences(filepath, target_segment):
+	sequences = list(SeqIO.parse(filepath, "fasta"))
+
+	for seq in sequences:
+		segment_search = re.search("(?:_fluviewer\||_segment\d_)([A-Z0-9]+)", seq.id)
+
+		if segment_search and segment_search.group(1) == target_segment:
+			return seq
+
+	print(f"WARN: Skipping {filepath}. Could not find segment of interest.")
+	return None
+
+def main(args):
+
+	if os.path.isfile(args.inpath):
+		print("ERROR: Expected directory of consensus.fa files as input.", file=sys.stderr)
+		exit(1)
+	
+	filepaths = glob(os.path.join(args.inpath, '*consensus*'))
+
+	if args.concat:
+		output_sequences = []
+		for filepath in filepaths:
+			concatenated_seq = concatenate_sequences(filepath)
+			if concatenated_seq:
+				output_sequences.append(concatenated_seq)
+
+
+	else:
+		output_sequences = []
+		for filepath in filepaths:
+			extracted_seq = extract_sequences(filepath, args.segment)
+			if extracted_seq:
+				output_sequences.append(extracted_seq)
+
+	SeqIO.write(output_sequences, args.outpath, "fasta")
+
+if __name__ == "__main__":
+	args = init_parser().parse_args()
+	main(args)
\ No newline at end of file
diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py
new file mode 100755
index 0000000..27d58be
--- /dev/null
+++ b/bin/fix_aa_muts.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+import os, json
+import argparse
+
+
+def init_parser():
+	parser = argparse.ArgumentParser()
+	parser.add_argument( 'json',  help='aa_muts.json produced by the Augur translate step')	
+	parser.add_argument( 'outpath',  help='Fixed amino acid mutation node data in JSON format.')	
+	
+	return parser
+
+
+def main(args):
+
+	with open(args.json, 'r') as infile:
+		data = json.load(infile)
+
+
+	name = data['annotations']['HA']['seqid']
+
+	data['annotations']['nuc'] = {
+      "end": 13590,
+      "seqid": name,
+      "start": 1,
+      "strand": "+",
+      "type": "source"
+    }
+
+	with open(args.outpath, 'w') as outfile:
+		json.dump(data, outfile, indent=4)
+
+if __name__ == "__main__":
+	args = init_parser().parse_args()
+	main(args)
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 24dcea1..cb03e69 100644
--- a/main.nf
+++ b/main.nf
@@ -79,6 +79,34 @@ process definition
 ---------------------------------------------------------------------------------
 */
 
+process extract_seqs{
+  tag "Optional process to automatically extract relevant segments from FluViewer consensus.fa"
+  publishDir "${params.work_dir}/results/", mode: 'copy'
+
+  input:
+  path(sequences_path)
+
+  output:
+  path("extracted_seqs.fasta")
+
+  script:
+  concat = params.concat ? '--concat' : ''
+  segment = params.segment != 'NONE' ? "--segment ${params.segment}" : ''
+  
+  """
+  if [ ! -d ${sequences_path} ]; then
+    echo "ERROR: Provided path must be a directory when running automatic sequence extraction with --concat or --segment."
+    exit 1
+  fi
+
+  extract_sequences.py \
+  --inpath ${sequences_path} \
+  --outpath extracted_seqs.fasta \
+  ${concat} \
+  ${segment}
+  """
+}
+
 process align {
 
   tag "Aligning sequences to ${params.ref} & filling gaps with N"
@@ -126,7 +154,7 @@ process refine {
   publishDir "${params.work_dir}/results/", mode: 'copy'
 
   input:
-  tuple file(tree), file(msa)
+  tuple file(tree), file(msa), file(metadata)
 
   output:
   tuple path("tree.nwk"), path("branch_lengths.json")
@@ -135,7 +163,7 @@ process refine {
   augur refine \
       --tree ${tree} \
       --alignment ${msa} \
-      --metadata ${params.meta} \
+      --metadata ${metadata} \
       --timetree \
       --keep-root \
       --divergence-units ${params.divergence_units} \
@@ -154,14 +182,14 @@ process ancestral {
     output:
     path("nt_muts.json")
 
-        """
-        augur ancestral \
-            --tree ${refine_tree} \
-            --alignment ${msa} \
-            --output-node-data nt_muts.json \
-            --keep-overhangs \
-            --keep-ambiguous
-        """
+    """
+    augur ancestral \
+        --tree ${refine_tree} \
+        --alignment ${msa} \
+        --output-node-data nt_muts.json \
+        --keep-overhangs \
+        --keep-ambiguous
+    """
 }
 
 process translate {
@@ -183,6 +211,21 @@ process translate {
       """
 }
 
+process fix_aa_json {
+  tag "Fixing aa_muts.json for concatenated workflow."
+  publishDir "${params.work_dir}/results/", mode: 'copy'
+
+  input:
+  file(aa_muts_json)
+
+  output:
+  file("aa_muts_fix.json")
+
+  """
+  fix_aa_muts.py ${aa_muts_json} aa_muts_fix.json
+  """
+}
+
 process export {
   tag "Exporting data files for auspice"
   publishDir "${params.work_dir}/auspice", mode: 'copy'
@@ -190,23 +233,25 @@ process export {
   input:
   tuple file(refine_tree), file(branch_len), file(nt_muts),\
   file(aa_muts)
+  file(metadata)
+  tuple file(auspice_config), file(colors), file(lat_long)
 
   output:
-  file("flu_na.json")
+  file("flu.json")
 
-      """
-      export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
-
-      augur export v2 \
-          --tree ${refine_tree} \
-          --metadata ${params.meta} \
-          --node-data ${branch_len} ${nt_muts} ${aa_muts} \
-          --colors ${params.colors} \
-          --lat-longs ${params.lat_long} \
-          --minify-json \
-          --auspice-config ${params.auspice} \
-          --output flu_na.json
-      """
+  """
+  export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
+
+  augur export v2 \
+      --tree ${refine_tree} \
+      --metadata ${metadata} \
+      --node-data ${branch_len} ${nt_muts} ${aa_muts} \
+      --colors ${colors} \
+      --lat-longs ${lat_long} \
+      --minify-json \
+      --auspice-config ${auspice_config} \
+      --output flu.json
+  """
 }
 
 /**
@@ -218,13 +263,32 @@ workflow
 workflow {
   seq_ch = Channel.fromPath(params.seqs, checkIfExists:true)
   ref_ch = Channel.fromPath(params.ref, checkIfExists:true)
+  meta_ch = Channel.fromPath(params.meta, checkIfExists:true)
+  config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect()
+
+  if (params.concat) {
+    ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
+    seq_ch = extract_seqs(seq_ch)
+
+  }else if (params.segment != 'NONE') { 
+    ref_anno_ch = ref_ch
+    seq_ch = extract_seqs(seq_ch)
+
+  }else{
+    ref_anno_ch = ref_ch
+  }
 
   align(seq_ch.combine(ref_ch)) | tree
   msa_ch = align.out
-  refine(tree.out.combine(align.out))
+  refine(tree.out.combine(msa_ch).combine(meta_ch))
   ancestral(refine.out.combine(msa_ch))
-  translate(ancestral.out.combine(refine.out.combine(ref_ch)))
-  export(refine.out.combine(ancestral.out.combine(translate.out)))
+  translate(ancestral.out.combine(refine.out.combine(ref_anno_ch)))
+  
+  ch_aa_muts = translate.out
+  if (params.concat) {
+    ch_aa_muts = fix_aa_json(ch_aa_muts)
+  }
+  export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
 }
 
 /**
diff --git a/nextflow.config b/nextflow.config
index a97e069..2ea64c0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,44 +11,53 @@ manifest {
 
 params {
 
-//help message
-help = null
+  //help message
+  help = null
 
-//version number
-version = null
+  //version number
+  version = null
 
-//conda env local cache
-conda_cache = null 
+  //conda env local cache
+  conda_cache = null 
 
-//directory containing config/ & data/ folders
-work_dir = "/path/to/data/"
+  //directory containing config/ & data/ folders
+  work_dir = "/path/to/data/"
 
-//reference for alignment
-ref = "${params.work_dir}config/Ref.gb"
+  //reference for alignment
+  ref = "${params.work_dir}/config/Ref.gb"
 
-//input sequences
-seqs = "${params.work_dir}data/sequences.fasta"
+  //reference annotation (needed for concatenated workflow only)
+  ref_anno = "${params.work_dir}/config/Ref.gff3"
 
-//metadata of input sequences
-meta = "${params.work_dir}data/metadata.csv"
+  //input sequences
+  seqs = "${params.work_dir}/data/sequences.fasta"
 
-//strains that are excluded
-drop_strains = "${params.work_dir}config/dropped_strains.txt"
+  //metadata of input sequences
+  meta = "${params.work_dir}/data/metadata.csv"
 
-//colors used in final auspice visualization
-colors = "${params.work_dir}config/colors.csv"
+  //strains that are excluded
+  drop_strains = "${params.work_dir}/config/dropped_strains.txt"
 
-//latitude and longitudes
-lat_long = "${params.work_dir}config/lat_longs.csv"
+  //colors used in final auspice visualization
+  colors = "${params.work_dir}/config/colors.csv"
 
-//details for auspice visualization
-auspice = "${params.work_dir}config/auspice_config.json"
+  //latitude and longitudes
+  lat_long = "${params.work_dir}/config/lat_longs.csv"
 
-//refining phylogeny
-divergence_units = "mutations"
+  //details for auspice visualization
+  auspice = "${params.work_dir}/config/auspice_config.json"
 
-//env variable AUGUR_RECURSION_LIMIT
-recursion_limit = 10000
+  //refining phylogeny
+  divergence_units = "mutations"
+
+  //run workflow for concatenated sequences instead of single segment
+  concat = false
+
+  //if not NONE, automate the extraction of a specific segment from fluviewer_consensus.fa files
+  segment = 'NONE'
+
+  //env variable AUGUR_RECURSION_LIMIT
+  recursion_limit = 10000
 
 }
 
@@ -56,12 +65,12 @@ recursion_limit = 10000
 //the process section of the config file. ex. AWS, SLURM, sun grid engine:
 
 process {
-withName: align {
-    cpus = 28
+  withName: align {
+      cpus = 28
+    }
+  withName: tree {
+      cpus = 28
   }
-withName: tree {
-    cpus = 28
- }
 //  penv='smp'
 //  executor='sge'
 //  memory='30 GB'
@@ -70,7 +79,7 @@ withName: tree {
 
 profiles {
     conda {
-      process.conda = "${projectDir}/environments/ENV.yml"
+      process.conda = "${projectDir}/environments/environment.yml"
       conda.createTimeout = '1 h'
       if (params.conda_cache) {
          conda.cacheDir = params.conda_cache
@@ -83,17 +92,17 @@ profiles {
 //html displaying breakdown of time taken to execute workflow
 timeline {
   enabled = true
-  file = "${params.work_dir}reports/fluflo_timeline.html"
+  file = "${params.work_dir}/reports/fluflo_timeline.html"
 }
 
 //html of cpu/mem usage
 report {
   enabled = true
-  file = "${params.work_dir}reports/fluflo_usage.html"
+  file = "${params.work_dir}/reports/fluflo_usage.html"
 }
 
 //dag of beast-flow workflow
 dag {
     enabled = true
-    file = "${params.work_dir}reports/fluflo_dag.html"
+    file = "${params.work_dir}/reports/fluflo_dag.html"
 }

From a47f4ed74844e2305a1a933c6479ca02c5c37f38 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Wed, 6 Dec 2023 15:30:38 -0800
Subject: [PATCH 2/9] revert change to default conda

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 2ea64c0..393c369 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -79,7 +79,7 @@ process {
 
 profiles {
     conda {
-      process.conda = "${projectDir}/environments/environment.yml"
+      process.conda = "${projectDir}/environments/ENV.yml"
       conda.createTimeout = '1 h'
       if (params.conda_cache) {
          conda.cacheDir = params.conda_cache

From d585131ad51366fc88ce3a17787d34720ee461e7 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Thu, 7 Dec 2023 12:27:15 -0800
Subject: [PATCH 3/9] updated extract_sequences.py

---
 bin/extract_sequences.py | 95 +++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 35 deletions(-)

diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py
index 7d6fe46..eac635f 100755
--- a/bin/extract_sequences.py
+++ b/bin/extract_sequences.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3 
 import os, sys
-from Bio import SeqIO
 import re
 import argparse
 from glob import glob 
@@ -17,35 +16,64 @@ def init_parser():
 	
 	return parser
 
-def concatenate_sequences(filepath):
-	sequences = list(SeqIO.parse(filepath, "fasta"))
+def parse_fasta(filepath):	
+	seqlist = []
+	seq = ''
+	header = ''
+	with open(filepath, 'r') as handle:
+		for line in handle.readlines():
+			if line[0] == '>':
+				if header != '':
+					seqlist.append((header, seq))
+
+				header = line.strip().lstrip('>')
+				seq = ''
+			else:
+				seq += line.strip()
+		seqlist.append((header, seq))
 	
-	if len(sequences) == 8:
-		concatenated_seq = sequences[0]
-		for seq in sequences[1:]:
-			concatenated_seq += seq
-
-		# rename sequences
-		concatenated_seq.id = re.split("_fluviewer|_segment", sequences[0].id)[0]
-		concatenated_seq.name = concatenated_seq.id
-		concatenated_seq.description = concatenated_seq.id
-		return concatenated_seq
+	return seqlist
+
+def write_fasta(seqs, outpath):
+	with open(outpath, 'w') as outfile:
+		for header, seq in seqs:
+			outfile.write(">" + header + '\n')
+			outfile.write(seq + '\n')
+
+def concatenate_sequences(filepath):
+	sequences = parse_fasta(filepath)
+
+	filepath_fields = re.split("(?:_fluviewer)?[\._]consensus", os.path.basename(filepath))
+
+	if len(filepath_fields) < 2:
+		print(f"WARN: Could not extract isolate name from file path {filepath}")
+		return None 
+
+	header = filepath_fields[0]
+
+	if len(sequences) != 8:
+		print(f"WARN: Skipping {filepath}. Segment sequences are missing")
+		return None
 	
-	print(f"WARN: Skipping {filepath}. Segment sequences are missing")
+	concatenated_seq = ''
+	for _ , seq in sequences:
+		concatenated_seq += seq
+
+	return (header, concatenated_seq)
 
-	return None
 
 def extract_sequences(filepath, target_segment):
-	sequences = list(SeqIO.parse(filepath, "fasta"))
+	sequences = parse_fasta(filepath)
 
-	for seq in sequences:
-		segment_search = re.search("(?:_fluviewer\||_segment\d_)([A-Z0-9]+)", seq.id)
+	search_segments = [x for x in sequences if re.search(target_segment, x[0])] 
 
-		if segment_search and segment_search.group(1) == target_segment:
-			return seq
+	if len(search_segments) != 1: 
+		print("ERROR: Could not locate a singular sequences that matches the segment of interest. You may need to specify a more advanced regex with --segment")
+		print(search_segments)
+		return None 
+
+	return search_segments[0]
 
-	print(f"WARN: Skipping {filepath}. Could not find segment of interest.")
-	return None
 
 def main(args):
 
@@ -54,23 +82,20 @@ def main(args):
 		exit(1)
 	
 	filepaths = glob(os.path.join(args.inpath, '*consensus*'))
+	output_sequences = []
+
+	for filepath in filepaths:
 
-	if args.concat:
-		output_sequences = []
-		for filepath in filepaths:
-			concatenated_seq = concatenate_sequences(filepath)
-			if concatenated_seq:
-				output_sequences.append(concatenated_seq)
 
+		if args.concat:
+			seq = concatenate_sequences(filepath)
+		else:
+			seq = extract_sequences(filepath, args.segment)
 
-	else:
-		output_sequences = []
-		for filepath in filepaths:
-			extracted_seq = extract_sequences(filepath, args.segment)
-			if extracted_seq:
-				output_sequences.append(extracted_seq)
+		if seq:
+			output_sequences.append(seq)
 
-	SeqIO.write(output_sequences, args.outpath, "fasta")
+	write_fasta(output_sequences, args.outpath)
 
 if __name__ == "__main__":
 	args = init_parser().parse_args()

From 6bd2ad4644a369bfb1841795f244cd94606b541f Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Wed, 13 Dec 2023 08:31:38 -0800
Subject: [PATCH 4/9] reduced changes to only those necessary for concat
 workflow

---
 main.nf         | 56 +++++++++++++------------------------------------
 nextflow.config |  8 +------
 2 files changed, 16 insertions(+), 48 deletions(-)

diff --git a/main.nf b/main.nf
index cb03e69..81b43ff 100644
--- a/main.nf
+++ b/main.nf
@@ -79,34 +79,6 @@ process definition
 ---------------------------------------------------------------------------------
 */
 
-process extract_seqs{
-  tag "Optional process to automatically extract relevant segments from FluViewer consensus.fa"
-  publishDir "${params.work_dir}/results/", mode: 'copy'
-
-  input:
-  path(sequences_path)
-
-  output:
-  path("extracted_seqs.fasta")
-
-  script:
-  concat = params.concat ? '--concat' : ''
-  segment = params.segment != 'NONE' ? "--segment ${params.segment}" : ''
-  
-  """
-  if [ ! -d ${sequences_path} ]; then
-    echo "ERROR: Provided path must be a directory when running automatic sequence extraction with --concat or --segment."
-    exit 1
-  fi
-
-  extract_sequences.py \
-  --inpath ${sequences_path} \
-  --outpath extracted_seqs.fasta \
-  ${concat} \
-  ${segment}
-  """
-}
-
 process align {
 
   tag "Aligning sequences to ${params.ref} & filling gaps with N"
@@ -212,7 +184,7 @@ process translate {
 }
 
 process fix_aa_json {
-  tag "Fixing aa_muts.json for concatenated workflow."
+  tag "Fixing aa_muts.json when using a GFF3 file for augur translate."
   publishDir "${params.work_dir}/results/", mode: 'copy'
 
   input:
@@ -231,8 +203,7 @@ process export {
   publishDir "${params.work_dir}/auspice", mode: 'copy'
 
   input:
-  tuple file(refine_tree), file(branch_len), file(nt_muts),\
-  file(aa_muts)
+  tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts) 
   file(metadata)
   tuple file(auspice_config), file(colors), file(lat_long)
 
@@ -266,26 +237,29 @@ workflow {
   meta_ch = Channel.fromPath(params.meta, checkIfExists:true)
   config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect()
 
-  if (params.concat) {
-    ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
-    seq_ch = extract_seqs(seq_ch)
-
-  }else if (params.segment != 'NONE') { 
-    ref_anno_ch = ref_ch
-    seq_ch = extract_seqs(seq_ch)
 
-  }else{
+  // Catch invalid reference input combinations 
+  if (!(params.ref =~ /.+\.[Gg]b$/) && params.ref_anno == 'NO_FILE' ){                         // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
+    error "ERROR: Extra parameter --ref_anno (.gff3 or .gb format) must be specified for augur translate if non-GenBank formatted reference is provided."
+  }else if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){     // Can only have .gff or .gb formats in the --ref_anno parameter
+    error "ERROR: Extra parameter --ref_anno must be in either .gff or .gb (GenBank) format."
+  }
+  
+  // Load the ref_anno_ch channel appropriately 
+  if (params.ref_anno == 'NO_FILE'){                                          // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
     ref_anno_ch = ref_ch
+  }else{                                                                      // Load new channel from scratch if different reference annotation format specified
+    ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
   }
 
   align(seq_ch.combine(ref_ch)) | tree
   msa_ch = align.out
   refine(tree.out.combine(msa_ch).combine(meta_ch))
   ancestral(refine.out.combine(msa_ch))
-  translate(ancestral.out.combine(refine.out.combine(ref_anno_ch)))
+  translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))
   
   ch_aa_muts = translate.out
-  if (params.concat) {
+  if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) {                                     // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
     ch_aa_muts = fix_aa_json(ch_aa_muts)
   }
   export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
diff --git a/nextflow.config b/nextflow.config
index 393c369..e7577fc 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,7 +27,7 @@ params {
   ref = "${params.work_dir}/config/Ref.gb"
 
   //reference annotation (needed for concatenated workflow only)
-  ref_anno = "${params.work_dir}/config/Ref.gff3"
+  ref_anno = "NO_FILE"
 
   //input sequences
   seqs = "${params.work_dir}/data/sequences.fasta"
@@ -50,12 +50,6 @@ params {
   //refining phylogeny
   divergence_units = "mutations"
 
-  //run workflow for concatenated sequences instead of single segment
-  concat = false
-
-  //if not NONE, automate the extraction of a specific segment from fluviewer_consensus.fa files
-  segment = 'NONE'
-
   //env variable AUGUR_RECURSION_LIMIT
   recursion_limit = 10000
 

From f4aedbab6022b140ed01df113d2bb9a3bf638096 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Wed, 13 Dec 2023 08:37:05 -0800
Subject: [PATCH 5/9] remove extract_sequences.py

---
 bin/extract_sequences.py | 102 ---------------------------------------
 1 file changed, 102 deletions(-)
 delete mode 100755 bin/extract_sequences.py

diff --git a/bin/extract_sequences.py b/bin/extract_sequences.py
deleted file mode 100755
index eac635f..0000000
--- a/bin/extract_sequences.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python3 
-import os, sys
-import re
-import argparse
-from glob import glob 
-
-
-def init_parser():
-	parser = argparse.ArgumentParser(description="Concatenate sequences from files.")
-	group = parser.add_mutually_exclusive_group(required=True)
-	group.add_argument("-s", "--segment", default='NONE', type=str, help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
-	group.add_argument("-c", "--concat", action='store_true', help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
-	
-	parser.add_argument("-i", "--inpath",  help="Input folder containing one consensus.fa per isolate, with up to 8 segments per file.")
-	parser.add_argument('-o', "--outpath", type=str, required=True, help="Output file name")
-	
-	return parser
-
-def parse_fasta(filepath):	
-	seqlist = []
-	seq = ''
-	header = ''
-	with open(filepath, 'r') as handle:
-		for line in handle.readlines():
-			if line[0] == '>':
-				if header != '':
-					seqlist.append((header, seq))
-
-				header = line.strip().lstrip('>')
-				seq = ''
-			else:
-				seq += line.strip()
-		seqlist.append((header, seq))
-	
-	return seqlist
-
-def write_fasta(seqs, outpath):
-	with open(outpath, 'w') as outfile:
-		for header, seq in seqs:
-			outfile.write(">" + header + '\n')
-			outfile.write(seq + '\n')
-
-def concatenate_sequences(filepath):
-	sequences = parse_fasta(filepath)
-
-	filepath_fields = re.split("(?:_fluviewer)?[\._]consensus", os.path.basename(filepath))
-
-	if len(filepath_fields) < 2:
-		print(f"WARN: Could not extract isolate name from file path {filepath}")
-		return None 
-
-	header = filepath_fields[0]
-
-	if len(sequences) != 8:
-		print(f"WARN: Skipping {filepath}. Segment sequences are missing")
-		return None
-	
-	concatenated_seq = ''
-	for _ , seq in sequences:
-		concatenated_seq += seq
-
-	return (header, concatenated_seq)
-
-
-def extract_sequences(filepath, target_segment):
-	sequences = parse_fasta(filepath)
-
-	search_segments = [x for x in sequences if re.search(target_segment, x[0])] 
-
-	if len(search_segments) != 1: 
-		print("ERROR: Could not locate a singular sequences that matches the segment of interest. You may need to specify a more advanced regex with --segment")
-		print(search_segments)
-		return None 
-
-	return search_segments[0]
-
-
-def main(args):
-
-	if os.path.isfile(args.inpath):
-		print("ERROR: Expected directory of consensus.fa files as input.", file=sys.stderr)
-		exit(1)
-	
-	filepaths = glob(os.path.join(args.inpath, '*consensus*'))
-	output_sequences = []
-
-	for filepath in filepaths:
-
-
-		if args.concat:
-			seq = concatenate_sequences(filepath)
-		else:
-			seq = extract_sequences(filepath, args.segment)
-
-		if seq:
-			output_sequences.append(seq)
-
-	write_fasta(output_sequences, args.outpath)
-
-if __name__ == "__main__":
-	args = init_parser().parse_args()
-	main(args)
\ No newline at end of file

From ad79c832e2f154c8cbd1ec42dade960c1b5b8c24 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Wed, 13 Dec 2023 09:42:33 -0800
Subject: [PATCH 6/9] improved fix_aa_muts.py, made it more flexible

---
 bin/fix_aa_muts.py | 35 ++++++++++++++++++++---------------
 main.nf            |  8 ++++----
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py
index 27d58be..be14eff 100755
--- a/bin/fix_aa_muts.py
+++ b/bin/fix_aa_muts.py
@@ -5,30 +5,35 @@
 
 def init_parser():
 	parser = argparse.ArgumentParser()
-	parser.add_argument( 'json',  help='aa_muts.json produced by the Augur translate step')	
-	parser.add_argument( 'outpath',  help='Fixed amino acid mutation node data in JSON format.')	
+	parser.add_argument('-a', '--aa_json', required=True,  help='aa_muts.json produced by the Augur translate step')	
+	parser.add_argument('-n',  '--nt_json', required=True,  help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.')	
+	parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.')	
 	
 	return parser
 
 
 def main(args):
 
-	with open(args.json, 'r') as infile:
-		data = json.load(infile)
+	with open(args.aa_json, 'r') as infile:
+		aa_data = json.load(infile)
 
-
-	name = data['annotations']['HA']['seqid']
-
-	data['annotations']['nuc'] = {
-      "end": 13590,
-      "seqid": name,
-      "start": 1,
-      "strand": "+",
-      "type": "source"
-    }
+	with open(args.nt_json, 'r') as infile:
+		reference_length = len(json.load(infile)['reference']['nuc'])
+	
+	if 'nuc' not in aa_data['annotations']:
+		key = list(aa_data['annotations'].keys())[0]
+		name = aa_data['annotations'][key]['seqid']
+
+		aa_data['annotations']['nuc'] = {
+		"end": reference_length,
+		"seqid": name,
+		"start": 1,
+		"strand": "+",
+		"type": "source"
+		}
 
 	with open(args.outpath, 'w') as outfile:
-		json.dump(data, outfile, indent=4)
+		json.dump(aa_data, outfile, indent=4)
 
 if __name__ == "__main__":
 	args = init_parser().parse_args()
diff --git a/main.nf b/main.nf
index 81b43ff..f164a47 100644
--- a/main.nf
+++ b/main.nf
@@ -188,13 +188,13 @@ process fix_aa_json {
   publishDir "${params.work_dir}/results/", mode: 'copy'
 
   input:
-  file(aa_muts_json)
+  tuple file(aa_muts_json), file(nt_muts_json)
 
   output:
   file("aa_muts_fix.json")
 
   """
-  fix_aa_muts.py ${aa_muts_json} aa_muts_fix.json
+  fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json
   """
 }
 
@@ -259,8 +259,8 @@ workflow {
   translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))
   
   ch_aa_muts = translate.out
-  if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) {                                     // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
-    ch_aa_muts = fix_aa_json(ch_aa_muts)
+  if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) {        // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
+    ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out))
   }
   export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
 }

From ecaa086b69a29abb099b1ea68bf875605bc9ab48 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Mon, 18 Dec 2023 20:21:53 -0800
Subject: [PATCH 7/9] updated README

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b165f58..4fef0dd 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,9 @@ each run is the input directory, which can be specified with the --work_dir flag
 command line.
 
 - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
-- Reference genome used to align reads to during guided assembly [./config/Ref.gb]
+- One of the two following configurations is required to describe the reference genome:
+  1. A single GenBank file describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR 
+  2. A reference sequence in FASTA format [./config/Ref.fasta] AND a reference annotation in GFF format [./config/Ref.gff3].
 - File containing metadata for sequences under analysis [./data/metadata.csv]
 - Excluded strains/ samples [./config/dropped_strains.txt]
 - Colors used in final auspice visualization [./config/colors.csv]

From bb94881bd36fc0c9e4d48212c3309132ae4a0a6f Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Tue, 19 Dec 2023 06:49:19 -0800
Subject: [PATCH 8/9] minor improvements / changes

---
 main.nf         | 17 +++++++++++------
 nextflow.config |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index f164a47..8d09fd5 100644
--- a/main.nf
+++ b/main.nf
@@ -26,7 +26,8 @@ Mandatory arguments:
 
 Optional arguments:
  --seqs                         Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
- --ref                          Reference genome used to align reads to during guided assembly [./config/Ref.gb]
+ --ref                          Reference genome used to align reads to during guided assembly [./config/Ref.gb (default); *.fasta file also accepted here]
+ --ref_anno                     Reference genome annotation file; only required when using a FASTA under --ref [*.gb / *.gff file accepted here]
  --meta                         File containing metadata for sequences under analysis [./data/metadata.csv]
  --drop_strains                 Excluded strains/ samples [./config/dropped_strains.txt]
  --colors                       Colors used in final auspice visualization [./config/colors.csv]
@@ -239,14 +240,17 @@ workflow {
 
 
   // Catch invalid reference input combinations 
-  if (!(params.ref =~ /.+\.[Gg]b$/) && params.ref_anno == 'NO_FILE' ){                         // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
-    error "ERROR: Extra parameter --ref_anno (.gff3 or .gb format) must be specified for augur translate if non-GenBank formatted reference is provided."
-  }else if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){     // Can only have .gff or .gb formats in the --ref_anno parameter
-    error "ERROR: Extra parameter --ref_anno must be in either .gff or .gb (GenBank) format."
+  ref_gb_format = (params.ref =~ /.+\.[Gg]b$/)
+
+  if (!ref_gb_format && params.ref_anno == 'NO_FILE' ){                         // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
+    error "ERROR: Parameter --ref_anno (.gff3 or .gb format) must be specified if non-GenBank formatted reference is provided under --ref."
+  }
+  if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){     // Can only have .gff or .gb formats in the --ref_anno parameter
+    error "ERROR: Parameter --ref_anno must be in either .gff or .gb (GenBank) format."
   }
   
   // Load the ref_anno_ch channel appropriately 
-  if (params.ref_anno == 'NO_FILE'){                                          // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
+  if (ref_gb_format){                                                         // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
     ref_anno_ch = ref_ch
   }else{                                                                      // Load new channel from scratch if different reference annotation format specified
     ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
@@ -259,6 +263,7 @@ workflow {
   translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))
   
   ch_aa_muts = translate.out
+
   if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) {        // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
     ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out))
   }
diff --git a/nextflow.config b/nextflow.config
index e7577fc..b4cb71f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -26,7 +26,7 @@ params {
   //reference for alignment
   ref = "${params.work_dir}/config/Ref.gb"
 
-  //reference annotation (needed for concatenated workflow only)
+  //reference annotation 
   ref_anno = "NO_FILE"
 
   //input sequences

From 38cfe5b337f7614bb83d2532921c890dd0621bc2 Mon Sep 17 00:00:00 2001
From: jpalmer37 <john.palmer1288@gmail.com>
Date: Tue, 19 Dec 2023 06:54:02 -0800
Subject: [PATCH 9/9] minor README changes

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4fef0dd..ee1f216 100644
--- a/README.md
+++ b/README.md
@@ -84,8 +84,8 @@ command line.
 
 - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
 - One of the two following configurations is required to describe the reference genome:
-  1. A single GenBank file describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR 
-  2. A reference sequence in FASTA format [./config/Ref.fasta] AND a reference annotation in GFF format [./config/Ref.gff3].
+  1. A single GenBank file passed to `--ref` describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR 
+  2. A reference sequence in FASTA format under `--ref` AND a reference annotation in GFF format under `--ref_anno`.
 - File containing metadata for sequences under analysis [./data/metadata.csv]
 - Excluded strains/ samples [./config/dropped_strains.txt]
 - Colors used in final auspice visualization [./config/colors.csv]