Merge pull request #6 from jpalmer37/dev

separate reference parameters to allow non-genbank file formats
BCCDC-PHL · Dec 27, 2023 · 16878b9 · 16878b9
2 parents 1092768 + 38cfe5b
commit 16878b9
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -83,7 +83,9 @@ each run is the input directory, which can be specified with the --work_dir flag
 command line.
 
 - Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
-- Reference genome used to align reads to during guided assembly [./config/Ref.gb]
+- One of the two following configurations is required to describe the reference genome:
+  1. A single GenBank file passed to `--ref` describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR 
+  2. A reference sequence in FASTA format under `--ref` AND a reference annotation in GFF format under `--ref_anno`.
 - File containing metadata for sequences under analysis [./data/metadata.csv]
 - Excluded strains/ samples [./config/dropped_strains.txt]
 - Colors used in final auspice visualization [./config/colors.csv]

diff --git a/bin/fix_aa_muts.py b/bin/fix_aa_muts.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import os, json
+import argparse
+
+
+def init_parser():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('-a', '--aa_json', required=True,  help='aa_muts.json produced by the Augur translate step')	
+	parser.add_argument('-n',  '--nt_json', required=True,  help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.')	
+	parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.')	
+
+	return parser
+
+
+def main(args):
+
+	with open(args.aa_json, 'r') as infile:
+		aa_data = json.load(infile)
+
+	with open(args.nt_json, 'r') as infile:
+		reference_length = len(json.load(infile)['reference']['nuc'])
+
+	if 'nuc' not in aa_data['annotations']:
+		key = list(aa_data['annotations'].keys())[0]
+		name = aa_data['annotations'][key]['seqid']
+
+		aa_data['annotations']['nuc'] = {
+		"end": reference_length,
+		"seqid": name,
+		"start": 1,
+		"strand": "+",
+		"type": "source"
+		}
+
+	with open(args.outpath, 'w') as outfile:
+		json.dump(aa_data, outfile, indent=4)
+
+if __name__ == "__main__":
+	args = init_parser().parse_args()
+	main(args)
diff --git a/main.nf b/main.nf
@@ -26,7 +26,8 @@ Mandatory arguments:
 
 Optional arguments:
  --seqs                         Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
- --ref                          Reference genome used to align reads to during guided assembly [./config/Ref.gb]
+ --ref                          Reference genome used to align reads to during guided assembly [./config/Ref.gb (default); *.fasta file also accepted here]
+ --ref_anno                     Reference genome annotation file; only required when using a FASTA under --ref [*.gb / *.gff file accepted here]
  --meta                         File containing metadata for sequences under analysis [./data/metadata.csv]
  --drop_strains                 Excluded strains/ samples [./config/dropped_strains.txt]
  --colors                       Colors used in final auspice visualization [./config/colors.csv]
@@ -126,7 +127,7 @@ process refine {
   publishDir "${params.work_dir}/results/", mode: 'copy'
 
   input:
-  tuple file(tree), file(msa)
+  tuple file(tree), file(msa), file(metadata)
 
   output:
   tuple path("tree.nwk"), path("branch_lengths.json")
@@ -135,7 +136,7 @@ process refine {
   augur refine \
       --tree ${tree} \
       --alignment ${msa} \
-      --metadata ${params.meta} \
+      --metadata ${metadata} \
       --timetree \
       --keep-root \
       --divergence-units ${params.divergence_units} \
@@ -154,14 +155,14 @@ process ancestral {
     output:
     path("nt_muts.json")
 
-        """
-        augur ancestral \
-            --tree ${refine_tree} \
-            --alignment ${msa} \
-            --output-node-data nt_muts.json \
-            --keep-overhangs \
-            --keep-ambiguous
-        """
+    """
+    augur ancestral \
+        --tree ${refine_tree} \
+        --alignment ${msa} \
+        --output-node-data nt_muts.json \
+        --keep-overhangs \
+        --keep-ambiguous
+    """
 }
 
 process translate {
@@ -183,30 +184,46 @@ process translate {
       """
 }
 
+process fix_aa_json {
+  tag "Fixing aa_muts.json when using a GFF3 file for augur translate."
+  publishDir "${params.work_dir}/results/", mode: 'copy'
+
+  input:
+  tuple file(aa_muts_json), file(nt_muts_json)
+
+  output:
+  file("aa_muts_fix.json")
+
+  """
+  fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json
+  """
+}
+
 process export {
   tag "Exporting data files for auspice"
   publishDir "${params.work_dir}/auspice", mode: 'copy'
 
   input:
-  tuple file(refine_tree), file(branch_len), file(nt_muts),\
-  file(aa_muts)
+  tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts) 
+  file(metadata)
+  tuple file(auspice_config), file(colors), file(lat_long)
 
   output:
-  file("flu_na.json")
+  file("flu.json")
 
-      """
-      export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
-
-      augur export v2 \
-          --tree ${refine_tree} \
-          --metadata ${params.meta} \
-          --node-data ${branch_len} ${nt_muts} ${aa_muts} \
-          --colors ${params.colors} \
-          --lat-longs ${params.lat_long} \
-          --minify-json \
-          --auspice-config ${params.auspice} \
-          --output flu_na.json
-      """
+  """
+  export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
+
+  augur export v2 \
+      --tree ${refine_tree} \
+      --metadata ${metadata} \
+      --node-data ${branch_len} ${nt_muts} ${aa_muts} \
+      --colors ${colors} \
+      --lat-longs ${lat_long} \
+      --minify-json \
+      --auspice-config ${auspice_config} \
+      --output flu.json
+  """
 }
 
 /**
@@ -218,13 +235,39 @@ workflow
 workflow {
   seq_ch = Channel.fromPath(params.seqs, checkIfExists:true)
   ref_ch = Channel.fromPath(params.ref, checkIfExists:true)
+  meta_ch = Channel.fromPath(params.meta, checkIfExists:true)
+  config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect()
+
+
+  // Catch invalid reference input combinations 
+  ref_gb_format = (params.ref =~ /.+\.[Gg]b$/)
+
+  if (!ref_gb_format && params.ref_anno == 'NO_FILE' ){                         // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
+    error "ERROR: Parameter --ref_anno (.gff3 or .gb format) must be specified if non-GenBank formatted reference is provided under --ref."
+  }
+  if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){     // Can only have .gff or .gb formats in the --ref_anno parameter
+    error "ERROR: Parameter --ref_anno must be in either .gff or .gb (GenBank) format."
+  }
+
+  // Load the ref_anno_ch channel appropriately 
+  if (ref_gb_format){                                                         // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
+    ref_anno_ch = ref_ch
+  }else{                                                                      // Load new channel from scratch if different reference annotation format specified
+    ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
+  }
 
   align(seq_ch.combine(ref_ch)) | tree
   msa_ch = align.out
-  refine(tree.out.combine(align.out))
+  refine(tree.out.combine(msa_ch).combine(meta_ch))
   ancestral(refine.out.combine(msa_ch))
-  translate(ancestral.out.combine(refine.out.combine(ref_ch)))
-  export(refine.out.combine(ancestral.out.combine(translate.out)))
+  translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))
+
+  ch_aa_muts = translate.out
+
+  if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) {        // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
+    ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out))
+  }
+  export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
 }
 
 /**

diff --git a/nextflow.config b/nextflow.config
@@ -11,57 +11,60 @@ manifest {
 
 params {
 
-//help message
-help = null
+  //help message
+  help = null
 
-//version number
-version = null
+  //version number
+  version = null
 
-//conda env local cache
-conda_cache = null 
+  //conda env local cache
+  conda_cache = null 
 
-//directory containing config/ & data/ folders
-work_dir = "/path/to/data/"
+  //directory containing config/ & data/ folders
+  work_dir = "/path/to/data/"
 
-//reference for alignment
-ref = "${params.work_dir}config/Ref.gb"
+  //reference for alignment
+  ref = "${params.work_dir}/config/Ref.gb"
 
-//input sequences
-seqs = "${params.work_dir}data/sequences.fasta"
+  //reference annotation 
+  ref_anno = "NO_FILE"
 
-//metadata of input sequences
-meta = "${params.work_dir}data/metadata.csv"
+  //input sequences
+  seqs = "${params.work_dir}/data/sequences.fasta"
 
-//strains that are excluded
-drop_strains = "${params.work_dir}config/dropped_strains.txt"
+  //metadata of input sequences
+  meta = "${params.work_dir}/data/metadata.csv"
 
-//colors used in final auspice visualization
-colors = "${params.work_dir}config/colors.csv"
+  //strains that are excluded
+  drop_strains = "${params.work_dir}/config/dropped_strains.txt"
 
-//latitude and longitudes
-lat_long = "${params.work_dir}config/lat_longs.csv"
+  //colors used in final auspice visualization
+  colors = "${params.work_dir}/config/colors.csv"
 
-//details for auspice visualization
-auspice = "${params.work_dir}config/auspice_config.json"
+  //latitude and longitudes
+  lat_long = "${params.work_dir}/config/lat_longs.csv"
 
-//refining phylogeny
-divergence_units = "mutations"
+  //details for auspice visualization
+  auspice = "${params.work_dir}/config/auspice_config.json"
 
-//env variable AUGUR_RECURSION_LIMIT
-recursion_limit = 10000
+  //refining phylogeny
+  divergence_units = "mutations"
+
+  //env variable AUGUR_RECURSION_LIMIT
+  recursion_limit = 10000
 
 }
 
 //seamlessly run pipeline on different execution systems by modifying
 //the process section of the config file. ex. AWS, SLURM, sun grid engine:
 
 process {
-withName: align {
-    cpus = 28
+  withName: align {
+      cpus = 28
+    }
+  withName: tree {
+      cpus = 28
   }
-withName: tree {
-    cpus = 28
- }
 //  penv='smp'
 //  executor='sge'
 //  memory='30 GB'
@@ -83,17 +86,17 @@ profiles {
 //html displaying breakdown of time taken to execute workflow
 timeline {
   enabled = true
-  file = "${params.work_dir}reports/fluflo_timeline.html"
+  file = "${params.work_dir}/reports/fluflo_timeline.html"
 }
 
 //html of cpu/mem usage
 report {
   enabled = true
-  file = "${params.work_dir}reports/fluflo_usage.html"
+  file = "${params.work_dir}/reports/fluflo_usage.html"
 }
 
 //dag of beast-flow workflow
 dag {
     enabled = true
-    file = "${params.work_dir}reports/fluflo_dag.html"
+    file = "${params.work_dir}/reports/fluflo_dag.html"
 }