Skip to content

Commit

Permalink
Merge pull request #6 from jpalmer37/dev
Browse files Browse the repository at this point in the history
separate reference parameters to allow non-genbank file formats
  • Loading branch information
j3551ca authored Dec 27, 2023
2 parents 1092768 + 38cfe5b commit 16878b9
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 65 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ each run is the input directory, which can be specified with the --work_dir flag
command line.

- Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
- Reference genome used to align reads to during guided assembly [./config/Ref.gb]
- One of the two following configurations is required to describe the reference genome:
1. A single GenBank file passed to `--ref` describing the reference genome used for both alignment and amino acid annotation [./config/Ref.gb] OR
2. A reference sequence in FASTA format under `--ref` AND a reference annotation in GFF format under `--ref_anno`.
- File containing metadata for sequences under analysis [./data/metadata.csv]
- Excluded strains/ samples [./config/dropped_strains.txt]
- Colors used in final auspice visualization [./config/colors.csv]
Expand Down
40 changes: 40 additions & 0 deletions bin/fix_aa_muts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
import os, json
import argparse


def init_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--aa_json', required=True, help='aa_muts.json produced by the Augur translate step')
parser.add_argument('-n', '--nt_json', required=True, help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.')
parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.')

return parser


def main(args):

with open(args.aa_json, 'r') as infile:
aa_data = json.load(infile)

with open(args.nt_json, 'r') as infile:
reference_length = len(json.load(infile)['reference']['nuc'])

if 'nuc' not in aa_data['annotations']:
key = list(aa_data['annotations'].keys())[0]
name = aa_data['annotations'][key]['seqid']

aa_data['annotations']['nuc'] = {
"end": reference_length,
"seqid": name,
"start": 1,
"strand": "+",
"type": "source"
}

with open(args.outpath, 'w') as outfile:
json.dump(aa_data, outfile, indent=4)

if __name__ == "__main__":
args = init_parser().parse_args()
main(args)
103 changes: 73 additions & 30 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ Mandatory arguments:
Optional arguments:
--seqs Multi-fasta file containing consensus sequences of interest [./data/sequences.fasta]
--ref Reference genome used to align reads to during guided assembly [./config/Ref.gb]
--ref Reference genome used to align reads to during guided assembly [./config/Ref.gb (default); *.fasta file also accepted here]
--ref_anno Reference genome annotation file; only required when using a FASTA under --ref [*.gb / *.gff file accepted here]
--meta File containing metadata for sequences under analysis [./data/metadata.csv]
--drop_strains Excluded strains/ samples [./config/dropped_strains.txt]
--colors Colors used in final auspice visualization [./config/colors.csv]
Expand Down Expand Up @@ -126,7 +127,7 @@ process refine {
publishDir "${params.work_dir}/results/", mode: 'copy'

input:
tuple file(tree), file(msa)
tuple file(tree), file(msa), file(metadata)

output:
tuple path("tree.nwk"), path("branch_lengths.json")
Expand All @@ -135,7 +136,7 @@ process refine {
augur refine \
--tree ${tree} \
--alignment ${msa} \
--metadata ${params.meta} \
--metadata ${metadata} \
--timetree \
--keep-root \
--divergence-units ${params.divergence_units} \
Expand All @@ -154,14 +155,14 @@ process ancestral {
output:
path("nt_muts.json")

"""
augur ancestral \
--tree ${refine_tree} \
--alignment ${msa} \
--output-node-data nt_muts.json \
--keep-overhangs \
--keep-ambiguous
"""
"""
augur ancestral \
--tree ${refine_tree} \
--alignment ${msa} \
--output-node-data nt_muts.json \
--keep-overhangs \
--keep-ambiguous
"""
}

process translate {
Expand All @@ -183,30 +184,46 @@ process translate {
"""
}

process fix_aa_json {
tag "Fixing aa_muts.json when using a GFF3 file for augur translate."
publishDir "${params.work_dir}/results/", mode: 'copy'

input:
tuple file(aa_muts_json), file(nt_muts_json)

output:
file("aa_muts_fix.json")

"""
fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json
"""
}

process export {
tag "Exporting data files for auspice"
publishDir "${params.work_dir}/auspice", mode: 'copy'

input:
tuple file(refine_tree), file(branch_len), file(nt_muts),\
file(aa_muts)
tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts)
file(metadata)
tuple file(auspice_config), file(colors), file(lat_long)

output:
file("flu_na.json")
file("flu.json")

"""
export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
augur export v2 \
--tree ${refine_tree} \
--metadata ${params.meta} \
--node-data ${branch_len} ${nt_muts} ${aa_muts} \
--colors ${params.colors} \
--lat-longs ${params.lat_long} \
--minify-json \
--auspice-config ${params.auspice} \
--output flu_na.json
"""
"""
export AUGUR_RECURSION_LIMIT=${params.recursion_limit}
augur export v2 \
--tree ${refine_tree} \
--metadata ${metadata} \
--node-data ${branch_len} ${nt_muts} ${aa_muts} \
--colors ${colors} \
--lat-longs ${lat_long} \
--minify-json \
--auspice-config ${auspice_config} \
--output flu.json
"""
}

/**
Expand All @@ -218,13 +235,39 @@ workflow
workflow {
seq_ch = Channel.fromPath(params.seqs, checkIfExists:true)
ref_ch = Channel.fromPath(params.ref, checkIfExists:true)
meta_ch = Channel.fromPath(params.meta, checkIfExists:true)
config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect()


// Catch invalid reference input combinations
ref_gb_format = (params.ref =~ /.+\.[Gg]b$/)

if (!ref_gb_format && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
error "ERROR: Parameter --ref_anno (.gff3 or .gb format) must be specified if non-GenBank formatted reference is provided under --ref."
}
if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter
error "ERROR: Parameter --ref_anno must be in either .gff or .gb (GenBank) format."
}

// Load the ref_anno_ch channel appropriately
if (ref_gb_format){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
ref_anno_ch = ref_ch
}else{ // Load new channel from scratch if different reference annotation format specified
ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
}

align(seq_ch.combine(ref_ch)) | tree
msa_ch = align.out
refine(tree.out.combine(align.out))
refine(tree.out.combine(msa_ch).combine(meta_ch))
ancestral(refine.out.combine(msa_ch))
translate(ancestral.out.combine(refine.out.combine(ref_ch)))
export(refine.out.combine(ancestral.out.combine(translate.out)))
translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))

ch_aa_muts = translate.out

if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out))
}
export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
}

/**
Expand Down
71 changes: 37 additions & 34 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,60 @@ manifest {

params {

//help message
help = null
//help message
help = null

//version number
version = null
//version number
version = null

//conda env local cache
conda_cache = null
//conda env local cache
conda_cache = null

//directory containing config/ & data/ folders
work_dir = "/path/to/data/"
//directory containing config/ & data/ folders
work_dir = "/path/to/data/"

//reference for alignment
ref = "${params.work_dir}config/Ref.gb"
//reference for alignment
ref = "${params.work_dir}/config/Ref.gb"

//input sequences
seqs = "${params.work_dir}data/sequences.fasta"
//reference annotation
ref_anno = "NO_FILE"

//metadata of input sequences
meta = "${params.work_dir}data/metadata.csv"
//input sequences
seqs = "${params.work_dir}/data/sequences.fasta"

//strains that are excluded
drop_strains = "${params.work_dir}config/dropped_strains.txt"
//metadata of input sequences
meta = "${params.work_dir}/data/metadata.csv"

//colors used in final auspice visualization
colors = "${params.work_dir}config/colors.csv"
//strains that are excluded
drop_strains = "${params.work_dir}/config/dropped_strains.txt"

//latitude and longitudes
lat_long = "${params.work_dir}config/lat_longs.csv"
//colors used in final auspice visualization
colors = "${params.work_dir}/config/colors.csv"

//details for auspice visualization
auspice = "${params.work_dir}config/auspice_config.json"
//latitude and longitudes
lat_long = "${params.work_dir}/config/lat_longs.csv"

//refining phylogeny
divergence_units = "mutations"
//details for auspice visualization
auspice = "${params.work_dir}/config/auspice_config.json"

//env variable AUGUR_RECURSION_LIMIT
recursion_limit = 10000
//refining phylogeny
divergence_units = "mutations"

//env variable AUGUR_RECURSION_LIMIT
recursion_limit = 10000

}

//seamlessly run pipeline on different execution systems by modifying
//the process section of the config file. ex. AWS, SLURM, sun grid engine:

process {
withName: align {
cpus = 28
withName: align {
cpus = 28
}
withName: tree {
cpus = 28
}
withName: tree {
cpus = 28
}
// penv='smp'
// executor='sge'
// memory='30 GB'
Expand All @@ -83,17 +86,17 @@ profiles {
//html displaying breakdown of time taken to execute workflow
timeline {
enabled = true
file = "${params.work_dir}reports/fluflo_timeline.html"
file = "${params.work_dir}/reports/fluflo_timeline.html"
}

//html of cpu/mem usage
report {
enabled = true
file = "${params.work_dir}reports/fluflo_usage.html"
file = "${params.work_dir}/reports/fluflo_usage.html"
}

//dag of beast-flow workflow
dag {
enabled = true
file = "${params.work_dir}reports/fluflo_dag.html"
file = "${params.work_dir}/reports/fluflo_dag.html"
}

0 comments on commit 16878b9

Please sign in to comment.