Skip to content

Commit

Permalink
feat: output genome annotations for complex CDS
Browse files Browse the repository at this point in the history
Auspice now supports multi-part CDS annotations
This PR makes `augur translate` output the correct annotation for complex CDSs

Complex CDSs are detected through `type(feat.location)` being `SeqFeature.CompountLocation`

Currently, this only works with genbank files, as our load_features utility function does not
deal with compound CDS in GFF files correctly

I tested this with MERSs complex ORF1ab

Partially resolves #1280
  • Loading branch information
corneliusroemer committed Nov 4, 2023
1 parent 378353e commit 6fa5ed5
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import CompoundLocation
from .utils import read_tree, InvalidTreeError, write_json, get_json_name
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict
Expand Down Expand Up @@ -332,14 +333,19 @@ def run(args):
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)

anc_seqs['reference'][gene] = aa_result['root_seq']
# FIXME: Note that this is calculating the end of the CDS as 3*length of translation
# this is equivalent to the annotation for single segment CDS, but not for cds
# with splicing and slippage. But auspice can't handle the latter at the moment.
anc_seqs['annotations'][gene] = {'seqid':args.annotation,
'type':feat.type,
'start': int(feat.location.start)+1,
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
if feat.type == "CDS" and type(feat.location) == CompoundLocation:
location = []
for seg in feat.location.parts:
location.append({"start": int(seg.start)+1, "end": int(seg.end)})
anc_seqs['annotations'][gene]["segments"] = location
else:
anc_seqs['annotations'][gene].update({
'start': int(feat.location.start)+1,
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
})

# Save ancestral amino acid sequences to FASTA.
if args.output_translations:
Expand Down

0 comments on commit 6fa5ed5

Please sign in to comment.