feat: output genome annotations for complex CDS

Auspice now supports multi-part CDS annotations This PR makes `augur translate` output the correct annotation for complex CDSs Complex CDSs are detected through `type(feat.location)` being `SeqFeature.CompountLocation` Currently, this only works with genbank files, as our load_features utility function does not deal with compound CDS in GFF files correctly I tested this with MERSs complex ORF1ab Partially resolves #1280
nextstrain · Nov 4, 2023 · 6fa5ed5 · 6fa5ed5
1 parent 378353e
commit 6fa5ed5
Showing 1 changed file with 11 additions and 5 deletions.
diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -28,6 +28,7 @@
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import CompoundLocation
 from .utils import read_tree, InvalidTreeError, write_json, get_json_name
 from treetime.vcf_utils import read_vcf, write_vcf
 from collections import defaultdict
@@ -332,14 +333,19 @@ def run(args):
                     node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
 
             anc_seqs['reference'][gene] = aa_result['root_seq']
-            # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
-            # this is equivalent to the annotation for single segment CDS, but not for cds
-            # with splicing and slippage. But auspice can't handle the latter at the moment.
             anc_seqs['annotations'][gene] = {'seqid':args.annotation,
                                              'type':feat.type,
-                                             'start': int(feat.location.start)+1,
-                                             'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
                                              'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+            if feat.type == "CDS" and type(feat.location) == CompoundLocation:
+                location = []
+                for seg in feat.location.parts:
+                    location.append({"start": int(seg.start)+1, "end": int(seg.end)})
+                anc_seqs['annotations'][gene]["segments"] = location
+            else:
+                anc_seqs['annotations'][gene].update({
+                    'start': int(feat.location.start)+1,
+                    'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
+                })
 
             # Save ancestral amino acid sequences to FASTA.
             if args.output_translations: