Merge TranslateGene

labgem · Apr 11, 2024 · 126580c · 126580c
2 parents 16d933e + 9aaac23
commit 126580c
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 29 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp 
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log

diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
@@ -20,7 +20,6 @@
 from ppanggolin.genome import Organism, Gene, RNA, Contig
 from ppanggolin.utils import is_compressed, read_compressed_or_not
 
-
 contig_counter: Value = Value('i', 0)
 
 
@@ -104,7 +103,7 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int =
 
     if not use_meta:
         gene_finder.train(*contig_sequences.values(), force_nonsd=False,
-                        translation_table=code)  # -g: Specify a translation table to use (default 11).
+                          translation_table=code)  # -g: Specify a translation table to use (default 11).
     gene_counter = 1
     for contig_name, sequence in sequences.items():
         for pred in gene_finder.find_genes(sequence):
@@ -293,8 +292,7 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau
 
 
 def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str:
-    """
-    Return the gene sequence
+    """Return the gene sequence
 
     :param contig_seq: Contig sequence
     :param gene: Gene
@@ -304,14 +302,15 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str:
 
     # check contig coordinate is in scope of contig seq length
     highest_position = max((stop for _, stop in gene.coordinates))
-    assert highest_position <= len(contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}"
+    assert highest_position <= len(
+        contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}"
 
     # Extract gene seq
     seq = ''.join([contig_seq[start - 1:stop] for start, stop in gene.coordinates])
-    
+
     # check length of extracted seq
     assert len(seq) == len(gene), ("The gene sequence extracted from the contig does not have the expected length: "
-                                  f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ")
+                                   f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ")
 
     if gene.strand == "+":
         return seq
@@ -348,7 +347,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str
         max_contig_len = max(len(contig) for contig in org.contigs)
         if max_contig_len < 20000:  # case of short sequence
             use_meta = True
-            logging.getLogger("PPanGGOLiN").info(f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")
+            logging.getLogger("PPanGGOLiN").info(
+                f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")
 
         else:
             use_meta = False

diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
@@ -15,6 +15,7 @@
 from tqdm import tqdm
 
 # local libraries
+from ppanggolin.genome import Gene
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.utils import mk_outdir, restricted_float
@@ -95,33 +96,36 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core",
         return families
 
 
-def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str:
+def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]:
     """translates the given dna sequence with the given translation table
 
-    :param seq: given dna sequence
+    :param gene: given gene
     :param code: translation table corresponding to genetic code to use
 
     :return: protein sequence
     """
     # code:  https://www.bioinformatics.org/sms/iupac.html
     start_table = code["start_table"]
     table = code["trans_table"]
-
-    if len(seq) % 3 == 0:
-        protein = start_table[seq[0: 3]]
-        for i in range(3, len(seq), 3):
-            codon = seq[i: i + 3]
-            try:
-                protein += table[codon]
-            except KeyError:  # codon was not planned for. Probably can't determine it.
-                protein += 'X'  # X is for unknown
-    else:
-        raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.")
-    return protein
+    mod = len(gene.dna) % 3
+    partial = False
+    if mod != 0:
+        partial = True
+        msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}"
+               f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.")
+        logging.getLogger("PPANGGOLIN").debug(msg)
+    protein = start_table[gene.dna[0: 3]]
+    for i in range(3, len(gene.dna) - mod, 3):
+        codon = gene.dna[i: i + 3]
+        try:
+            protein += table[codon]
+        except KeyError:  # codon was not planned for. Probably can't determine it.
+            protein += 'X'  # X is for unknown
+    return protein, partial
 
 
 def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]],
-                         source: str = 'protein', use_gene_id: bool = False) -> Path:
+                         source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]:
     """Write fasta files for each gene family
 
     :param family: gene family to write
@@ -142,6 +146,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
         if len(genes) == 1:
             single_copy_genes.extend(genes)
 
+    partial = False
     for gene in single_copy_genes:
         if use_gene_id:
             f_obj.write('>' + gene.ID + "\n")
@@ -150,12 +155,14 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
         if source == "dna":
             f_obj.write(gene.dna + '\n')
         elif source == "protein":
-            f_obj.write(translate(gene.dna, code_table) + "\n")
+            protein, part = translate(gene, code_table)
+            if not partial:
+                partial = part
+            f_obj.write(protein + "\n")
         else:
             raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')")
     f_obj.flush()
-
-    return f_name
+    return f_name, partial
 
 
 def launch_mafft(fname: Path, output: Path, fam_name: str):
@@ -203,12 +210,19 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int
     logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...")
     code_table = genetic_codes(code)
 
+    partial = False
     for family in tqdm(families, unit="family", disable=disable_bar):
         start_write = time.time()
-        fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
+        fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
+        if not partial:
+            partial = part
         write_total = write_total + (time.time() - start_write)
         args.append((fname, output, family.name))
 
+    if partial:
+        logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. "
+                                                "Last nucleotides were removed to translate. "
+                                                "Use --verbose 2 to see genes that are partial")
     logging.getLogger("PPanGGOLiN").info("Computing the MSA ...")
     with get_context('fork').Pool(cpu) as p:
         with tqdm(total=len(families), unit="family", disable=disable_bar) as bar:
@@ -229,7 +243,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path
     """
 
     # sort familes by ID, so the gene order is consistent
-    families = sorted(families, key = lambda fam: fam.ID)
+    families = sorted(families, key=lambda f: f.ID)
 
     phylo_dict = {}
     for org in pangenome.organisms: