Skip to content

Commit

Permalink
Merge TranslateGene
Browse files Browse the repository at this point in the history
  • Loading branch information
jpjarnoux committed Apr 11, 2024
2 parents 16d933e + 9aaac23 commit 126580c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
Expand Down
16 changes: 8 additions & 8 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import is_compressed, read_compressed_or_not


contig_counter: Value = Value('i', 0)


Expand Down Expand Up @@ -104,7 +103,7 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int =

if not use_meta:
gene_finder.train(*contig_sequences.values(), force_nonsd=False,
translation_table=code) # -g: Specify a translation table to use (default 11).
translation_table=code) # -g: Specify a translation table to use (default 11).
gene_counter = 1
for contig_name, sequence in sequences.items():
for pred in gene_finder.find_genes(sequence):
Expand Down Expand Up @@ -293,8 +292,7 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau


def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str:
"""
Return the gene sequence
"""Return the gene sequence
:param contig_seq: Contig sequence
:param gene: Gene
Expand All @@ -304,14 +302,15 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str:

# check contig coordinate is in scope of contig seq length
highest_position = max((stop for _, stop in gene.coordinates))
assert highest_position <= len(contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}"
assert highest_position <= len(
contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}"

# Extract gene seq
seq = ''.join([contig_seq[start - 1:stop] for start, stop in gene.coordinates])

# check length of extracted seq
assert len(seq) == len(gene), ("The gene sequence extracted from the contig does not have the expected length: "
f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ")
f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ")

if gene.strand == "+":
return seq
Expand Down Expand Up @@ -348,7 +347,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str
max_contig_len = max(len(contig) for contig in org.contigs)
if max_contig_len < 20000: # case of short sequence
use_meta = True
logging.getLogger("PPanGGOLiN").info(f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")
logging.getLogger("PPanGGOLiN").info(
f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")

else:
use_meta = False
Expand Down
54 changes: 34 additions & 20 deletions ppanggolin/formats/writeMSA.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from tqdm import tqdm

# local libraries
from ppanggolin.genome import Gene
from ppanggolin.geneFamily import GeneFamily
from ppanggolin.pangenome import Pangenome
from ppanggolin.utils import mk_outdir, restricted_float
Expand Down Expand Up @@ -95,33 +96,36 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core",
return families


def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str:
def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]:
"""translates the given dna sequence with the given translation table
:param seq: given dna sequence
:param gene: given gene
:param code: translation table corresponding to genetic code to use
:return: protein sequence
"""
# code: https://www.bioinformatics.org/sms/iupac.html
start_table = code["start_table"]
table = code["trans_table"]

if len(seq) % 3 == 0:
protein = start_table[seq[0: 3]]
for i in range(3, len(seq), 3):
codon = seq[i: i + 3]
try:
protein += table[codon]
except KeyError: # codon was not planned for. Probably can't determine it.
protein += 'X' # X is for unknown
else:
raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.")
return protein
mod = len(gene.dna) % 3
partial = False
if mod != 0:
partial = True
msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}"
f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.")
logging.getLogger("PPANGGOLIN").debug(msg)
protein = start_table[gene.dna[0: 3]]
for i in range(3, len(gene.dna) - mod, 3):
codon = gene.dna[i: i + 3]
try:
protein += table[codon]
except KeyError: # codon was not planned for. Probably can't determine it.
protein += 'X' # X is for unknown
return protein, partial


def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]],
source: str = 'protein', use_gene_id: bool = False) -> Path:
source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]:
"""Write fasta files for each gene family
:param family: gene family to write
Expand All @@ -142,6 +146,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
if len(genes) == 1:
single_copy_genes.extend(genes)

partial = False
for gene in single_copy_genes:
if use_gene_id:
f_obj.write('>' + gene.ID + "\n")
Expand All @@ -150,12 +155,14 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory
if source == "dna":
f_obj.write(gene.dna + '\n')
elif source == "protein":
f_obj.write(translate(gene.dna, code_table) + "\n")
protein, part = translate(gene, code_table)
if not partial:
partial = part
f_obj.write(protein + "\n")
else:
raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')")
f_obj.flush()

return f_name
return f_name, partial


def launch_mafft(fname: Path, output: Path, fam_name: str):
Expand Down Expand Up @@ -203,12 +210,19 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int
logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...")
code_table = genetic_codes(code)

partial = False
for family in tqdm(families, unit="family", disable=disable_bar):
start_write = time.time()
fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id)
if not partial:
partial = part
write_total = write_total + (time.time() - start_write)
args.append((fname, output, family.name))

if partial:
logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. "
"Last nucleotides were removed to translate. "
"Use --verbose 2 to see genes that are partial")
logging.getLogger("PPanGGOLiN").info("Computing the MSA ...")
with get_context('fork').Pool(cpu) as p:
with tqdm(total=len(families), unit="family", disable=disable_bar) as bar:
Expand All @@ -229,7 +243,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path
"""

# sort familes by ID, so the gene order is consistent
families = sorted(families, key = lambda fam: fam.ID)
families = sorted(families, key=lambda f: f.ID)

phylo_dict = {}
for org in pangenome.organisms:
Expand Down

0 comments on commit 126580c

Please sign in to comment.