diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 347c0d83..73d93922 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,7 +77,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 866c98aa..38786cf6 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -20,7 +20,6 @@ from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import is_compressed, read_compressed_or_not - contig_counter: Value = Value('i', 0) @@ -104,7 +103,7 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int = if not use_meta: gene_finder.train(*contig_sequences.values(), force_nonsd=False, - translation_table=code) # -g: Specify a translation table to use (default 11). + translation_table=code) # -g: Specify a translation table to use (default 11). gene_counter = 1 for contig_name, sequence in sequences.items(): for pred in gene_finder.find_genes(sequence): @@ -293,8 +292,7 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: - """ - Return the gene sequence + """Return the gene sequence :param contig_seq: Contig sequence :param gene: Gene @@ -304,14 +302,15 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: # check contig coordinate is in scope of contig seq length highest_position = max((stop for _, stop in gene.coordinates)) - assert highest_position <= len(contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" + assert highest_position <= len( + contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" # Extract gene seq seq = ''.join([contig_seq[start - 1:stop] for start, stop in gene.coordinates]) - + # check length of extracted seq assert len(seq) == len(gene), ("The gene sequence extracted from the contig does not have the expected length: " - f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ") + f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ") if gene.strand == "+": return seq @@ -348,7 +347,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str max_contig_len = max(len(contig) for contig in org.contigs) if max_contig_len < 20000: # case of short sequence use_meta = True - logging.getLogger("PPanGGOLiN").info(f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.") + logging.getLogger("PPanGGOLiN").info( + f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.") else: use_meta = False diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 0e419205..e1382316 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -15,6 +15,7 @@ from tqdm import tqdm # local libraries +from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome from ppanggolin.utils import mk_outdir, restricted_float @@ -95,10 +96,10 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", return families -def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str: +def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]: """translates the given dna sequence with the given translation table - :param seq: given dna sequence + :param gene: given gene :param code: translation table corresponding to genetic code to use :return: protein sequence @@ -106,22 +107,25 @@ def translate(seq: str, code: Dict[str, Dict[str, str]]) -> str: # code: https://www.bioinformatics.org/sms/iupac.html start_table = code["start_table"] table = code["trans_table"] - - if len(seq) % 3 == 0: - protein = start_table[seq[0: 3]] - for i in range(3, len(seq), 3): - codon = seq[i: i + 3] - try: - protein += table[codon] - except KeyError: # codon was not planned for. Probably can't determine it. - protein += 'X' # X is for unknown - else: - raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.") - return protein + mod = len(gene.dna) % 3 + partial = False + if mod != 0: + partial = True + msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" + f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") + logging.getLogger("PPANGGOLIN").debug(msg) + protein = start_table[gene.dna[0: 3]] + for i in range(3, len(gene.dna) - mod, 3): + codon = gene.dna[i: i + 3] + try: + protein += table[codon] + except KeyError: # codon was not planned for. Probably can't determine it. + protein += 'X' # X is for unknown + return protein, partial def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, Dict[str, str]], - source: str = 'protein', use_gene_id: bool = False) -> Path: + source: str = 'protein', use_gene_id: bool = False) -> Tuple[Path, bool]: """Write fasta files for each gene family :param family: gene family to write @@ -142,6 +146,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if len(genes) == 1: single_copy_genes.extend(genes) + partial = False for gene in single_copy_genes: if use_gene_id: f_obj.write('>' + gene.ID + "\n") @@ -150,12 +155,14 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory if source == "dna": f_obj.write(gene.dna + '\n') elif source == "protein": - f_obj.write(translate(gene.dna, code_table) + "\n") + protein, part = translate(gene, code_table) + if not partial: + partial = part + f_obj.write(protein + "\n") else: raise AssertionError("Unknown sequence source given (expected 'dna' or 'protein')") f_obj.flush() - - return f_name + return f_name, partial def launch_mafft(fname: Path, output: Path, fam_name: str): @@ -203,12 +210,19 @@ def compute_msa(families: Set[GeneFamily], output: Path, tmpdir: Path, cpu: int logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...") code_table = genetic_codes(code) + partial = False for family in tqdm(families, unit="family", disable=disable_bar): start_write = time.time() - fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) + fname, part = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) + if not partial: + partial = part write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) + if partial: + logging.getLogger("PPanGGOLiN").warning("Partial gene was found during translation. " + "Last nucleotides were removed to translate. " + "Use --verbose 2 to see genes that are partial") logging.getLogger("PPanGGOLiN").info("Computing the MSA ...") with get_context('fork').Pool(cpu) as p: with tqdm(total=len(families), unit="family", disable=disable_bar) as bar: @@ -229,7 +243,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path """ # sort familes by ID, so the gene order is consistent - families = sorted(families, key = lambda fam: fam.ID) + families = sorted(families, key=lambda f: f.ID) phylo_dict = {} for org in pangenome.organisms: