From 31e50f5197aa2f2bbef6eb82c3f8564eddbfdbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 16:56:50 +0100 Subject: [PATCH 1/6] Add option to write proteins sequences from genes in pangenome --- ppanggolin/cluster/cluster.py | 14 +---- ppanggolin/formats/writeSequences.py | 86 ++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 14418e2a..9ce34b2a 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -23,7 +23,7 @@ from ppanggolin.utils import read_compressed_or_not, restricted_float from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations +from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes from ppanggolin.utils import mk_outdir @@ -68,7 +68,6 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force "or provide a way to access the gene sequence during the annotation step " "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") - def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: """ @@ -84,16 +83,7 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = :return: path to representative sequence file and path to tsv clustering result """ - seq_nucdb = tmpdir / 'nucleotid_sequences_db' - cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - logging.getLogger("PPanGGOLiN").info("Creating sequence database...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / 'aa_db' - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + seqdb = translate_genes(sequences, tmpdir, cpu, code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 900e3496..63a78231 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -5,8 +5,11 @@ import argparse import logging import re +import subprocess from pathlib import Path from typing import TextIO, Dict, Set, Iterable +import tempfile +import shutil # installed libraries from tqdm import tqdm @@ -18,6 +21,7 @@ from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file + module_regex = re.compile(r'^module_\d+') #\d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -43,6 +47,20 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() +def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: + seq_nucdb = tmpdir / 'nucleotid_sequences_db' + cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Creating sequence database...") + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") + seqdb = tmpdir / 'aa_db' + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + return seqdb + + def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, compress: bool = False, disable_bar: bool = False): """ @@ -78,6 +96,40 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") +def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, + compress: bool = False, keep_tmp: bool = False, tmp: Path = None, + cpu: int = 1, code: int = 11, disable_bar: bool = False): + """ Write all amino acid sequences from given genes in pangenome + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param genes_prot: Selected partition of gene + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param keep_tmp: Keep temporary directory + :param tmp: Path to temporary directory + :param cpu: Number of CPU available + :param code: Genetic code use to translate nucleotide sequences to protein sequences + :param disable_bar: Disable progress bar + """ + tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") + mk_outdir(tmpdir, True, True) + + write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) + + with open(tmpdir/f"{genes_prot}_genes.fna") as sequences: + translate_db = translate_genes(sequences, tmpdir, cpu, code) + outpath = output/f"{genes_prot}_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") + + if not keep_tmp: + logging.getLogger("PPanGGOLiN").debug("Clean temporary directory") + shutil.rmtree(tmpdir) + + def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: """ function used to filter down families to the given partition @@ -93,13 +145,13 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c if partition == 'all': logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...") genefams = pangenome.gene_families - + elif partition in ['persistent', 'shell', 'cloud']: logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...") for fam in pangenome.gene_families: if fam.named_partition == partition: genefams.add(fam) - + elif partition == "rgp": logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...") for region in pangenome.regions: @@ -342,8 +394,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, - soft_core: float = 0.95, regions: str = None, genes: str = None, gene_families: str = None, - prot_families: str = None, compress: bool = False, disable_bar: bool = False): + soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, + gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False): """ Main function to write sequence file from pangenome @@ -359,7 +411,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - if not any(x for x in [regions, genes, prot_families, gene_families]): + if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): raise Exception("You did not indicate what file you wanted to write.") need_annotations = False @@ -375,7 +427,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, if prot_families in ["core", "softcore"]: need_annotations = True - if any(x is not None for x in [regions, genes, gene_families]): + if any(x is not None for x in [regions, genes, genes_prot, gene_families]): need_annotations = True need_families = True if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): @@ -394,6 +446,8 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, provided_filter = '' if genes is not None: provided_filter = genes + if genes_prot is not None: + provided_filter = genes_prot if gene_families is not None: provided_filter = gene_families if prot_families is not None: @@ -422,6 +476,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) if genes is not None: write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) + if genes_prot is not None: + write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, + disable_bar=disable_bar) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -439,7 +496,7 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, gene_families=args.gene_families, + regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -477,7 +534,7 @@ def parser_seq(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - + required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") @@ -493,7 +550,7 @@ def parser_seq(parser: argparse.ArgumentParser): help="A tab-separated file listing the genome names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per genome. " "If this is provided, those annotations will be used.") - + onereq = parser.add_argument_group(title="Output file", description="At least one of the following argument is required. " "Indicating 'all' writes all elements. Writing a partition " @@ -502,11 +559,13 @@ def parser_seq(parser: argparse.ArgumentParser): ) onereq.add_argument("--genes", required=False, type=filter_values, help=f"Write all nucleotide CDS sequences. {poss_values_log}") + onereq.add_argument("--genes_prot", required=False, type=filter_values, + help=f"Write representative amino acid sequences of genes. {poss_values_log}") onereq.add_argument("--prot_families", required=False, type=filter_values, help=f"Write representative amino acid sequences of gene families. {poss_values_log}") onereq.add_argument("--gene_families", required=False, type=filter_values, help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") - + optional = parser.add_argument_group(title="Optional arguments") # could make choice to allow customization optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"], @@ -515,6 +574,13 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold to use if 'softcore' partition is chosen") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + help="directory for storing temporary files") + optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", + help="Keeping temporary files (useful for debugging).") if __name__ == '__main__': From e54565c9adbdc46383b03c78fb9144a0271ac20f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:25:52 +0100 Subject: [PATCH 2/6] Split function to add check function --- ppanggolin/formats/writeSequences.py | 155 ++++++++++++++++----------- 1 file changed, 93 insertions(+), 62 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 63a78231..a3753d46 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -27,6 +27,94 @@ poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." +def check_write_sequences_args(args: argparse.Namespace) -> None: + """Check arguments compatibility in CLI + + :param args: argparse namespace arguments from CLI + + :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given + """ + if args.regions is not None and args.fasta is None and args.anno is None: + raise argparse.ArgumentError(argument=None, + message="The --regions options requires the use of --anno or --fasta " + "(You need to provide the same file used to compute the pangenome)") + + +def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None, genes: str = None, + genes_prot: str = None, gene_families: str = None, + prot_families: str = None, disable_bar: bool = False): + """Check and load required information from pangenome + + :param pangenome: Empty pangenome + :param regions: Check and load the RGP + :param genes: Check and load the genes + :param genes_prot: Write amino acid CDS sequences. + :param gene_families: Check and load the gene families to write representative nucleotide sequences. + :param prot_families: Check and load the gene families to write representative amino acid sequences. + :param disable_bar: Disable progress bar + + :raises AssertionError: if not any arguments to write any file is given + :raises ValueError: if the given filter is not recognized + :raises AttributeError: if the pangenome does not contain the required information + """ + if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): + raise AssertionError("You did not indicate what file you wanted to write.") + + need_annotations = False + need_families = False + need_graph = False + need_partitions = False + need_spots = False + need_regions = False + need_modules = False + + if prot_families is not None: + need_families = True + if prot_families in ["core", "softcore"]: + need_annotations = True + + if any(x is not None for x in [regions, genes, genes_prot, gene_families]): + need_annotations = True + need_families = True + if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): + need_annotations = True + need_regions = True + if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): + need_partitions = True + for x in (genes, gene_families, prot_families): + if x is not None and 'module_' in x: + need_modules = True + + if not (need_annotations or need_families or need_graph or need_partitions or + need_spots or need_regions or need_modules): + # then nothing is needed, then something is wrong. + # find which filter was provided + provided_filter = '' + if genes is not None: + provided_filter = genes + if genes_prot is not None: + provided_filter = genes_prot + if gene_families is not None: + provided_filter = gene_families + if prot_families is not None: + provided_filter = prot_families + if regions is not None: + provided_filter = regions + raise ValueError(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " + f"{poss_values_log}") + + if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): + raise AttributeError("The provided pangenome has no gene sequences. " + "This is not compatible with any of the following options : --genes, --gene_families") + if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: + raise AttributeError("The provided pangenome has no gene families. This is not compatible with any of " + "the following options : --prot_families, --gene_families") + + check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, + need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, + need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) + + def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '', disable_bar: bool = False): """ @@ -46,7 +134,6 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.write(f'{gene.dna}\n') file_obj.flush() - def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: seq_nucdb = tmpdir / 'nucleotid_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) @@ -395,7 +482,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, - gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False): + gene_families: str = None, prot_families: str = None, compress: bool = False, + disable_bar: bool = False): """ Main function to write sequence file from pangenome @@ -406,69 +494,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param soft_core: Soft core threshold to use :param regions: Write the RGP nucleotide sequences :param genes: Write all nucleotide CDS sequences + :param genes_prot: Write amino acid CDS sequences. :param gene_families: Write representative nucleotide sequences of gene families. :param prot_families: Write representative amino acid sequences of gene families. :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): - raise Exception("You did not indicate what file you wanted to write.") - need_annotations = False - need_families = False - need_graph = False - need_partitions = False - need_spots = False - need_regions = False - need_modules = False - - if prot_families is not None: - need_families = True - if prot_families in ["core", "softcore"]: - need_annotations = True - - if any(x is not None for x in [regions, genes, genes_prot, gene_families]): - need_annotations = True - need_families = True - if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): - need_annotations = True - need_regions = True - if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): - need_partitions = True - for x in (genes, gene_families, prot_families): - if x is not None and 'module_' in x: - need_modules = True - - if not (need_annotations or need_families or need_graph or need_partitions or - need_spots or need_regions or need_modules): - # then nothing is needed, then something is wrong. - # find which filter was provided - provided_filter = '' - if genes is not None: - provided_filter = genes - if genes_prot is not None: - provided_filter = genes_prot - if gene_families is not None: - provided_filter = gene_families - if prot_families is not None: - provided_filter = prot_families - if regions is not None: - provided_filter = regions - raise Exception(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " - f"{poss_values_log}") - ex_gene_sequences = Exception("The provided pangenome has no gene sequences. " - "This is not compatible with any of the following options : --genes, --gene_families") - ex_gene_family_sequences = Exception("The provided pangenome has no gene families. " - "This is not compatible with any of the following options : " - "--prot_families, --gene_families") - if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): - raise ex_gene_sequences - if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: - raise ex_gene_family_sequences - - check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, - need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, - need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) + check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar) if prot_families is not None: write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) @@ -489,9 +522,7 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - if args.regions is not None and args.fasta is None and args.anno is None: - raise Exception("The --regions options requires the use of --anno or --fasta " - "(You need to provide the same file used to compute the pangenome)") + check_write_sequences_args(args) mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) From 035330ecaafe34544579d69cb03e6d5e0c17d96f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:34:49 +0100 Subject: [PATCH 3/6] Little refactoring --- ppanggolin/formats/writeSequences.py | 47 +++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index a3753d46..c68eac5e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -18,11 +18,11 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype +from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, \ + detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file - -module_regex = re.compile(r'^module_\d+') #\d == [0-9] +module_regex = re.compile(r'^module_\d+') # \d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -134,8 +134,18 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.write(f'{gene.dna}\n') file_obj.flush() + def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: - seq_nucdb = tmpdir / 'nucleotid_sequences_db' + """Translate nucleotide sequences into MMSeqs2 amino acid sequences database + + :param sequences: File with the nucleotide sequences + :param tmpdir: Temporary directory to save the MMSeqs2 files + :param cpu: Number of available CPU cores to use + :param code: Translation code to use + + :return: Path to the MMSeqs2 database + """ + seq_nucdb = tmpdir / 'nucleotide_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") @@ -159,6 +169,8 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co :param soft_core: Soft core threshold to use :param compress: Compress the file in .gz :param disable_bar: Disable progress bar + + :raises AttributeError: If the pangenome does not contain gene sequences """ logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...") @@ -179,7 +191,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. - raise Exception("The pangenome does not include gene sequences") + raise AttributeError("The pangenome does not include gene sequences") logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") @@ -199,14 +211,14 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ - tmpdir = tmp/"translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") + tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") mk_outdir(tmpdir, True, True) write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) - with open(tmpdir/f"{genes_prot}_genes.fna") as sequences: + with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: translate_db = translate_genes(sequences, tmpdir, cpu, code) - outpath = output/f"{genes_prot}_protein_genes.fna" + outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) @@ -396,6 +408,9 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: :param organism: organism object :return: Dictionary with all sequences associated to contig + + :raises TypeError: If the file containing sequences is not recognized + :raises KeyError: If their inconsistency between pangenome contigs and the given contigs """ filetype = detect_filetype(genome_file) if filetype in ["fasta", "gff"]: @@ -403,12 +418,12 @@ def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: elif filetype == "gbff": contig_to_sequence = read_fasta_gbk(genome_file) else: - raise Exception(f"Unknown filetype detected: '{genome_file}'") + raise TypeError(f"Unknown filetype detected: '{genome_file}'") # check_contig_names if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: - raise Exception(f"Contig name inconsistency detected in genome '{organism.name}' between the " - f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + raise KeyError(f"Contig name inconsistency detected in genome '{organism.name}' between the " + f"information stored in the pangenome file and the contigs found in '{genome_file}'.") return contig_to_sequence @@ -420,7 +435,7 @@ def write_spaced_fasta(sequence: str, space: int = 60) -> str: :param sequence: sequence to write :param space: maximum of size for one line - :return: a sequence of maximum space caracter + :return: a sequence of maximum space character """ seq = "" j = 0 @@ -442,6 +457,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa :param anno: A tab-separated file listing the organism names, and the gff/gbff filepath of its annotations :param compress: Compress the file in .gz :param disable_bar: Disable progress bar + + :raises SyntaxError: if no tabulation are found in list genomes file """ assert fasta is not None or anno is not None, "Write regions requires to use anno or fasta, not any provided" @@ -450,7 +467,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = Path(elements[1]) if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) @@ -527,7 +544,8 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, gene_families=args.gene_families, + regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -543,6 +561,7 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser parser_seq(parser) return parser + def filter_values(arg_value: str): """ Check filter value to ensure they are in the expected format. From 788cd5cfc471c844dbf3c5452dcc43472acb3076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:42:33 +0100 Subject: [PATCH 4/6] Add translate arguments as kwargs --- ppanggolin/formats/writeSequences.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index c68eac5e..379601a1 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -500,7 +500,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, gene_families: str = None, prot_families: str = None, compress: bool = False, - disable_bar: bool = False): + disable_bar: bool = False, **translate_kwgs): """ Main function to write sequence file from pangenome @@ -528,7 +528,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) if genes_prot is not None: write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, - disable_bar=disable_bar) + disable_bar=disable_bar, **translate_kwgs) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -540,13 +540,17 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ check_write_sequences_args(args) + translate_kwgs = {"code": args.translation_table, + "cpu": args.cpu, + "tmp": args.tmpdir, + "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, - gene_families=args.gene_families, - prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, + disable_bar=args.disable_prog_bar, **translate_kwgs) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -627,7 +631,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") From 9cb7b3155dbce1407288e232854a3904ee70de78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:56:18 +0100 Subject: [PATCH 5/6] Replace cpu by threads to be more accurate --- ppanggolin/formats/writeSequences.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 379601a1..9e03f87b 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -32,7 +32,7 @@ def check_write_sequences_args(args: argparse.Namespace) -> None: :param args: argparse namespace arguments from CLI - :raises argparse.ArgumentTypeError: if region is given but neither fasta or anno is given + :raises argparse.ArgumentTypeError: if region is given but neither fasta nor anno is given """ if args.regions is not None and args.fasta is None and args.anno is None: raise argparse.ArgumentError(argument=None, @@ -135,12 +135,12 @@ def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_o file_obj.flush() -def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11) -> Path: +def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: """Translate nucleotide sequences into MMSeqs2 amino acid sequences database :param sequences: File with the nucleotide sequences :param tmpdir: Temporary directory to save the MMSeqs2 files - :param cpu: Number of available CPU cores to use + :param threads: Number of available threads to use :param code: Translation code to use :return: Path to the MMSeqs2 database @@ -152,7 +152,7 @@ def translate_genes(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 1 subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / 'aa_db' - cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return seqdb @@ -197,7 +197,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, - cpu: int = 1, code: int = 11, disable_bar: bool = False): + threads: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences @@ -207,7 +207,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: :param compress: Compress the file in .gz :param keep_tmp: Keep temporary directory :param tmp: Path to temporary directory - :param cpu: Number of CPU available + :param threads: Number of threads available :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ @@ -217,7 +217,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, tmpdir, cpu, code) + translate_db = translate_genes(sequences, tmpdir, threads, code) outpath = output / f"{genes_prot}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) @@ -541,7 +541,7 @@ def launch(args: argparse.Namespace): """ check_write_sequences_args(args) translate_kwgs = {"code": args.translation_table, - "cpu": args.cpu, + "threads": args.threads, "tmp": args.tmpdir, "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) @@ -630,7 +630,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads") optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", From 12186a5ba59a8239491d7f817e1a2d99fb295b3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 28 Mar 2024 17:56:30 +0100 Subject: [PATCH 6/6] Write test and documentation for the new option --- .github/workflows/main.yml | 1 + docs/user/writeFasta.md | 28 +++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 67c4ad2d..5db304dd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,6 +77,7 @@ jobs: ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot -c 1 --keep_tmp ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index ac1ece26..e0ba5013 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -18,7 +18,9 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify ## Genes -This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example: +### Nucleotide sequences + +This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all @@ -30,8 +32,25 @@ Or to write only the persistent genes: ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent ``` +### Protein sequences + +This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all +``` + +Or to write only the cloud genes: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud +``` + +To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`. -## Protein families +## Gene families + +### Protein sequences This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: @@ -39,14 +58,13 @@ This option can be used to write the protein sequences of the representative seq ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all ``` -or for all of the shell families for example: +or for all the shell families for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell ``` - -## Gene families +### Nucleotide sequences This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: