From 08f860527b5fdc1e2120a4574f89178a986e5b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 10:58:42 +0200 Subject: [PATCH 01/14] Add a run_process function to print error message from subprocess --- ppanggolin/cluster/cluster.py | 21 ++++----- ppanggolin/formats/writeMSA.py | 12 +---- ppanggolin/utils.py | 80 +++++++++++++++++++++------------- 3 files changed, 59 insertions(+), 54 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index d41589db..010e2b95 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -20,7 +20,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float +from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations @@ -100,22 +100,18 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", identity, "-c", coverage, "--threads", cpu, "--kmer-per-seq", 80, "--max-seqs", 300])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 cluster failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Extracting cluster representatives...") repdb = tmpdir / 'representative_db' cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 result2repseq failed with the following error:\n") reprfa = tmpdir / 'representative_sequences.fasta' cmd = list(map(str, ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 result2flat failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Writing gene to family informations") outtsv = tmpdir / 'families_tsv' cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 createtsv failed with the following error:\n") return reprfa, outtsv @@ -150,6 +146,7 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, :return: Result of alignment """ + # TODO use create_db function logging.getLogger("PPanGGOLiN").debug("Create database") seqdb = tmpdir / 'rep_sequence_db' cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, faa_file, seqdb])) @@ -159,14 +156,12 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, alndb = tmpdir / 'rep_alignment_db' cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir, "-a", "--min-seq-id", identity, "-c", coverage, "--cov-mode", 1, "--threads", cpu])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 search failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Extracting alignments...") outfile = tmpdir / 'rep_families.tsv' cmd = list(map(str, ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, "--format-output", "query,target,qlen,tlen,bits"])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs2 convertalis failed with the following error:\n") return outfile diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index b8b77c70..1448a134 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -5,7 +5,6 @@ import argparse import logging import tempfile -import subprocess import time from multiprocessing import get_context from pathlib import Path @@ -17,7 +16,7 @@ # local libraries from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import mk_outdir, restricted_float +from ppanggolin.utils import mk_outdir, restricted_float, run_subprocess from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.genetic_codes import genetic_codes @@ -151,14 +150,7 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): outname = output / f"{fam_name}.aln" cmd = ["mafft", "--thread", "1", fname.absolute().as_posix()] logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd)) - result = subprocess.run(cmd, capture_output=True) - - with open(outname, "w") as fout: - fout.write(result.stdout.decode('utf-8')) - - if result.stderr and result.returncode != 0: - raise Exception("mafft failed with the following error:\n" - f"{result.stderr.decode('utf-8')}") + run_subprocess(cmd, outname, msg="mafft failed with the following error:\n") def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 3a6d9cdf..ac9ef3a9 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -15,6 +15,7 @@ import time from itertools import zip_longest import re +import subprocess import networkx as nx from importlib.metadata import distribution @@ -27,8 +28,8 @@ from collections import defaultdict # all input params that exists in ppanggolin -ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', - "fasta_file", "annot_file", "genome_name"] # the last three params is for projection cmd +ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome', + "fasta_file", "annot_file", "genome_name"] # the last three params is for projection cmd # all params that should be in the general_parameters section of the config file ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log', @@ -42,8 +43,8 @@ # Inside a workflow command, write output default is overwrite to output some flat files WRITE_PAN_FLAG_DEFAULT_IN_WF = ["csv", "Rtab", "gexf", "light_gexf", - 'stats', 'json', 'partitions', 'regions', - 'borders', 'modules', 'spot_modules', "spots", "families_tsv"] + 'stats', 'json', 'partitions', 'regions', + 'borders', 'modules', 'spot_modules', "spots", "families_tsv"] WRITE_GENOME_FLAG_DEFAULT_IN_WF = ['table', 'proksee', "gff"] DRAW_FLAG_DEFAULT_IN_WF = ["tile_plot", "ucurve", "draw_spots"] @@ -249,7 +250,7 @@ def is_compressed(file_or_file_path: Union[Path, TextIO, gzip.GzipFile]): return False -def mk_outdir(output: Path, force: bool = False, exist_ok:bool=False): +def mk_outdir(output: Path, force: bool = False, exist_ok: bool = False): """ Create a directory at the given output if it doesn't exist already :param output: Path where to create directory @@ -266,23 +267,25 @@ def mk_outdir(output: Path, force: bool = False, exist_ok:bool=False): raise FileExistsError( f"{output} already exists. Use -f if you want to overwrite the files in the directory") + @contextmanager def create_tmpdir(main_dir, basename="tmpdir", keep_tmp=False): - if keep_tmp: - dir_name = basename + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime()) + dir_name = basename + time.strftime("_%Y-%m-%d_%H.%M.%S", time.localtime()) new_tmpdir = main_dir / dir_name - logging.getLogger("PPanGGOLiN").debug(f'Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained.') + logging.getLogger("PPanGGOLiN").debug( + f'Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained.') mk_outdir(new_tmpdir, force=True) yield new_tmpdir - + else: with tempfile.TemporaryDirectory(dir=main_dir, prefix=basename) as new_tmpdir: - logging.getLogger("PPanGGOLiN").debug(f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained.") + logging.getLogger("PPanGGOLiN").debug( + f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained.") yield Path(new_tmpdir) - + def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: """Returns a usable filename for a ppanggolin output file, or crashes. @@ -317,7 +320,8 @@ def detect_filetype(filename: Path) -> str: first_line = f.readline() if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file return "gbff" - elif re.match(r"##gff-version\s{1,3}3", first_line): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab + elif re.match(r"##gff-version\s{1,3}3", + first_line): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab return 'gff' elif first_line.startswith(">"): return 'fasta' @@ -374,7 +378,7 @@ def connected_components(g: nx.Graph, removed: set, weight: float): removed.update(c) -def _plain_bfs(g: nx.Graph, source:Any, removed: set, weight: float): +def _plain_bfs(g: nx.Graph, source: Any, removed: set, weight: float): """ A fast BFS node generator, copied from networkx then adapted to the current use case @@ -449,7 +453,7 @@ def parse_config_file(yaml_config_file: str) -> dict: with yaml_config_file as yaml_fh: config = yaml.safe_load(yaml_fh) - + if config is None: config = {} @@ -560,7 +564,6 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names return args - def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): """ Combine two args object. @@ -580,8 +583,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_that_differe_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None) -> dict: """ Get the parameters that have different value than default values. @@ -659,7 +662,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_that_differe_from_default(default_args, args, input_params) + params_that_differ = get_args_that_different_from_default(default_args, args, input_params) if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) @@ -700,7 +703,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_differe_from_default(default_step_args, step_args) + step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args) if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) @@ -1025,12 +1028,13 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa genome_name = elements[0] putative_circular_contigs = elements[2:] - if not genome_file_path.exists(): + if not genome_file_path.exists(): # Check if the file path doesn't exist and try an alternative path. genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) if not genome_file_path_alt.exists(): - raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") + raise FileNotFoundError( + f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") else: genome_file_path = genome_file_path_alt @@ -1041,7 +1045,7 @@ def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, Union[Pa if len(genome_name_to_genome_path) == 0: raise Exception(f"There are no genomes in the provided file: {path_list_file} ") - + return genome_name_to_genome_path @@ -1065,6 +1069,7 @@ def flatten(dictionary, parent_key=''): flatten(nested_dict) return flat_dict + def get_major_version(version: str) -> int: """ Extracts the major version number from a version string. @@ -1121,7 +1126,7 @@ def find_consecutive_sequences(sequence: List[int]) -> List[List[int]]: else: # there is a break in the consecutivity consecutive_sequences.append([index]) - + return consecutive_sequences @@ -1136,10 +1141,10 @@ def find_region_border_position(region_positions: List[int], contig_gene_count: """ consecutive_region_positions = get_consecutive_region_positions(region_positions, contig_gene_count) - + return consecutive_region_positions[0][0], consecutive_region_positions[-1][-1] - - + + def get_consecutive_region_positions(region_positions: List[int], contig_gene_count: int) -> List[List[int]]: """ Order integers position of the region considering circularity of the contig. @@ -1153,12 +1158,12 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co """ if len(region_positions) == 0: raise ValueError('Region has no position. This is unexpected.') - + consecutive_sequences = sorted(find_consecutive_sequences(region_positions)) - + if len(consecutive_sequences) == 0: raise ValueError('No consecutive sequences found in the region. This is unexpected.') - + elif len(consecutive_sequences) == 1: return consecutive_sequences @@ -1173,9 +1178,22 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co f'indicate an overlap on the edge of the contig, but neither ends at the end of the contig ({contig_gene_count - 1}).') return [consecutive_sequences[-1], consecutive_sequences[0]] - + elif len(consecutive_sequences) > 2: raise ValueError(f'More than two consecutive sequences found ({len(consecutive_sequences)}). ' f'This is unexpected. Consecutive sequences: {consecutive_sequences}. ' 'The region should consist of consecutive positions along the contig.') - \ No newline at end of file + + +def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + if output is None: + result = subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + else: + result = subprocess.run(cmd, capture_output=True, check=True) + + with open(output, "w") as fout: + fout.write(result.stdout.decode('utf-8')) + + if result.stderr and result.returncode != 0: + raise Exception(msg + f"{result.stderr.decode('utf-8')}") \ No newline at end of file From eb732be659584f2ca54d9dc9b513bfd809715bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 11:25:28 +0200 Subject: [PATCH 02/14] Change subprocess.run for run_subprocess --- ppanggolin/align/alignOnPang.py | 13 +++++-------- ppanggolin/cluster/cluster.py | 9 ++------- ppanggolin/formats/writeSequences.py | 14 +++++--------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 0742a876..1c97c75d 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -17,7 +17,7 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import mk_outdir, read_compressed_or_not, create_tmpdir +from ppanggolin.utils import mk_outdir, read_compressed_or_not, create_tmpdir, run_subprocess from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph @@ -73,27 +73,24 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: + logging.getLogger("PPanGGOLiN").info("Aligning sequences") cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] - logging.getLogger("PPanGGOLiN").info("Aligning sequences") - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - start = time.time() - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs search failed with the following error:\n") align_time = time.time() - start logging.getLogger("PPanGGOLiN").info(f"Done aligning sequences in {round(align_time, 2)} seconds") with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", delete=False) as outfile: + logging.getLogger("PPanGGOLiN").info("Extracting alignments...") cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] - logging.getLogger("PPanGGOLiN").info("Extracting alignments...") - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs convertalis failed with the following error:\n") return Path(outfile.name) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index b4c441b0..966c029d 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -4,7 +4,6 @@ # default libraries import logging import tempfile -import subprocess from collections import defaultdict import os import argparse @@ -23,7 +22,7 @@ from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file -from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes +from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db from ppanggolin.utils import mk_outdir @@ -136,12 +135,8 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, :return: Result of alignment """ - # TODO use create_db function - logging.getLogger("PPanGGOLiN").debug("Create database") - seqdb = tmpdir / 'rep_sequence_db' + seqdb = create_mmseqs_db(faa_file, 'rep_sequence_db', tmpdir, db_mode=1, db_type=1) cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, faa_file, seqdb])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) logging.getLogger("PPanGGOLiN").info("Aligning cluster representatives...") alndb = tmpdir / 'rep_alignment_db' cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir, "-a", "--min-seq-id", identity, diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index f27d44ba..c3a8435e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -5,8 +5,6 @@ import argparse import logging import re -import subprocess -import sys from pathlib import Path from typing import TextIO, Dict, Set, Iterable, Union import tempfile @@ -19,7 +17,8 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype +from ppanggolin.utils import (write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, + detect_filetype, run_subprocess) from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] @@ -153,9 +152,8 @@ def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_m cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type] cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb] cmd = list(map(str, cmd)) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs createdb failed with the following error:\n") return seq_nucdb @@ -177,8 +175,7 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], db_name: str, tmpdir logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir / db_name cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs translatenucs failed with the following error:\n") return seqdb @@ -250,8 +247,7 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s translate_db = translate_genes(pangenome_sequences, 'aa_db', tmpdir, threads, True, code) outpath = output / f"{proteins}_protein_genes.fna" cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") if not keep_tmp: From 9e7a5e2cb14db0f5b1ad51629eb741c1a51f06e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 12:30:42 +0200 Subject: [PATCH 03/14] Improve logging error subprocess handling --- ppanggolin/cluster/cluster.py | 3 +-- ppanggolin/utils.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 966c029d..5acf74b2 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -135,8 +135,7 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, :return: Result of alignment """ - seqdb = create_mmseqs_db(faa_file, 'rep_sequence_db', tmpdir, db_mode=1, db_type=1) - cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, faa_file, seqdb])) + seqdb = create_mmseqs_db([faa_file], 'rep_sequence_db', tmpdir, db_mode=1, db_type=1) logging.getLogger("PPanGGOLiN").info("Aligning cluster representatives...") alndb = tmpdir / 'rep_alignment_db' cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir, "-a", "--min-seq-id", identity, diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index ac9ef3a9..9f879d5c 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -1187,13 +1187,13 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - if output is None: - result = subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + except subprocess.CalledProcessError as subprocess_err: + for line in subprocess_err.stdout.split("\n"): + logging.getLogger("PPanGGOLiN").error(line) + raise Exception(msg + subprocess_err.stderr) else: - result = subprocess.run(cmd, capture_output=True, check=True) - - with open(output, "w") as fout: - fout.write(result.stdout.decode('utf-8')) - - if result.stderr and result.returncode != 0: - raise Exception(msg + f"{result.stderr.decode('utf-8')}") \ No newline at end of file + if output is not None: + with open(output, 'w') as fout: + fout.write(result.stdout) \ No newline at end of file From ba04fe77811867282ca499e92bd9c80c2178d06c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 15:00:42 +0200 Subject: [PATCH 04/14] Add infer_singletons to workflow --- ppanggolin/utils.py | 3 +++ ppanggolin/workflow/all.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 9f879d5c..1784f0f6 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -441,6 +441,9 @@ def check_option_workflow(args): if not any([args.fasta, args.anno]): raise Exception("At least one of --fasta or --anno must be given") + if args.infer_singletons and args.clusters is None: + logging.getLogger("PPanGGOLiN").warning("--infer_singleton works only with --clusters given.") + def parse_config_file(yaml_config_file: str) -> dict: """ diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 702efe40..da91697b 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -54,7 +54,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, start_anno = time.time() read_annotations(pangenome, args.anno, pseudo=args.annotate.use_pseudo, - cpu=args.annotate.cpu, translation_table=args.annotate.translation_table, disable_bar=args.disable_prog_bar) + cpu=args.annotate.cpu, translation_table=args.annotate.translation_table, + disable_bar=args.disable_prog_bar) anno_time = time.time() - start_anno start_writing = time.time() @@ -63,6 +64,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() + print(args.cluster) read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar, infer_singleton=args.cluster.infer_singletons) else: # args.cluster is None @@ -362,6 +364,10 @@ def add_workflow_args(parser: argparse.ArgumentParser): optional.add_argument("--identity", required=False, type=restricted_float, default=0.8, help="Minimal identity percent for two proteins to be in the same cluster") + optional.add_argument("--infer_singletons", required=False, action="store_true", + help="When reading a clustering result with --clusters," + " if a gene is not in the provided file") + optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, help="Number of partitions to use. Must be at least 2. If under 2, " "it will be detected automatically.") From 2862164faf6b30d4c80379204e112765cb889020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 6 Jun 2024 15:00:59 +0200 Subject: [PATCH 05/14] Add gene names and coordinates of the RGP --- docs/user/RGP/rgpOutputs.md | 104 ++++++++++++++--------- ppanggolin/formats/writeFlatPangenome.py | 8 +- ppanggolin/region.py | 8 +- 3 files changed, 76 insertions(+), 44 deletions(-) diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md index 26a87cfa..317230c1 100644 --- a/docs/user/RGP/rgpOutputs.md +++ b/docs/user/RGP/rgpOutputs.md @@ -2,42 +2,52 @@ ### RGP -The `regions_of_genomic_plasticity.tsv` is a tsv file that lists all the detected Regions of Genome Plasticity. This requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. +The `regions_of_genomic_plasticity.tsv` is a tsv file that lists all the detected Regions of Genome Plasticity. This +requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. It can be written with the following command: + ```bash ppanggolin write_pangenome -p pangenome.h5 --regions -o rgp_outputs ``` The file has the following format : -| Column | Description | -|--------|-------------| -|region| A unique identifier for the region. This is usually built from the contig it is on, with a number after it.| -|genome| The genome it is in. This is the genome name provided by the user.| -|start| The start position of the RGP in the contig.| -|stop| The stop position of the RGP in the contig.| -|genes| The number of genes included in the RGP.| -|contigBorder| This is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete.| -|wholeContig| This is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If a RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant.| +| Column | Description | +|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| region | A unique identifier for the region. This is usually built from the contig it is on, with a number after it. | +| genome | The genome it is in. This is the genome name provided by the user. | +| contig | Name of the contig | +| genes | The number of genes included in the RGP. | +| first_gene | Name of the first gene of the region | +| last_gene | Name of the last gene of the region | +| start | The start position of the RGP in the contig. | +| stop | The stop position of the RGP in the contig. | +| length | The length of the RGP in nucleotide | +| coordinates | The coordinates of the region. If the region overlap the contig edges will be right with join coordinates syntax (*i.e* 1523..1758,1..57) | +| contigBorder | This is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete. | +| wholeContig | This is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If a RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant. | ### Spots -The `spots.tsv` is a tsv file that links the spots in `summarize_spots.tsv` with the RGPs in `regions_of_genomic_plasticity.tsv`. +The `spots.tsv` is a tsv file that links the spots in `summarize_spots.tsv` with the RGPs +in `regions_of_genomic_plasticity.tsv`. It can be created with the following command: + ```bash ppanggolin write_pangenome -p pangenome.h5 --spots -o rgp_outputs ``` -|Column|Description| -|------|------------| -|spot_id| The spot identifier (found in the 'spot' column of `summarize_spots.tsv`).| -|rgp_id| The RGP identifier (found in 'region' column of `regions_of_genomic_plasticity.tsv`).| +| Column | Description | +|---------|---------------------------------------------------------------------------------------| +| spot_id | The spot identifier (found in the 'spot' column of `summarize_spots.tsv`). | +| rgp_id | The RGP identifier (found in 'region' column of `regions_of_genomic_plasticity.tsv`). | ### Summarize spots -The `summarize_spots.tsv` file is a tsv file that will associate each spot with multiple metrics that can indicate the dynamic of the spot. +The `summarize_spots.tsv` file is a tsv file that will associate each spot with multiple metrics that can indicate the +dynamic of the spot. It can be created with the following command: @@ -45,20 +55,21 @@ It can be created with the following command: ppanggolin write_pangenome -p pangenome.h5 --spots -o rgp_outputs ``` -|Column|Description| -|-------|------------| -|spot|The spot identifier. It is unique in the pangenome.| -|nb_rgp|The number of RGPs present in the spot.| -|nb_families| The number of different gene families that are found in the spot.| -|nb_unique_family_sets| The number of RGPs with different gene family content. If two RGPs are identical, they will be counted only once. The difference between this number and the one provided in 'nb_rgp' can be a strong indicator on whether their is a high turnover in gene content in this area or not.| -|mean_nb_genes| The mean number of genes on RGPs in the spot.| -|stdev_nb_genes| The standard deviation of the number of genes in the spot.| -|max_nb_genes| The longest RGP in number of genes of the spot.| -|min_nb_genes| The shortest RGP in number of genes of the spot.| +| Column | Description | +|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| spot | The spot identifier. It is unique in the pangenome. | +| nb_rgp | The number of RGPs present in the spot. | +| nb_families | The number of different gene families that are found in the spot. | +| nb_unique_family_sets | The number of RGPs with different gene family content. If two RGPs are identical, they will be counted only once. The difference between this number and the one provided in 'nb_rgp' can be a strong indicator on whether their is a high turnover in gene content in this area or not. | +| mean_nb_genes | The mean number of genes on RGPs in the spot. | +| stdev_nb_genes | The standard deviation of the number of genes in the spot. | +| max_nb_genes | The longest RGP in number of genes of the spot. | +| min_nb_genes | The shortest RGP in number of genes of the spot. | ### Borders -Each spot has at least one set of gene families bordering them. To write the list of gene families bordering spots, you can use the `--borders` option as follow: +Each spot has at least one set of gene families bordering them. To write the list of gene families bordering spots, you +can use the `--borders` option as follow: ```bash ppanggolin write_pangenome -p pangenome.h5 --borders -o rgp_outputs @@ -66,29 +77,35 @@ ppanggolin write_pangenome -p pangenome.h5 --borders -o rgp_outputs It will write a .tsv file with 4 columns: -|Column|Description| -|-------|------------| -|spot_id| The spot identifier. It is unique in the pangenome.| -|number| The number of RGPs present in the spot that have those bordering genes.| -|border1| Comma-separated list of gene families of the 1st border.| -|border2| Comma-separated list of gene families of the 2nd border.| +| Column | Description | +|---------|-------------------------------------------------------------------------| +| spot_id | The spot identifier. It is unique in the pangenome. | +| number | The number of RGPs present in the spot that have those bordering genes. | +| border1 | Comma-separated list of gene families of the 1st border. | +| border2 | Comma-separated list of gene families of the 2nd border. | -Since there can be some variation in the borders, some spots will have multiple borders and thus multiple lines in this file. +Since there can be some variation in the borders, some spots will have multiple borders and thus multiple lines in this +file. The sum of the number for each spot_id should be exactly the number of RGPs in the spot. -The flag `--borders` also creates a file call `border_protein_genes.fasta` that are the protein sequences of the gene family found in borders. +The flag `--borders` also creates a file call `border_protein_genes.fasta` that are the protein sequences of the gene +family found in borders. -In addition, the `--borders` option also generates a file named `border_protein_genes.fasta`, containing protein sequences corresponding to the gene families of the spot borders. +In addition, the `--borders` option also generates a file named `border_protein_genes.fasta`, containing protein +sequences corresponding to the gene families of the spot borders. ### Draw spots -The `draw` command with the option `--draw_spots` can draw specific spots of interest, whose ID are provided, or all the spots if you wish. -It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. +The `draw` command with the option `--draw_spots` can draw specific spots of interest, whose ID are provided, or all the +spots if you wish. +It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is +basically a subgraph of the pangenome, consisting of the spot itself. The command can be used as such: ```bash ppanggolin draw -p pangenome.h5 --draw_spots --spots all ``` + This command draws an interactive `.html` figure and a `.gexf` graph file for all the spots. If you are interested in only a single spot, you can use its identifier to draw it. For example for the `spot_34`: @@ -103,8 +120,13 @@ The interactive figures that are drawn look like this: :align: center ``` -The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called `spot_X_identical_rgps.tsv`, with X the spot_id. +The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene +organization, the organization is represented only once (the represented RGP is picked at random among all identical +RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure +called `spot_X_identical_rgps.tsv`, with X the spot_id. -They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be. +They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot +itself can be saved using the save button on the right of the screen, if need be. -For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](../PangenomeAnalyses/pangenomeGraphOut.md#pangenome-graph-output). +For the gexf file, you can see how to visualize it in the section about +the [pangenome gexf](../PangenomeAnalyses/pangenomeGraphOut.md#pangenome-graph-output). diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index dfac8013..63f71e47 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -840,15 +840,19 @@ def write_rgp_table(regions: Set[Region], "region": region.name, "genome": region.organism, "contig": region.contig, + "genes": len(region), + "first_gene": region.starter, + "last_gene": region.stopper, "start": region.start, "stop": region.stop, - "genes": len(region), + "length": region.length, + "coordinates": region.string_coordinates(), "contigBorder": region.is_contig_border, "wholeContig": region.is_whole_contig } - writer.writerow(row) + def spot2rgp(spots: set, output: Path, compress: bool = False): """Write a tsv file providing association between spot and rgp diff --git a/ppanggolin/region.py b/ppanggolin/region.py index d32e163c..c327a260 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -237,6 +237,12 @@ def coordinates(self) -> List[Tuple[int]]: self.identify_rgp_last_and_first_genes() return self._coordinates + def string_coordinates(self) -> str: + """ + Return a string representation of the coordinates + """ + return ','.join([f'{start}..{stop}' for start, stop in self.coordinates]) + @property def overlaps_contig_edge(self) -> bool: if self._overlaps_contig_edge is None: @@ -339,7 +345,7 @@ def length(self): :return: Size of the region """ - return sum([(stop - start +1) for start, stop in self.coordinates ]) + return sum([(stop - start +1) for start, stop in self.coordinates]) @property def organism(self) -> Organism: From 5d2de99505ed4f243edfc13490cc74aef26d6edb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 15:07:54 +0200 Subject: [PATCH 06/14] Fix field name in regions output --- ppanggolin/formats/writeFlatPangenome.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 63f71e47..a7d29b6e 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -826,8 +826,8 @@ def write_rgp_table(regions: Set[Region], """ fname = output / "regions_of_genomic_plasticity.tsv" with write_compressed_or_not(fname, compress) as tab: - fieldnames = ["region", "genome", "contig", "start", - "stop", "genes", "contigBorder", "wholeContig"] + fieldnames = ["region", "genome", "contig", "genes", "first_gene", "last_gene", + "start", "stop", "length", "coordinates", "contigBorder", "wholeContig"] writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t') writer.writeheader() From 6ddc9739cca8eb10a6c5aec29667c6a8847ae82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 23:05:22 +0200 Subject: [PATCH 07/14] Add the protein sequence to gene family when reading clustering --- .github/workflows/main.yml | 4 +- .../PangenomeAnalyses/pangenomeCluster.md | 6 ++ ppanggolin/cluster/cluster.py | 56 +++++++++++++++---- ppanggolin/geneFamily.py | 7 +++ ppanggolin/workflow/all.py | 8 ++- 5 files changed, 64 insertions(+), 17 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88088f66..2b722c0c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -120,9 +120,9 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS - ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS + ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS cd - - name: testing rgp_cluster command diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index eca6a630..805be2d2 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -55,6 +55,12 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) +When you provide your clustering, by default, the pangenome will be without sequences for gene families. +PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`. +The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output). + + + ### Defragmentation Without performing additional steps, most cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes. Therefore, they do not provide informative data on the pangenome's diversity. diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index aed0a4dc..5c7835df 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -19,7 +19,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir +from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", - compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile + compress=False, + disable_bar=disable_bar) # write CDS sequences to the tmpFile else: raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: - sequence_path = tmp_path/'nucleotide_sequences.fna' + sequence_path = tmp_path / 'nucleotide_sequences.fna' check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) @@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome): logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") -def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, - disable_bar: bool = False): +def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1, + tmpdir: Path = None, keep_tmp: bool = False): + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp: + repres_path = tmp / "representative.fna" + with open(repres_path, "w") as repres_seq: + for family in pangenome.gene_families: + repres_seq.write(f">{family.name}\n") + repres_seq.write(f"{family.representative.dna}\n") + translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu, + is_single_line_fasta=True, code=code) + outpath = tmp / "representative_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + with open(outpath, "r") as repres_prot: + lines = repres_prot.readlines() + while len(lines) > 0: + family_name = lines.pop(0).strip()[1:] + family_seq = lines.pop(0).strip() + family = pangenome.get_gene_family(family_name) + family.add_sequence(family_seq) + + +def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, + write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None, + keep_tmp: bool = False, force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -369,7 +394,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -421,10 +446,14 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) + if write_sequences: + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" + if write_sequences: + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton @@ -450,7 +479,8 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) + read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table, + args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -488,12 +518,8 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument('--no_defrag', required=False, default=False, action="store_true", help="DO NOT Use the defragmentation strategy to link potential fragments " "with their original gene family.") - clust.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") # clust.add_argument("--compress") - clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - read = parser.add_argument_group(title="Read clustering arguments") read.add_argument('--clusters', required=False, type=Path, help="A tab-separated list containing the result of a clustering. One line per gene. " @@ -501,8 +527,14 @@ def parser_clust(parser: argparse.ArgumentParser): read.add_argument("--infer_singletons", required=False, action="store_true", help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") + read.add_argument("--write_sequences", action="store_true", + help="Get the protein sequence of the representative gene of each gene family " + "and write it in the pangenome file.") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index b856fd6f..708a3dd6 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -188,6 +188,13 @@ def remove(self, identifier): del self[identifier] + @property + def representative(self) -> Gene: + """Get the representative gene of the family + :return: The representative gene of the family + """ + return self.get(self.name) + def contains_gene_id(self, identifier): """ Check if the family contains already a gene id diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 702efe40..436bf3ba 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -63,8 +63,10 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() - read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar, - infer_singleton=args.cluster.infer_singletons) + read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, + write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu, + tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp, + force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: @@ -78,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, disable_bar=args.disable_prog_bar, defrag=not args.cluster.no_defrag, code=args.cluster.translation_table, coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=True) + keep_tmp_files=args.cluster.keep_tmp) clust_time = time.time() - start_clust elif args.fasta is not None: From 9e62cd2ca56769a2eafebc879b8dde3e8a7fef54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 11:19:47 +0200 Subject: [PATCH 08/14] Add a warning message if gene families are without sequences --- ppanggolin/formats/writeFlatPangenome.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index dfac8013..259b6bb0 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1143,7 +1143,12 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, if regions: processes.append(p.apply_async(func=write_regions, args=(output, compress))) if borders: - processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) + if pangenome.status["geneFamilySequences"] == "No": + logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. " + "This may be due to the use of external clustering. " + "Please refer to the documentation or submit an issue.") + else: + processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) if modules: processes.append(p.apply_async(func=write_modules, args=(output, compress))) processes.append(p.apply_async(func=write_module_summary, args=(output, compress))) From 86569f1f6c67268e6f685b6b9b954693c265d5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 17:46:09 +0200 Subject: [PATCH 09/14] Add docstring --- ppanggolin/utils.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 9f879d5c..9ace2282 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -583,8 +583,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None) -> dict: """ Get the parameters that have different value than default values. @@ -662,7 +662,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_that_different_from_default(default_args, args, input_params) + params_that_differ = get_args_differing_from_default(default_args, args, input_params) if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) @@ -703,7 +703,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args) + step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) @@ -1186,6 +1186,16 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + """Run a subprocess command and write the output to the given path. + + :param cmd: list of program arguments + :param output: path to write the subprocess output + :param msg: message to print if the subprocess fails + + :return: + + :raises subprocess.CalledProcessError: raise when the subprocess return a non-zero exit code + """ logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) @@ -1196,4 +1206,4 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f else: if output is not None: with open(output, 'w') as fout: - fout.write(result.stdout) \ No newline at end of file + fout.write(result.stdout) From 96653a9694a3972d428a2e78ef916a7f65ae43c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 18:00:25 +0200 Subject: [PATCH 10/14] Translate representative gene when read clustering everytime --- docs/user/PangenomeAnalyses/pangenomeCluster.md | 7 +++---- ppanggolin/cluster/cluster.py | 16 ++++++++-------- ppanggolin/formats/writeFlatPangenome.py | 7 +------ ppanggolin/workflow/all.py | 5 ++--- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index 805be2d2..53c118ae 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -54,10 +54,9 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) - -When you provide your clustering, by default, the pangenome will be without sequences for gene families. -PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`. -The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output). +```{note} +When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file. +``` diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 5c7835df..1d7a779a 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -381,8 +381,8 @@ def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cp def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, - write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None, - keep_tmp: bool = False, force: bool = False, disable_bar: bool = False): + code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False, + force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -390,6 +390,10 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param pangenome: Input Pangenome :param families_tsv_file: MMseqs2 clustering results :param infer_singleton: creates a new family for each gene with no associated family + :param code: Genetic code used for sequence translation. + :param cpu: Number of CPU cores to use for clustering. + :param tmpdir: Path to a temporary directory for intermediate files. + :param keep_tmp: Keep temporary files (useful for debugging). :param force: force to write in the pangenome :param disable_bar: Allow to disable progress bar """ @@ -446,8 +450,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) - if write_sequences: - get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. @@ -479,7 +482,7 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table, + read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table, args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -527,9 +530,6 @@ def parser_clust(parser: argparse.ArgumentParser): read.add_argument("--infer_singletons", required=False, action="store_true", help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") - read.add_argument("--write_sequences", action="store_true", - help="Get the protein sequence of the representative gene of each gene family " - "and write it in the pangenome file.") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 259b6bb0..dfac8013 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1143,12 +1143,7 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, if regions: processes.append(p.apply_async(func=write_regions, args=(output, compress))) if borders: - if pangenome.status["geneFamilySequences"] == "No": - logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. " - "This may be due to the use of external clustering. " - "Please refer to the documentation or submit an issue.") - else: - processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) + processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) if modules: processes.append(p.apply_async(func=write_modules, args=(output, compress))) processes.append(p.apply_async(func=write_module_summary, args=(output, compress))) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 436bf3ba..dec41fd8 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -64,9 +64,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, - write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu, - tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp, - force=args.force, disable_bar=args.disable_prog_bar) + code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir, + keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: From b9a542c39bfd7f486f48de4fac8c8e5f84770557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 18:27:23 +0200 Subject: [PATCH 11/14] get_genes return one gene if begin==end --- ppanggolin/genome.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 504197fe..87e2ef46 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) raise TypeError(f"Expected type int for 'begin' and 'end', " f"but received types '{type(begin)}' and '{type(end)}'.") - if begin >= end: + if begin > end: raise ValueError("The 'begin' position must be less than the 'end' position.") if end > self._genes_position[-1].position: @@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) if end == self._genes_position[-1].position: return self._genes_position[begin:] else: - return self._genes_position[begin: end] + if begin == end: + return self._genes_position[begin] + else: + return self._genes_position[begin: end] @property def number_of_genes(self) -> int: From edb09e623579066234a3ccd52e10f5f1128cc943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 19:38:25 +0200 Subject: [PATCH 12/14] remove deprecated option --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b722c0c..2d9e9106 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -122,7 +122,7 @@ jobs: cd testingDataset ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS - ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS + ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS cd - - name: testing rgp_cluster command From ed629cfe98f5bcb5bcc5574cd85397ad6ab5bd7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 22:27:59 +0200 Subject: [PATCH 13/14] fix remaining deprecated args --- ppanggolin/cluster/cluster.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index c30cac84..ab1046b9 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -398,7 +398,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -455,8 +455,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" - if write_sequences: - pangenome.status["geneFamilySequences"] = "Computed" + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton From 7054f63dc482bed860a5b9187068a27b69b0c3d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 12 Jun 2024 13:53:50 +0200 Subject: [PATCH 14/14] Fix help message --- ppanggolin/utils.py | 2 +- ppanggolin/workflow/all.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index f9d9e839..d6f1259c 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -442,7 +442,7 @@ def check_option_workflow(args): raise Exception("At least one of --fasta or --anno must be given") if args.infer_singletons and args.clusters is None: - logging.getLogger("PPanGGOLiN").warning("--infer_singleton works only with --clusters given.") + logging.getLogger("PPanGGOLiN").warning("--infer_singleton works only with --clusters.") def parse_config_file(yaml_config_file: str) -> dict: diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 3b2bc937..e884f0bf 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -365,8 +365,9 @@ def add_workflow_args(parser: argparse.ArgumentParser): help="Minimal identity percent for two proteins to be in the same cluster") optional.add_argument("--infer_singletons", required=False, action="store_true", - help="When reading a clustering result with --clusters," - " if a gene is not in the provided file") + help="Use this option together with --clusters. " + "If a gene is not present in the provided clustering result file, " + "it will be assigned to its own unique cluster as a singleton.") optional.add_argument("-K", "--nb_of_partitions", required=False, default=-1, type=int, help="Number of partitions to use. Must be at least 2. If under 2, "