diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88088f66..2d9e9106 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -120,7 +120,7 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index eca6a630..53c118ae 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -54,6 +54,11 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) +```{note} +When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file. +``` + + ### Defragmentation diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index b34995b4..ab1046b9 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -19,7 +19,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir +from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", - compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile + compress=False, + disable_bar=disable_bar) # write CDS sequences to the tmpFile else: raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: - sequence_path = tmp_path/'nucleotide_sequences.fna' + sequence_path = tmp_path / 'nucleotide_sequences.fna' check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) @@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome): logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") -def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, - disable_bar: bool = False): +def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1, + tmpdir: Path = None, keep_tmp: bool = False): + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp: + repres_path = tmp / "representative.fna" + with open(repres_path, "w") as repres_seq: + for family in pangenome.gene_families: + repres_seq.write(f">{family.name}\n") + repres_seq.write(f"{family.representative.dna}\n") + translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu, + is_single_line_fasta=True, code=code) + outpath = tmp / "representative_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + with open(outpath, "r") as repres_prot: + lines = repres_prot.readlines() + while len(lines) > 0: + family_name = lines.pop(0).strip()[1:] + family_seq = lines.pop(0).strip() + family = pangenome.get_gene_family(family_name) + family.add_sequence(family_seq) + + +def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, + code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False, + force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -365,11 +390,15 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param pangenome: Input Pangenome :param families_tsv_file: MMseqs2 clustering results :param infer_singleton: creates a new family for each gene with no associated family + :param code: Genetic code used for sequence translation. + :param cpu: Number of CPU cores to use for clustering. + :param tmpdir: Path to a temporary directory for intermediate files. + :param keep_tmp: Keep temporary files (useful for debugging). :param force: force to write in the pangenome :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -421,10 +450,12 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton @@ -450,7 +481,8 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) + read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table, + args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -488,10 +520,6 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument('--no_defrag', required=False, default=False, action="store_true", help="DO NOT Use the defragmentation strategy to link potential fragments " "with their original gene family.") - clust.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - - clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") read = parser.add_argument_group(title="Read clustering arguments") read.add_argument('--clusters', required=False, type=Path, @@ -501,7 +529,10 @@ def parser_clust(parser: argparse.ArgumentParser): help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index b856fd6f..708a3dd6 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -188,6 +188,13 @@ def remove(self, identifier): del self[identifier] + @property + def representative(self) -> Gene: + """Get the representative gene of the family + :return: The representative gene of the family + """ + return self.get(self.name) + def contains_gene_id(self, identifier): """ Check if the family contains already a gene id diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 504197fe..87e2ef46 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) raise TypeError(f"Expected type int for 'begin' and 'end', " f"but received types '{type(begin)}' and '{type(end)}'.") - if begin >= end: + if begin > end: raise ValueError("The 'begin' position must be less than the 'end' position.") if end > self._genes_position[-1].position: @@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) if end == self._genes_position[-1].position: return self._genes_position[begin:] else: - return self._genes_position[begin: end] + if begin == end: + return self._genes_position[begin] + else: + return self._genes_position[begin: end] @property def number_of_genes(self) -> int: diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 1784f0f6..f9d9e839 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -586,8 +586,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None) -> dict: """ Get the parameters that have different value than default values. @@ -665,7 +665,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_that_different_from_default(default_args, args, input_params) + params_that_differ = get_args_differing_from_default(default_args, args, input_params) if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) @@ -706,7 +706,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args) + step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) @@ -1189,6 +1189,16 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + """Run a subprocess command and write the output to the given path. + + :param cmd: list of program arguments + :param output: path to write the subprocess output + :param msg: message to print if the subprocess fails + + :return: + + :raises subprocess.CalledProcessError: raise when the subprocess return a non-zero exit code + """ logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) @@ -1199,4 +1209,4 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f else: if output is not None: with open(output, 'w') as fout: - fout.write(result.stdout) \ No newline at end of file + fout.write(result.stdout) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index da91697b..3b2bc937 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -64,9 +64,9 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() - print(args.cluster) - read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar, - infer_singleton=args.cluster.infer_singletons) + read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, + code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir, + keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: @@ -80,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, disable_bar=args.disable_prog_bar, defrag=not args.cluster.no_defrag, code=args.cluster.translation_table, coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=True) + keep_tmp_files=args.cluster.keep_tmp) clust_time = time.time() - start_clust elif args.fasta is not None: