From 6ddc9739cca8eb10a6c5aec29667c6a8847ae82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 10 Jun 2024 23:05:22 +0200 Subject: [PATCH 1/7] Add the protein sequence to gene family when reading clustering --- .github/workflows/main.yml | 4 +- .../PangenomeAnalyses/pangenomeCluster.md | 6 ++ ppanggolin/cluster/cluster.py | 56 +++++++++++++++---- ppanggolin/geneFamily.py | 7 +++ ppanggolin/workflow/all.py | 8 ++- 5 files changed, 64 insertions(+), 17 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88088f66..2b722c0c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -120,9 +120,9 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS - ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS + ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS cd - - name: testing rgp_cluster command diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index eca6a630..805be2d2 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -55,6 +55,12 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) +When you provide your clustering, by default, the pangenome will be without sequences for gene families. +PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`. +The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output). + + + ### Defragmentation Without performing additional steps, most cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes. Therefore, they do not provide informative data on the pangenome's diversity. diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index aed0a4dc..5c7835df 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -19,7 +19,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir +from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", - compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile + compress=False, + disable_bar=disable_bar) # write CDS sequences to the tmpFile else: raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: - sequence_path = tmp_path/'nucleotide_sequences.fna' + sequence_path = tmp_path / 'nucleotide_sequences.fna' check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) @@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome): logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") -def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, - disable_bar: bool = False): +def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1, + tmpdir: Path = None, keep_tmp: bool = False): + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp: + repres_path = tmp / "representative.fna" + with open(repres_path, "w") as repres_seq: + for family in pangenome.gene_families: + repres_seq.write(f">{family.name}\n") + repres_seq.write(f"{family.representative.dna}\n") + translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu, + is_single_line_fasta=True, code=code) + outpath = tmp / "representative_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + with open(outpath, "r") as repres_prot: + lines = repres_prot.readlines() + while len(lines) > 0: + family_name = lines.pop(0).strip()[1:] + family_seq = lines.pop(0).strip() + family = pangenome.get_gene_family(family_name) + family.add_sequence(family_seq) + + +def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, + write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None, + keep_tmp: bool = False, force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -369,7 +394,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -421,10 +446,14 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) + if write_sequences: + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" + if write_sequences: + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton @@ -450,7 +479,8 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) + read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table, + args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -488,12 +518,8 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument('--no_defrag', required=False, default=False, action="store_true", help="DO NOT Use the defragmentation strategy to link potential fragments " "with their original gene family.") - clust.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") # clust.add_argument("--compress") - clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - read = parser.add_argument_group(title="Read clustering arguments") read.add_argument('--clusters', required=False, type=Path, help="A tab-separated list containing the result of a clustering. One line per gene. " @@ -501,8 +527,14 @@ def parser_clust(parser: argparse.ArgumentParser): read.add_argument("--infer_singletons", required=False, action="store_true", help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") + read.add_argument("--write_sequences", action="store_true", + help="Get the protein sequence of the representative gene of each gene family " + "and write it in the pangenome file.") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index b856fd6f..708a3dd6 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -188,6 +188,13 @@ def remove(self, identifier): del self[identifier] + @property + def representative(self) -> Gene: + """Get the representative gene of the family + :return: The representative gene of the family + """ + return self.get(self.name) + def contains_gene_id(self, identifier): """ Check if the family contains already a gene id diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 702efe40..436bf3ba 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -63,8 +63,10 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() - read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar, - infer_singleton=args.cluster.infer_singletons) + read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, + write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu, + tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp, + force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: @@ -78,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, disable_bar=args.disable_prog_bar, defrag=not args.cluster.no_defrag, code=args.cluster.translation_table, coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=True) + keep_tmp_files=args.cluster.keep_tmp) clust_time = time.time() - start_clust elif args.fasta is not None: From 9e62cd2ca56769a2eafebc879b8dde3e8a7fef54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 11:19:47 +0200 Subject: [PATCH 2/7] Add a warning message if gene families are without sequences --- ppanggolin/formats/writeFlatPangenome.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index dfac8013..259b6bb0 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1143,7 +1143,12 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, if regions: processes.append(p.apply_async(func=write_regions, args=(output, compress))) if borders: - processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) + if pangenome.status["geneFamilySequences"] == "No": + logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. " + "This may be due to the use of external clustering. " + "Please refer to the documentation or submit an issue.") + else: + processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) if modules: processes.append(p.apply_async(func=write_modules, args=(output, compress))) processes.append(p.apply_async(func=write_module_summary, args=(output, compress))) From 86569f1f6c67268e6f685b6b9b954693c265d5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 17:46:09 +0200 Subject: [PATCH 3/7] Add docstring --- ppanggolin/utils.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 9f879d5c..9ace2282 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -583,8 +583,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None) -> dict: """ Get the parameters that have different value than default values. @@ -662,7 +662,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_that_different_from_default(default_args, args, input_params) + params_that_differ = get_args_differing_from_default(default_args, args, input_params) if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) @@ -703,7 +703,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args) + step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) @@ -1186,6 +1186,16 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + """Run a subprocess command and write the output to the given path. + + :param cmd: list of program arguments + :param output: path to write the subprocess output + :param msg: message to print if the subprocess fails + + :return: + + :raises subprocess.CalledProcessError: raise when the subprocess return a non-zero exit code + """ logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) @@ -1196,4 +1206,4 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f else: if output is not None: with open(output, 'w') as fout: - fout.write(result.stdout) \ No newline at end of file + fout.write(result.stdout) From 96653a9694a3972d428a2e78ef916a7f65ae43c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 18:00:25 +0200 Subject: [PATCH 4/7] Translate representative gene when read clustering everytime --- docs/user/PangenomeAnalyses/pangenomeCluster.md | 7 +++---- ppanggolin/cluster/cluster.py | 16 ++++++++-------- ppanggolin/formats/writeFlatPangenome.py | 7 +------ ppanggolin/workflow/all.py | 5 ++--- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index 805be2d2..53c118ae 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -54,10 +54,9 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) - -When you provide your clustering, by default, the pangenome will be without sequences for gene families. -PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`. -The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output). +```{note} +When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file. +``` diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 5c7835df..1d7a779a 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -381,8 +381,8 @@ def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cp def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, - write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None, - keep_tmp: bool = False, force: bool = False, disable_bar: bool = False): + code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False, + force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -390,6 +390,10 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param pangenome: Input Pangenome :param families_tsv_file: MMseqs2 clustering results :param infer_singleton: creates a new family for each gene with no associated family + :param code: Genetic code used for sequence translation. + :param cpu: Number of CPU cores to use for clustering. + :param tmpdir: Path to a temporary directory for intermediate files. + :param keep_tmp: Keep temporary files (useful for debugging). :param force: force to write in the pangenome :param disable_bar: Allow to disable progress bar """ @@ -446,8 +450,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) - if write_sequences: - get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. @@ -479,7 +482,7 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table, + read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table, args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -527,9 +530,6 @@ def parser_clust(parser: argparse.ArgumentParser): read.add_argument("--infer_singletons", required=False, action="store_true", help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") - read.add_argument("--write_sequences", action="store_true", - help="Get the protein sequence of the representative gene of each gene family " - "and write it in the pangenome file.") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 259b6bb0..dfac8013 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1143,12 +1143,7 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, if regions: processes.append(p.apply_async(func=write_regions, args=(output, compress))) if borders: - if pangenome.status["geneFamilySequences"] == "No": - logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. " - "This may be due to the use of external clustering. " - "Please refer to the documentation or submit an issue.") - else: - processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) + processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress))) if modules: processes.append(p.apply_async(func=write_modules, args=(output, compress))) processes.append(p.apply_async(func=write_module_summary, args=(output, compress))) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 436bf3ba..dec41fd8 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -64,9 +64,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, - write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu, - tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp, - force=args.force, disable_bar=args.disable_prog_bar) + code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir, + keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: From b9a542c39bfd7f486f48de4fac8c8e5f84770557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 18:27:23 +0200 Subject: [PATCH 5/7] get_genes return one gene if begin==end --- ppanggolin/genome.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 504197fe..87e2ef46 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) raise TypeError(f"Expected type int for 'begin' and 'end', " f"but received types '{type(begin)}' and '{type(end)}'.") - if begin >= end: + if begin > end: raise ValueError("The 'begin' position must be less than the 'end' position.") if end > self._genes_position[-1].position: @@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) if end == self._genes_position[-1].position: return self._genes_position[begin:] else: - return self._genes_position[begin: end] + if begin == end: + return self._genes_position[begin] + else: + return self._genes_position[begin: end] @property def number_of_genes(self) -> int: From edb09e623579066234a3ccd52e10f5f1128cc943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 19:38:25 +0200 Subject: [PATCH 6/7] remove deprecated option --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b722c0c..2d9e9106 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -122,7 +122,7 @@ jobs: cd testingDataset ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS - ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS + ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS cd - - name: testing rgp_cluster command From ed629cfe98f5bcb5bcc5574cd85397ad6ab5bd7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 11 Jun 2024 22:27:59 +0200 Subject: [PATCH 7/7] fix remaining deprecated args --- ppanggolin/cluster/cluster.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index c30cac84..ab1046b9 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -398,7 +398,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -455,8 +455,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" - if write_sequences: - pangenome.status["geneFamilySequences"] = "Computed" + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton