From 6ddc9739cca8eb10a6c5aec29667c6a8847ae82a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Mon, 10 Jun 2024 23:05:22 +0200
Subject: [PATCH 1/7] Add the protein sequence to gene family when reading
 clustering

---
 .github/workflows/main.yml                    |  4 +-
 .../PangenomeAnalyses/pangenomeCluster.md     |  6 ++
 ppanggolin/cluster/cluster.py                 | 56 +++++++++++++++----
 ppanggolin/geneFamily.py                      |  7 +++
 ppanggolin/workflow/all.py                    |  8 ++-
 5 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 88088f66..2b722c0c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -120,9 +120,9 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
+        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang  --cpu $NUM_CPUS 
         ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
-        ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
+        ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
         cd -
     - name: testing rgp_cluster command
diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md
index eca6a630..805be2d2 100644
--- a/docs/user/PangenomeAnalyses/pangenomeCluster.md
+++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md
@@ -55,6 +55,12 @@ You can do this from the command line:
 An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv)
 
 
+When you provide your clustering, by default, the pangenome will be without sequences for gene families. 
+PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`.
+The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output).
+
+
+
 ### Defragmentation
 
 Without performing additional steps, most cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes. Therefore, they do not provide informative data on the pangenome's diversity. 
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index aed0a4dc..5c7835df 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -19,7 +19,7 @@
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.genome import Gene
 from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir
+from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir
 from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
 from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
 from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db
@@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force:
     elif pangenome.status["geneSequences"] == "inFile":
         logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file")
         write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_",
-                                                 compress=False, disable_bar=disable_bar)  # write CDS sequences to the tmpFile
+                                                 compress=False,
+                                                 disable_bar=disable_bar)  # write CDS sequences to the tmpFile
     else:
         raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster "
                         "the genes in gene families. Either provide clustering results (see --clusters), "
@@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
     dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
     with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path:
-        sequence_path = tmp_path/'nucleotide_sequences.fna'
+        sequence_path = tmp_path / 'nucleotide_sequences.fna'
         check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar)
         logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
         rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
@@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome):
     logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families")
 
 
-def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False,
-                    disable_bar: bool = False):
+def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1,
+                                        tmpdir: Path = None, keep_tmp: bool = False):
+    tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir
+    with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp:
+        repres_path = tmp / "representative.fna"
+        with open(repres_path, "w") as repres_seq:
+            for family in pangenome.gene_families:
+                repres_seq.write(f">{family.name}\n")
+                repres_seq.write(f"{family.representative.dna}\n")
+        translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu,
+                                       is_single_line_fasta=True, code=code)
+        outpath = tmp / "representative_protein_genes.fna"
+        cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
+        run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n")
+        with open(outpath, "r") as repres_prot:
+            lines = repres_prot.readlines()
+            while len(lines) > 0:
+                family_name = lines.pop(0).strip()[1:]
+                family_seq = lines.pop(0).strip()
+                family = pangenome.get_gene_family(family_name)
+                family.add_sequence(family_seq)
+
+
+def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False,
+                    write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None,
+                    keep_tmp: bool = False, force: bool = False, disable_bar: bool = False):
     """
     Get the pangenome information, the gene families and the genes with an associated gene family.
     Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
@@ -369,7 +394,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
     :param disable_bar: Allow to disable progress bar
     """
     check_pangenome_former_clustering(pangenome, force)
-    check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar)
+    check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar)
 
     logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...")
     filesize = os.stat(families_tsv_file).st_size
@@ -421,10 +446,14 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
                     f"You can either update your cluster file to ensure each gene has a cluster assignment, "
                     f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene."
                 )
+    if write_sequences:
+        get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)
 
     pangenome.status["genesClustered"] = "Computed"
     if frag:  # if there was fragment information in the file.
         pangenome.status["defragmented"] = "Computed"
+    if write_sequences:
+        pangenome.status["geneFamilySequences"] = "Computed"
     pangenome.parameters["cluster"] = {}
     pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
     pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton
@@ -450,7 +479,8 @@ def launch(args: argparse.Namespace):
         if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
                     args.coverage, args.identity, args.mode]:
             logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.")
-        read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar)
+        read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table,
+                        args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar)
         logging.getLogger("PPanGGOLiN").info("Done reading the cluster file")
     write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
 
@@ -488,12 +518,8 @@ def parser_clust(parser: argparse.ArgumentParser):
     clust.add_argument('--no_defrag', required=False, default=False, action="store_true",
                        help="DO NOT Use the defragmentation strategy to link potential fragments "
                             "with their original gene family.")
-    clust.add_argument("--translation_table", required=False, default="11",
-                       help="Translation table (genetic code) to use.")
     # clust.add_argument("--compress")
 
-    clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
-
     read = parser.add_argument_group(title="Read clustering arguments")
     read.add_argument('--clusters', required=False, type=Path,
                       help="A tab-separated list containing the result of a clustering. One line per gene. "
@@ -501,8 +527,14 @@ def parser_clust(parser: argparse.ArgumentParser):
     read.add_argument("--infer_singletons", required=False, action="store_true",
                       help="When reading a clustering result with --clusters, if a gene is not in the provided file"
                            " it will be placed in a cluster where the gene is the only member.")
+    read.add_argument("--write_sequences", action="store_true",
+                      help="Get the protein sequence of the representative gene of each gene family "
+                           "and write it in the pangenome file.")
     optional = parser.add_argument_group(title="Optional arguments")
-    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+    optional.add_argument("--translation_table", required=False, default="11",
+                          help="Translation table (genetic code) to use.")
+    optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
+    optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")
diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py
index b856fd6f..708a3dd6 100644
--- a/ppanggolin/geneFamily.py
+++ b/ppanggolin/geneFamily.py
@@ -188,6 +188,13 @@ def remove(self, identifier):
         del self[identifier]
 
 
+    @property
+    def representative(self) -> Gene:
+        """Get the representative gene of the family
+        :return: The representative gene of the family
+        """
+        return self.get(self.name)
+
     def contains_gene_id(self, identifier):
         """
         Check if the family contains already a gene id
diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py
index 702efe40..436bf3ba 100644
--- a/ppanggolin/workflow/all.py
+++ b/ppanggolin/workflow/all.py
@@ -63,8 +63,10 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
 
         if args.clusters is not None:
             start_clust = time.time()
-            read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar,
-                            infer_singleton=args.cluster.infer_singletons)
+            read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons,
+                            write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu,
+                            tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp,
+                            force=args.force, disable_bar=args.disable_prog_bar)
         else:  # args.cluster is None
             if pangenome.status["geneSequences"] == "No":
                 if args.fasta is None:
@@ -78,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
                        disable_bar=args.disable_prog_bar,
                        defrag=not args.cluster.no_defrag, code=args.cluster.translation_table,
                        coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode,
-                       keep_tmp_files=True)
+                       keep_tmp_files=args.cluster.keep_tmp)
         clust_time = time.time() - start_clust
 
     elif args.fasta is not None:

From 9e62cd2ca56769a2eafebc879b8dde3e8a7fef54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 11:19:47 +0200
Subject: [PATCH 2/7] Add a warning message if gene families are without
 sequences

---
 ppanggolin/formats/writeFlatPangenome.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py
index dfac8013..259b6bb0 100644
--- a/ppanggolin/formats/writeFlatPangenome.py
+++ b/ppanggolin/formats/writeFlatPangenome.py
@@ -1143,7 +1143,12 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1,
         if regions:
             processes.append(p.apply_async(func=write_regions, args=(output, compress)))
         if borders:
-            processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress)))
+            if pangenome.status["geneFamilySequences"] == "No":
+                logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. "
+                                                        "This may be due to the use of external clustering. "
+                                                        "Please refer to the documentation or submit an issue.")
+            else:
+                processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress)))
         if modules:
             processes.append(p.apply_async(func=write_modules, args=(output, compress)))
             processes.append(p.apply_async(func=write_module_summary, args=(output, compress)))

From 86569f1f6c67268e6f685b6b9b954693c265d5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 17:46:09 +0200
Subject: [PATCH 3/7] Add docstring

---
 ppanggolin/utils.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py
index 9f879d5c..9ace2282 100755
--- a/ppanggolin/utils.py
+++ b/ppanggolin/utils.py
@@ -583,8 +583,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace):
     return args
 
 
-def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace,
-                                         param_to_ignore: Union[List[str], Set[str]] = None) -> dict:
+def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace,
+                                    param_to_ignore: Union[List[str], Set[str]] = None) -> dict:
     """
     Get the parameters that have different value than default values.
 
@@ -662,7 +662,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_
     # cli > config > default
 
     args = overwrite_args(default_args, config_args, cli_args)
-    params_that_differ = get_args_that_different_from_default(default_args, args, input_params)
+    params_that_differ = get_args_differing_from_default(default_args, args, input_params)
 
     if params_that_differ:
         params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()])
@@ -703,7 +703,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_
 
             step_args = overwrite_args(default_step_args, config_step_args, cli_args)
 
-            step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args)
+            step_params_that_differ = get_args_differing_from_default(default_step_args, step_args)
 
             if step_params_that_differ:
                 step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()])
@@ -1186,6 +1186,16 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co
 
 
 def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"):
+    """Run a subprocess command and write the output to the given path.
+
+    :param cmd: list of program arguments
+    :param output: path to write the subprocess output
+    :param msg: message to print if the subprocess fails
+
+    :return:
+
+    :raises subprocess.CalledProcessError: raise when the subprocess return a non-zero exit code
+    """
     logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
     try:
         result = subprocess.run(cmd, check=True, capture_output=True, text=True)
@@ -1196,4 +1206,4 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f
     else:
         if output is not None:
             with open(output, 'w') as fout:
-                fout.write(result.stdout)
\ No newline at end of file
+                fout.write(result.stdout)

From 96653a9694a3972d428a2e78ef916a7f65ae43c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 18:00:25 +0200
Subject: [PATCH 4/7] Translate representative gene when read clustering
 everytime

---
 docs/user/PangenomeAnalyses/pangenomeCluster.md |  7 +++----
 ppanggolin/cluster/cluster.py                   | 16 ++++++++--------
 ppanggolin/formats/writeFlatPangenome.py        |  7 +------
 ppanggolin/workflow/all.py                      |  5 ++---
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md
index 805be2d2..53c118ae 100644
--- a/docs/user/PangenomeAnalyses/pangenomeCluster.md
+++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md
@@ -54,10 +54,9 @@ You can do this from the command line:
 
 An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv)
 
-
-When you provide your clustering, by default, the pangenome will be without sequences for gene families. 
-PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`.
-The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output).
+```{note}
+When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file.
+```
 
 
 
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 5c7835df..1d7a779a 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -381,8 +381,8 @@ def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cp
 
 
 def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False,
-                    write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None,
-                    keep_tmp: bool = False, force: bool = False, disable_bar: bool = False):
+                    code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False,
+                    force: bool = False, disable_bar: bool = False):
     """
     Get the pangenome information, the gene families and the genes with an associated gene family.
     Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
@@ -390,6 +390,10 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
     :param pangenome: Input Pangenome
     :param families_tsv_file: MMseqs2 clustering results
     :param infer_singleton: creates a new family for each gene with no associated family
+    :param code: Genetic code used for sequence translation.
+    :param cpu: Number of CPU cores to use for clustering.
+    :param tmpdir: Path to a temporary directory for intermediate files.
+    :param keep_tmp: Keep temporary files (useful for debugging).
     :param force: force to write in the pangenome
     :param disable_bar: Allow to disable progress bar
     """
@@ -446,8 +450,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
                     f"You can either update your cluster file to ensure each gene has a cluster assignment, "
                     f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene."
                 )
-    if write_sequences:
-        get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)
+    get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)
 
     pangenome.status["genesClustered"] = "Computed"
     if frag:  # if there was fragment information in the file.
@@ -479,7 +482,7 @@ def launch(args: argparse.Namespace):
         if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
                     args.coverage, args.identity, args.mode]:
             logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.")
-        read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table,
+        read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table,
                         args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar)
         logging.getLogger("PPanGGOLiN").info("Done reading the cluster file")
     write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
@@ -527,9 +530,6 @@ def parser_clust(parser: argparse.ArgumentParser):
     read.add_argument("--infer_singletons", required=False, action="store_true",
                       help="When reading a clustering result with --clusters, if a gene is not in the provided file"
                            " it will be placed in a cluster where the gene is the only member.")
-    read.add_argument("--write_sequences", action="store_true",
-                      help="Get the protein sequence of the representative gene of each gene family "
-                           "and write it in the pangenome file.")
     optional = parser.add_argument_group(title="Optional arguments")
     optional.add_argument("--translation_table", required=False, default="11",
                           help="Translation table (genetic code) to use.")
diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py
index 259b6bb0..dfac8013 100644
--- a/ppanggolin/formats/writeFlatPangenome.py
+++ b/ppanggolin/formats/writeFlatPangenome.py
@@ -1143,12 +1143,7 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1,
         if regions:
             processes.append(p.apply_async(func=write_regions, args=(output, compress)))
         if borders:
-            if pangenome.status["geneFamilySequences"] == "No":
-                logging.getLogger("PPanGGOLiN").warning("Gene families were not associated with protein sequences. "
-                                                        "This may be due to the use of external clustering. "
-                                                        "Please refer to the documentation or submit an issue.")
-            else:
-                processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress)))
+            processes.append(p.apply_async(func=write_borders, args=(output, dup_margin, compress)))
         if modules:
             processes.append(p.apply_async(func=write_modules, args=(output, compress)))
             processes.append(p.apply_async(func=write_module_summary, args=(output, compress)))
diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py
index 436bf3ba..dec41fd8 100644
--- a/ppanggolin/workflow/all.py
+++ b/ppanggolin/workflow/all.py
@@ -64,9 +64,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
         if args.clusters is not None:
             start_clust = time.time()
             read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons,
-                            write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu,
-                            tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp,
-                            force=args.force, disable_bar=args.disable_prog_bar)
+                            code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir,
+                            keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar)
         else:  # args.cluster is None
             if pangenome.status["geneSequences"] == "No":
                 if args.fasta is None:

From b9a542c39bfd7f486f48de4fac8c8e5f84770557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 18:27:23 +0200
Subject: [PATCH 5/7] get_genes return one gene if begin==end

---
 ppanggolin/genome.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
index 504197fe..87e2ef46 100644
--- a/ppanggolin/genome.py
+++ b/ppanggolin/genome.py
@@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
             raise TypeError(f"Expected type int for 'begin' and 'end', "
                             f"but received types '{type(begin)}' and '{type(end)}'.")
 
-        if begin >= end:
+        if begin > end:
             raise ValueError("The 'begin' position must be less than the 'end' position.")
 
         if end > self._genes_position[-1].position:
@@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
         if end == self._genes_position[-1].position:
             return self._genes_position[begin:]
         else:
-            return self._genes_position[begin: end]
+            if begin == end:
+                return self._genes_position[begin]
+            else:
+                return self._genes_position[begin: end]
 
     @property
     def number_of_genes(self) -> int:

From edb09e623579066234a3ccd52e10f5f1128cc943 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 19:38:25 +0200
Subject: [PATCH 6/7] remove deprecated option

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2b722c0c..2d9e9106 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -122,7 +122,7 @@ jobs:
         cd testingDataset
         ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang  --cpu $NUM_CPUS 
         ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
-        ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS
+        ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
         cd -
     - name: testing rgp_cluster command

From ed629cfe98f5bcb5bcc5574cd85397ad6ab5bd7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 11 Jun 2024 22:27:59 +0200
Subject: [PATCH 7/7] fix remaining deprecated args

---
 ppanggolin/cluster/cluster.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index c30cac84..ab1046b9 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -398,7 +398,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
     :param disable_bar: Allow to disable progress bar
     """
     check_pangenome_former_clustering(pangenome, force)
-    check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar)
+    check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar)
 
     logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...")
     filesize = os.stat(families_tsv_file).st_size
@@ -455,8 +455,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
     pangenome.status["genesClustered"] = "Computed"
     if frag:  # if there was fragment information in the file.
         pangenome.status["defragmented"] = "Computed"
-    if write_sequences:
-        pangenome.status["geneFamilySequences"] = "Computed"
+    pangenome.status["geneFamilySequences"] = "Computed"
     pangenome.parameters["cluster"] = {}
     pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
     pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton