Skip to content

Commit

Permalink
Merge pull request #238 from labgem/ReadClustSeq
Browse files Browse the repository at this point in the history
Add the protein sequence to gene family when reading clustering
  • Loading branch information
jpjarnoux authored Jun 12, 2024
2 parents 6a1905c + ed629cf commit d9563a5
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
Expand Down
5 changes: 5 additions & 0 deletions docs/user/PangenomeAnalyses/pangenomeCluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ You can do this from the command line:

An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv)

```{note}
When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file.
```



### Defragmentation

Expand Down
55 changes: 43 additions & 12 deletions ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Gene
from ppanggolin.geneFamily import GeneFamily
from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir
from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir
from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db
Expand Down Expand Up @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force:
elif pangenome.status["geneSequences"] == "inFile":
logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file")
write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_",
compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile
compress=False,
disable_bar=disable_bar) # write CDS sequences to the tmpFile
else:
raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster "
"the genes in gene families. Either provide clustering results (see --clusters), "
Expand Down Expand Up @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path:
sequence_path = tmp_path/'nucleotide_sequences.fna'
sequence_path = tmp_path / 'nucleotide_sequences.fna'
check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar)
logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
Expand Down Expand Up @@ -356,20 +357,48 @@ def infer_singletons(pangenome: Pangenome):
logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families")


def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False,
disable_bar: bool = False):
def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1,
tmpdir: Path = None, keep_tmp: bool = False):
tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir
with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp:
repres_path = tmp / "representative.fna"
with open(repres_path, "w") as repres_seq:
for family in pangenome.gene_families:
repres_seq.write(f">{family.name}\n")
repres_seq.write(f"{family.representative.dna}\n")
translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu,
is_single_line_fasta=True, code=code)
outpath = tmp / "representative_protein_genes.fna"
cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n")
with open(outpath, "r") as repres_prot:
lines = repres_prot.readlines()
while len(lines) > 0:
family_name = lines.pop(0).strip()[1:]
family_seq = lines.pop(0).strip()
family = pangenome.get_gene_family(family_name)
family.add_sequence(family_seq)


def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False,
code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False,
force: bool = False, disable_bar: bool = False):
"""
Get the pangenome information, the gene families and the genes with an associated gene family.
Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
:param pangenome: Input Pangenome
:param families_tsv_file: MMseqs2 clustering results
:param infer_singleton: creates a new family for each gene with no associated family
:param code: Genetic code used for sequence translation.
:param cpu: Number of CPU cores to use for clustering.
:param tmpdir: Path to a temporary directory for intermediate files.
:param keep_tmp: Keep temporary files (useful for debugging).
:param force: force to write in the pangenome
:param disable_bar: Allow to disable progress bar
"""
check_pangenome_former_clustering(pangenome, force)
check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar)
check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar)

logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...")
filesize = os.stat(families_tsv_file).st_size
Expand Down Expand Up @@ -421,10 +450,12 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
f"You can either update your cluster file to ensure each gene has a cluster assignment, "
f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene."
)
get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)

pangenome.status["genesClustered"] = "Computed"
if frag: # if there was fragment information in the file.
pangenome.status["defragmented"] = "Computed"
pangenome.status["geneFamilySequences"] = "Computed"
pangenome.parameters["cluster"] = {}
pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton
Expand All @@ -450,7 +481,8 @@ def launch(args: argparse.Namespace):
if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
args.coverage, args.identity, args.mode]:
logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.")
read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar)
read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table,
args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar)
logging.getLogger("PPanGGOLiN").info("Done reading the cluster file")
write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)

Expand Down Expand Up @@ -488,10 +520,6 @@ def parser_clust(parser: argparse.ArgumentParser):
clust.add_argument('--no_defrag', required=False, default=False, action="store_true",
help="DO NOT Use the defragmentation strategy to link potential fragments "
"with their original gene family.")
clust.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")

clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")

read = parser.add_argument_group(title="Read clustering arguments")
read.add_argument('--clusters', required=False, type=Path,
Expand All @@ -501,7 +529,10 @@ def parser_clust(parser: argparse.ArgumentParser):
help="When reading a clustering result with --clusters, if a gene is not in the provided file"
" it will be placed in a cluster where the gene is the only member.")
optional = parser.add_argument_group(title="Optional arguments")
optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
optional.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
help="directory for storing temporary files")
optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
help="Keeping temporary files (useful for debugging).")
Expand Down
7 changes: 7 additions & 0 deletions ppanggolin/geneFamily.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,13 @@ def remove(self, identifier):
del self[identifier]


@property
def representative(self) -> Gene:
"""Get the representative gene of the family
:return: The representative gene of the family
"""
return self.get(self.name)

def contains_gene_id(self, identifier):
"""
Check if the family contains already a gene id
Expand Down
7 changes: 5 additions & 2 deletions ppanggolin/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
raise TypeError(f"Expected type int for 'begin' and 'end', "
f"but received types '{type(begin)}' and '{type(end)}'.")

if begin >= end:
if begin > end:
raise ValueError("The 'begin' position must be less than the 'end' position.")

if end > self._genes_position[-1].position:
Expand All @@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
if end == self._genes_position[-1].position:
return self._genes_position[begin:]
else:
return self._genes_position[begin: end]
if begin == end:
return self._genes_position[begin]
else:
return self._genes_position[begin: end]

@property
def number_of_genes(self) -> int:
Expand Down
7 changes: 4 additions & 3 deletions ppanggolin/workflow/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,

if args.clusters is not None:
start_clust = time.time()
read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar,
infer_singleton=args.cluster.infer_singletons)
read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons,
code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir,
keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar)
else: # args.cluster is None
if pangenome.status["geneSequences"] == "No":
if args.fasta is None:
Expand All @@ -78,7 +79,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
disable_bar=args.disable_prog_bar,
defrag=not args.cluster.no_defrag, code=args.cluster.translation_table,
coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode,
keep_tmp_files=True)
keep_tmp_files=args.cluster.keep_tmp)
clust_time = time.time() - start_clust

elif args.fasta is not None:
Expand Down

0 comments on commit d9563a5

Please sign in to comment.