Skip to content

Commit

Permalink
Add the protein sequence to gene family when reading clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
jpjarnoux committed Jun 10, 2024
1 parent 277aeef commit 6ddc973
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 17 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
ppanggolin cluster --clusters clusters.tsv --write_sequences -p readclusters/pangenome.h5 --cpu $NUM_CPUS
ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
cd -
- name: testing rgp_cluster command
Expand Down
6 changes: 6 additions & 0 deletions docs/user/PangenomeAnalyses/pangenomeCluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ You can do this from the command line:
An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv)


When you provide your clustering, by default, the pangenome will be without sequences for gene families.
PPanGGOLiN can get the protein sequence of each family and write it in the HDF5 file with the option `--write_sequences`.
The sequence can be important for some [outputs](./pangenomeAnalyses.md#pan-output).



### Defragmentation

Without performing additional steps, most cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes. Therefore, they do not provide informative data on the pangenome's diversity.
Expand Down
56 changes: 44 additions & 12 deletions ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Gene
from ppanggolin.geneFamily import GeneFamily
from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir
from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir
from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db
Expand Down Expand Up @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force:
elif pangenome.status["geneSequences"] == "inFile":
logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file")
write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_",
compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile
compress=False,
disable_bar=disable_bar) # write CDS sequences to the tmpFile
else:
raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster "
"the genes in gene families. Either provide clustering results (see --clusters), "
Expand Down Expand Up @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path:
sequence_path = tmp_path/'nucleotide_sequences.fna'
sequence_path = tmp_path / 'nucleotide_sequences.fna'
check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar)
logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
Expand Down Expand Up @@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome):
logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families")


def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False,
disable_bar: bool = False):
def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1,
tmpdir: Path = None, keep_tmp: bool = False):
tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir
with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp:
repres_path = tmp / "representative.fna"
with open(repres_path, "w") as repres_seq:
for family in pangenome.gene_families:
repres_seq.write(f">{family.name}\n")
repres_seq.write(f"{family.representative.dna}\n")
translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu,
is_single_line_fasta=True, code=code)
outpath = tmp / "representative_protein_genes.fna"
cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n")
with open(outpath, "r") as repres_prot:
lines = repres_prot.readlines()
while len(lines) > 0:
family_name = lines.pop(0).strip()[1:]
family_seq = lines.pop(0).strip()
family = pangenome.get_gene_family(family_name)
family.add_sequence(family_seq)


def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False,
write_sequences: bool = False, code: int = 11, cpu: int = 1, tmpdir: Path = None,
keep_tmp: bool = False, force: bool = False, disable_bar: bool = False):
"""
Get the pangenome information, the gene families and the genes with an associated gene family.
Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
Expand All @@ -369,7 +394,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
:param disable_bar: Allow to disable progress bar
"""
check_pangenome_former_clustering(pangenome, force)
check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar)
check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=write_sequences, disable_bar=disable_bar)

logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...")
filesize = os.stat(families_tsv_file).st_size
Expand Down Expand Up @@ -421,10 +446,14 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
f"You can either update your cluster file to ensure each gene has a cluster assignment, "
f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene."
)
if write_sequences:
get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)

pangenome.status["genesClustered"] = "Computed"
if frag: # if there was fragment information in the file.
pangenome.status["defragmented"] = "Computed"
if write_sequences:
pangenome.status["geneFamilySequences"] = "Computed"
pangenome.parameters["cluster"] = {}
pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton
Expand All @@ -450,7 +479,8 @@ def launch(args: argparse.Namespace):
if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
args.coverage, args.identity, args.mode]:
logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.")
read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar)
read_clustering(pangenome, args.clusters, args.infer_singletons, args.write_sequences, args.translation_table,
args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar)
logging.getLogger("PPanGGOLiN").info("Done reading the cluster file")
write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)

Expand Down Expand Up @@ -488,21 +518,23 @@ def parser_clust(parser: argparse.ArgumentParser):
clust.add_argument('--no_defrag', required=False, default=False, action="store_true",
help="DO NOT Use the defragmentation strategy to link potential fragments "
"with their original gene family.")
clust.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")
# clust.add_argument("--compress")

clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")

read = parser.add_argument_group(title="Read clustering arguments")
read.add_argument('--clusters', required=False, type=Path,
help="A tab-separated list containing the result of a clustering. One line per gene. "
"First column is cluster ID, and second is gene ID")
read.add_argument("--infer_singletons", required=False, action="store_true",
help="When reading a clustering result with --clusters, if a gene is not in the provided file"
" it will be placed in a cluster where the gene is the only member.")
read.add_argument("--write_sequences", action="store_true",
help="Get the protein sequence of the representative gene of each gene family "
"and write it in the pangenome file.")
optional = parser.add_argument_group(title="Optional arguments")
optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
optional.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
help="directory for storing temporary files")
optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
help="Keeping temporary files (useful for debugging).")
Expand Down
7 changes: 7 additions & 0 deletions ppanggolin/geneFamily.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,13 @@ def remove(self, identifier):
del self[identifier]


@property
def representative(self) -> Gene:
"""Get the representative gene of the family
:return: The representative gene of the family
"""
return self.get(self.name)

def contains_gene_id(self, identifier):
"""
Check if the family contains already a gene id
Expand Down
8 changes: 5 additions & 3 deletions ppanggolin/workflow/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,

if args.clusters is not None:
start_clust = time.time()
read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar,
infer_singleton=args.cluster.infer_singletons)
read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons,
write_sequences=True, code=args.cluster.translation_table, cpu=args.cluster.cpu,
tmpdir=args.tmpdir, keep_tmp=args.cluster.keep_tmp,
force=args.force, disable_bar=args.disable_prog_bar)
else: # args.cluster is None
if pangenome.status["geneSequences"] == "No":
if args.fasta is None:
Expand All @@ -78,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
disable_bar=args.disable_prog_bar,
defrag=not args.cluster.no_defrag, code=args.cluster.translation_table,
coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode,
keep_tmp_files=True)
keep_tmp_files=args.cluster.keep_tmp)
clust_time = time.time() - start_clust

elif args.fasta is not None:
Expand Down

0 comments on commit 6ddc973

Please sign in to comment.