diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cb5b0f47..88088f66 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -89,18 +89,18 @@ jobs: ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM ppanggolin draw -p stepbystep/pangenome.h5 --tile_plot --nocloud --soft_core 0.92 --ucurve --output stepbystep -f ppanggolin rgp -p stepbystep/pangenome.h5 --persistent_penalty 2 --variable_gain 1 --min_score 3 --dup_margin 0.05 - ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 + ppanggolin spot -p stepbystep/pangenome.h5 --output stepbystep --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 -f ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp --compress ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 - ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes_prot cloud --threads 1 --keep_tmp + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes core --proteins cloud + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 --compress + ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp --compress ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log @@ -180,6 +180,8 @@ jobs: head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee --cpu $NUM_CPUS + head genomes.fasta.list | sed 's/^/input_genome_/g' > genomes.fasta.head.list + ppanggolin projection --pangenome myannopang/pangenome.h5 -o projection_from_list_of_fasta --fasta genomes.fasta.head.list --gff --proksee --cpu $NUM_CPUS ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index e0ba5013..10725d5b 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -20,7 +20,8 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify ### Nucleotide sequences -This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all the genes of the pangenome for example: +With the `--genes partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. +It can be used as such, to write all the genes of the pangenome for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all @@ -34,10 +35,11 @@ ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent ### Protein sequences -This option can be used to write the amino acid CDS sequences. It can be used as such, to write all the genes of the pangenome for example: +With the `--proteins partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. +It can be used as such, to write all the genes of the pangenome for example: ```bash -ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot all +ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all ``` Or to write only the cloud genes: @@ -46,19 +48,23 @@ Or to write only the cloud genes: ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud ``` -To translate the genes sequences, PPanGGOLiN use [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. So for this option you can give multiple threads with `--threads`. You can also specify the translation table to use with `--translate_table`. Finally, you can keep the [MMSeqs2](https://github.com/soedinglab/MMseqs2) that are generated in the temporary directory (that you can also be specified with `--tmpdir`) by indicate the option `--keep_tmp`. +To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. +So for this option you can specify multiple threads with `--cpu`. +You can also specify the translation table to use with `--translate_table`. +The temporary directory, can be specified with `--tmpdir` to store the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database and other files. Temporary files will be deleted at the end of the execution. To keep them, you can use the `--keep_tmp` option. ## Gene families ### Protein sequences -This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: +With the `--prot_families partition` option PPanGGOLiN will write the protein sequences of the representative gene for each family for the given partition. +It can be used as such for all families: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all ``` -or for all the shell families for example: +Or for all the shell families for example: ```bash ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell @@ -66,16 +72,33 @@ ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell ### Nucleotide sequences -This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: +With the `--gene_families partition` option PPanGGOLiN will write the nucleotide sequences of the representative gene for each family for the given partition. +It can be used as such for all families: ```bash ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all ``` -or for the cloud families for example: +Or for the core families for example: ```bash -ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud +ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families core +``` + + +## Modules +All the precedent command admit a module as partition. + +So you can write the protein sequences for the family in module_X as such: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --prot_families module_X +``` + +Or the nucleotide sequence of all genes in module_X: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --genes module_X ``` ## Regions @@ -91,4 +114,4 @@ It can be used as such: ```bash ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list -``` +``` \ No newline at end of file diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 247545ae..ca72ed9b 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,7 +9,7 @@ import subprocess import argparse from collections import defaultdict, Counter -from typing import List, Tuple, Set, Dict, IO, Iterable +from typing import List, Tuple, Set, Dict, Union, Iterable, Any from pathlib import Path from tqdm import tqdm @@ -22,55 +22,13 @@ from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file +from ppanggolin.formats.writeSequences import translate_genes, create_mmseqs_db -def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequences") -> Path: - """ - Create a MMseqs2 sequence database with the given fasta files. - - :param seq_files: An iterable of path of FASTA files. - :param tmpdir: Path to the temporary directory where the database will be created. - :param basename: Prefix for the database file (default: "sequences"). - - :return: Path to the created MMseqs2 database file. - """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb: - cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0'] - - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL) - - return Path(seqdb.name) - - -def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path: - """ - Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences. - - :param seqdb: Path to the input MMseqs2 sequence database containing nucleotide sequences. - :param translation_table: The translation table to use for conversion. - :param cpu: Number of CPU cores to use for translation. - :param tmpdir: Path to the temporary directory for intermediate files. - - :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences. - """ - - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem, - suffix=".aa.DB") as seqdb_aa: - cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table", - f"{translation_table}", "--threads", str(cpu)] - - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - - return Path(seqdb_aa.name) - - -def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], - tmpdir: Path, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, - is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path: +def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], + tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow", + is_target_slf: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -81,25 +39,36 @@ def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], :param no_defrag: Do not apply defragmentation :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment - :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs - :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs + :param query_type: Sequences type of the file (query). [nucleotide, protein, unknow] + :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link + :param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein] + :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. - :return: Alignement result file + :return: Alignment result file """ - target_db = create_mmseqs_db([target_seq_file], tmpdir, basename="target_sequences") - query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences") - - if is_target_nt: - logging.getLogger().debug( - f"Target sequences will be translated by mmseqs with translation table {translation_table}") - target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir) - - if is_query_nt: - logging.getLogger().debug( - f"Query sequences will be translated by mmseqs with translation table {translation_table}") - query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir) + if target_type == "nucleotide": + logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " + f"translation table {translation_table}") + with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir: + # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same + target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table) + else: + db_type = 1 if target_type == "protein" else 0 + target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, + 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type) + + if query_type == "nucleotide": + logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " + f"with translation table {translation_table}") + with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir: + # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same + query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table) + else: + db_type = 1 if query_type == "protein" else 0 + query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, + 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type) cov_mode = "2" # coverage of query if no_defrag: @@ -111,25 +80,24 @@ def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path], with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB", delete=False) as aln_db: cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a", - "--min-seq-id", str(identity), - "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), + "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu), "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"] - logging.getLogger().info("Aligning sequences") - logging.getLogger().debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Aligning sequences") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) start = time.time() subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) align_time = time.time() - start - logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Done aligning sequences in {round(align_time, 2)} seconds") with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv", delete=False) as outfile: cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name, "--format-mode", "2"] - logging.getLogger().info("Extracting alignments...") - logging.getLogger().debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Extracting alignments...") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return Path(outfile.name) @@ -141,16 +109,16 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, Read alignment result to link input sequences to pangenome gene family. Alignment have been made against all genes of the pangenome. - :param aln_res: Alignement result file + :param aln_res: Alignment result file :param outdir: Output directory :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file + :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file - logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: @@ -171,21 +139,21 @@ def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path, def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, - pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: + pangenome: Pangenome) -> Tuple[Dict[Any, GeneFamily], Path]: """ Read alignment result to link input sequences to pangenome gene family. Alignment have been made against representative sequence of gene families of the pangenome. - :param aln_res: Alignement result file + :param aln_res: Alignment result file :param outdir: Output directory :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file + :return: Dictionary with sequence link to pangenome gene families and actual path to the cleaned alignment file """ seq2pang = {} aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file - logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}') + logging.getLogger("PPanGGOLiN").debug(f'Writing alignment file in {aln_file_clean}') with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl: for line in alnFile: @@ -205,7 +173,7 @@ def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path, return seq2pang, aln_file_clean -def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: +def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool, bool]: """ Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences. @@ -217,49 +185,54 @@ def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]: seq_set = set() seq_count = 0 first_seq_concat = "" - + single_line_fasta = True + count_fasta_line = 0 for line in seq_file: if line.startswith(">"): seq_set.add(line[1:].split()[0].strip()) seq_count += 1 - elif seq_count <= 20: - first_seq_concat += line.strip() + if count_fasta_line > 1: # Allow to know if we can use soft link with createdb from MMSeqs2 + single_line_fasta = False + count_fasta_line = 0 + else: + count_fasta_line += 1 + if seq_count <= 20: + first_seq_concat += line.strip() char_counter = Counter(first_seq_concat) is_nucleotide = all(char in dna_expected_char for char in char_counter) - return seq_set, is_nucleotide + return seq_set, is_nucleotide, single_line_fasta -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): +def write_gene_fam_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): """ Export the sequence of gene families :param pangenome: Pangenome containing families - :param file_obj: Temporary file where sequences will be written + :param output: Path to file where sequences will be written :param add: Add prefix to sequence name :param disable_bar: disable progress bar """ - for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, - total=pangenome.number_of_gene_families): - file_obj.write(">" + add + fam.name + "\n") - file_obj.write(fam.sequence + "\n") - # file_obj.flush() + with open(output, "w") as file_obj: + for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar, + total=pangenome.number_of_gene_families): + file_obj.write(">" + add + fam.name + "\n") + file_obj.write(fam.sequence + "\n") -def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False): +def write_all_gene_sequences(pangenome: Pangenome, output: Path, add: str = "", disable_bar: bool = False): """ Export the sequence of pangenome genes :param pangenome: Pangenome containing genes - :param file_obj: Temporary file where sequences will be written + :param output: Path to file where sequences will be written :param add: Add prefix to sequence name :param disable_bar: disable progress bar - """ if pangenome.status["geneSequences"] == "inFile": - get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, add=add, disable_bar=disable_bar) + get_non_redundant_gene_sequences_from_file(pangenome.file, output, add=add, disable_bar=disable_bar) else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") @@ -269,7 +242,7 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq """ Project the partition of each sequence from the input file and write them in a file - :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family + :param seqid_to_gene_family: dictionary which link sequence and pangenome gene family :param seq_set: input sequences :param output: Path of the output directory @@ -289,7 +262,7 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s """ Write input gene to pangenome gene family. - :param seqid_to_gene_family: dictionnary which links input sequence and pangenome gene family + :param seqid_to_gene_family: dictionary which links input sequence and pangenome gene family :param seq_set: input sequences :param output: Path of the output directory @@ -317,7 +290,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: :param pangenome: Input pangenome :param multigenics: multigenics families - :return: Dictionnary link families to RGP + :return: Dictionary link families to RGP """ fam2rgp = defaultdict(list) for rgp in pangenome.regions: @@ -357,18 +330,6 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ return fam2spot, fam2border -def add_spot_str(spot: Spot) -> str: - # TODO define as self.__str__ in spot - """ - allow to map spot set - - :param spot: spot which will be return - - :return: Str with spot ID - """ - return "spot_" + str(spot.ID) - - def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3): """ Draw a gexf graph of the spot @@ -376,7 +337,7 @@ def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, :param spots: spot find in the alignment between pangenome and input sequences :param output: Path of the output directory :param multigenics: multigenics families - :param fam_to_mod: dictionnary which link families and modules + :param fam_to_mod: dictionary which link families and modules :param set_size: """ for spot in spots: @@ -398,18 +359,18 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"]) - finfo = open(output / "info_input_seq.tsv", "w") - finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") - fam2rgp = get_fam_to_rgp(pangenome, multigenics) - fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) - spot_list = set() - for seq, panfam in seq_to_pang.items(): - finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( - map(add_spot_str, fam2spot[panfam])) + "\t" + ",".join( - map(add_spot_str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") - spot_list |= set(fam2spot[panfam]) - spot_list |= set(fam2border[panfam]) - finfo.close() + with open(output / "info_input_seq.tsv", "w") as finfo: + finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") + fam2rgp = get_fam_to_rgp(pangenome, multigenics) + fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) + spot_list = set() + for seq, panfam in seq_to_pang.items(): + finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( + map(str, fam2spot[panfam])) + "\t" + ",".join( + map(str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") + spot_list |= set(fam2spot[panfam]) + spot_list |= set(fam2border[panfam]) + if draw_related: drawn_spots = set() for spot in spot_list: @@ -433,10 +394,11 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel f"{output / 'info_input_seq.tsv'}") -def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, - tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, - disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: +def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, + no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + translation_table: int = 11, disable_bar: bool = False + ) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -447,7 +409,8 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow] + :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). @@ -459,27 +422,25 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Itera and a dictionary mapping input sequences to gene families. """ - # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, - prefix="representative_genes", suffix=".faa") as tmp_pang_file: - logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}') - write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) + pangenome_sequences = tmpdir / "proteins_families.faa" + logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}') + write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=False, - translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="protein", translation_table=translation_table) - seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) return align_file, seq2pang -def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path, - tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, - disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]: +def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, + no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, + translation_table: int = 11, disable_bar: bool = False + ) -> Tuple[Path, Dict[str, GeneFamily]]: """ Assign gene families from a pangenome to input sequences. @@ -490,7 +451,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Sequences type of the file (query). [nucleotide, protein, unknow] + :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). :param identity: Minimum identity threshold for the alignment (default: 0.8). @@ -501,26 +463,23 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera :return: A tuple containing the path to the alignment result file, and a dictionary mapping input sequences to gene families. """ + pangenome_sequences = tmpdir / "nucleotide_genes.fna" + logging.getLogger("PPanGGOLiN").debug(f'Write all pangenome gene sequences in {pangenome_sequences.absolute()}') + write_all_gene_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False, - prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file: - logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}') - write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar) - - align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, - no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_target_nt=True, - translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="nucleotide", translation_table=translation_table) - seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) + seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) return align_file, seq2pang def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, + use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. @@ -559,21 +518,23 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar) with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set, is_nucleotide = get_seq_ids(seqFileObj) + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: - align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir, - is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, identity=identity, + align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, + input_type=input_type, + is_input_slf=single_line_fasta, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: - align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file], + align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, + input_type=input_type, + is_input_slf=single_line_fasta, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, @@ -583,9 +544,10 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only - logging.getLogger().info(f"sequences partition projection : '{part_proj}'") - logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'") + logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'") + logging.getLogger("PPanGGOLiN").info( + f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") + logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'") def launch(args: argparse.Namespace): @@ -657,7 +619,7 @@ def parser_align(parser: argparse.ArgumentParser): help="In the context of provided annotation, use this option to read pseudogenes. " "(Default behavior is to ignore them)") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 730ce6c6..b6fac5e2 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -511,13 +511,11 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li rna_counter += 1 genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_metadata) - - genome_meta_obj = Metadata(source='annotation_file', **genome_metadata) - organism.add_metadata(source="annotation_file", metadata=genome_meta_obj) + + organism.add_metadata(metadata=Metadata(source='annotation_file', **genome_metadata)) for contig, metadata_dict in contig_to_uniq_metadata.items(): - contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict) - contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj) + contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) if used_transl_table_arg: logging.getLogger("PPanGGOLiN").info( @@ -792,8 +790,7 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org: if len(contig_name_to_region_info) == org.number_of_contigs: genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_name_to_region_info) if genome_metadata: - genome_meta_obj = Metadata(source='annotation_file', **genome_metadata) - org.add_metadata(source="annotation_file", metadata=genome_meta_obj) + org.add_metadata(Metadata(source='annotation_file', **genome_metadata)) else: logging.getLogger("PPanGGOLiN").warning( f"Inconsistent data in GFF file {gff_file_path}: " @@ -805,9 +802,8 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org: contig = org.get(contig_name) except KeyError: raise ValueError(f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.") - - contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict) - contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj) + + contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separation: int = 10): diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 69d69f61..8e0abc7e 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -8,7 +8,7 @@ from collections import defaultdict import os import argparse -from typing import TextIO, Tuple, Dict, Set +from typing import Tuple, Dict, Set from pathlib import Path import time @@ -20,11 +20,10 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float +from ppanggolin.utils import read_compressed_or_not, restricted_float, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes -from ppanggolin.utils import mk_outdir # Global functions @@ -43,14 +42,14 @@ def check_pangenome_former_clustering(pangenome: Pangenome, force: bool = False) # Clustering functions -def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force: bool = False, +def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: bool = False, disable_bar: bool = False): """ Check the pangenome statuses and write the gene sequences in the provided tmpFile. (whether they are written in the .h5 file or currently in memory) :param pangenome: Annotated Pangenome - :param tmp_file: Temporary file + :param sequences: Path to write the sequences :param force: Force to write on existing pangenome information :param disable_bar: Allow to disable progress bar """ @@ -58,19 +57,20 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: logging.getLogger("PPanGGOLiN").debug("Write sequences from annotation loaded in pangenome") # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. - write_gene_sequences_from_annotations(pangenome.genes, tmp_file, add="ppanggolin_", disable_bar=disable_bar) + write_gene_sequences_from_annotations(pangenome.genes, sequences, add="ppanggolin_", + compress=False, disable_bar=disable_bar) elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") - write_gene_sequences_from_pangenome_file(pangenome.file, tmp_file, add="ppanggolin_", - disable_bar=disable_bar) # write CDS sequences to the tmpFile + write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", + compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile else: - tmp_file.close() # closing the tmp file since an exception will be raised. raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " "or provide a way to access the gene sequence during the annotation step " "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") -def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, + +def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11, coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> Tuple[Path, Path]: """ Make a first clustering of all sequences in pangenome @@ -85,7 +85,9 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = :return: path to representative sequence file and path to tsv clustering result """ - seqdb = translate_genes(sequences, tmpdir, cpu, code) + + seqdb = translate_genes(sequences=sequences, tmpdir=tmpdir, cpu=cpu, + is_single_line_fasta=True, code=code) logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir / 'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", @@ -293,33 +295,23 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = :param disable_bar: Disable the progress bar during clustering. :param keep_tmp_files: Keep temporary files (useful for debugging). """ - - if keep_tmp_files: - date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) - - dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' - tmp_path = Path(tmpdir) / dir_name - mk_outdir(tmp_path, force=True) - else: - newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(newtmpdir.name) - - with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: - check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) + date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) + dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' + with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: + sequence_path = tmp_path/'nucleotide_sequences.fna' + check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") - rep, tsv = first_clustering(sequence_file, tmp_path, cpu, code, coverage, identity, mode) + rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) - fam2seq = read_faa(rep) - if not defrag: - logging.getLogger("PPanGGOLiN").debug("No defragmentation") - genes2fam, _ = read_tsv(tsv) - else: - logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") - aln = align_rep(rep, tmp_path, cpu, coverage, identity) - genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) - pangenome.status["defragmented"] = "Computed" - if not keep_tmp_files: - newtmpdir.cleanup() + fam2seq = read_faa(rep) + if not defrag: + logging.getLogger("PPanGGOLiN").debug("No defragmentation") + genes2fam, _ = read_tsv(tsv) + else: + logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") + aln = align_rep(rep, tmp_path, cpu, coverage, identity) + genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) + pangenome.status["defragmented"] = "Computed" read_fam2seq(pangenome, fam2seq) read_gene2fam(pangenome, genes2fam, disable_bar=disable_bar) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index c8c60eb3..86c644a8 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -108,34 +108,31 @@ def align_sequences_to_families(pangenome: Pangenome, output: Path, sequence_fil """ # Alignment of sequences on pangenome families families_of_interest = set() - with read_compressed_or_not(sequence_file) as seqFileObj: - seq_set, is_nucleotide = get_seq_ids(seqFileObj) - - logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") - if tmpdir is None: - tmpdir = Path(tempfile.gettempdir()) - logging.getLogger("PPanGGOLiN").warning("Temporary directory is not specified. " - f"Using default temporary directory: {tmpdir}.") - with create_tmpdir(tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - - if use_representatives: - _, seqid2fam = get_input_seq_to_family_with_rep(pangenome=pangenome, - sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) - else: - _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, - sequence_files=[sequence_file], - output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, - disable_bar=disable_bar) + family_2_input_seqid = {} + if sequence_file is not None: + # Alignment of sequences on pangenome families + with read_compressed_or_not(sequence_file) as seqFileObj: + seq_set, is_nucleotide, is_slf = get_seq_ids(seqFileObj) + + logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") + + with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + input_type = "nucleotide" if is_nucleotide else "unknow" + if use_representatives: + _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, + input_type=input_type, is_input_slf=is_slf, cpu=cpu, + no_defrag=no_defrag, identity=identity, + coverage=coverage, translation_table=translation_table, + disable_bar=disable_bar) + else: + _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, + output=output, tmpdir=new_tmpdir, + input_type=input_type, is_input_slf=is_slf, + cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, + translation_table=translation_table, + disable_bar=disable_bar) + project_and_write_partition(seqid2fam, seq_set, output) write_gene_to_gene_family(seqid2fam, seq_set, output) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 53b486e2..ec90d1d7 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -155,7 +155,7 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: new_classify = set() -def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = 3, +def subgraph(spot: Spot, outname: Path, with_border: bool = True, set_size: int = 3, multigenics: set = None, fam_to_mod: dict = None): """ Write a pangeome subgraph of the gene families of a spot in gexf format @@ -211,7 +211,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = if "name" in g.nodes[node]: g.nodes[node]["name"] = g.nodes[node]["name"].most_common(1)[0][0] - nx.write_gexf(g, outname) + nx.write_gexf(g, outname.absolute().as_posix()) def is_gene_list_ordered(genes:List[Feature]): """ @@ -537,7 +537,7 @@ def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, return column(genome_header, slider_spacing, slider_font, slider_offset) -def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, file_name: str): +def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, output: Path): """ :param gene_lists: @@ -549,7 +549,7 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam """ # Prepare the source data - output_file(file_name + ".html") + output_file(output) # generate the figure and add some tools to it wheel_zoom = WheelZoomTool() @@ -607,10 +607,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: fam2mod[fam] = f"module_{mod.ID}" for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): - fname = output / f"spot_{str(spot.ID)}" - + basename = f"spot_{str(spot.ID)}" + identical_rgp_out = output / (basename + '_identical_rgps.tsv') # write rgps representatives and the rgps they are identical to - with open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') as out_struc: + with open(identical_rgp_out, 'w') as out_struc: out_struc.write('representative_rgp\trepresentative_rgp_genome\tidentical_rgp\tidentical_rgp_genome\n') for key_rgp, other_rgps in spot.get_uniq_to_rgp().items(): for rgp in other_rgps: @@ -665,9 +665,10 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: uniq_gene_lists.append(genelist) ordered_counts.append(curr_genelist_count) - draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix()) - subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size, - multigenics=multigenics, fam_to_mod=fam2mod) + draw_spot_out = output / (basename + ".html") + subgraph_out = output / (basename + ".gexf") + draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, draw_spot_out) + subgraph(spot, subgraph_out, set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger("PPanGGOLiN").info(f"Done drawing spot(s), they can be found in the directory: '{output}'") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index bb0f02d5..85c1e7ac 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -4,7 +4,7 @@ # default libraries import logging from pathlib import Path -from typing import TextIO, Dict, Any, Set, List, Tuple +from typing import Dict, Any, Set, List, Tuple from collections import defaultdict # installed libraries @@ -17,6 +17,7 @@ from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module from ppanggolin.metadata import Metadata +from ppanggolin.utils import write_compressed_or_not class Genedata: @@ -26,7 +27,7 @@ class Genedata: """ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, - genetic_code: int, coordinates:List[Tuple[int]] = None): + genetic_code: int, coordinates: List[Tuple[int]] = None): """Constructor method :param start: Gene start position @@ -60,6 +61,7 @@ def __eq__(self, other): and self.genetic_code == other.genetic_code \ and self.coordinates == other.coordinates \ + def __hash__(self): return hash((self.start, self.stop, self.strand, self.gene_type, self.position, self.name, self.product, self.genetic_code, tuple(self.coordinates))) @@ -172,9 +174,10 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: for row in read_chunks(table, chunk=20000): start = int(row["start"]) stop = int(row["stop"]) - - if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: # manage gene with joined coordinates if the info exists - + + if "has_joined_coordinates" in row.dtype.names and row["has_joined_coordinates"]: + # manage gene with joined coordinates if the info exists + try: coordinates = genedata_id_to_coordinates[row["genedata_id"]] except KeyError: @@ -183,7 +186,6 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: else: coordinates = [(start, stop)] - genedata = Genedata(start=start, stop=stop, strand=row["strand"].decode(), @@ -199,6 +201,7 @@ def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: return genedata_id2genedata + def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: """ Read join coordinates from a HDF5 file and return a dictionary mapping genedata_id to coordinates. @@ -207,7 +210,7 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: :return: A dictionary mapping genedata_id to a list of tuples representing start and stop coordinates. """ genedata_id_to_coordinates = defaultdict(list) - + if not hasattr(h5f.root.annotations, "joinedCoordinates"): # then the pangenome file has no joined annotations # or has been made before the joined annotations coordinates @@ -218,7 +221,8 @@ def read_join_coordinates(h5f: tables.File) -> Dict[str, List[Tuple[int, int]]]: for row in read_chunks(table, chunk=20000): genedata_id = row["genedata_id"] - genedata_id_to_coordinates[genedata_id].append((int(row["coordinate_rank"]), int(row["start"]), int(row["stop"]))) + genedata_id_to_coordinates[genedata_id].append( + (int(row["coordinate_rank"]), int(row["start"]), int(row["stop"]))) # sort coordinate by their rank genedata_id_to_sorted_coordinates = {} @@ -242,21 +246,21 @@ def read_sequences(h5f: tables.File) -> dict: return seqid2seq -def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '', +def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: Path, add: str = '', disable_bar: bool = False): """ Writes the non-redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. :param pangenome_filename: Name of the pangenome file - :param file_obj: Name of the output file + :param output: Path to the output file :param add: Add a prefix to sequence header :param disable_bar: disable progress bar """ - logging.getLogger("PPanGGOLiN").info( - f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}") + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing non redundant CDS sequences from {pangenome_filename}" + f" to {output.absolute()}") with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: @@ -269,41 +273,41 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj seqid2cds_name[row["seqid"]] = row["gene"].decode() table = h5f.root.annotations.sequences - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - cds_name = seqid2cds_name[row["seqid"]] - file_obj.write(f'>{add}{cds_name}\n') - file_obj.write(f'{row["dna"].decode()}\n') + with open(output, "w") as file_obj: + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + cds_name = seqid2cds_name[row["seqid"]] + file_obj.write(f'>{add}{cds_name}\n') + file_obj.write(f'{row["dna"].decode()}\n') - file_obj.flush() - -def write_gene_sequences_from_pangenome_file(pangenome_filename: str, file_obj: TextIO, list_cds: iter = None, - add: str = '', disable_bar: bool = False): +def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: iter = None, + add: str = '', compress: bool = False, disable_bar: bool = False): """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. :param pangenome_filename: Name of the pangenome file - :param file_obj: Name of the output file + :param output: Path to the sequences file :param list_cds: An iterable object of CDS :param add: Add a prefix to sequence header + :param compress: Compress the output file :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger("PPanGGOLiN").info( - f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...") - h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) - - table = h5f.root.annotations.geneSequences - list_cds = set(list_cds) if list_cds is not None else None - seqid2seq = read_sequences(h5f) - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - # Read the table chunk per chunk otherwise RAM dies on big pangenomes - name_cds = row["gene"].decode() - if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): - file_obj.write('>' + add + name_cds + "\n") - file_obj.write(seqid2seq[row["seqid"]] + "\n") - file_obj.flush() - h5f.close() + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {pangenome_filename} " + "file to a fasta file...") + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: + table = h5f.root.annotations.geneSequences + list_cds = set(list_cds) if list_cds is not None else None + seqid2seq = read_sequences(h5f) + with write_compressed_or_not(output, compress) as file_obj: + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + # Read the table chunk per chunk otherwise RAM dies on big pangenomes + name_cds = row["gene"].decode() + if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): + file_obj.write('>' + add + name_cds + "\n") + file_obj.write(seqid2seq[row["seqid"]] + "\n") + logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to " + f"{output.absolute()}{'.gz' if compress else ''}") def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): @@ -520,7 +524,8 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in local = "" gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, - genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, coordinates=genedata.coordinates) + genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local, + coordinates=genedata.coordinates) gene.is_fragment = row["is_fragment"] if link: contig = pangenome.get_contig(identifier=int(row["contig"])) @@ -677,7 +682,11 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, source_table = metadata_group._f_get_child(source) for row in tqdm(read_chunks(source_table), total=source_table.nrows, unit='metadata', disable=disable_bar): meta_dict = {'source': source} - identifier = row["ID"].decode() if isinstance(row["ID"], bytes) else int(row["ID"]) + try: + meta_id = row["metadata_id"] + except KeyError: + meta_id = None + identifier = row["ID"].decode("utf-8") if isinstance(row["ID"], bytes) else int(row["ID"]) # else: # identifier = row["name"].decode() if metatype == "families": @@ -701,8 +710,7 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, for field in row.dtype.names: if field not in ["ID", "name"]: meta_dict[field] = row[field].decode() if isinstance(row[field], bytes) else row[field] - meta = Metadata(**meta_dict) - element.add_metadata(source=source, metadata=meta) + element.add_metadata(metadata=Metadata(**meta_dict), metadata_id=meta_id) pangenome.status["metadata"][metatype] = "Loaded" diff --git a/ppanggolin/formats/writeFlatMetadata.py b/ppanggolin/formats/writeFlatMetadata.py index f4b91697..cfd58d12 100644 --- a/ppanggolin/formats/writeFlatMetadata.py +++ b/ppanggolin/formats/writeFlatMetadata.py @@ -89,7 +89,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, for element in getattr(pangenome, element_type_to_attribute[element_type]): for source in set(element.sources) & set(sources): - for metadata in element.get_metadata_by_source(source): + for metadata in element.get_metadata_by_source(source).values(): metadata_dict = metadata.to_dict() if element_type == "spots": metadata_dict[element_type] = f"spot_{element.ID}" diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index e5e6ad2a..868aaf2d 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -974,11 +974,6 @@ def write_spot_modules(output: Path, compress: bool = False): """ logging.getLogger("PPanGGOLiN").info("Writing modules to spot associations...") - fam2mod = {} - for mod in pan.modules: - for fam in mod.families: - fam2mod[fam] = mod - with write_compressed_or_not(output / "modules_spots.tsv", compress) as fout: fout.write("module_id\tspot_id\n") @@ -986,14 +981,14 @@ def write_spot_modules(output: Path, compress: bool = False): curr_mods = defaultdict(set) for rgp in spot.get_uniq_content(): for fam in rgp.families: - mod = fam2mod.get(fam) - if mod is not None: - curr_mods[mod].add(fam) + if fam.module is not None: + curr_mods[fam.module].add(fam) + + for module, mod_families_in_spot in curr_mods.items(): - for mod in curr_mods: - if curr_mods[mod] == mod.families: + if mod_families_in_spot == set(module.families): # if all the families in the module are found in the spot, write the association - fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") + fout.write(f"module_{module.ID}\tspot_{spot.ID}\n") logging.getLogger("PPanGGOLiN").info( f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") @@ -1239,7 +1234,7 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--modules", required=False, action="store_true", help="Write a tsv file listing functional modules and the families that belong to them") optional.add_argument("--spot_modules", required=False, action="store_true", - help="writes 3 files comparing the presence of modules within spots") + help="writes 2 files comparing the presence of modules within spots") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 15091d28..8582abfa 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -92,8 +92,9 @@ def translate(gene: Gene, code: Dict[str, Dict[str, str]]) -> Tuple[str, bool]: partial = False if mod != 0: partial = True - msg = (f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" - f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") + msg = ( + f"Gene {gene.ID} {'' if gene.local_identifier == '' else 'with local identifier ' + gene.local_identifier}" + f" has a sequence length of {len(gene.dna)} which modulo 3 was different than 0.") logging.getLogger("PPANGGOLIN").debug(msg) protein = start_table[gene.dna[0: 3]] for i in range(3, len(gene.dna) - mod, 3): @@ -126,7 +127,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory for _, genes in family.get_org_dict().items(): if len(genes) == 1: single_copy_genes.extend(genes) - + with open(f_name, "w") as f_obj: partial = False for gene in single_copy_genes: @@ -160,12 +161,12 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd)) result = subprocess.run(cmd, capture_output=True) - with open(outname, "w") as fout: + with open(outname, "w") as fout: fout.write(result.stdout.decode('utf-8')) if result.stderr and result.returncode != 0: raise Exception("mafft failed with the following error:\n" - f"{result.stderr.decode('utf-8')}") + f"{result.stderr.decode('utf-8')}") def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index e72cfec2..dbc95f78 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -5,7 +5,6 @@ import logging from typing import Dict, List, Tuple, Union - # installed libraries from tqdm import tqdm import numpy @@ -58,7 +57,7 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: if metastatus["modules"] in ["Computed", "Loaded", "inFile"]: metadata_group._v_attrs.modules = True metasources_group._v_attrs.modules = metasources["modules"] - + return True if any(metadata_group._v_attrs._f_list()) else False @@ -99,12 +98,13 @@ def get_metadata_contig_len(select_ctg: List[Contig], source: str) -> Tuple[Dict :return: Maximum type and size of each element """ - type_dict = {"ID": tables.Int64Col()} + type_dict = {"metadata_id": tables.Int64Col(), + "ID": tables.Int64Col()} max_len_dict = {} expected_rows = 0 for contig in select_ctg: - for metadata in contig.get_metadata_by_source(source): + for metadata in contig.get_metadata_by_source(source).values(): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -146,10 +146,13 @@ def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Co source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for contig in tqdm(select_contigs, unit="contigs", desc=f'Source = {source}', disable=disable_bar): - for metadata in contig.get_metadata_by_source(source): + for meta_id, metadata in contig.get_metadata_by_source(source).items(): + meta_row["metadata_id"] = meta_id for desc in source_table.colnames: if desc == "ID": meta_row[desc] = contig.ID + elif desc == "metadata_id": + meta_row[desc] = meta_id else: if hasattr(metadata, desc): value = metadata.__getattribute__(desc) @@ -172,7 +175,7 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam :return: Maximum type and size of each element """ - type_dict = {} + type_dict = {"metadata_id": tables.Int64Col()} max_len_dict = {} expected_rows = 0 @@ -191,7 +194,7 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam else: raise Exception("Unexpected attribute. A recent change could create this error." " Please report the error on our github.") - for metadata in element.get_metadata_by_source(source): + for metadata in element.get_metadata_by_source(source).values(): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -233,17 +236,18 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, """ metatype_group = write_metadata_group(h5f, metatype) meta_len = get_metadata_len(select_elements, source) - # h5f.remove_node(metatype_group, source) source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): - for metadata in element.get_metadata_by_source(source): + for meta_id, metadata in element.get_metadata_by_source(source).items(): for desc in source_table.colnames: if desc == "ID": if hasattr(element, 'name') and len(element.name) > 0: meta_row[desc] = element.name else: meta_row[desc] = element.ID + elif desc == "metadata_id": + meta_row[desc] = meta_id else: if hasattr(metadata, desc): value = metadata.__getattribute__(desc) @@ -301,55 +305,55 @@ def write_metadata(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = F """ if pangenome.status["metadata"]["families"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing gene families metadata in pangenome") - select_gf = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["families"][-1], - metatype="families")) + select_gf = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["families"][-1], + metatype="families")) write_metadata_metatype(h5f, pangenome.status["metasources"]["families"][-1], "families", select_gf, disable_bar) pangenome.status["metadata"]["families"] = "Loaded" if pangenome.status["metadata"]["genomes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genomes metadata in pangenome") - select_genomes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genomes"][-1], - metatype="genomes")) + select_genomes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genomes"][-1], + metatype="genomes")) write_metadata_metatype(h5f, pangenome.status["metasources"]["genomes"][-1], "genomes", select_genomes, disable_bar) pangenome.status["metadata"]["genomes"] = "Loaded" if pangenome.status["metadata"]["contigs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing contigs metadata in pangenome") - select_contigs = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["contigs"][-1], - metatype="contigs")) + select_contigs = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["contigs"][-1], + metatype="contigs")) write_metadata_contig(h5f, pangenome.status["metasources"]["contigs"][-1], select_contigs, disable_bar) pangenome.status["metadata"]["contigs"] = "Loaded" if pangenome.status["metadata"]["genes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genes metadata in pangenome") - select_genes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genes"][-1], - metatype="genes")) + select_genes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genes"][-1], + metatype="genes")) write_metadata_metatype(h5f, pangenome.status["metasources"]["genes"][-1], "genes", select_genes, disable_bar) pangenome.status["metadata"]["genes"] = "Loaded" if pangenome.status["metadata"]["RGPs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing RGPs metadata in pangenome") - select_rgps = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["RGPs"][-1], - metatype="RGPs")) + select_rgps = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["RGPs"][-1], + metatype="RGPs")) write_metadata_metatype(h5f, pangenome.status["metasources"]["RGPs"][-1], "RGPs", select_rgps, disable_bar) pangenome.status["metadata"]["RGPs"] = "Loaded" if pangenome.status["metadata"]["spots"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing spots metadata in pangenome") - select_spots = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["spots"][-1], - metatype="spots")) + select_spots = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["spots"][-1], + metatype="spots")) write_metadata_metatype(h5f, pangenome.status["metasources"]["spots"][-1], "spots", select_spots, disable_bar) pangenome.status["metadata"]["spots"] = "Loaded" if pangenome.status["metadata"]["modules"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing modules metadata in pangenome") - select_modules = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["modules"][-1], - metatype="modules")) + select_modules = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["modules"][-1], + metatype="modules")) write_metadata_metatype(h5f, pangenome.status["metasources"]["modules"][-1], "modules", select_modules, disable_bar) pangenome.status["metadata"]["modules"] = "Loaded" diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index b8057b57..1f31d298 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -7,7 +7,7 @@ import re import subprocess from pathlib import Path -from typing import TextIO, Dict, Set, Iterable +from typing import Dict, Set, Iterable, Union import tempfile import shutil @@ -18,8 +18,9 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, \ - detect_filetype + +from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, + restricted_float, detect_filetype) from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] @@ -115,52 +116,75 @@ def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) -def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '', - disable_bar: bool = False): +def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '', + compress: bool = False, disable_bar: bool = False): """ Writes the CDS sequences to a File object, and adds the string provided through `add` in front of it. Loads the sequences from previously computed or loaded annotations. :param genes_to_write: Genes to write. - :param file_obj: Output file to write sequences. + :param output: Path to output file to write sequences. :param add: Add prefix to gene ID. + :param compress: Compress the file in .gz :param disable_bar: Disable progress bar. """ - logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {file_obj.name}") - for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): - if gene.type == "CDS": - file_obj.write(f'>{add}{gene.ID}\n') - file_obj.write(f'{gene.dna}\n') - file_obj.flush() + logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {output.absolute()}") + with write_compressed_or_not(output, compress) as file_obj: + for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar): + if gene.type == "CDS": + file_obj.write(f'>{add}{gene.ID}\n') + file_obj.write(f'{gene.dna}\n') -def translate_genes(sequences: TextIO, tmpdir: Path, threads: int = 1, code: int = 11) -> Path: - """Translate nucleotide sequences into MMSeqs2 amino acid sequences database +def create_mmseqs_db(sequences: Iterable[Path], db_name: str, tmpdir: Path, db_mode: int = 0, db_type: int = 0) -> Path: + """Create a MMseqs2 database from a sequences file. - :param sequences: File with the nucleotide sequences + :param sequences: File with the sequences + :param db_name: name of the database :param tmpdir: Temporary directory to save the MMSeqs2 files - :param threads: Number of available threads to use - :param code: Translation code to use + :param db_mode: Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q) + :param db_type: Database type 0: auto, 1: amino acid 2: nucleotides :return: Path to the MMSeqs2 database """ - seq_nucdb = tmpdir / 'nucleotide_sequences_db' - cmd = list(map(str, ["mmseqs", "createdb", "--createdb-mode", 1, sequences.name, seq_nucdb])) + assert db_mode in [0, 1], f"Createdb mode must be 0 or 1, given {db_mode}" + assert db_type in [0, 1, 2], f"dbtype must be 0, 1 or 2, given {db_type}" + + seq_nucdb = tmpdir / db_name + cmd = ["mmseqs", "createdb", "--createdb-mode", db_mode, "--dbtype", db_type] + cmd += [seq.absolute().as_posix() for seq in sequences] + [seq_nucdb.absolute()] + cmd = list(map(str, cmd)) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) logging.getLogger("PPanGGOLiN").info("Creating sequence database...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + return seq_nucdb + + +def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, + is_single_line_fasta: bool = False, code: int = 11) -> Path: + """Translate nucleotide sequences into MMSeqs2 amino acid sequences database + + :param sequences: File with the nucleotide sequences + :param tmpdir: Temporary directory to save the MMSeqs2 files + :param cpu: Number of available threads to use + :param is_single_line_fasta: Allow to use soft link in MMSeqs2 database + :param code: Translation code to use + + :return: Path to the MMSeqs2 database + """ + seq_nucdb = create_mmseqs_db([sequences] if isinstance(sequences, Path) else sequences, 'nucleotides_db', + tmpdir, db_mode=1 if is_single_line_fasta else 0, db_type=2) logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") - seqdb = tmpdir / 'aa_db' - cmd = list( - map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", threads, "--translation-table", code])) + seqdb = tmpdir / 'translate_db' + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return seqdb def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, - compress: bool = False, disable_bar: bool = False): + compress: bool = False, disable_bar: bool = False) -> Path: """ Write all nucleotide CDS sequences @@ -190,50 +214,54 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co genes_to_write.extend(fam.genes) logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write") - with write_compressed_or_not(outpath, compress) as fasta: - if pangenome.status["geneSequences"] in ["inFile"]: - write_gene_sequences_from_pangenome_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), - disable_bar=disable_bar) - elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar) - else: - # this should never happen if the pangenome has been properly checked before launching this function. - raise AttributeError("The pangenome does not include gene sequences") - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") + if pangenome.status["geneSequences"] in ["inFile"]: + write_gene_sequences_from_pangenome_file(pangenome.file, outpath, set([gene.ID for gene in genes_to_write]), + compress=compress, disable_bar=disable_bar) + elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: + write_gene_sequences_from_annotations(genes_to_write, outpath, compress=compress, disable_bar=disable_bar) + else: + # this should never happen if the pangenome has been properly checked before launching this function. + raise AttributeError("The pangenome does not include gene sequences") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}{'.gz' if compress else ''}'") + return outpath -def write_gene_protein_sequences(pangenome: Pangenome, output: Path, genes_prot: str, soft_core: float = 0.95, +def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, - threads: int = 1, code: int = 11, disable_bar: bool = False): + cpu: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome :param pangenome: Pangenome object with gene families sequences :param output: Path to output directory - :param genes_prot: Selected partition of gene + :param proteins: Selected partition of gene :param soft_core: Soft core threshold to use :param compress: Compress the file in .gz :param keep_tmp: Keep temporary directory :param tmp: Path to temporary directory - :param threads: Number of threads available + :param cpu: Number of threads available :param code: Genetic code use to translate nucleotide sequences to protein sequences :param disable_bar: Disable progress bar """ - tmpdir = tmp / "translateGenes" if tmp is not None else Path(f"{tempfile.gettempdir()}/translateGenes") - mk_outdir(tmpdir, True, True) - - write_gene_sequences(pangenome, tmpdir, genes_prot, soft_core, compress, disable_bar) - - with open(tmpdir / f"{genes_prot}_genes.fna") as sequences: - translate_db = translate_genes(sequences, tmpdir, threads, code) - outpath = output / f"{genes_prot}_protein_genes.fna" - cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) - logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") - - if not keep_tmp: - logging.getLogger("PPanGGOLiN").debug("Clean temporary directory") - shutil.rmtree(tmpdir) + with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()), + basename="translateGenes", keep_tmp=keep_tmp) as tmpdir: + + write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) + + pangenome_sequences = tmpdir / f"{proteins}_genes.fna{'.gz' if compress else ''}" + translate_db = translate_genes(sequences=pangenome_sequences, tmpdir=tmpdir, + cpu=cpu, is_single_line_fasta=True, code=code) + outpath = output / f"{proteins}_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + if compress: + with write_compressed_or_not(outpath, compress) as compress_file: + with open(outpath, "r") as sequence_file: + shutil.copyfileobj(sequence_file, compress_file) + outpath.unlink() + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}.gz'") + else: + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: @@ -307,12 +335,11 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families", soft_core) - with write_compressed_or_not(outpath, compress) as fasta: - write_gene_sequences_from_pangenome_file(pangenome.file, fasta, [fam.name for fam in genefams], - disable_bar=disable_bar) + write_gene_sequences_from_pangenome_file(pangenome.file, outpath, [fam.name for fam in genefams], + compress=compress, disable_bar=disable_bar) - logging.getLogger("PPanGGOLiN").info( - f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}") def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, @@ -337,8 +364,8 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - logging.getLogger("PPanGGOLiN").info( - f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:" + f"'{outpath}{'.gz' if compress else ''}'") def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: @@ -475,7 +502,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise ValueError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = Path(elements[1]) if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) @@ -502,11 +529,12 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa fasta.write(f">{region.name}\n") fasta.write( write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: " + f"'{outname}{'.gz' if compress else ''}'") def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, - soft_core: float = 0.95, regions: str = None, genes: str = None, genes_prot: str = None, + soft_core: float = 0.95, regions: str = None, genes: str = None, proteins: str = None, gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False, **translate_kwgs): """ @@ -519,14 +547,14 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param soft_core: Soft core threshold to use :param regions: Write the RGP nucleotide sequences :param genes: Write all nucleotide CDS sequences - :param genes_prot: Write amino acid CDS sequences. + :param proteins: Write amino acid CDS sequences. :param gene_families: Write representative nucleotide sequences of gene families. :param prot_families: Write representative amino acid sequences of gene families. :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - check_pangenome_to_write_sequences(pangenome, regions, genes, genes_prot, gene_families, prot_families, disable_bar) + check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar) if prot_families is not None: write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) @@ -534,9 +562,9 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) if genes is not None: write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) - if genes_prot is not None: - write_gene_protein_sequences(pangenome, output, genes_prot, soft_core, compress, - disable_bar=disable_bar, **translate_kwgs) + if proteins is not None: + write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, + **translate_kwgs) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) @@ -549,14 +577,14 @@ def launch(args: argparse.Namespace): """ check_write_sequences_args(args) translate_kwgs = {"code": args.translation_table, - "threads": args.threads, + "cpu": args.cpu, "tmp": args.tmpdir, "keep_tmp": args.keep_tmp} mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, genes_prot=args.genes_prot, + regions=args.regions, genes=args.genes, proteins=args.proteins, gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar, **translate_kwgs) @@ -621,7 +649,7 @@ def parser_seq(parser: argparse.ArgumentParser): ) onereq.add_argument("--genes", required=False, type=filter_values, help=f"Write all nucleotide CDS sequences. {poss_values_log}") - onereq.add_argument("--genes_prot", required=False, type=filter_values, + onereq.add_argument("--proteins", required=False, type=filter_values, help=f"Write representative amino acid sequences of genes. {poss_values_log}") onereq.add_argument("--prot_families", required=False, type=filter_values, help=f"Write representative amino acid sequences of gene families. {poss_values_log}") @@ -638,7 +666,7 @@ def parser_seq(parser: argparse.ArgumentParser): optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") - optional.add_argument("--threads", required=False, default=1, type=int, help="Number of available threads") + optional.add_argument("--cpu", required=False, default=1, type=int, help="Number of available threads") optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 87d9d82c..cacf5534 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -135,8 +135,7 @@ def check_duplicate_contig_name(): else: logging.getLogger().debug(f"{metatype}: {row[metatype]} doesn't exist") else: - meta = Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype}) - element.add_metadata(source=source, metadata=meta) + element.add_metadata(Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype})) pangenome.status["metadata"][metatype] = "Computed" pangenome.status["metasources"][metatype].append(source) diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 2d6b5c73..b0021b64 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -76,7 +76,7 @@ def fields(self) -> List[str]: fields = list(self.__dict__) fields.remove("source") return fields - + def to_dict(self) -> Dict[str, Any]: """ Get metadata in dict format. @@ -103,7 +103,13 @@ class MetaFeatures: def __init__(self): """Constructor method """ - self._metadata_getter = defaultdict(list) + self._metadata_getter = defaultdict(dict) + + @property + def number_of_metadata(self) -> int: + """Get the number of metadata associated to feature + """ + return sum(len(meta_dict) for meta_dict in self._metadata_getter.values()) @property def metadata(self) -> Generator[Metadata, None, None]: @@ -112,8 +118,8 @@ def metadata(self) -> Generator[Metadata, None, None]: :return: Metadata from all sources """ - for meta_list in self._metadata_getter.values(): - for metadata in meta_list: + for meta_dict in self._metadata_getter.values(): + for metadata in meta_dict.values(): yield metadata @property @@ -147,21 +153,33 @@ def formatted_metadata_dict(self, separator: str = "|") -> Dict[str, str]: return {source_field: separator.join(values) for source_field, values in source_field_2_values.items()} - - def add_metadata(self, source, metadata): + def add_metadata(self, metadata: Metadata, metadata_id: int = None) -> None: """Add metadata to metadata getter - :param source: Name of the metadata source :param metadata: metadata value to add for the source + :param metadata_id: metadata identifier :raises AssertionError: Source or metadata is not with the correct type """ assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" - assert isinstance(source, str), f"Source is not a string but with {type(source)}" - - self._metadata_getter[source].append(metadata) - def get_metadata_by_source(self, source: str) -> Union[List[Metadata], None]: + # Metadata_id should not already exist because the metadata are added from scratch to a new source, + # or they are ridden + if metadata_id is None: + if self.has_source(metadata.source): + metadata_id = max([meta_id for meta_id in self._metadata_getter[metadata.source].keys()]) + 1 + else: + metadata_id = 0 + + try: + self._metadata_getter[metadata.source][metadata_id] + except KeyError: + self._metadata_getter[metadata.source][metadata_id] = metadata + else: + raise KeyError(f"A metadata with ID {metadata_id} already exist " + f"for source {metadata.source} in {str(self)}") + + def get_metadata_by_source(self, source: str) -> Union[Dict[int, Metadata], None]: """Get all the metadata feature corresponding to the source :param source: Name of the source to get @@ -200,16 +218,15 @@ def del_metadata_by_source(self, source: str): def del_metadata_by_attribute(self, **kwargs): """Remove a source from the feature - - :param source: Name of the source to delete """ - for source, metadata in self._metadata_getter.items(): + for source, metadata_dict in self._metadata_getter.items(): for attr, value in kwargs.items(): - if hasattr(metadata, attr): - # BUG If value is a list, the join block detection. - # It would be better to keep a list and change in writing and reading metadata to join the list - if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: - self._metadata_getter[source].remove(metadata) + for meta_id, metadata in metadata_dict.items(): + if hasattr(metadata, attr): + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + del self._metadata_getter[source][meta_id] def max_metadata_by_source(self) -> Tuple[str, int]: """Get the maximum number of metadata for one source @@ -226,4 +243,13 @@ def has_metadata(self) -> bool: :return: True if it has metadata else False """ - return True if len(self._metadata_getter) > 0 else False \ No newline at end of file + return True if self.number_of_metadata > 0 else False + + def has_source(self, source: str) -> bool: + """Check if the source is in the metadata feature + + :param source: name of the source + + :return: True if the source is in the metadata feature else False + """ + return True if source in self._metadata_getter else False diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index d61873a3..37cb3b84 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -827,15 +827,15 @@ def metadata_sources(self, metatype: str) -> Set[str]: def metadata(self, metatype: str) -> Generator[Metadata, None, None]: """Create a generator with all metadatas in the pangenome - :param metatype: Select to which pangenome element metadata should be generate + :param metatype: Select to which pangenome element metadata should be generate :return: Set of metadata source """ for elem in self.select_elem(metatype): yield elem.metadata - def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ - Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: + def get_elem_by_metadata(self, metatype: str, **kwargs + ) -> Generator[Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: """Get element in pangenome with metadata attribute expected :param metatype: Select to which pangenome element metadata @@ -847,16 +847,16 @@ def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ if len(list(elem.get_metadata_by_attribute(**kwargs))) > 0: yield elem - def get_elem_by_sources(self, source: List[str], - metatype: str) -> Generator[Union[GeneFamily, Gene, Contig, Organism, - Region, Spot, Module], None, None]: - """ Get gene famlies with a specific source in pangenome + def get_elem_by_source(self, source: str, metatype: str + ) -> Generator[Union[GeneFamily, Gene, Contig, Organism, Region, Spot, Module], None, None]: + """ Get gene families with a specific source in pangenome :param source: Name of the source + :param metatype: select to which pangenome element metadata should be written :return: Gene families with the source """ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): - if elem.get_metadata_by_source(source) is not None: + if elem.has_source(source): yield elem diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index a21cb85e..8a734201 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -19,7 +19,6 @@ from tqdm import tqdm import networkx as nx import yaml -import pandas as pd # # local libraries from ppanggolin.annotate.synta import read_fasta, get_dna_sequence @@ -39,8 +38,10 @@ from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, write_tsv_genome_file -from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, write_rgp_table +from ppanggolin.formats.writeFlatGenomes import write_proksee_organism, manage_module_colors, write_gff_file, \ + write_tsv_genome_file +from ppanggolin.formats.writeFlatPangenome import summarize_spots, summarize_genome, write_summaries_in_tsv, \ + write_rgp_table from ppanggolin.formats.writeSequences import read_genome_file @@ -53,7 +54,8 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' -def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): + +def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool): """ Check the status of a pangenome and determine whether projection is possible. @@ -76,7 +78,6 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): predict_rgp = True project_spots = True - if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: raise NameError("The provided pangenome has not been partitioned. " "Annotation of an external genome is therefore not possible. " @@ -99,7 +100,7 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): "Projection of modules into the provided genome will not be performed.") project_modules = False - + if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln: raise Exception("The provided pangenome has no gene sequences. " "Projection is still possible with the --fast option to use representative " @@ -108,22 +109,19 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]: raise Exception("The provided pangenome has no gene families sequences. " "This is not possible to annotate an input genome to this pangenome.") - + return predict_rgp, project_spots, project_modules -def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, - organism_name, circular_contigs, pangenome_params, - cpu, use_pseudo, disable_bar, tmpdir, config): - """ - """ +def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, organism_name, circular_contigs, + pangenome_params, cpu, use_pseudo, disable_bar, tmpdir, config): genome_name_to_path = None if input_mode == "multiple": if anno: input_type = "annotation" genome_name_to_path = parse_input_paths_file(anno) - + elif fasta: input_type = "fasta" genome_name_to_path = parse_input_paths_file(fasta) @@ -134,12 +132,12 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, if anno: input_type = "annotation" genome_name_to_path = {organism_name: {"path": anno, - "circular_contigs": circular_contigs}} - + "circular_contigs": circular_contigs}} + elif fasta: input_type = "fasta" genome_name_to_path = {organism_name: {"path": fasta, - "circular_contigs": circular_contigs}} + "circular_contigs": circular_contigs}} if input_type == "annotation": check_input_names(pangenome, genome_name_to_path) @@ -155,9 +153,9 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " - "genomes without associated sequence data, and you did not provide " - "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " - f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") + "genomes without associated sequence data, and you did not provide " + "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " + f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") elif input_type == "fasta": annotate_param_names = ["norna", "kingdom", @@ -166,22 +164,25 @@ def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config) check_input_names(pangenome, genome_name_to_path) - organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, - translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, + translation_table=int(pangenome_params.cluster.translation_table), + norna=annotate_params.norna, kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, + procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) return organisms, genome_name_to_path, input_type -def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]], - input_org_to_spots:Dict[Organism, Set[Spot]], - input_orgs_to_modules:Dict[Organism, Set[Module]] , - input_org_to_lonely_genes_count:Dict[Organism, int], - write_proksee:bool, write_gff:bool, write_table:bool, - add_sequences:bool, - genome_name_to_path:Dict[str,dict], input_type:str, - output_dir:Path, dup_margin:float, soft_core:float, - metadata_sep:str, compress:bool, - need_regions:bool, need_spots:bool, need_modules:bool): +def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], + input_org_2_rgps: Dict[Organism, Set[Region]], + input_org_to_spots: Dict[Organism, Set[Spot]], + input_orgs_to_modules: Dict[Organism, Set[Module]], + input_org_to_lonely_genes_count: Dict[Organism, int], + write_proksee: bool, write_gff: bool, write_table: bool, + add_sequences: bool, + genome_name_to_path: Dict[str, dict], input_type: str, + output_dir: Path, dup_margin: float, soft_core: float, + metadata_sep: str, compress: bool, + need_regions: bool, need_spots: bool, need_modules: bool): """ Write the results of the projection of pangneome onto input genomes. @@ -215,14 +216,12 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input # single_copy_families = get_single_copy_families(pangenome, dup_margin) # multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - - # dup margin value here is specified in argument and is used to compute completeness. + # dup margin value here is specified in argument and is used to compute completeness. # Thats mean it can be different than dup margin used in spot and RGPS. - pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin = dup_margin, exclude_fragments=True) + pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin=dup_margin, + exclude_fragments=True) pangenome_persistent_count = len([fam for fam in pangenome.gene_families if fam.named_partition == "persistent"]) - soft_core_families = pangenome.soft_core_families(soft_core) exact_core_families = pangenome.exact_core_families() @@ -235,17 +234,17 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input singleton_gene_count = input_org_to_lonely_genes_count[organism] org_summary = summarize_projected_genome(organism, - pangenome_persistent_count, - pangenome_persistent_single_copy_families, - soft_core_families=soft_core_families, - exact_core_families=exact_core_families, - input_org_rgps=input_org_2_rgps.get(organism, None), - input_org_spots=input_org_to_spots.get(organism, None), - input_org_modules=input_orgs_to_modules.get(organism, None), - pangenome_file=pangenome.file, - singleton_gene_count=singleton_gene_count) + pangenome_persistent_count, + pangenome_persistent_single_copy_families, + soft_core_families=soft_core_families, + exact_core_families=exact_core_families, + input_org_rgps=input_org_2_rgps.get(organism, None), + input_org_spots=input_org_to_spots.get(organism, None), + input_org_modules=input_orgs_to_modules.get(organism, None), + pangenome_file=pangenome.file, + singleton_gene_count=singleton_gene_count) summaries.append(org_summary) - + yaml_outputfile = output_dir / organism.name / "projection_summary.yaml" write_summary_in_yaml(org_summary, yaml_outputfile) @@ -265,7 +264,7 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input genome_sequences=genome_sequences, compress=compress) if write_gff: - if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN + if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN annotation_sources = {"rRNA": "external", "tRNA": "external", "CDS": "external"} @@ -279,14 +278,13 @@ def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input if write_table: write_tsv_genome_file(organism, output_dir / organism.name, compress=compress, metadata_sep=metadata_sep, - need_regions=need_regions, need_spots=need_spots, need_modules=need_modules) + need_regions=need_regions, need_spots=need_spots, need_modules=need_modules) output_file = output_dir / "summary_projection.tsv" write_summaries_in_tsv(summaries, output_file=output_file, dup_margin=dup_margin, soft_core=soft_core) - def summarize_projected_genome(organism: Organism, @@ -496,7 +494,7 @@ def check_input_names(pangenome, input_names): f"{len(duplicated_names)} provided genome name(s) already exist in the given pangenome: {' '.join(duplicated_names)}") -def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): +def write_summary_in_yaml(summary_info: Dict[str, Any], output_file: Path): """ Write summary information to a YAML file. @@ -538,42 +536,36 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - seq_fasta_files = [] + logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') - logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.') + input_genes = [gene for org in input_organisms for gene in org.genes] - for input_organism in input_organisms: - seq_outdir = output / input_organism.name - mk_outdir(seq_outdir, force=True) - - seq_fasta_file = seq_outdir / "cds_sequences.fasta" - - with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, - add="ppanggolin_") + seq_fasta_file = output / 'input_genes.fasta' - seq_fasta_files.append(seq_fasta_file) - - with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: + write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_') + with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: - _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir, - tmpdir=new_tmpdir, is_input_seq_nt=True, - cpu=cpu, no_defrag=no_defrag, identity=identity, - coverage=coverage, + _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, + sequence_files=seq_fasta_file, output=new_tmpdir, + tmpdir=new_tmpdir, input_type="nucleotide", + is_input_slf=True, cpu=cpu, no_defrag=no_defrag, + identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, - sequence_files=seq_fasta_files, + sequence_files=seq_fasta_file, output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, + input_type="nucleotide", is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) + input_org_to_lonely_genes_count = {} for input_organism in input_organisms: org_outdir = output / input_organism.name + mk_outdir(org_outdir, force=True) seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes} @@ -606,7 +598,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org gene_family = GeneFamily(pangenome.max_fam_id, new_name) pangenome.add_gene_family(gene_family) - + gene_family.partition = "Cloud" lonely_genes.add(gene) @@ -616,7 +608,7 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org "The input genome contains a gene that aligns to a pangenome family " f"which already contains a gene with the same ID ({gene_id}). " f"The genome name has been appended to the family name: {new_name}") - + gene.ID = new_name # Add the gene to the gene family @@ -624,7 +616,6 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org pangenome._mk_gene_getter() # re-build the gene getter - logging.getLogger('PPanGGOLiN').info( f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} " "specific genes that do not align to any gene of the pangenome.") @@ -923,7 +914,7 @@ def predict_spot_in_one_organism( overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1, - compress:bool = False) -> Set[Spot]: + compress: bool = False) -> Set[Spot]: """ Predict spots for input organism RGPs. @@ -960,10 +951,11 @@ def predict_spot_in_one_organism( input_org_node_to_rgps[border_node].add(rgp) if len(input_org_node_to_rgps) == 0: - logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " - "as they are on a contig border (or have " - f"less than {set_size} persistent gene families until the contig border). " - "Projection of spots stops here") + logging.getLogger("PPanGGOLiN").debug( + f"{organism_name}: no RGPs of the input genome will be associated with any spot of insertion " + "as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border). " + "Projection of spots stops here") return set() # remove node that were already in the graph @@ -1048,7 +1040,7 @@ def predict_spot_in_one_organism( filename='input_genome_rgp_to_spot.tsv', compress=compress) input_org_spots = {spot for spots in input_rgp_to_spots.values() - for spot in spots } + for spot in spots} new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)} logging.getLogger('PPanGGOLiN').debug( @@ -1189,8 +1181,6 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume return input_mode - - def launch(args: argparse.Namespace): """ Command launcher @@ -1208,41 +1198,43 @@ def launch(args: argparse.Namespace): predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast) - need_graph =True if args.table else False + need_graph = True if args.table else False check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, need_spots=project_spots, need_graph=need_graph) - + logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') pangenome_params = argparse.Namespace( **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) if predict_rgp: - #computing multigenics for rgp prediction first to have original family.number_of_genomes + # computing multigenics for rgp prediction first to have original family.number_of_genomes # and the same multigenics list as when rgp and spot were predicted multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, - input_mode=args.input_mode, - anno=args.anno, fasta=args.fasta, - organism_name=args.genome_name, - circular_contigs=args.circular_contigs, - pangenome_params=pangenome_params, - cpu=args.cpu, use_pseudo=args.use_pseudo, - disable_bar=args.disable_prog_bar, - tmpdir= args.tmpdir, config=args.config) - - + + organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, + input_mode=args.input_mode, + anno=args.anno, fasta=args.fasta, + organism_name=args.genome_name, + circular_contigs=args.circular_contigs, + pangenome_params=pangenome_params, + cpu=args.cpu, + use_pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar, + tmpdir=args.tmpdir, config=args.config) input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, use_representatives=args.fast, - no_defrag=args.no_defrag, identity=args.identity, - coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=int(pangenome_params.cluster.translation_table), - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) - + output=output_dir, cpu=args.cpu, + use_representatives=args.fast, + no_defrag=args.no_defrag, + identity=args.identity, + coverage=args.coverage, + tmpdir=args.tmpdir, + translation_table=int( + pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} @@ -1250,38 +1242,41 @@ def launch(args: argparse.Namespace): logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, - disable_bar=args.disable_prog_bar, compress=args.compress) + input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, + variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, + min_score=pangenome_params.rgp.min_score, multigenics=multigenics, + output_dir=output_dir, + disable_bar=args.disable_prog_bar, compress=args.compress) if project_spots: logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_2_rgps=input_org_2_rgps, - multigenics=multigenics, - output=output_dir, - write_graph_flag=args.spot_graph, - graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size, - compress=args.compress) + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size, + compress=args.compress) if project_modules: input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir, compress=args.compress) write_projection_results(pangenome, organisms, input_org_2_rgps, - input_org_to_spots, - input_orgs_to_modules, - input_org_to_lonely_genes_count, - write_proksee=args.proksee, write_gff=args.gff, write_table=args.table, - add_sequences=args.add_sequences, - genome_name_to_path=genome_name_to_path, input_type=input_type, - output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core, - metadata_sep=args.metadata_sep, compress=args.compress, - need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp) - + input_org_to_spots, + input_orgs_to_modules, + input_org_to_lonely_genes_count, + write_proksee=args.proksee, write_gff=args.gff, write_table=args.table, + add_sequences=args.add_sequences, + genome_name_to_path=genome_name_to_path, input_type=input_type, + output_dir=output_dir, dup_margin=args.dup_margin, soft_core=args.soft_core, + metadata_sep=args.metadata_sep, compress=args.compress, + need_modules=project_modules, need_spots=project_spots, need_regions=predict_rgp) + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ @@ -1355,12 +1350,12 @@ def parser_projection(parser: argparse.ArgumentParser): help="minimum ratio of genomes in which the family must have multiple genes " "for it to be considered 'duplicated'. " "This metric is used to compute completeness and duplication of the input genomes") - + optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, - help="Soft core threshold used when generating general statistics on the projected genome. " - "This threshold does not influence PPanGGOLiN's partitioning. " - "The value determines the minimum fraction of genomes that must possess a gene family " - "for it to be considered part of the soft core.") + help="Soft core threshold used when generating general statistics on the projected genome. " + "This threshold does not influence PPanGGOLiN's partitioning. " + "The value determines the minimum fraction of genomes that must possess a gene family " + "for it to be considered part of the soft core.") optional.add_argument("--spot_graph", required=False, action="store_true", help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs " @@ -1374,13 +1369,13 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--proksee", required=False, action="store_true", help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input genome.") - + optional.add_argument("--table", required=False, action="store_true", help="Generate a tsv file for each input genome with pangenome annotations.") - + optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") - + optional.add_argument("--add_sequences", required=False, action="store_true", help="Include input genome DNA sequences in GFF and Proksee output.") @@ -1392,21 +1387,21 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") - + optional.add_argument("--add_metadata", - required=False, - action="store_true", - help="Include metadata information in the output files " - "if any have been added to pangenome elements (see ppanggolin metadata command).") + required=False, + action="store_true", + help="Include metadata information in the output files " + "if any have been added to pangenome elements (see ppanggolin metadata command).") optional.add_argument("--metadata_sources", - default=None, - nargs="+", - help="Which source of metadata should be written. " - "By default all metadata sources are included.") + default=None, + nargs="+", + help="Which source of metadata should be written. " + "By default all metadata sources are included.") optional.add_argument("--metadata_sep", - required=False, - default='|', - help="The separator used to join multiple metadata values for elements with multiple metadata" - " values from the same source. This character should not appear in metadata values.") + required=False, + default='|', + help="The separator used to join multiple metadata values for elements with multiple metadata" + " values from the same source. This character should not appear in metadata values.") diff --git a/tests/align/test_align.py b/tests/align/test_align.py new file mode 100644 index 00000000..42d1cd5f --- /dev/null +++ b/tests/align/test_align.py @@ -0,0 +1,82 @@ +import pytest +from typing import List +from random import choice, randint + +from ppanggolin.align.alignOnPang import get_seq_ids + + +@pytest.fixture +def single_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGTTGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] + +@pytest.fixture +def single_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'YWTPRPFFYAAEYNN\n', + ">Gene_2\n", + "YWTPRPSYWTPAAEYNN\n" + ] + +@pytest.fixture +def multi_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGT\n', + 'TGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] + +@pytest.fixture +def multi_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'AAEYNN\n', + 'YWTPRPFFY\n', + ">Gene_2\n", + "YWTPRPS\n", + "YWTPAAEYNN\n" + ] + + +def test_get_seq_ids_single_line_nt(single_line_fasta_nt): + + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_nt) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_single_line_aa(single_line_fasta_aa): + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_multi_line_nt(multi_line_fasta_nt): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_nt) + + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert not single_line_fasta + +def test_get_seq_ids_multi_line_aa(multi_line_fasta_aa): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert not single_line_fasta + + + diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..393e1386 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + nucleotides: marks tests to run with nucleotides alphabets + aminoacids: marks tests to run with amino-acids alphabets \ No newline at end of file diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d4bab2e4..95928b05 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -83,24 +83,24 @@ def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: """ metafeatures = MetaFeatures() for meta in metadata: - metafeatures.add_metadata(meta.source, meta) + metafeatures.add_metadata(meta) yield metafeatures def test_add_metadata(self, metafeatures, metadata): """Tests that metadata can be added to the metadata getter """ - assert all(metafeatures._metadata_getter[meta.source] == [meta] for meta in metadata) + assert all(list(metafeatures._metadata_getter[meta.source].values()) == [meta] for meta in metadata) def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): """Tests that all the metadata features corresponding to a source can be retrieved """ - assert all(metafeatures.get_metadata_by_source(meta.source) == [meta] for meta in metadata) + assert all(list(metafeatures.get_metadata_by_source(meta.source).values()) == [meta] for meta in metadata) def test_remove_source_from_feature(self, metafeatures): """Tests that a source can be removed from the feature """ metadata = Metadata("source_del", attribute1="value") - metafeatures.add_metadata("source_del", metadata) + metafeatures.add_metadata(metadata) metafeatures.del_metadata_by_source("source_del") assert metafeatures.get_metadata_by_source("source_del") is None @@ -114,7 +114,7 @@ def test_get_metadata_by_attribute_values(self, metafeatures): """ meta = Metadata("source_test", attribute1="value_to_retrieve") # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) - metafeatures.add_metadata(meta.source, meta) + metafeatures.add_metadata(meta) # metafeatures[meta_list.source] = meta_list assert list(metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve")) == [meta] # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] @@ -124,26 +124,12 @@ def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metad """ metadata1 = Metadata("source_max", attribute1="value1") metadata2 = Metadata("source_max", attribute2="value2") - metafeatures.add_metadata("source_max", metadata1) - metafeatures.add_metadata("source_max", metadata2) + metafeatures.add_metadata(metadata1) + metafeatures.add_metadata(metadata2) assert metafeatures.max_metadata_by_source() == ("source_max", 2) def test_metadata_is_not_with_type_metadata(self, metafeatures): """Tests that an AssertionError is raised when metadata is not with type Metadata """ with pytest.raises(AssertionError): - metafeatures.add_metadata("source1", "not_metadata") - - def test_source_is_not_a_string(self, metafeatures): - """Tests that an AssertionError is raised when the source is not a string - """ - - metadata = Metadata("source1", attribute1="value1") - with pytest.raises(AssertionError): - metafeatures.add_metadata(1, metadata) - - def test_source_or_metadata_is_not_with_correct_type(self, metafeatures, metadata): - """Tests that an AssertionError is raised when the source or metadata is not with the correct type - """ - with pytest.raises(AssertionError): - metafeatures.add_metadata(1, "not_metadata") + metafeatures.add_metadata("not_metadata") diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index d67eb1ce..90483079 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -804,25 +804,25 @@ def add_element_to_pangenome(self, pangenome): """ metadata = Metadata(source="source", attribute="attr") family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") - family.add_metadata(source=metadata.source, metadata=metadata) + family.add_metadata(metadata=metadata) pangenome.add_gene_family(family) org = Organism("Org") - org.add_metadata(source=metadata.source, metadata=metadata) + org.add_metadata(metadata=metadata) ctg = Contig(0, "Ctg") org.add(ctg) gene = Gene("Gene") - gene.position, gene.start = (0, 0) - gene.add_metadata(source=metadata.source, metadata=metadata) - ctg[gene.start] = gene + gene.fill_annotations(start=0, stop=100, position=0, strand='+') + gene.add_metadata(metadata=metadata) + ctg.add(gene) pangenome.add_organism(org) rgp = Region("RGP") - rgp.add_metadata(source=metadata.source, metadata=metadata) + rgp.add_metadata(metadata=metadata) pangenome.add_region(rgp) spot = Spot(0) - spot.add_metadata(source=metadata.source, metadata=metadata) + spot.add_metadata(metadata=metadata) pangenome.add_spot(spot) module = Module(0) - module.add_metadata(source=metadata.source, metadata=metadata) + module.add_metadata(metadata=metadata) pangenome.add_module(module) def test_select_elem(self, add_element_to_pangenome, pangenome): @@ -876,7 +876,7 @@ def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): assert isinstance(metadata, Metadata) assert metadata.source == 'source' - def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): + def test_get_elem_by_source(self, add_element_to_pangenome, pangenome): """Tests the metadata generator filtered by source of the Pangenome class. :param add_element_to_pangenome: Add elements to the pangenome @@ -884,7 +884,7 @@ def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): """ for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, "spots": Spot, "modules": Module}.items(): - for elem in pangenome.get_elem_by_sources(source='source', metatype=metatype): + for elem in pangenome.get_elem_by_source(source='source', metatype=metatype): assert isinstance(elem, expected_type) for metadata in elem.metadata: assert isinstance(metadata, Metadata)