diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88088f66..2d9e9106 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -120,7 +120,7 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS + ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index eca6a630..53c118ae 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -54,6 +54,11 @@ You can do this from the command line: An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) +```{note} +When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file. +``` + + ### Defragmentation diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index a99ba3a4..eee073b8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -27,7 +27,7 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]], tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, - is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False, + query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow", is_target_slf: bool = False, translation_table: int = None) -> Path: """ Align fasta sequence to pangenome sequences. @@ -39,34 +39,36 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi :param no_defrag: Do not apply defragmentation :param identity: minimal identity threshold for the alignment :param coverage: minimal identity threshold for the alignment - :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs + :param query_type: Sequences type of the file (query). [nucleotide, protein, unknow] :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link - :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs + :param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein] :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated. :return: Alignment result file """ - if is_target_nt: + if target_type == "nucleotide": logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with " f"translation table {translation_table}") with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table) else: + db_type = 1 if target_type == "protein" else 0 target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file, - 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1) + 'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type) - if is_query_nt: + if query_type == "nucleotide": logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs " f"with translation table {translation_table}") with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir: # Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table) else: + db_type = 1 if query_type == "protein" else 0 query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files, - 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1) + 'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type) cov_mode = "2" # coverage of query if no_defrag: @@ -390,7 +392,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False ) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -404,7 +406,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow] :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). @@ -421,10 +423,10 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}') write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar) - align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, - tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, - is_target_nt=False, is_target_slf=True, translation_table=translation_table) + align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, + cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="protein", translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome) @@ -432,7 +434,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path, - tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1, + tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11, disable_bar: bool = False ) -> Tuple[Path, Dict[str, GeneFamily]]: @@ -446,7 +448,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio :param sequence_files: Iterable of paths of FASTA files containing input sequences to align. :param output: Path to the output directory where alignment results will be stored. :param tmpdir: Temporary directory for intermediate files. - :param is_input_seq_nt: Is input sequence file nucleotide sequences. + :param input_type: Sequences type of the file (query). [nucleotide, protein, unknow] :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link :param cpu: Number of CPU cores to use for the alignment (default: 1). :param no_defrag: If True, the defragmentation workflow is skipped (default: False). @@ -464,8 +466,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, - is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True, - is_target_slf=True, translation_table=translation_table) + query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True, + target_type="nucleotide", translation_table=translation_table) seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome) @@ -516,19 +518,19 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj) with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - is_input_seq_nt=is_nucleotide, - is_input_slf=single_line_fasta, - cpu=cpu, no_defrag=no_defrag, identity=identity, + input_type=input_type, + is_input_slf=single_line_fasta, cpu=cpu, + no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, + input_type=input_type, is_input_slf=single_line_fasta, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 6bf37a12..659954c9 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -510,13 +510,11 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li rna_counter += 1 genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_metadata) - - genome_meta_obj = Metadata(source='annotation_file', **genome_metadata) - organism.add_metadata(source="annotation_file", metadata=genome_meta_obj) + + organism.add_metadata(metadata=Metadata(source='annotation_file', **genome_metadata)) for contig, metadata_dict in contig_to_uniq_metadata.items(): - contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict) - contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj) + contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) if used_transl_table_arg: logging.getLogger("PPanGGOLiN").info( @@ -791,8 +789,7 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org: if len(contig_name_to_region_info) == org.number_of_contigs: genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_name_to_region_info) if genome_metadata: - genome_meta_obj = Metadata(source='annotation_file', **genome_metadata) - org.add_metadata(source="annotation_file", metadata=genome_meta_obj) + org.add_metadata(Metadata(source='annotation_file', **genome_metadata)) else: logging.getLogger("PPanGGOLiN").warning( f"Inconsistent data in GFF file {gff_file_path}: " @@ -804,9 +801,8 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org: contig = org.get(contig_name) except KeyError: raise ValueError(f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.") - - contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict) - contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj) + + contig.add_metadata(Metadata(source='annotation_file', **metadata_dict)) def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separation: int = 10): diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index aed0a4dc..ab1046b9 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -19,7 +19,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene from ppanggolin.geneFamily import GeneFamily -from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir +from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db @@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force: elif pangenome.status["geneSequences"] == "inFile": logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file") write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_", - compress=False, disable_bar=disable_bar) # write CDS sequences to the tmpFile + compress=False, + disable_bar=disable_bar) # write CDS sequences to the tmpFile else: raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " "the genes in gene families. Either provide clustering results (see --clusters), " @@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime()) dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}' with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path: - sequence_path = tmp_path/'nucleotide_sequences.fna' + sequence_path = tmp_path / 'nucleotide_sequences.fna' check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode) @@ -356,8 +357,32 @@ def infer_singletons(pangenome: Pangenome): logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") -def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, - disable_bar: bool = False): +def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1, + tmpdir: Path = None, keep_tmp: bool = False): + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp: + repres_path = tmp / "representative.fna" + with open(repres_path, "w") as repres_seq: + for family in pangenome.gene_families: + repres_seq.write(f">{family.name}\n") + repres_seq.write(f"{family.representative.dna}\n") + translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu, + is_single_line_fasta=True, code=code) + outpath = tmp / "representative_protein_genes.fna" + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) + run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") + with open(outpath, "r") as repres_prot: + lines = repres_prot.readlines() + while len(lines) > 0: + family_name = lines.pop(0).strip()[1:] + family_seq = lines.pop(0).strip() + family = pangenome.get_gene_family(family_name) + family.add_sequence(family_seq) + + +def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, + code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False, + force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. @@ -365,11 +390,15 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet :param pangenome: Input Pangenome :param families_tsv_file: MMseqs2 clustering results :param infer_singleton: creates a new family for each gene with no associated family + :param code: Genetic code used for sequence translation. + :param cpu: Number of CPU cores to use for clustering. + :param tmpdir: Path to a temporary directory for intermediate files. + :param keep_tmp: Keep temporary files (useful for debugging). :param force: force to write in the pangenome :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) + check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size @@ -421,10 +450,12 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet f"You can either update your cluster file to ensure each gene has a cluster assignment, " f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene." ) + get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" + pangenome.status["geneFamilySequences"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["# read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton @@ -450,7 +481,8 @@ def launch(args: argparse.Namespace): if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") - read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) + read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table, + args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar) logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -488,11 +520,6 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument('--no_defrag', required=False, default=False, action="store_true", help="DO NOT Use the defragmentation strategy to link potential fragments " "with their original gene family.") - clust.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - # clust.add_argument("--compress") - - clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") read = parser.add_argument_group(title="Read clustering arguments") read.add_argument('--clusters', required=False, type=Path, @@ -502,7 +529,10 @@ def parser_clust(parser: argparse.ArgumentParser): help="When reading a clustering result with --clusters, if a gene is not in the provided file" " it will be placed in a cluster where the gene is the only member.") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), + optional.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") optional.add_argument("--keep_tmp", required=False, default=False, action="store_true", help="Keeping temporary files (useful for debugging).") diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 508203fc..512d12a6 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -74,18 +74,17 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences") with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir: - + input_type = "nucleotide" if is_nucleotide else "unknow" if use_representatives: _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir, - is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, - cpu=cpu, no_defrag=no_defrag, - identity=identity, coverage=coverage, - translation_table=translation_table, + input_type=input_type, is_input_slf=is_slf, cpu=cpu, + no_defrag=no_defrag, identity=identity, + coverage=coverage, translation_table=translation_table, disable_bar=disable_bar) else: _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file, output=output, tmpdir=new_tmpdir, - is_input_seq_nt=is_nucleotide, is_input_slf=is_slf, + input_type=input_type, is_input_slf=is_slf, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 4c90be2c..85c1e7ac 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -682,7 +682,11 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, source_table = metadata_group._f_get_child(source) for row in tqdm(read_chunks(source_table), total=source_table.nrows, unit='metadata', disable=disable_bar): meta_dict = {'source': source} - identifier = row["ID"].decode() if isinstance(row["ID"], bytes) else int(row["ID"]) + try: + meta_id = row["metadata_id"] + except KeyError: + meta_id = None + identifier = row["ID"].decode("utf-8") if isinstance(row["ID"], bytes) else int(row["ID"]) # else: # identifier = row["name"].decode() if metatype == "families": @@ -706,8 +710,7 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, for field in row.dtype.names: if field not in ["ID", "name"]: meta_dict[field] = row[field].decode() if isinstance(row[field], bytes) else row[field] - meta = Metadata(**meta_dict) - element.add_metadata(source=source, metadata=meta) + element.add_metadata(metadata=Metadata(**meta_dict), metadata_id=meta_id) pangenome.status["metadata"][metatype] = "Loaded" diff --git a/ppanggolin/formats/writeFlatMetadata.py b/ppanggolin/formats/writeFlatMetadata.py index f4b91697..cfd58d12 100644 --- a/ppanggolin/formats/writeFlatMetadata.py +++ b/ppanggolin/formats/writeFlatMetadata.py @@ -89,7 +89,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path, for element in getattr(pangenome, element_type_to_attribute[element_type]): for source in set(element.sources) & set(sources): - for metadata in element.get_metadata_by_source(source): + for metadata in element.get_metadata_by_source(source).values(): metadata_dict = metadata.to_dict() if element_type == "spots": metadata_dict[element_type] = f"spot_{element.ID}" diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index a7d29b6e..cabdfbd8 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -976,11 +976,6 @@ def write_spot_modules(output: Path, compress: bool = False): """ logging.getLogger("PPanGGOLiN").info("Writing modules to spot associations...") - fam2mod = {} - for mod in pan.modules: - for fam in mod.families: - fam2mod[fam] = mod - with write_compressed_or_not(output / "modules_spots.tsv", compress) as fout: fout.write("module_id\tspot_id\n") @@ -988,14 +983,14 @@ def write_spot_modules(output: Path, compress: bool = False): curr_mods = defaultdict(set) for rgp in spot.get_uniq_content(): for fam in rgp.families: - mod = fam2mod.get(fam) - if mod is not None: - curr_mods[mod].add(fam) + if fam.module is not None: + curr_mods[fam.module].add(fam) + + for module, mod_families_in_spot in curr_mods.items(): - for mod in curr_mods: - if curr_mods[mod] == mod.families: + if mod_families_in_spot == set(module.families): # if all the families in the module are found in the spot, write the association - fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") + fout.write(f"module_{module.ID}\tspot_{spot.ID}\n") logging.getLogger("PPanGGOLiN").info( f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") @@ -1241,7 +1236,7 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--modules", required=False, action="store_true", help="Write a tsv file listing functional modules and the families that belong to them") optional.add_argument("--spot_modules", required=False, action="store_true", - help="writes 3 files comparing the presence of modules within spots") + help="writes 2 files comparing the presence of modules within spots") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index e72cfec2..dbc95f78 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -5,7 +5,6 @@ import logging from typing import Dict, List, Tuple, Union - # installed libraries from tqdm import tqdm import numpy @@ -58,7 +57,7 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: if metastatus["modules"] in ["Computed", "Loaded", "inFile"]: metadata_group._v_attrs.modules = True metasources_group._v_attrs.modules = metasources["modules"] - + return True if any(metadata_group._v_attrs._f_list()) else False @@ -99,12 +98,13 @@ def get_metadata_contig_len(select_ctg: List[Contig], source: str) -> Tuple[Dict :return: Maximum type and size of each element """ - type_dict = {"ID": tables.Int64Col()} + type_dict = {"metadata_id": tables.Int64Col(), + "ID": tables.Int64Col()} max_len_dict = {} expected_rows = 0 for contig in select_ctg: - for metadata in contig.get_metadata_by_source(source): + for metadata in contig.get_metadata_by_source(source).values(): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -146,10 +146,13 @@ def write_metadata_contig(h5f: tables.File, source: str, select_contigs: List[Co source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for contig in tqdm(select_contigs, unit="contigs", desc=f'Source = {source}', disable=disable_bar): - for metadata in contig.get_metadata_by_source(source): + for meta_id, metadata in contig.get_metadata_by_source(source).items(): + meta_row["metadata_id"] = meta_id for desc in source_table.colnames: if desc == "ID": meta_row[desc] = contig.ID + elif desc == "metadata_id": + meta_row[desc] = meta_id else: if hasattr(metadata, desc): value = metadata.__getattribute__(desc) @@ -172,7 +175,7 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam :return: Maximum type and size of each element """ - type_dict = {} + type_dict = {"metadata_id": tables.Int64Col()} max_len_dict = {} expected_rows = 0 @@ -191,7 +194,7 @@ def get_metadata_len(select_elem: Union[List[Gene], List[Organism], List[GeneFam else: raise Exception("Unexpected attribute. A recent change could create this error." " Please report the error on our github.") - for metadata in element.get_metadata_by_source(source): + for metadata in element.get_metadata_by_source(source).values(): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -233,17 +236,18 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, """ metatype_group = write_metadata_group(h5f, metatype) meta_len = get_metadata_len(select_elements, source) - # h5f.remove_node(metatype_group, source) source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): - for metadata in element.get_metadata_by_source(source): + for meta_id, metadata in element.get_metadata_by_source(source).items(): for desc in source_table.colnames: if desc == "ID": if hasattr(element, 'name') and len(element.name) > 0: meta_row[desc] = element.name else: meta_row[desc] = element.ID + elif desc == "metadata_id": + meta_row[desc] = meta_id else: if hasattr(metadata, desc): value = metadata.__getattribute__(desc) @@ -301,55 +305,55 @@ def write_metadata(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = F """ if pangenome.status["metadata"]["families"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing gene families metadata in pangenome") - select_gf = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["families"][-1], - metatype="families")) + select_gf = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["families"][-1], + metatype="families")) write_metadata_metatype(h5f, pangenome.status["metasources"]["families"][-1], "families", select_gf, disable_bar) pangenome.status["metadata"]["families"] = "Loaded" if pangenome.status["metadata"]["genomes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genomes metadata in pangenome") - select_genomes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genomes"][-1], - metatype="genomes")) + select_genomes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genomes"][-1], + metatype="genomes")) write_metadata_metatype(h5f, pangenome.status["metasources"]["genomes"][-1], "genomes", select_genomes, disable_bar) pangenome.status["metadata"]["genomes"] = "Loaded" if pangenome.status["metadata"]["contigs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing contigs metadata in pangenome") - select_contigs = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["contigs"][-1], - metatype="contigs")) + select_contigs = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["contigs"][-1], + metatype="contigs")) write_metadata_contig(h5f, pangenome.status["metasources"]["contigs"][-1], select_contigs, disable_bar) pangenome.status["metadata"]["contigs"] = "Loaded" if pangenome.status["metadata"]["genes"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing genes metadata in pangenome") - select_genes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genes"][-1], - metatype="genes")) + select_genes = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["genes"][-1], + metatype="genes")) write_metadata_metatype(h5f, pangenome.status["metasources"]["genes"][-1], "genes", select_genes, disable_bar) pangenome.status["metadata"]["genes"] = "Loaded" if pangenome.status["metadata"]["RGPs"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing RGPs metadata in pangenome") - select_rgps = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["RGPs"][-1], - metatype="RGPs")) + select_rgps = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["RGPs"][-1], + metatype="RGPs")) write_metadata_metatype(h5f, pangenome.status["metasources"]["RGPs"][-1], "RGPs", select_rgps, disable_bar) pangenome.status["metadata"]["RGPs"] = "Loaded" if pangenome.status["metadata"]["spots"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing spots metadata in pangenome") - select_spots = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["spots"][-1], - metatype="spots")) + select_spots = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["spots"][-1], + metatype="spots")) write_metadata_metatype(h5f, pangenome.status["metasources"]["spots"][-1], "spots", select_spots, disable_bar) pangenome.status["metadata"]["spots"] = "Loaded" if pangenome.status["metadata"]["modules"] == "Computed": logging.getLogger("PPanGGOLiN").info("Writing modules metadata in pangenome") - select_modules = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["modules"][-1], - metatype="modules")) + select_modules = list(pangenome.get_elem_by_source(source=pangenome.status["metasources"]["modules"][-1], + metatype="modules")) write_metadata_metatype(h5f, pangenome.status["metasources"]["modules"][-1], "modules", select_modules, disable_bar) pangenome.status["metadata"]["modules"] = "Loaded" diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 41cc3e0d..8b238411 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -497,7 +497,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - raise SyntaxError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") + raise ValueError(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = Path(elements[1]) if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index b856fd6f..708a3dd6 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -188,6 +188,13 @@ def remove(self, identifier): del self[identifier] + @property + def representative(self) -> Gene: + """Get the representative gene of the family + :return: The representative gene of the family + """ + return self.get(self.name) + def contains_gene_id(self, identifier): """ Check if the family contains already a gene id diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 504197fe..87e2ef46 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) raise TypeError(f"Expected type int for 'begin' and 'end', " f"but received types '{type(begin)}' and '{type(end)}'.") - if begin >= end: + if begin > end: raise ValueError("The 'begin' position must be less than the 'end' position.") if end > self._genes_position[-1].position: @@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False) if end == self._genes_position[-1].position: return self._genes_position[begin:] else: - return self._genes_position[begin: end] + if begin == end: + return self._genes_position[begin] + else: + return self._genes_position[begin: end] @property def number_of_genes(self) -> int: diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 87d9d82c..cacf5534 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -135,8 +135,7 @@ def check_duplicate_contig_name(): else: logging.getLogger().debug(f"{metatype}: {row[metatype]} doesn't exist") else: - meta = Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype}) - element.add_metadata(source=source, metadata=meta) + element.add_metadata(Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype})) pangenome.status["metadata"][metatype] = "Computed" pangenome.status["metasources"][metatype].append(source) diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 2d6b5c73..b0021b64 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -76,7 +76,7 @@ def fields(self) -> List[str]: fields = list(self.__dict__) fields.remove("source") return fields - + def to_dict(self) -> Dict[str, Any]: """ Get metadata in dict format. @@ -103,7 +103,13 @@ class MetaFeatures: def __init__(self): """Constructor method """ - self._metadata_getter = defaultdict(list) + self._metadata_getter = defaultdict(dict) + + @property + def number_of_metadata(self) -> int: + """Get the number of metadata associated to feature + """ + return sum(len(meta_dict) for meta_dict in self._metadata_getter.values()) @property def metadata(self) -> Generator[Metadata, None, None]: @@ -112,8 +118,8 @@ def metadata(self) -> Generator[Metadata, None, None]: :return: Metadata from all sources """ - for meta_list in self._metadata_getter.values(): - for metadata in meta_list: + for meta_dict in self._metadata_getter.values(): + for metadata in meta_dict.values(): yield metadata @property @@ -147,21 +153,33 @@ def formatted_metadata_dict(self, separator: str = "|") -> Dict[str, str]: return {source_field: separator.join(values) for source_field, values in source_field_2_values.items()} - - def add_metadata(self, source, metadata): + def add_metadata(self, metadata: Metadata, metadata_id: int = None) -> None: """Add metadata to metadata getter - :param source: Name of the metadata source :param metadata: metadata value to add for the source + :param metadata_id: metadata identifier :raises AssertionError: Source or metadata is not with the correct type """ assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" - assert isinstance(source, str), f"Source is not a string but with {type(source)}" - - self._metadata_getter[source].append(metadata) - def get_metadata_by_source(self, source: str) -> Union[List[Metadata], None]: + # Metadata_id should not already exist because the metadata are added from scratch to a new source, + # or they are ridden + if metadata_id is None: + if self.has_source(metadata.source): + metadata_id = max([meta_id for meta_id in self._metadata_getter[metadata.source].keys()]) + 1 + else: + metadata_id = 0 + + try: + self._metadata_getter[metadata.source][metadata_id] + except KeyError: + self._metadata_getter[metadata.source][metadata_id] = metadata + else: + raise KeyError(f"A metadata with ID {metadata_id} already exist " + f"for source {metadata.source} in {str(self)}") + + def get_metadata_by_source(self, source: str) -> Union[Dict[int, Metadata], None]: """Get all the metadata feature corresponding to the source :param source: Name of the source to get @@ -200,16 +218,15 @@ def del_metadata_by_source(self, source: str): def del_metadata_by_attribute(self, **kwargs): """Remove a source from the feature - - :param source: Name of the source to delete """ - for source, metadata in self._metadata_getter.items(): + for source, metadata_dict in self._metadata_getter.items(): for attr, value in kwargs.items(): - if hasattr(metadata, attr): - # BUG If value is a list, the join block detection. - # It would be better to keep a list and change in writing and reading metadata to join the list - if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: - self._metadata_getter[source].remove(metadata) + for meta_id, metadata in metadata_dict.items(): + if hasattr(metadata, attr): + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + del self._metadata_getter[source][meta_id] def max_metadata_by_source(self) -> Tuple[str, int]: """Get the maximum number of metadata for one source @@ -226,4 +243,13 @@ def has_metadata(self) -> bool: :return: True if it has metadata else False """ - return True if len(self._metadata_getter) > 0 else False \ No newline at end of file + return True if self.number_of_metadata > 0 else False + + def has_source(self, source: str) -> bool: + """Check if the source is in the metadata feature + + :param source: name of the source + + :return: True if the source is in the metadata feature else False + """ + return True if source in self._metadata_getter else False diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index d61873a3..37cb3b84 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -827,15 +827,15 @@ def metadata_sources(self, metatype: str) -> Set[str]: def metadata(self, metatype: str) -> Generator[Metadata, None, None]: """Create a generator with all metadatas in the pangenome - :param metatype: Select to which pangenome element metadata should be generate + :param metatype: Select to which pangenome element metadata should be generate :return: Set of metadata source """ for elem in self.select_elem(metatype): yield elem.metadata - def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ - Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: + def get_elem_by_metadata(self, metatype: str, **kwargs + ) -> Generator[Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: """Get element in pangenome with metadata attribute expected :param metatype: Select to which pangenome element metadata @@ -847,16 +847,16 @@ def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ if len(list(elem.get_metadata_by_attribute(**kwargs))) > 0: yield elem - def get_elem_by_sources(self, source: List[str], - metatype: str) -> Generator[Union[GeneFamily, Gene, Contig, Organism, - Region, Spot, Module], None, None]: - """ Get gene famlies with a specific source in pangenome + def get_elem_by_source(self, source: str, metatype: str + ) -> Generator[Union[GeneFamily, Gene, Contig, Organism, Region, Spot, Module], None, None]: + """ Get gene families with a specific source in pangenome :param source: Name of the source + :param metatype: select to which pangenome element metadata should be written :return: Gene families with the source """ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): - if elem.get_metadata_by_source(source) is not None: + if elem.has_source(source): yield elem diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 08c0da5f..8a734201 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -536,21 +536,10 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org :return: Number of genes that do not cluster with any of the gene families of the pangenome. """ - seq_fasta_files = [] - logging.getLogger('PPanGGOLiN').info('Writing gene sequences of input genomes.') input_genes = [gene for org in input_organisms for gene in org.genes] - # for input_organism in input_organisms: - # seq_outdir = output / input_organism.name - # mk_outdir(seq_outdir, force=True) - # - # seq_fasta_file = seq_outdir / "cds_sequences.fasta" - # - # write_gene_sequences_from_annotations(input_organism.genes, seq_fasta_file, disable_bar=True, add="ppanggolin_") - # - # seq_fasta_files.append(seq_fasta_file) seq_fasta_file = output / 'input_genes.fasta' write_gene_sequences_from_annotations(input_genes, seq_fasta_file, disable_bar=True, add='ppanggolin_') @@ -558,17 +547,16 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org with create_tmpdir(main_dir=tmpdir, basename="projection_tmp", keep_tmp=keep_tmp) as new_tmpdir: if use_representatives: _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome=pangenome, - sequence_files=seq_fasta_file, - output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, is_input_slf=True, - cpu=cpu, no_defrag=no_defrag, + sequence_files=seq_fasta_file, output=new_tmpdir, + tmpdir=new_tmpdir, input_type="nucleotide", + is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table) else: _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_file, output=new_tmpdir, tmpdir=new_tmpdir, - is_input_seq_nt=True, is_input_slf=True, + input_type="nucleotide", is_input_slf=True, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table, diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 1784f0f6..f9d9e839 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -586,8 +586,8 @@ def combine_args(args: argparse.Namespace, another_args: argparse.Namespace): return args -def get_args_that_different_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, - param_to_ignore: Union[List[str], Set[str]] = None) -> dict: +def get_args_differing_from_default(default_args: argparse.Namespace, final_args: argparse.Namespace, + param_to_ignore: Union[List[str], Set[str]] = None) -> dict: """ Get the parameters that have different value than default values. @@ -665,7 +665,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # cli > config > default args = overwrite_args(default_args, config_args, cli_args) - params_that_differ = get_args_that_different_from_default(default_args, args, input_params) + params_that_differ = get_args_differing_from_default(default_args, args, input_params) if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) @@ -706,7 +706,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ step_args = overwrite_args(default_step_args, config_step_args, cli_args) - step_params_that_differ = get_args_that_different_from_default(default_step_args, step_args) + step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) @@ -1189,6 +1189,16 @@ def get_consecutive_region_positions(region_positions: List[int], contig_gene_co def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess failed with the following error:\n"): + """Run a subprocess command and write the output to the given path. + + :param cmd: list of program arguments + :param output: path to write the subprocess output + :param msg: message to print if the subprocess fails + + :return: + + :raises subprocess.CalledProcessError: raise when the subprocess return a non-zero exit code + """ logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) @@ -1199,4 +1209,4 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f else: if output is not None: with open(output, 'w') as fout: - fout.write(result.stdout) \ No newline at end of file + fout.write(result.stdout) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index da91697b..3b2bc937 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -64,9 +64,9 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, if args.clusters is not None: start_clust = time.time() - print(args.cluster) - read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar, - infer_singleton=args.cluster.infer_singletons) + read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons, + code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir, + keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar) else: # args.cluster is None if pangenome.status["geneSequences"] == "No": if args.fasta is None: @@ -80,7 +80,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, disable_bar=args.disable_prog_bar, defrag=not args.cluster.no_defrag, code=args.cluster.translation_table, coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode, - keep_tmp_files=True) + keep_tmp_files=args.cluster.keep_tmp) clust_time = time.time() - start_clust elif args.fasta is not None: diff --git a/pyproject.toml b/pyproject.toml index abdbf2d7..7864f533 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,9 +81,3 @@ packages = ["ppanggolin"] [tool.setuptools.dynamic] version = {file = "VERSION"} - -[tool.pytest.ini_options] -markers = [ - "nucleotides: marks tests to run with nucleotides alphabets", - "aminoacids: marks tests to run with amino-acids alphabets", -] \ No newline at end of file diff --git a/tests/align/test_align.py b/tests/align/test_align.py index e5ce2dde..42d1cd5f 100644 --- a/tests/align/test_align.py +++ b/tests/align/test_align.py @@ -6,106 +6,77 @@ @pytest.fixture -def nucleotides(): - yield ['A', 'C', 'G', 'T', 'N'] +def single_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGTTGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] @pytest.fixture -def aminoacids(): - amino_acid_alphabet = [ - 'A', # Alanine - 'R', # Arginine - 'N', # Asparagine - 'D', # Aspartic acid - 'C', # Cysteine - 'E', # Glutamic acid - 'Q', # Glutamine - 'G', # Glycine - 'H', # Histidine - 'I', # Isoleucine - 'L', # Leucine - 'K', # Lysine - 'M', # Methionine - 'F', # Phenylalanine - 'P', # Proline - 'S', # Serine - 'T', # Threonine - 'W', # Tryptophan - 'Y', # Tyrosine - 'V' # Valine - ] - yield amino_acid_alphabet +def single_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'YWTPRPFFYAAEYNN\n', + ">Gene_2\n", + "YWTPRPSYWTPAAEYNN\n" + ] @pytest.fixture -def number_of_sequences(): - yield randint(4, 10) -@pytest.fixture -def single_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, - aminoacids: List[str], nucleotides: List[str]): - if request.node.get_closest_marker('aminoacids'): - alphabet = aminoacids - fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" - else: - alphabet = nucleotides - fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" - with open(fasta_path, "w") as fasta: - for i in range(number_of_sequences): - fasta.write(f">Gene_{i}\n") - fasta.write("".join([choice(alphabet) for _ in range(0, randint(30, 100))])) - fasta.write("\n") - yield fasta_path +def multi_line_fasta_nt() -> List: + + return ['>Gene_1 seq_description\n', + 'ATGCGT\n', + 'TGTCGTTG\n', + ">Gene_2\n", + "TGTGACCTGCT\n" + ] @pytest.fixture -def multi_line_fasta(request, tmp_path_factory: pytest.TempPathFactory, number_of_sequences, - aminoacids: List[str], nucleotides: List[str]): - if request.node.get_closest_marker('aminoacids'): - alphabet = aminoacids - fasta_path = tmp_path_factory.getbasetemp() / "single_line_nt.fasta" - else: - alphabet = nucleotides - fasta_path = tmp_path_factory.getbasetemp() / "single_line_aa.fasta" - with open(fasta_path, "w") as fasta: - for i in range(number_of_sequences): - fasta.write(f">Gene_{i}\n") - for j in range(randint(4,10)): - fasta.write("".join([choice(alphabet) for _ in range(60)])) - fasta.write("\n") - yield fasta_path - -@pytest.mark.nucleotides -def test_get_seq_ids_single_line_nt(number_of_sequences, single_line_fasta): - with open(single_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert is_nucleotide - assert single_line_fasta - -@pytest.mark.aminoacids -def test_get_seq_ids_single_line_aa(number_of_sequences, single_line_fasta): - with open(single_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert not is_nucleotide - assert single_line_fasta - -@pytest.mark.nucleotides -def test_get_seq_ids_multi_line_nt(number_of_sequences, multi_line_fasta): - with open(multi_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert is_nucleotide - assert not single_line_fasta - -@pytest.mark.aminoacids -def test_get_seq_ids_multi_line_aa(number_of_sequences, multi_line_fasta): - with open(multi_line_fasta, "r") as fasta: - seq_set, is_nucleotide, single_line_fasta = get_seq_ids(fasta) - assert len(seq_set) == number_of_sequences - assert seq_set == {f"Gene_{i}" for i in range(number_of_sequences)} - assert not is_nucleotide - assert not single_line_fasta +def multi_line_fasta_aa() -> List: + + return ['>Gene_1 seq_description\n', + 'AAEYNN\n', + 'YWTPRPFFY\n', + ">Gene_2\n", + "YWTPRPS\n", + "YWTPAAEYNN\n" + ] + + +def test_get_seq_ids_single_line_nt(single_line_fasta_nt): + + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_nt) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_single_line_aa(single_line_fasta_aa): + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(single_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert single_line_fasta + +def test_get_seq_ids_multi_line_nt(multi_line_fasta_nt): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_nt) + + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert is_nucleotide + assert not single_line_fasta + +def test_get_seq_ids_multi_line_aa(multi_line_fasta_aa): + + seq_set, is_nucleotide, single_line_fasta = get_seq_ids(multi_line_fasta_aa) + assert len(seq_set) == 2 + assert seq_set == {'Gene_1', 'Gene_2'} + assert not is_nucleotide + assert not single_line_fasta diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..393e1386 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + nucleotides: marks tests to run with nucleotides alphabets + aminoacids: marks tests to run with amino-acids alphabets \ No newline at end of file diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d4bab2e4..95928b05 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -83,24 +83,24 @@ def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: """ metafeatures = MetaFeatures() for meta in metadata: - metafeatures.add_metadata(meta.source, meta) + metafeatures.add_metadata(meta) yield metafeatures def test_add_metadata(self, metafeatures, metadata): """Tests that metadata can be added to the metadata getter """ - assert all(metafeatures._metadata_getter[meta.source] == [meta] for meta in metadata) + assert all(list(metafeatures._metadata_getter[meta.source].values()) == [meta] for meta in metadata) def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): """Tests that all the metadata features corresponding to a source can be retrieved """ - assert all(metafeatures.get_metadata_by_source(meta.source) == [meta] for meta in metadata) + assert all(list(metafeatures.get_metadata_by_source(meta.source).values()) == [meta] for meta in metadata) def test_remove_source_from_feature(self, metafeatures): """Tests that a source can be removed from the feature """ metadata = Metadata("source_del", attribute1="value") - metafeatures.add_metadata("source_del", metadata) + metafeatures.add_metadata(metadata) metafeatures.del_metadata_by_source("source_del") assert metafeatures.get_metadata_by_source("source_del") is None @@ -114,7 +114,7 @@ def test_get_metadata_by_attribute_values(self, metafeatures): """ meta = Metadata("source_test", attribute1="value_to_retrieve") # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) - metafeatures.add_metadata(meta.source, meta) + metafeatures.add_metadata(meta) # metafeatures[meta_list.source] = meta_list assert list(metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve")) == [meta] # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] @@ -124,26 +124,12 @@ def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metad """ metadata1 = Metadata("source_max", attribute1="value1") metadata2 = Metadata("source_max", attribute2="value2") - metafeatures.add_metadata("source_max", metadata1) - metafeatures.add_metadata("source_max", metadata2) + metafeatures.add_metadata(metadata1) + metafeatures.add_metadata(metadata2) assert metafeatures.max_metadata_by_source() == ("source_max", 2) def test_metadata_is_not_with_type_metadata(self, metafeatures): """Tests that an AssertionError is raised when metadata is not with type Metadata """ with pytest.raises(AssertionError): - metafeatures.add_metadata("source1", "not_metadata") - - def test_source_is_not_a_string(self, metafeatures): - """Tests that an AssertionError is raised when the source is not a string - """ - - metadata = Metadata("source1", attribute1="value1") - with pytest.raises(AssertionError): - metafeatures.add_metadata(1, metadata) - - def test_source_or_metadata_is_not_with_correct_type(self, metafeatures, metadata): - """Tests that an AssertionError is raised when the source or metadata is not with the correct type - """ - with pytest.raises(AssertionError): - metafeatures.add_metadata(1, "not_metadata") + metafeatures.add_metadata("not_metadata") diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index d67eb1ce..90483079 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -804,25 +804,25 @@ def add_element_to_pangenome(self, pangenome): """ metadata = Metadata(source="source", attribute="attr") family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") - family.add_metadata(source=metadata.source, metadata=metadata) + family.add_metadata(metadata=metadata) pangenome.add_gene_family(family) org = Organism("Org") - org.add_metadata(source=metadata.source, metadata=metadata) + org.add_metadata(metadata=metadata) ctg = Contig(0, "Ctg") org.add(ctg) gene = Gene("Gene") - gene.position, gene.start = (0, 0) - gene.add_metadata(source=metadata.source, metadata=metadata) - ctg[gene.start] = gene + gene.fill_annotations(start=0, stop=100, position=0, strand='+') + gene.add_metadata(metadata=metadata) + ctg.add(gene) pangenome.add_organism(org) rgp = Region("RGP") - rgp.add_metadata(source=metadata.source, metadata=metadata) + rgp.add_metadata(metadata=metadata) pangenome.add_region(rgp) spot = Spot(0) - spot.add_metadata(source=metadata.source, metadata=metadata) + spot.add_metadata(metadata=metadata) pangenome.add_spot(spot) module = Module(0) - module.add_metadata(source=metadata.source, metadata=metadata) + module.add_metadata(metadata=metadata) pangenome.add_module(module) def test_select_elem(self, add_element_to_pangenome, pangenome): @@ -876,7 +876,7 @@ def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): assert isinstance(metadata, Metadata) assert metadata.source == 'source' - def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): + def test_get_elem_by_source(self, add_element_to_pangenome, pangenome): """Tests the metadata generator filtered by source of the Pangenome class. :param add_element_to_pangenome: Add elements to the pangenome @@ -884,7 +884,7 @@ def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): """ for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, "spots": Spot, "modules": Module}.items(): - for elem in pangenome.get_elem_by_sources(source='source', metatype=metatype): + for elem in pangenome.get_elem_by_source(source='source', metatype=metatype): assert isinstance(elem, expected_type) for metadata in elem.metadata: assert isinstance(metadata, Metadata)