Merge branch 'dev' into issue195

labgem · Jun 11, 2024 · 21887c7 · 21887c7
2 parents a1784fe + 6a2be8b
commit 21887c7
Show file tree

Hide file tree

Showing 18 changed files with 222 additions and 258 deletions.
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
@@ -27,7 +27,7 @@
 
 def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]],
                       tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
-                      is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
+                      query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow",
                       is_target_slf: bool = False, translation_table: int = None) -> Path:
     """
     Align fasta sequence to pangenome sequences. 
@@ -39,34 +39,36 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
     :param no_defrag: Do not apply defragmentation
     :param identity: minimal identity threshold for the alignment
     :param coverage: minimal identity threshold for the alignment
-    :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param query_type: Sequences type of the file (query). [nucleotide, protein, unknow]
     :param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link
-    :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
+    :param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein]
     :param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link
     :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
 
     :return: Alignment result file
     """
 
-    if is_target_nt:
+    if target_type == "nucleotide":
         logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with "
                                               f"translation table {translation_table}")
         with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir:
             #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
             target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table)
     else:
+        db_type = 1 if target_type == "protein" else 0
         target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file,
-                                     'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1)
+                                     'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type)
 
-    if is_query_nt:
+    if query_type == "nucleotide":
         logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
                                               f"with translation table {translation_table}")
         with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir:
             #  Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
             query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table)
     else:
+        db_type = 1 if query_type == "protein" else 0
         query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files,
-                                    'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1)
+                                    'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type)
 
     cov_mode = "2"  # coverage of query
     if no_defrag:
@@ -390,7 +392,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
 
 
 def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
                                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                                      translation_table: int = 11, disable_bar: bool = False
                                      ) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -404,7 +406,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow]
     :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
@@ -421,18 +423,18 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
     logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}')
     write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar)
 
-    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
-                                   tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
-                                   is_target_nt=False, is_target_slf=True, translation_table=translation_table)
+    align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
+                                   cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+                                   query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
+                                   target_type="protein", translation_table=translation_table)
 
     seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
 
     return align_file, seq2pang
 
 
 def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Union[Path, Iterable[Path]], output: Path,
-                                     tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
+                                     tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
                                      no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
                                      translation_table: int = 11, disable_bar: bool = False
                                      ) -> Tuple[Path, Dict[str, GeneFamily]]:
@@ -446,7 +448,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Unio
     :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
     :param output: Path to the output directory where alignment results will be stored.
     :param tmpdir: Temporary directory for intermediate files.
-    :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+    :param input_type: Sequences type of the file (query). [nucleotide, protein, unknow]
     :param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
     :param cpu: Number of CPU cores to use for the alignment (default: 1).
     :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
@@ -464,8 +466,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files:  Unio
 
     align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
                                    cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
-                                   is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
-                                   is_target_slf=True, translation_table=translation_table)
+                                   query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
+                                   target_type="nucleotide", translation_table=translation_table)
 
     seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
 
@@ -516,19 +518,19 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
         seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj)
 
     with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
-
+        input_type = "nucleotide" if is_nucleotide else "unknow"
         if use_representatives:
             align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
-                                                                    is_input_seq_nt=is_nucleotide,
-                                                                    is_input_slf=single_line_fasta,
-                                                                    cpu=cpu, no_defrag=no_defrag, identity=identity,
+                                                                    input_type=input_type,
+                                                                    is_input_slf=single_line_fasta, cpu=cpu,
+                                                                    no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,
                                                                     translation_table=translation_table,
                                                                     disable_bar=disable_bar)
         else:
             align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                     output=output, tmpdir=new_tmpdir,
-                                                                    is_input_seq_nt=is_nucleotide,
+                                                                    input_type=input_type,
                                                                     is_input_slf=single_line_fasta,
                                                                     cpu=cpu, no_defrag=no_defrag, identity=identity,
                                                                     coverage=coverage,

diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -531,13 +531,10 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
                 rna_counter += 1
 
     genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_metadata)
-
-    genome_meta_obj = Metadata(source='annotation_file', **genome_metadata)
-    organism.add_metadata(source="annotation_file", metadata=genome_meta_obj)
+    organism.add_metadata(metadata=Metadata(source='annotation_file', **genome_metadata))
 
     for contig, metadata_dict in contig_to_uniq_metadata.items():
-        contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict)
-        contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj)
+        contig.add_metadata(Metadata(source='annotation_file', **metadata_dict))
 
     if used_transl_table_arg:
         logging.getLogger("PPanGGOLiN").info(
@@ -815,8 +812,7 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org:
     if len(contig_name_to_region_info) == org.number_of_contigs:
         genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_name_to_region_info)
         if genome_metadata:
-            genome_meta_obj = Metadata(source='annotation_file', **genome_metadata)
-            org.add_metadata(source="annotation_file", metadata=genome_meta_obj)
+            org.add_metadata(Metadata(source='annotation_file', **genome_metadata))
     else:
         logging.getLogger("PPanGGOLiN").warning(
             f"Inconsistent data in GFF file {gff_file_path}: "
@@ -827,11 +823,9 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org:
         try:
             contig = org.get(contig_name)
         except KeyError:
-            raise ValueError(
-                f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.")
+            raise ValueError(f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.")
 
-        contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict)
-        contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj)
+        contig.add_metadata(Metadata(source='annotation_file', **metadata_dict))
 
 
 def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separation: int = 10):

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
@@ -490,7 +490,6 @@ def parser_clust(parser: argparse.ArgumentParser):
                             "with their original gene family.")
     clust.add_argument("--translation_table", required=False, default="11",
                        help="Translation table (genetic code) to use.")
-    # clust.add_argument("--compress")
 
     clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
 

diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
@@ -74,18 +74,17 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir:
         logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences")
 
         with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
-
+            input_type = "nucleotide" if is_nucleotide else "unknow"
             if use_representatives:
                 _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
-                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
-                                                                cpu=cpu, no_defrag=no_defrag,
-                                                                identity=identity, coverage=coverage,
-                                                                translation_table=translation_table,
+                                                                input_type=input_type, is_input_slf=is_slf, cpu=cpu,
+                                                                no_defrag=no_defrag, identity=identity,
+                                                                coverage=coverage, translation_table=translation_table,
                                                                 disable_bar=disable_bar)
             else:
                 _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
                                                                 output=output, tmpdir=new_tmpdir,
-                                                                is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
+                                                                input_type=input_type, is_input_slf=is_slf,
                                                                 cpu=cpu, no_defrag=no_defrag,
                                                                 identity=identity, coverage=coverage,
                                                                 translation_table=translation_table,

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
@@ -682,7 +682,11 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
         source_table = metadata_group._f_get_child(source)
         for row in tqdm(read_chunks(source_table), total=source_table.nrows, unit='metadata', disable=disable_bar):
             meta_dict = {'source': source}
-            identifier = row["ID"].decode() if isinstance(row["ID"], bytes) else int(row["ID"])
+            try:
+                meta_id = row["metadata_id"]
+            except KeyError:
+                meta_id = None
+            identifier = row["ID"].decode("utf-8") if isinstance(row["ID"], bytes) else int(row["ID"])
             # else:
             #     identifier = row["name"].decode()
             if metatype == "families":
@@ -706,8 +710,7 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
             for field in row.dtype.names:
                 if field not in ["ID", "name"]:
                     meta_dict[field] = row[field].decode() if isinstance(row[field], bytes) else row[field]
-            meta = Metadata(**meta_dict)
-            element.add_metadata(source=source, metadata=meta)
+            element.add_metadata(metadata=Metadata(**meta_dict), metadata_id=meta_id)
     pangenome.status["metadata"][metatype] = "Loaded"
 
 

diff --git a/ppanggolin/formats/writeFlatMetadata.py b/ppanggolin/formats/writeFlatMetadata.py
@@ -89,7 +89,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path,
 
         for element in getattr(pangenome, element_type_to_attribute[element_type]):
             for source in set(element.sources) & set(sources):
-                for metadata in element.get_metadata_by_source(source):
+                for metadata in element.get_metadata_by_source(source).values():
                     metadata_dict = metadata.to_dict()
                     if element_type == "spots":
                         metadata_dict[element_type] = f"spot_{element.ID}"

diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py
@@ -976,26 +976,21 @@ def write_spot_modules(output: Path, compress: bool = False):
     """
     logging.getLogger("PPanGGOLiN").info("Writing modules to spot associations...")
 
-    fam2mod = {}
-    for mod in pan.modules:
-        for fam in mod.families:
-            fam2mod[fam] = mod
-
     with write_compressed_or_not(output / "modules_spots.tsv", compress) as fout:
         fout.write("module_id\tspot_id\n")
 
         for spot in pan.spots:
             curr_mods = defaultdict(set)
             for rgp in spot.get_uniq_content():
                 for fam in rgp.families:
-                    mod = fam2mod.get(fam)
-                    if mod is not None:
-                        curr_mods[mod].add(fam)
+                    if fam.module is not None:
+                        curr_mods[fam.module].add(fam)
+
+            for module, mod_families_in_spot in curr_mods.items():
 
-            for mod in curr_mods:
-                if curr_mods[mod] == mod.families:
+                if mod_families_in_spot == set(module.families):
                     # if all the families in the module are found in the spot, write the association
-                    fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n")
+                    fout.write(f"module_{module.ID}\tspot_{spot.ID}\n")
 
     logging.getLogger("PPanGGOLiN").info(
         f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}")
@@ -1241,7 +1236,7 @@ def parser_flat(parser: argparse.ArgumentParser):
     optional.add_argument("--modules", required=False, action="store_true",
                           help="Write a tsv file listing functional modules and the families that belong to them")
     optional.add_argument("--spot_modules", required=False, action="store_true",
-                          help="writes 3 files comparing the presence of modules within spots")
+                          help="writes 2 files comparing the presence of modules within spots")
 
     optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
     optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")