Skip to content

Commit

Permalink
Merge branch 'dev' into issue195
Browse files Browse the repository at this point in the history
  • Loading branch information
jpjarnoux authored Jun 11, 2024
2 parents a1784fe + 6a2be8b commit 21887c7
Show file tree
Hide file tree
Showing 18 changed files with 222 additions and 258 deletions.
46 changes: 24 additions & 22 deletions ppanggolin/align/alignOnPang.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_files: Union[Path, Iterable[Path]],
tmpdir: Path, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
is_query_nt: bool = False, is_query_slf: bool = False, is_target_nt: bool = False,
query_type: str = "unknow", is_query_slf: bool = False, target_type: str = "unknow",
is_target_slf: bool = False, translation_table: int = None) -> Path:
"""
Align fasta sequence to pangenome sequences.
Expand All @@ -39,34 +39,36 @@ def align_seq_to_pang(target_seq_file: Union[Path, Iterable[Path]], query_seq_fi
:param no_defrag: Do not apply defragmentation
:param identity: minimal identity threshold for the alignment
:param coverage: minimal identity threshold for the alignment
:param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs
:param query_type: Sequences type of the file (query). [nucleotide, protein, unknow]
:param is_query_slf: Is the sequence file (query) with single line fasta. If True, MMSeqs2 database will be with soft link
:param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
:param target_type: Sequences type of pangenome (target). [nucleotide, aminoacid, protein]
:param is_target_slf: Is the sequences of pangenome (target) with single line fasta. If True, MMSeqs2 database will be with soft link
:param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
:return: Alignment result file
"""

if is_target_nt:
if target_type == "nucleotide":
logging.getLogger("PPanGGOLiN").debug("Target sequences will be translated by mmseqs with "
f"translation table {translation_table}")
with create_tmpdir(tmpdir, basename="target_db", keep_tmp=True) as target_db_dir:
# Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
target_db = translate_genes(target_seq_file, target_db_dir, cpu, is_target_slf, translation_table)
else:
db_type = 1 if target_type == "protein" else 0
target_db = create_mmseqs_db([target_seq_file] if isinstance(target_seq_file, Path) else target_seq_file,
'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=1)
'target_db', tmpdir, db_mode=1 if is_target_slf else 0, db_type=db_type)

if is_query_nt:
if query_type == "nucleotide":
logging.getLogger("PPanGGOLiN").debug("Query sequences will be translated by mmseqs "
f"with translation table {translation_table}")
with create_tmpdir(tmpdir, basename="query_db", keep_tmp=True) as query_db_dir:
# Keep is set as true because whether tmpdir is deleted or not target_db_dir will be the same
query_db = translate_genes(query_seq_files, query_db_dir, cpu, is_query_slf, translation_table)
else:
db_type = 1 if query_type == "protein" else 0
query_db = create_mmseqs_db([query_seq_files] if isinstance(query_seq_files, Path) else query_seq_files,
'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=1)
'query_db', tmpdir, db_mode=1 if is_query_slf else 0, db_type=db_type)

cov_mode = "2" # coverage of query
if no_defrag:
Expand Down Expand Up @@ -390,7 +392,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel


def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path,
tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
translation_table: int = 11, disable_bar: bool = False
) -> Tuple[Path, Dict[str, GeneFamily]]:
Expand All @@ -404,7 +406,7 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
:param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
:param output: Path to the output directory where alignment results will be stored.
:param tmpdir: Temporary directory for intermediate files.
:param is_input_seq_nt: Is input sequence file nucleotide sequences.
:param input_type: Type of input sequence file. [nucleotide, aminoacid, unknow]
:param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
:param cpu: Number of CPU cores to use for the alignment (default: 1).
:param no_defrag: If True, the defragmentation workflow is skipped (default: False).
Expand All @@ -421,18 +423,18 @@ def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Union
logging.getLogger("PPanGGOLiN").debug(f'Write gene family sequences in {pangenome_sequences.absolute()}')
write_gene_fam_sequences(pangenome, pangenome_sequences, add="ppanggolin_", disable_bar=disable_bar)

align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files,
tmpdir=tmpdir, cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf,
is_target_nt=False, is_target_slf=True, translation_table=translation_table)
align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
target_type="protein", translation_table=translation_table)

seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)

return align_file, seq2pang


def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Union[Path, Iterable[Path]], output: Path,
tmpdir: Path, is_input_seq_nt: bool, is_input_slf: bool = False, cpu: int = 1,
tmpdir: Path, input_type: str = "unknow", is_input_slf: bool = False, cpu: int = 1,
no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8,
translation_table: int = 11, disable_bar: bool = False
) -> Tuple[Path, Dict[str, GeneFamily]]:
Expand All @@ -446,7 +448,7 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio
:param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
:param output: Path to the output directory where alignment results will be stored.
:param tmpdir: Temporary directory for intermediate files.
:param is_input_seq_nt: Is input sequence file nucleotide sequences.
:param input_type: Sequences type of the file (query). [nucleotide, protein, unknow]
:param is_input_slf: Is the sequence file with single line fasta. If True, MMSeqs2 database will be with soft link
:param cpu: Number of CPU cores to use for the alignment (default: 1).
:param no_defrag: If True, the defragmentation workflow is skipped (default: False).
Expand All @@ -464,8 +466,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Unio

align_file = align_seq_to_pang(target_seq_file=pangenome_sequences, query_seq_files=sequence_files, tmpdir=tmpdir,
cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
is_query_nt=is_input_seq_nt, is_query_slf=is_input_slf, is_target_nt=True,
is_target_slf=True, translation_table=translation_table)
query_type=input_type, is_query_slf=is_input_slf, is_target_slf=True,
target_type="nucleotide", translation_table=translation_table)

seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)

Expand Down Expand Up @@ -516,19 +518,19 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
seq_set, is_nucleotide, single_line_fasta = get_seq_ids(seqFileObj)

with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:

input_type = "nucleotide" if is_nucleotide else "unknow"
if use_representatives:
align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
is_input_seq_nt=is_nucleotide,
is_input_slf=single_line_fasta,
cpu=cpu, no_defrag=no_defrag, identity=identity,
input_type=input_type,
is_input_slf=single_line_fasta, cpu=cpu,
no_defrag=no_defrag, identity=identity,
coverage=coverage,
translation_table=translation_table,
disable_bar=disable_bar)
else:
align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
output=output, tmpdir=new_tmpdir,
is_input_seq_nt=is_nucleotide,
input_type=input_type,
is_input_slf=single_line_fasta,
cpu=cpu, no_defrag=no_defrag, identity=identity,
coverage=coverage,
Expand Down
16 changes: 5 additions & 11 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,13 +531,10 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
rna_counter += 1

genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_to_metadata)

genome_meta_obj = Metadata(source='annotation_file', **genome_metadata)
organism.add_metadata(source="annotation_file", metadata=genome_meta_obj)
organism.add_metadata(metadata=Metadata(source='annotation_file', **genome_metadata))

for contig, metadata_dict in contig_to_uniq_metadata.items():
contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict)
contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj)
contig.add_metadata(Metadata(source='annotation_file', **metadata_dict))

if used_transl_table_arg:
logging.getLogger("PPanGGOLiN").info(
Expand Down Expand Up @@ -815,8 +812,7 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org:
if len(contig_name_to_region_info) == org.number_of_contigs:
genome_metadata, contig_to_uniq_metadata = combine_contigs_metadata(contig_name_to_region_info)
if genome_metadata:
genome_meta_obj = Metadata(source='annotation_file', **genome_metadata)
org.add_metadata(source="annotation_file", metadata=genome_meta_obj)
org.add_metadata(Metadata(source='annotation_file', **genome_metadata))
else:
logging.getLogger("PPanGGOLiN").warning(
f"Inconsistent data in GFF file {gff_file_path}: "
Expand All @@ -827,11 +823,9 @@ def add_metadata_from_gff_file(contig_name_to_region_info: Dict[str, str], org:
try:
contig = org.get(contig_name)
except KeyError:
raise ValueError(
f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.")
raise ValueError(f"Contig '{contig_name}' does not exist in the genome object created from GFF file {gff_file_path}.")

contigs_meta_obj = Metadata(source='annotation_file', **metadata_dict)
contig.add_metadata(source="annotation_file", metadata=contigs_meta_obj)
contig.add_metadata(Metadata(source='annotation_file', **metadata_dict))


def check_and_add_extra_gene_part(gene: Gene, new_gene_info: Dict, max_separation: int = 10):
Expand Down
1 change: 0 additions & 1 deletion ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,6 @@ def parser_clust(parser: argparse.ArgumentParser):
"with their original gene family.")
clust.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")
# clust.add_argument("--compress")

clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")

Expand Down
11 changes: 5 additions & 6 deletions ppanggolin/context/searchGeneContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,17 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir:
logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences")

with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:

input_type = "nucleotide" if is_nucleotide else "unknow"
if use_representatives:
_, seqid2fam = get_input_seq_to_family_with_rep(pangenome, sequence_file, output, new_tmpdir,
is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
cpu=cpu, no_defrag=no_defrag,
identity=identity, coverage=coverage,
translation_table=translation_table,
input_type=input_type, is_input_slf=is_slf, cpu=cpu,
no_defrag=no_defrag, identity=identity,
coverage=coverage, translation_table=translation_table,
disable_bar=disable_bar)
else:
_, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=sequence_file,
output=output, tmpdir=new_tmpdir,
is_input_seq_nt=is_nucleotide, is_input_slf=is_slf,
input_type=input_type, is_input_slf=is_slf,
cpu=cpu, no_defrag=no_defrag,
identity=identity, coverage=coverage,
translation_table=translation_table,
Expand Down
9 changes: 6 additions & 3 deletions ppanggolin/formats/readBinaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,11 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
source_table = metadata_group._f_get_child(source)
for row in tqdm(read_chunks(source_table), total=source_table.nrows, unit='metadata', disable=disable_bar):
meta_dict = {'source': source}
identifier = row["ID"].decode() if isinstance(row["ID"], bytes) else int(row["ID"])
try:
meta_id = row["metadata_id"]
except KeyError:
meta_id = None
identifier = row["ID"].decode("utf-8") if isinstance(row["ID"], bytes) else int(row["ID"])
# else:
# identifier = row["name"].decode()
if metatype == "families":
Expand All @@ -706,8 +710,7 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
for field in row.dtype.names:
if field not in ["ID", "name"]:
meta_dict[field] = row[field].decode() if isinstance(row[field], bytes) else row[field]
meta = Metadata(**meta_dict)
element.add_metadata(source=source, metadata=meta)
element.add_metadata(metadata=Metadata(**meta_dict), metadata_id=meta_id)
pangenome.status["metadata"][metatype] = "Loaded"


Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/formats/writeFlatMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def write_flat_metadata_files(pangenome: Pangenome, output: Path,

for element in getattr(pangenome, element_type_to_attribute[element_type]):
for source in set(element.sources) & set(sources):
for metadata in element.get_metadata_by_source(source):
for metadata in element.get_metadata_by_source(source).values():
metadata_dict = metadata.to_dict()
if element_type == "spots":
metadata_dict[element_type] = f"spot_{element.ID}"
Expand Down
19 changes: 7 additions & 12 deletions ppanggolin/formats/writeFlatPangenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,26 +976,21 @@ def write_spot_modules(output: Path, compress: bool = False):
"""
logging.getLogger("PPanGGOLiN").info("Writing modules to spot associations...")

fam2mod = {}
for mod in pan.modules:
for fam in mod.families:
fam2mod[fam] = mod

with write_compressed_or_not(output / "modules_spots.tsv", compress) as fout:
fout.write("module_id\tspot_id\n")

for spot in pan.spots:
curr_mods = defaultdict(set)
for rgp in spot.get_uniq_content():
for fam in rgp.families:
mod = fam2mod.get(fam)
if mod is not None:
curr_mods[mod].add(fam)
if fam.module is not None:
curr_mods[fam.module].add(fam)

for module, mod_families_in_spot in curr_mods.items():

for mod in curr_mods:
if curr_mods[mod] == mod.families:
if mod_families_in_spot == set(module.families):
# if all the families in the module are found in the spot, write the association
fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n")
fout.write(f"module_{module.ID}\tspot_{spot.ID}\n")

logging.getLogger("PPanGGOLiN").info(
f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}")
Expand Down Expand Up @@ -1241,7 +1236,7 @@ def parser_flat(parser: argparse.ArgumentParser):
optional.add_argument("--modules", required=False, action="store_true",
help="Write a tsv file listing functional modules and the families that belong to them")
optional.add_argument("--spot_modules", required=False, action="store_true",
help="writes 3 files comparing the presence of modules within spots")
help="writes 2 files comparing the presence of modules within spots")

optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
Expand Down
Loading

0 comments on commit 21887c7

Please sign in to comment.