Skip to content

Commit

Permalink
Merge pull request #249 from labgem/GffOverlap
Browse files Browse the repository at this point in the history
Fix overlap and frameshift gene in GFF file
  • Loading branch information
JeanMainguy authored Jul 10, 2024
2 parents dfdbf9a + b6230e8 commit e10b28a
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 40 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,16 @@ jobs:
# projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF
ppanggolin projection --pangenome myannopang/pangenome.h5 --anno GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron
# projection with GFF with no sequence and fasta sequence
ppanggolin projection -p myannopang/pangenome.h5 --anno GBFF/plasmid_GCF_000093005.1_ASM9300v1.gff.gz --fasta GBFF/plasmid_GCF_000093005.1_ASM9300v1.fna.gz
# projection with GFF with no sequence and fasta sequence specified in a TSV file with other GFF (but with sequences)
head -n 3 genomes.gbff.head.list > genomes.gbff.h3_and_GFFplasmidNoSeq.list
echo GFF_plasmid_No_seq$'\t'GBFF/plasmid_GCF_000093005.1_ASM9300v1.gff.gz >> genomes.gbff.h3_and_GFFplasmidNoSeq.list
echo GFF_plasmid_No_seq$'\t'GBFF/plasmid_GCF_000093005.1_ASM9300v1.fna.gz >> genomes.fna.GFFplasmidNoSeq.list
ppanggolin projection -p myannopang/pangenome.h5 --anno genomes.gbff.h3_and_GFFplasmidNoSeq.list --fasta genomes.fna.GFFplasmidNoSeq.list
- name: testing write_genome_cmds
shell: bash -l {0}
Expand Down
16 changes: 13 additions & 3 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,10 +915,20 @@ def correct_putative_overlaps(contigs: Iterable[Contig]):

new_coordinates = []
for start, stop in gene.coordinates:
if start > len(contig):
raise ValueError(
f"A gene start position ({start}) is higher than contig length ({len(contig)}). This case is not handled.")

if start > len(contig):
if len(new_coordinates) == 0:
raise ValueError(f"First gene start position ({start}) is higher than contig "
f"length ({len(contig)}). This case is not handled.")

new_start = start - len(contig)
new_stop = stop - len(contig)

new_coordinates.append((new_start, new_stop))

warn_msg = (f"Start position ({start}) for gene {gene.name} is higher than contig {contig.name}"
f" length ({len(contig)}). New coordinate are {new_coordinates}")
logging.getLogger("PPanGGOLiN").warning(warn_msg)
elif stop > len(contig):
# Handle overlapping gene
new_stop = len(contig)
Expand Down
120 changes: 83 additions & 37 deletions ppanggolin/projection/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,63 +112,109 @@ def check_pangenome_for_projection(pangenome: Pangenome, fast_aln: bool):

return predict_rgp, project_spots, project_modules

def manage_input_genomes_annotation(
pangenome, input_mode: str, anno: str, fasta: str, organism_name: str, circular_contigs: list,
pangenome_params, cpu: int, use_pseudo: bool, disable_bar: bool, tmpdir: str, config: dict):
"""
Manage the input genomes annotation based on the provided mode and parameters.
def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, organism_name, circular_contigs,
pangenome_params, cpu, use_pseudo, disable_bar, tmpdir, config):
:param pangenome: The pangenome object.
:param input_mode: The input mode, either 'multiple' or 'single'.
:param anno: The annotation file path or None.
:param fasta: The FASTA file path or None.
:param organism_name: The name of the organism.
:param circular_contigs: List of circular contigs.
:param pangenome_params: Parameters for pangenome processing.
:param cpu: Number of CPUs to use.
:param use_pseudo: Flag to use pseudo annotation.
:param disable_bar: Flag to disable progress bar.
:param tmpdir: Temporary directory path.
:param config: Configuration dictionary.
:return: A tuple of organisms, genome_name_to_path, and input_type.
"""

genome_name_to_path = None
input_type = None

# Determine input type and parse paths based on the input mode
if input_mode == "multiple":
if anno:
input_type = "annotation"
genome_name_to_path = parse_input_paths_file(anno)

elif fasta:
input_type = "fasta"
genome_name_to_path = parse_input_paths_file(fasta)

else: # args.input_mode == "single:

elif input_mode == "single":
circular_contigs = circular_contigs if circular_contigs else []
if anno:
input_type = "annotation"
genome_name_to_path = {organism_name: {"path": anno,
"circular_contigs": circular_contigs}}

genome_name_to_path = {
organism_name: {
"path": anno,
"circular_contigs": circular_contigs
}
}
elif fasta:
input_type = "fasta"
genome_name_to_path = {organism_name: {"path": fasta,
"circular_contigs": circular_contigs}}
genome_name_to_path = {
organism_name: {
"path": fasta,
"circular_contigs": circular_contigs
}
}
else:
raise ValueError(f"Input mode '{input_mode}' is not valid. Expected 'multiple' or 'single'.")

# Process annotation input type
if input_type == "annotation":
check_input_names(pangenome, genome_name_to_path)

organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo,
translation_table=int(pangenome_params.cluster.translation_table),
disable_bar=disable_bar)

if not all((has_fasta for has_fasta in org_2_has_fasta.values())):
organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta}
if fasta:
get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_path)
organisms, org_2_has_fasta = read_annotation_files(
genome_name_to_path, cpu=cpu, pseudo=use_pseudo,
translation_table=int(pangenome_params.cluster.translation_table),
disable_bar=disable_bar
)

# Check for genomes without associated sequence data
if not all(has_fasta for has_fasta in org_2_has_fasta.values()):
organisms_with_no_fasta = {
org for org, has_fasta in org_2_has_fasta.items() if not has_fasta
}
if fasta:
if input_mode == "multiple":
genome_name_to_fasta_path = parse_input_paths_file(fasta)
else:
genome_name_to_fasta_path = {
organism_name: {
"path": fasta,
"circular_contigs": circular_contigs
}
}
get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path)
else:
raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) "
"genomes without associated sequence data, and you did not provide "
"FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. "
f"The following genomes have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}")

raise ValueError(
f"GFF files provided for {len(organisms_with_no_fasta)} (out of {len(organisms)}) genomes without "
"associated sequence data, and no FASTA sequences provided using the --fasta option. Cannot project "
"the pangenome onto these genomes. Genomes without sequence data: "
f"{', '.join(o.name for o in organisms_with_no_fasta)}"
)

# Process fasta input type
elif input_type == "fasta":
annotate_param_names = ["norna", "kingdom",
"allow_overlap", "prodigal_procedure"]

annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"]
annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config)

check_input_names(pangenome, genome_name_to_path)
organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu,
translation_table=int(pangenome_params.cluster.translation_table),
norna=annotate_params.norna, kingdom=annotate_params.kingdom,
allow_overlap=annotate_params.allow_overlap,
procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar)
organisms = annotate_fasta_files(
genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu,
translation_table=int(pangenome_params.cluster.translation_table),
norna=annotate_params.norna, kingdom=annotate_params.kingdom,
allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure,
disable_bar=disable_bar
)
else:
raise ValueError(f"Input type '{input_type}' is not valid. Expected 'fasta' or 'annotation'.")

return organisms, genome_name_to_path, input_type


Expand Down Expand Up @@ -461,7 +507,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path):
org_fasta_file = genome_name_to_annot_path[org.name]['path']

with read_compressed_or_not(org_fasta_file) as currFastaFile:
org_contig_to_seq, _ = read_fasta(org, currFastaFile)
org_contig_to_seq = read_fasta(org, currFastaFile)

for contig in org.contigs:
try:
Expand Down Expand Up @@ -711,21 +757,21 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file):
"""

with read_compressed_or_not(fasta_file) as currFastaFile:
contig_id2deq, _ = read_fasta(input_organism, currFastaFile)
contig_id2seq = read_fasta(input_organism, currFastaFile)

for contig in input_organism.contigs:
try:
for gene in contig.genes:
gene.add_dna(get_dna_sequence(
contig_id2deq[contig.name], gene))
contig_id2seq[contig.name], gene))

for rna in contig.RNAs:
rna.add_dna(get_dna_sequence(contig_id2deq[contig.name], rna))
rna.add_dna(get_dna_sequence(contig_id2seq[contig.name], rna))
except KeyError:
msg = f"Fasta file for input genome {input_organism.name} did not have the contig {contig.name} " \
f"that was read from the annotation file. "
msg += f"The provided contigs in the fasta were : " \
f"{', '.join([contig for contig in contig_id2deq.keys()])}."
f"{', '.join([contig for contig in contig_id2seq.keys()])}."
raise KeyError(msg)


Expand Down
Binary file not shown.
Binary file not shown.

0 comments on commit e10b28a

Please sign in to comment.