Skip to content

Commit

Permalink
Refactor hmmscan hit loops
Browse files Browse the repository at this point in the history
  • Loading branch information
jhahnfeld committed Jun 26, 2023
1 parent a693f43 commit 0a1af01
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 83 deletions.
79 changes: 29 additions & 50 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,60 +174,39 @@ def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]:
'UTF-8')).digitize(alphabet) for cds in cdss
]

hits: list[dict] = []
pfam_hits = []
cds_pfams_hits = []
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm:
for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
for hit in top_hits:
hits.append(
{
'query': hit.name.decode(),
'subject_name': hit.best_domain.alignment.hmm_name.decode(),
'subject_id': hit.best_domain.alignment.hmm_accession.decode(),
'bitscore': hit.score,
'evalue': hit.evalue,
'domain_length': len(hit.best_domain.alignment.hmm_sequence),
'domain_start': hit.best_domain.alignment.hmm_from,
'domain_stop': hit.best_domain.alignment.hmm_to,
'aa_start': hit.best_domain.alignment.target_from,
'aa_stop': hit.best_domain.alignment.target_to
}
cds = orf_by_aa_digest[hit.name.decode()]

domain_cov = (hit.best_domain.alignment.hmm_to - hit.best_domain.alignment.hmm_from + 1) / len(hit.best_domain.alignment.hmm_sequence)
aa_cov = (hit.best_domain.alignment.target_to - hit.best_domain.alignment.target_from + 1) / len(cds['aa'])

pfam = OrderedDict()
pfam['name'] = hit.best_domain.alignment.hmm_name.decode()
pfam['id'] = hit.best_domain.alignment.hmm_accession.decode()
pfam['length'] = len(hit.best_domain.alignment.hmm_sequence)
pfam['aa_cov'] = aa_cov
pfam['hmm_cov'] = domain_cov
pfam['evalue'] = hit.evalue
pfam['score'] = hit.score
pfam['start'] = hit.best_domain.alignment.target_from
pfam['stop'] = hit.best_domain.alignment.target_to

cds.setdefault('pfams', [])
cds['pfams'].append(pfam)
cds.setdefault('db_xrefs', [])
cds['db_xrefs'].append(f"PFAM:{pfam['id']}")
pfam_hits.append(cds)
cds_pfams_hits.append(cds)
log.info(
'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'],
pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
)

pfam_hits = []
cds_pfams_hits = []
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
for hit in hits:
cds = orf_by_aa_digest[hit['query']]

domain_length = hit['domain_length']
domain_start = hit['domain_start']
domain_stop = hit['domain_stop']
domain_cov = (domain_stop - domain_start + 1) / domain_length
aa_start = hit['aa_start']
aa_stop = hit['aa_stop']
aa_cov = (aa_stop - aa_start + 1) / len(cds['aa'])

pfam = OrderedDict()
pfam['name'] = hit['subject_name']
pfam['id'] = hit['subject_id']
pfam['length'] = domain_length
pfam['aa_cov'] = aa_cov
pfam['hmm_cov'] = domain_cov
pfam['evalue'] = hit['evalue']
pfam['score'] = hit['bitscore']
pfam['start'] = aa_start
pfam['stop'] = aa_stop

cds.setdefault('pfams', [])
cds['pfams'].append(pfam)
cds.setdefault('db_xrefs', [])
cds['db_xrefs'].append(f"PFAM:{pfam['id']}")
pfam_hits.append(cds)
cds_pfams_hits.append(cds)
log.info(
'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
)
log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits))
return cds_pfams_hits

Expand Down
56 changes: 23 additions & 33 deletions bakta/features/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,32 @@ def detect_spurious(orfs: Sequence[dict]):
'UTF-8')).digitize(alphabet) for orf in orfs
]

hits: list[dict] = []
discarded_orfs = []
orf_by_aa_digest = get_orf_dictionary(orfs)
with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm:
for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
for hit in top_hits:
hits.append(
{
'query': hit.name.decode(),
'subject_name': hit.best_domain.alignment.hmm_name.decode(),
'subject_id': hit.best_domain.alignment.hmm_accession.decode(),
'bitscore': hit.score,
'evalue': hit.evalue,
}
)

discarded_orfs = []
orf_by_aa_digest = get_orf_dictionary(orfs)
for hit in hits:
orf = orf_by_aa_digest[hit['query']]
if hit['evalue'] > 1E-5:
log.debug(
'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore']
)
else:
discard = OrderedDict()
discard['type'] = bc.DISCARD_TYPE_SPURIOUS
discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{hit["subject_id"]})'
discard['score'] = hit['bitscore']
discard['evalue'] = hit['evalue']

orf['discarded'] = discard
discarded_orfs.append(orf)
log.info(
'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], hit['subject_name'], hit['evalue'], hit['bitscore']
)
orf = orf_by_aa_digest[hit.name.decode()]
if hit.evalue > 1E-5:
log.debug(
'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'],
hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
)
else:
discard = OrderedDict()
discard['type'] = bc.DISCARD_TYPE_SPURIOUS
discard['description'] = f'(partial) homology to spurious sequence HMM ' \
f'(AntiFam:{hit.best_domain.alignment.hmm_accession.decode()})'
discard['score'] = hit.score
discard['evalue'] = hit.evalue

orf['discarded'] = discard
discarded_orfs.append(orf)
log.info(
'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
)
log.info('discarded=%i', len(discarded_orfs))
return discarded_orfs

Expand Down

0 comments on commit 0a1af01

Please sign in to comment.