Skip to content

Commit

Permalink
Merge pull request #9 from apriltuesday/EVA-3248
Browse files Browse the repository at this point in the history
Map to Ensembl gene IDs and CHEBI IDs
  • Loading branch information
apriltuesday authored May 11, 2023
2 parents 7318d54 + 8c87691 commit 05783d6
Show file tree
Hide file tree
Showing 10 changed files with 314 additions and 36 deletions.
2 changes: 2 additions & 0 deletions bin/generate_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
parser.add_argument('--clinical-annot-path', help='Path to clinical_annotations.tsv', required=True)
parser.add_argument('--clinical-alleles-path', help='Path to clinical_ann_alleles.tsv', required=True)
parser.add_argument('--drugs-path', help='Path to drugs.tsv', required=True)
parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
parser.add_argument('--output-path', help='Path to output evidence strings', required=True)

Expand All @@ -15,6 +16,7 @@
evidence_generation.pipeline(
clinical_annot_path=args.clinical_annot_path,
clinical_alleles_path=args.clinical_alleles_path,
drugs_path=args.drugs_path,
created_date=args.created_date,
output_path=args.output_path
)
83 changes: 75 additions & 8 deletions opentargets_pharmgkb/evidence_generation.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,105 @@
import json
import multiprocessing

import pandas as pd
from cmat.consequence_prediction.common.biomart import query_biomart

from opentargets_pharmgkb.ols import get_chebi_iri
from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation

pgkb_column_names = [

column_names = [
'Clinical Annotation ID',
'Variant/Haplotypes',
'Gene',
'split_gene',
'ensembl_gene_id',
'Drug(s)',
'split_drug',
'chebi',
'Level of Evidence',
'Phenotype Category',
'Drug(s)',
'Phenotype(s)',
'Genotype/Allele',
'all_genotypes',
'Annotation Text'
]


def pipeline(clinical_annot_path, clinical_alleles_path, created_date, output_path):
def pipeline(clinical_annot_path, clinical_alleles_path, drugs_path, created_date, output_path):
clinical_annot_table = pd.read_csv(clinical_annot_path, sep='\t')
clinical_alleles_table = pd.read_csv(clinical_alleles_path, sep='\t')
drugs_table = pd.read_csv(drugs_path, sep='\t')

merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on='Clinical Annotation ID', how='left')
rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')][pgkb_column_names]

# Restrict to variants with rsIDs
rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')]
# Also provide a column with all genotypes for a given rs
rs_only_table = pd.merge(rs_only_table, rs_only_table.groupby(by='Clinical Annotation ID').aggregate(
all_genotypes=('Genotype/Allele', list)), on='Clinical Annotation ID')

mapped_genes = explode_and_map_genes(rs_only_table)
mapped_drugs = explode_and_map_drugs(mapped_genes, drugs_table)
evidence_table = mapped_drugs[column_names]

# Generate evidence
evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in rs_only_table.iterrows()]
evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in evidence_table.iterrows()]
with open(output_path, 'w+') as output:
output.write('\n'.join(json.dumps(ev) for ev in evidence))


def explode_and_map_genes(df):
"""
Maps gene symbols to Ensembl gene IDs using BioMart. Explodes multiple genes in single row.
:param df: dataframe to annotate (should have a 'Gene' column)
:return: dataframe with 'ensembl_gene_id' column added
"""
split_genes = explode_column(df, 'Gene', 'split_gene')
ensembl_ids = query_biomart(
('hgnc_symbol', 'split_gene'),
('ensembl_gene_id', 'ensembl_gene_id'),
split_genes['split_gene'].drop_duplicates().tolist()
)
mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene')
# HGNC could map to more than one ensembl gene id, so must explode again
mapped_genes = mapped_genes.explode('ensembl_gene_id').reset_index(drop=True)
return mapped_genes


def explode_and_map_drugs(df, drugs_table):
"""
Maps drug names to CHEBI IRIs using OLS, falling back on primary drugs data from PharmGKB if needed.
Explodes multiple drugs in a single row.
:param df: dataframe to annotate (should have a 'Drug(s)' column)
:param drugs_table: drugs dataframe
:return: dataframe with 'chebi' column added
"""
split_drugs = explode_column(df, 'Drug(s)', 'split_drug')
# Query OLS in parallel as there are no batch queries currently.
with multiprocessing.Pool(processes=24) as pool:
str_to_iri = {
s: pool.apply(get_chebi_iri, args=(s,))
for s in split_drugs['split_drug'].drop_duplicates().tolist()
}
mapped_drugs = pd.concat(
split_drugs[split_drugs['split_drug'] == s].assign(chebi=none_to_nan(iri))
for s, iri in str_to_iri.items()
)
# Some drugs we can't unambiguously map using OLS, so we rely on primary data provided by PharmGKB.
# Using OLS first ensures we get up-to-date IDs wherever possible.
drugs_table['chebi_id'] = drugs_table['Cross-references'].str.extract(r'CHEBI:(?P<chebi_id>\d+)', expand=False)
mapped_drugs = pd.merge(mapped_drugs, drugs_table, left_on='split_drug', right_on='Name', how='left')
mapped_drugs['chebi'].fillna(mapped_drugs['chebi_id'].apply(chebi_id_to_iri), inplace=True)
return mapped_drugs


def chebi_id_to_iri(id_):
return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'


def generate_clinical_annotation_evidence(created_date, row):
"""Generates an evidence string for a PharmGKB clinical annotation."""
vcf_full_coords = get_coordinates_for_clinical_annotation(row['Variant/Haplotypes'], row['all_genotypes'])
Expand All @@ -50,7 +116,7 @@ def generate_clinical_annotation_evidence(created_date, row):
# VARIANT ATTRIBUTES
'variantId': vcf_full_coords,
'variantRsId': row['Variant/Haplotypes'],
'targetFromSourceId': row['Gene'], # TODO needs to be exploded & mapped
'targetFromSourceId': row['ensembl_gene_id'],
# TODO need to use consequence prediction from clinvar repo
'variantFunctionalConsequenceId': None,
'variantOverlappingGeneId': None,
Expand All @@ -60,7 +126,8 @@ def generate_clinical_annotation_evidence(created_date, row):
'genotypeAnnotationText': row['Annotation Text'],

# PHENOTYPE ATTRIBUTES
'drugId': row['Drug(s)'], # TODO CHEBI - needs to be exploded & mapped
'drugText': row['split_drug'],
'drugId': row['chebi'],
'pgxCategory': row['Phenotype Category'],
'phenotypeFromSourceId': row['Phenotype(s)'] # TODO EFO - needs to be exploded & mapped
}
Expand Down
34 changes: 34 additions & 0 deletions opentargets_pharmgkb/ols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging
import os
from functools import lru_cache

import requests
from requests import RequestException
from retry import retry

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

OLS_API_ROOT = 'https://www.ebi.ac.uk/ols/api'


@lru_cache
@retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def get_chebi_iri(drug_name):
chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}')
response = requests.get(chebi_search_url)
response.raise_for_status()
data = response.json()
if 'response' in data:
results = data['response']['docs']
candidates = set()
for result in results:
# Check that we've found the drug exactly (strict case-insensitive string match)
if result['label'].lower() == drug_name.lower():
candidates.add(result['iri'])
# Only return a result if we can find it unambiguously
if len(candidates) == 1:
return candidates.pop()
logger.warning(f'Could not find a CHEBI IRI for {drug_name}')
return None
21 changes: 21 additions & 0 deletions opentargets_pharmgkb/pandas_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np


def none_to_nan(x):
return np.nan if x is None else x


def explode_column(df, source_col, target_col, sep=';'):
"""
Splits a string-valued column in dataframe and explodes on the values, storing them in the specified target column.
Any white space around the separator will be stripped.
:param df: Pandas dataframe
:param source_col: name of column in df to split
:param target_col: destination column name for exploded values
:param sep: string separator to split source_col by (default ';')
:return: dataframe with target_col added
"""
split_cols = df.assign(**{target_col: df[source_col].str.split(sep)}).explode(target_col).reset_index(drop=True)
split_cols[target_col] = split_cols[target_col].map(lambda x: x.strip())
return split_cols
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
numpy==1.24.3
pandas==1.5.3
pytest==7.2.2
requests==2.28.2
retry==0.9.2
retry==0.9.2
cmat @ git+https://github.com/apriltuesday/eva-opentargets.git@refactor#egg=cmat
29 changes: 29 additions & 0 deletions tests/resources/drugs.tsv

Large diffs are not rendered by default.

Loading

0 comments on commit 05783d6

Please sign in to comment.