Merge pull request #9 from apriltuesday/EVA-3248

Map to Ensembl gene IDs and CHEBI IDs
EBIvariation · May 11, 2023 · 05783d6 · 05783d6
2 parents 7318d54 + 8c87691
commit 05783d6
Show file tree

Hide file tree

Showing 10 changed files with 314 additions and 36 deletions.
diff --git a/bin/generate_evidence.py b/bin/generate_evidence.py
@@ -6,6 +6,7 @@
 parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
 parser.add_argument('--clinical-annot-path', help='Path to clinical_annotations.tsv', required=True)
 parser.add_argument('--clinical-alleles-path', help='Path to clinical_ann_alleles.tsv', required=True)
+parser.add_argument('--drugs-path', help='Path to drugs.tsv', required=True)
 parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
 parser.add_argument('--output-path', help='Path to output evidence strings', required=True)
 
@@ -15,6 +16,7 @@
     evidence_generation.pipeline(
         clinical_annot_path=args.clinical_annot_path,
         clinical_alleles_path=args.clinical_alleles_path,
+        drugs_path=args.drugs_path,
         created_date=args.created_date,
         output_path=args.output_path
     )
diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -1,39 +1,105 @@
 import json
+import multiprocessing
 
 import pandas as pd
+from cmat.consequence_prediction.common.biomart import query_biomart
 
+from opentargets_pharmgkb.ols import get_chebi_iri
+from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
 from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation
 
-pgkb_column_names = [
+
+column_names = [
     'Clinical Annotation ID',
     'Variant/Haplotypes',
     'Gene',
+    'split_gene',
+    'ensembl_gene_id',
+    'Drug(s)',
+    'split_drug',
+    'chebi',
     'Level of Evidence',
     'Phenotype Category',
-    'Drug(s)',
     'Phenotype(s)',
     'Genotype/Allele',
+    'all_genotypes',
     'Annotation Text'
 ]
 
 
-def pipeline(clinical_annot_path, clinical_alleles_path, created_date, output_path):
+def pipeline(clinical_annot_path, clinical_alleles_path, drugs_path, created_date, output_path):
     clinical_annot_table = pd.read_csv(clinical_annot_path, sep='\t')
     clinical_alleles_table = pd.read_csv(clinical_alleles_path, sep='\t')
+    drugs_table = pd.read_csv(drugs_path, sep='\t')
 
     merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on='Clinical Annotation ID', how='left')
-    rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')][pgkb_column_names]
-
+    # Restrict to variants with rsIDs
+    rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')]
     # Also provide a column with all genotypes for a given rs
     rs_only_table = pd.merge(rs_only_table, rs_only_table.groupby(by='Clinical Annotation ID').aggregate(
         all_genotypes=('Genotype/Allele', list)), on='Clinical Annotation ID')
 
+    mapped_genes = explode_and_map_genes(rs_only_table)
+    mapped_drugs = explode_and_map_drugs(mapped_genes, drugs_table)
+    evidence_table = mapped_drugs[column_names]
+
     # Generate evidence
-    evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in rs_only_table.iterrows()]
+    evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in evidence_table.iterrows()]
     with open(output_path, 'w+') as output:
         output.write('\n'.join(json.dumps(ev) for ev in evidence))
 
 
+def explode_and_map_genes(df):
+    """
+    Maps gene symbols to Ensembl gene IDs using BioMart. Explodes multiple genes in single row.
+
+    :param df: dataframe to annotate (should have a 'Gene' column)
+    :return: dataframe with 'ensembl_gene_id' column added
+    """
+    split_genes = explode_column(df, 'Gene', 'split_gene')
+    ensembl_ids = query_biomart(
+        ('hgnc_symbol', 'split_gene'),
+        ('ensembl_gene_id', 'ensembl_gene_id'),
+        split_genes['split_gene'].drop_duplicates().tolist()
+    )
+    mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene')
+    # HGNC could map to more than one ensembl gene id, so must explode again
+    mapped_genes = mapped_genes.explode('ensembl_gene_id').reset_index(drop=True)
+    return mapped_genes
+
+
+def explode_and_map_drugs(df, drugs_table):
+    """
+    Maps drug names to CHEBI IRIs using OLS, falling back on primary drugs data from PharmGKB if needed.
+    Explodes multiple drugs in a single row.
+
+    :param df: dataframe to annotate (should have a 'Drug(s)' column)
+    :param drugs_table: drugs dataframe
+    :return: dataframe with 'chebi' column added
+    """
+    split_drugs = explode_column(df, 'Drug(s)', 'split_drug')
+    # Query OLS in parallel as there are no batch queries currently.
+    with multiprocessing.Pool(processes=24) as pool:
+        str_to_iri = {
+            s: pool.apply(get_chebi_iri, args=(s,))
+            for s in split_drugs['split_drug'].drop_duplicates().tolist()
+        }
+    mapped_drugs = pd.concat(
+        split_drugs[split_drugs['split_drug'] == s].assign(chebi=none_to_nan(iri))
+        for s, iri in str_to_iri.items()
+    )
+    # Some drugs we can't unambiguously map using OLS, so we rely on primary data provided by PharmGKB.
+    # Using OLS first ensures we get up-to-date IDs wherever possible.
+    drugs_table['chebi_id'] = drugs_table['Cross-references'].str.extract(r'CHEBI:(?P<chebi_id>\d+)', expand=False)
+    mapped_drugs = pd.merge(mapped_drugs, drugs_table, left_on='split_drug', right_on='Name', how='left')
+    mapped_drugs['chebi'].fillna(mapped_drugs['chebi_id'].apply(chebi_id_to_iri), inplace=True)
+    return mapped_drugs
+
+
+def chebi_id_to_iri(id_):
+    return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
+
+
 def generate_clinical_annotation_evidence(created_date, row):
     """Generates an evidence string for a PharmGKB clinical annotation."""
     vcf_full_coords = get_coordinates_for_clinical_annotation(row['Variant/Haplotypes'], row['all_genotypes'])
@@ -50,7 +116,7 @@ def generate_clinical_annotation_evidence(created_date, row):
         # VARIANT ATTRIBUTES
         'variantId': vcf_full_coords,
         'variantRsId': row['Variant/Haplotypes'],
-        'targetFromSourceId': row['Gene'],  # TODO needs to be exploded & mapped
+        'targetFromSourceId': row['ensembl_gene_id'],
         # TODO need to use consequence prediction from clinvar repo
         'variantFunctionalConsequenceId': None,
         'variantOverlappingGeneId': None,
@@ -60,7 +126,8 @@ def generate_clinical_annotation_evidence(created_date, row):
         'genotypeAnnotationText': row['Annotation Text'],
 
         # PHENOTYPE ATTRIBUTES
-        'drugId': row['Drug(s)'],  # TODO CHEBI - needs to be exploded & mapped
+        'drugText': row['split_drug'],
+        'drugId': row['chebi'],
         'pgxCategory': row['Phenotype Category'],
         'phenotypeFromSourceId': row['Phenotype(s)']  # TODO EFO - needs to be exploded & mapped
     }

diff --git a/opentargets_pharmgkb/ols.py b/opentargets_pharmgkb/ols.py
@@ -0,0 +1,34 @@
+import logging
+import os
+from functools import lru_cache
+
+import requests
+from requests import RequestException
+from retry import retry
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+OLS_API_ROOT = 'https://www.ebi.ac.uk/ols/api'
+
+
+@lru_cache
+@retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+def get_chebi_iri(drug_name):
+    chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}')
+    response = requests.get(chebi_search_url)
+    response.raise_for_status()
+    data = response.json()
+    if 'response' in data:
+        results = data['response']['docs']
+        candidates = set()
+        for result in results:
+            # Check that we've found the drug exactly (strict case-insensitive string match)
+            if result['label'].lower() == drug_name.lower():
+                candidates.add(result['iri'])
+        # Only return a result if we can find it unambiguously
+        if len(candidates) == 1:
+            return candidates.pop()
+    logger.warning(f'Could not find a CHEBI IRI for {drug_name}')
+    return None
diff --git a/opentargets_pharmgkb/pandas_utils.py b/opentargets_pharmgkb/pandas_utils.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+
+def none_to_nan(x):
+    return np.nan if x is None else x
+
+
+def explode_column(df, source_col, target_col, sep=';'):
+    """
+    Splits a string-valued column in dataframe and explodes on the values, storing them in the specified target column.
+    Any white space around the separator will be stripped.
+
+    :param df: Pandas dataframe
+    :param source_col: name of column in df to split
+    :param target_col: destination column name for exploded values
+    :param sep: string separator to split source_col by (default ';')
+    :return: dataframe with target_col added
+    """
+    split_cols = df.assign(**{target_col: df[source_col].str.split(sep)}).explode(target_col).reset_index(drop=True)
+    split_cols[target_col] = split_cols[target_col].map(lambda x: x.strip())
+    return split_cols
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,6 @@
+numpy==1.24.3
 pandas==1.5.3
 pytest==7.2.2
 requests==2.28.2
-retry==0.9.2
+retry==0.9.2
+cmat @ git+https://github.com/apriltuesday/eva-opentargets.git@refactor#egg=cmat
diff --git a/tests/resources/drugs.tsv b/tests/resources/drugs.tsv