Merge pull request #11 from apriltuesday/EVA-3269

EVA-3269: Adding publications, phenotype mappings, functional consequences
EBIvariation · Jun 15, 2023 · 3ae0a92 · 3ae0a92
2 parents 05783d6 + c25ba35
commit 3ae0a92
Show file tree

Hide file tree

Showing 10 changed files with 405 additions and 201 deletions.
diff --git a/bin/generate_evidence.py b/bin/generate_evidence.py
@@ -6,6 +6,7 @@
 parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
 parser.add_argument('--clinical-annot-path', help='Path to clinical_annotations.tsv', required=True)
 parser.add_argument('--clinical-alleles-path', help='Path to clinical_ann_alleles.tsv', required=True)
+parser.add_argument('--clinical-evidence-path', help='Path to clinical_ann_evidence.tsv', required=True)
 parser.add_argument('--drugs-path', help='Path to drugs.tsv', required=True)
 parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
 parser.add_argument('--output-path', help='Path to output evidence strings', required=True)
@@ -16,6 +17,7 @@
     evidence_generation.pipeline(
         clinical_annot_path=args.clinical_annot_path,
         clinical_alleles_path=args.clinical_alleles_path,
+        clinical_evidence_path=args.clinical_evidence_path,
         drugs_path=args.drugs_path,
         created_date=args.created_date,
         output_path=args.output_path

diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -1,54 +1,110 @@
 import json
 import multiprocessing
+from itertools import zip_longest
 
 import pandas as pd
 from cmat.consequence_prediction.common.biomart import query_biomart
+from cmat.consequence_prediction.snp_indel_variants.pipeline import process_variants
 
-from opentargets_pharmgkb.ols import get_chebi_iri
+from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
 from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
 from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation
 
+ID_COL_NAME = 'Clinical Annotation ID'
 
-column_names = [
-    'Clinical Annotation ID',
-    'Variant/Haplotypes',
-    'Gene',
-    'split_gene',
-    'ensembl_gene_id',
-    'Drug(s)',
-    'split_drug',
-    'chebi',
-    'Level of Evidence',
-    'Phenotype Category',
-    'Phenotype(s)',
-    'Genotype/Allele',
-    'all_genotypes',
-    'Annotation Text'
-]
-
-
-def pipeline(clinical_annot_path, clinical_alleles_path, drugs_path, created_date, output_path):
-    clinical_annot_table = pd.read_csv(clinical_annot_path, sep='\t')
-    clinical_alleles_table = pd.read_csv(clinical_alleles_path, sep='\t')
-    drugs_table = pd.read_csv(drugs_path, sep='\t')
-
-    merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on='Clinical Annotation ID', how='left')
+
+def pipeline(clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path, created_date, output_path):
+    clinical_annot_table = read_tsv_to_df(clinical_annot_path)
+    clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
+    clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
+    drugs_table = read_tsv_to_df(drugs_path)
+
+    merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
     # Restrict to variants with rsIDs
     rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')]
-    # Also provide a column with all genotypes for a given rs
-    rs_only_table = pd.merge(rs_only_table, rs_only_table.groupby(by='Clinical Annotation ID').aggregate(
-        all_genotypes=('Genotype/Allele', list)), on='Clinical Annotation ID')
 
-    mapped_genes = explode_and_map_genes(rs_only_table)
+    coordinates_table = get_vcf_coordinates(rs_only_table)
+    consequences_table = get_functional_consequences(coordinates_table)
+    mapped_genes = explode_and_map_genes(consequences_table)
     mapped_drugs = explode_and_map_drugs(mapped_genes, drugs_table)
-    evidence_table = mapped_drugs[column_names]
+    mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
+
+    # Add clinical evidence with PMIDs
+    pmid_evidence = clinical_evidence_table[clinical_evidence_table['PMID'].notna()]
+    evidence_table = pd.merge(mapped_phenotypes, pmid_evidence.groupby(by=ID_COL_NAME).aggregate(
+        publications=('PMID', list)), on=ID_COL_NAME)
 
     # Generate evidence
     evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in evidence_table.iterrows()]
     with open(output_path, 'w+') as output:
         output.write('\n'.join(json.dumps(ev) for ev in evidence))
 
 
+def read_tsv_to_df(path):
+    return pd.read_csv(path, sep='\t', dtype=str)
+
+
+def get_vcf_coordinates(df):
+    """
+    Get VCF-style coordinates (chr_pos_ref_alt) for dataframe.
+
+    :param df: dataframe to annotate (needs 'Genotype/Allele' and 'Variant/Haplotypes' columns)
+    :return: dataframe with 'vcf_coords' column added
+    """
+    # First set a column with all genotypes for a given rs
+    df_with_coords = pd.merge(df, df.groupby(by=ID_COL_NAME).aggregate(
+        all_genotypes=('Genotype/Allele', list)), on=ID_COL_NAME)
+    # Then get coordinates for each row
+    for i, row in df_with_coords.iterrows():
+        df_with_coords.at[i, 'vcf_coords'] = get_coordinates_for_clinical_annotation(
+            row['Variant/Haplotypes'], row['all_genotypes'])
+    return df_with_coords
+
+
+def get_functional_consequences(df):
+    """
+    Get functional consequences from VEP.
+
+    :param df: dataframe to annotate (needs 'vcf_coords' column)
+    :return: dataframe with 'overlapping_gene' and 'consequence_term' columns added
+    """
+    vep_id_to_coords = {
+        coord_id_to_vep_id(x): x for x in df['vcf_coords'].dropna().drop_duplicates().tolist()
+    }
+    with multiprocessing.Pool(processes=24) as pool:
+        all_consequences = [
+            pool.apply(process_to_list, args=(batch,))
+            for batch in grouper(vep_id_to_coords.keys(), 200)
+        ]
+    mapped_consequences = pd.DataFrame(data=[
+        {
+            'vcf_coords': vep_id_to_coords[variant_id],
+            'overlapping_gene': gene_id,
+            'consequence_term': consequence_term
+        }
+        for batch in all_consequences
+        for variant_id, gene_id, gene_symbol, consequence_term in batch
+    ])
+    return pd.merge(df, mapped_consequences, on='vcf_coords', how='left')
+
+
+def coord_id_to_vep_id(coord_id):
+    """Converts an underscore-separated coordinate identifier (e.g. 15_7237571_C_T) to VEP compatible one."""
+    id_fields = coord_id.split('_')
+    assert len(id_fields) == 4, 'Invalid identifier supplied (should contain exactly 4 fields)'
+    return '{} {} . {} {}'.format(*id_fields)
+
+
+def grouper(iterable, n):
+    args = [iter(iterable)] * n
+    return [x for x in zip_longest(*args, fillvalue=None) if x is not None]
+
+
+def process_to_list(b):
+    """Wrapper for process_variants because multiprocessing does not like generators."""
+    return list(process_variants(b))
+
+
 def explode_and_map_genes(df):
     """
     Maps gene symbols to Ensembl gene IDs using BioMart. Explodes multiple genes in single row.
@@ -59,12 +115,12 @@ def explode_and_map_genes(df):
     split_genes = explode_column(df, 'Gene', 'split_gene')
     ensembl_ids = query_biomart(
         ('hgnc_symbol', 'split_gene'),
-        ('ensembl_gene_id', 'ensembl_gene_id'),
+        ('ensembl_gene_id', 'gene_from_pgkb'),
         split_genes['split_gene'].drop_duplicates().tolist()
     )
     mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene')
     # HGNC could map to more than one ensembl gene id, so must explode again
-    mapped_genes = mapped_genes.explode('ensembl_gene_id').reset_index(drop=True)
+    mapped_genes = mapped_genes.explode('gene_from_pgkb').reset_index(drop=True)
     return mapped_genes
 
 
@@ -100,26 +156,46 @@ def chebi_id_to_iri(id_):
     return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
 
 
+def explode_and_map_phenotypes(df):
+    """
+    Maps phenotype text to EFO IRIs using Zooma. Explodes multiple phenotypes in single row.
+
+    :param df: dataframe to annotate (should have a 'Phenotype(s)' column)
+    :return: dataframe with 'efo' column added
+    """
+    df['Phenotype(s)'].fillna('', inplace=True)
+    split_phenotypes = explode_column(df, 'Phenotype(s)', 'split_phenotype')
+    with multiprocessing.Pool(processes=24) as pool:
+        str_to_iri = {
+            s: pool.apply(get_efo_iri, args=(s,))
+            for s in split_phenotypes['split_phenotype'].drop_duplicates().tolist()
+        }
+    mapped_phenotypes = pd.concat(
+        split_phenotypes[split_phenotypes['split_phenotype'] == s].assign(efo=none_to_nan(iri))
+        for s, iri in str_to_iri.items()
+    )
+    return mapped_phenotypes
+
+
 def generate_clinical_annotation_evidence(created_date, row):
     """Generates an evidence string for a PharmGKB clinical annotation."""
-    vcf_full_coords = get_coordinates_for_clinical_annotation(row['Variant/Haplotypes'], row['all_genotypes'])
     evidence_string = {
         # DATA SOURCE ATTRIBUTES
         'datasourceId': 'pharmgkb',
         'datasourceVersion': created_date,
 
         # RECORD ATTRIBUTES
         'datatypeId': 'clinical_annotation',
-        'studyId': row['Clinical Annotation ID'],
+        'studyId': row[ID_COL_NAME],
         'evidenceLevel': row['Level of Evidence'],
+        'literature': [str(x) for x in row['publications']],
 
         # VARIANT ATTRIBUTES
-        'variantId': vcf_full_coords,
+        'variantId': row['vcf_coords'],
         'variantRsId': row['Variant/Haplotypes'],
-        'targetFromSourceId': row['ensembl_gene_id'],
-        # TODO need to use consequence prediction from clinvar repo
-        'variantFunctionalConsequenceId': None,
-        'variantOverlappingGeneId': None,
+        'targetFromSourceId': row['gene_from_pgkb'],
+        'variantFunctionalConsequenceId': row['consequence_term'],
+        'variantOverlappingGeneId': row['overlapping_gene'],
 
         # GENOTYPE ATTRIBUTES
         'genotype': row['Genotype/Allele'],
@@ -129,8 +205,10 @@ def generate_clinical_annotation_evidence(created_date, row):
         'drugText': row['split_drug'],
         'drugId': row['chebi'],
         'pgxCategory': row['Phenotype Category'],
-        'phenotypeFromSourceId': row['Phenotype(s)']  # TODO EFO - needs to be exploded & mapped
+        'phenotypeText': row['split_phenotype'],
+        'phenotypeFromSourceId': row['efo']
     }
     # Remove the attributes with empty values (either None or empty lists).
-    evidence_string = {key: value for key, value in evidence_string.items() if value and pd.notna(value)}
+    evidence_string = {key: value for key, value in evidence_string.items()
+                       if value and (isinstance(value, list) or pd.notna(value))}
     return evidence_string
diff --git a/opentargets_pharmgkb/ols.py b/opentargets_pharmgkb/ols.py
diff --git a/opentargets_pharmgkb/ontology_apis.py b/opentargets_pharmgkb/ontology_apis.py
@@ -0,0 +1,63 @@
+import logging
+import os
+from functools import lru_cache
+
+import requests
+from cmat.trait_mapping.main import process_trait
+from cmat.trait_mapping.trait import Trait
+from requests import RequestException
+from retry import retry
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+HOST = 'https://www.ebi.ac.uk'
+OLS_API_ROOT = f'{HOST}/ols/api'
+
+# Defaults from CMAT for getting EFO mappings that don't require manual curation
+zooma_filters = {'ontologies': 'efo,ordo,hp,mondo',
+                 'required': 'cttv,eva-clinvar,clinvar-xrefs,gwas',
+                 'preferred': 'eva-clinvar,cttv,gwas,clinvar-xrefs'}
+oxo_targets = ['Orphanet', 'efo', 'hp', 'mondo']
+oxo_distance = 1
+
+
+@lru_cache
+@retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+def get_chebi_iri(drug_name):
+    chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}')
+    response = requests.get(chebi_search_url)
+    response.raise_for_status()
+    data = response.json()
+    if 'response' in data:
+        results = data['response']['docs']
+        candidates = set()
+        for result in results:
+            # Check that we've found the drug exactly (strict case-insensitive string match)
+            if result['label'].lower() == drug_name.lower():
+                candidates.add(result['iri'])
+        # Only return a result if we can find it unambiguously
+        if len(candidates) == 1:
+            return candidates.pop()
+    logger.warning(f'Could not find a CHEBI IRI for {drug_name}')
+    return None
+
+
+@lru_cache
+@retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+def get_efo_iri(phenotype_name):
+    if not phenotype_name:
+        return None
+
+    # Trait to store Zooma/OxO results - other attributes not used
+    trait = Trait(phenotype_name, None, None)
+    processed_trait = process_trait(trait, zooma_filters, HOST, oxo_targets, oxo_distance)
+    if processed_trait.is_finished:
+        efo_uris = [ontology_entry.uri for ontology_entry in processed_trait.finished_mapping_set]
+        if len(efo_uris) > 1:
+            # Don't expect multiple mappings for PharmGKB phenotypes
+            logger.warning(f'Found multiple mappings for {phenotype_name}: {",".join(efo_uris)}')
+            return None
+        return efo_uris[0]
+    return None
diff --git a/opentargets_pharmgkb/variant_coordinates.py b/opentargets_pharmgkb/variant_coordinates.py
@@ -64,6 +64,9 @@ def get_coordinates_for_rs(rsid):
         for mapping in data['mappings']:
             if mapping['assembly_name'] == 'GRCh38':
                 chrom = mapping['seq_region_name']
+                # Skip things like CHR_HSCHR22_1_CTG7
+                if '_' in chrom:
+                    continue
                 pos = mapping['start']
                 alleles = mapping['allele_string'].split('/')
                 ref = alleles[0]