Merge pull request #15 from apriltuesday/issue-5

Generate chr_pos_ref_alt variant identifiers using reference assembly
EBIvariation · Aug 24, 2023 · cdb0966 · cdb0966
2 parents 7af9590 + 73876b1
commit cdb0966
Show file tree

Hide file tree

Showing 12 changed files with 584,156 additions and 146 deletions.
diff --git a/README.md b/README.md
@@ -6,12 +6,14 @@ Pipeline to provide evidence strings for Open Targets from PharmGKB
 DATA_DIR=<directory for data>
 wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
 wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
+wget https://api.pharmgkb.org/v1/download/file/data/variants.zip
 
 unzip -j clinicalAnnotations.zip "*.tsv" -d $DATA_DIR
 unzip -j clinicalAnnotations.zip "CREATED*.txt" -d $DATA_DIR
 unzip -j drugs.zip "*.tsv" -d $DATA_DIR
-rm clinicalAnnotations.zip drugs.zip
+unzip -j variants.zip "*.tsv" -d $DATA_DIR
+rm clinicalAnnotations.zip drugs.zip variants.zip
 
 # Run pipeline
-generate_evidence.py --data-dir $DATA_DIR --created-date <created date> --output-path evidence.json
+generate_evidence.py --data-dir $DATA_DIR --fasta <path to fasta> --created-date <created date> --output-path evidence.json
 ```
diff --git a/bin/generate_evidence.py b/bin/generate_evidence.py
@@ -5,6 +5,7 @@
 
 parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
 parser.add_argument('--data-dir', help='Directory containing necessary .tsv files from PharmGKB', required=True)
+parser.add_argument('--fasta', help='Path to FASTA file for GRCh38 (should use RefSeq contigs)')
 parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
 parser.add_argument('--output-path', help='Path to output evidence strings', required=True)
 
@@ -13,6 +14,7 @@
     args = parser.parse_args()
     evidence_generation.pipeline(
         data_dir=args.data_dir,
+        fasta_path=args.fasta,
         created_date=args.created_date,
         output_path=args.output_path
     )
diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -11,7 +11,7 @@
 from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
 from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
 from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
-from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation
+from opentargets_pharmgkb.variant_coordinates import Fasta
 
 logging.basicConfig()
 logger = logging.getLogger(__package__)
@@ -20,19 +20,21 @@
 ID_COL_NAME = 'Clinical Annotation ID'
 
 
-def pipeline(data_dir, created_date, output_path):
+def pipeline(data_dir, fasta_path, created_date, output_path, debug_path=None):
     clinical_annot_path = os.path.join(data_dir, 'clinical_annotations.tsv')
     clinical_alleles_path = os.path.join(data_dir, 'clinical_ann_alleles.tsv')
     clinical_evidence_path = os.path.join(data_dir, 'clinical_ann_evidence.tsv')
+    variants_path = os.path.join(data_dir, 'variants.tsv')
     drugs_path = os.path.join(data_dir, 'drugs.tsv')
-    for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path):
+    for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, variants_path, drugs_path):
         if not os.path.exists(p):
             logger.error(f'Missing required data file: {p}')
             raise ValueError(f'Missing required data file: {p}')
 
     clinical_annot_table = read_tsv_to_df(clinical_annot_path)
     clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
     clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
+    variants_table = read_tsv_to_df(variants_path)
     drugs_table = read_tsv_to_df(drugs_path)
 
     # Restrict to variants with rsIDs
@@ -44,7 +46,9 @@ def pipeline(data_dir, created_date, output_path):
     counts.with_rs = len(rs_only_table)
 
     # Main processing
-    merged_with_alleles_table = pd.merge(rs_only_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
+    merged_with_variants_table = pd.merge(rs_only_table, variants_table, left_on='Variant/Haplotypes',
+                                          right_on='Variant Name', how='left')
+    merged_with_alleles_table = pd.merge(merged_with_variants_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
     counts.exploded_alleles = len(merged_with_alleles_table)
 
     mapped_drugs = explode_and_map_drugs(merged_with_alleles_table, drugs_table)
@@ -53,7 +57,7 @@ def pipeline(data_dir, created_date, output_path):
     mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
     counts.exploded_phenotypes = len(mapped_phenotypes)
 
-    coordinates_table = get_vcf_coordinates(mapped_phenotypes)
+    coordinates_table = get_vcf_coordinates(mapped_phenotypes, fasta_path)
     consequences_table = get_functional_consequences(coordinates_table)
 
     # Add clinical evidence with PMIDs
@@ -75,28 +79,33 @@ def pipeline(data_dir, created_date, output_path):
         output.write('\n'.join(json.dumps(ev) for ev in evidence))
 
     # Final count report
-    gene_comparison_counts(evidence_table, counts, debug_path=f'{output_path.rsplit(".", 1)[0]}_genes.csv')
+    if not debug_path:
+        debug_path = f'{output_path.rsplit(".", 1)[0]}_genes.csv'
+    gene_comparison_counts(evidence_table, counts, debug_path=debug_path)
     counts.report()
 
 
 def read_tsv_to_df(path):
     return pd.read_csv(path, sep='\t', dtype=str)
 
 
-def get_vcf_coordinates(df):
+def get_vcf_coordinates(df, fasta_path):
     """
     Get VCF-style coordinates (chr_pos_ref_alt) for dataframe.
 
     :param df: dataframe to annotate (needs 'Genotype/Allele' and 'Variant/Haplotypes' columns)
     :return: dataframe with 'vcf_coords' column added
     """
+    fasta = Fasta(fasta_path)
     # First set a column with all genotypes for a given rs
     df_with_coords = pd.merge(df, df.groupby(by=ID_COL_NAME).aggregate(
         all_genotypes=('Genotype/Allele', list)), on=ID_COL_NAME)
     # Then get coordinates for each row
+    # TODO Currently this does one call per genotype per RS
+    #  Remove redundant calls once we figure out how to handle genotypes & multiple alts per RS
     for i, row in df_with_coords.iterrows():
-        df_with_coords.at[i, 'vcf_coords'] = get_coordinates_for_clinical_annotation(
-            row['Variant/Haplotypes'], row['all_genotypes'])
+        df_with_coords.at[i, 'vcf_coords'] = fasta.get_coordinates_for_clinical_annotation(
+            row['Variant/Haplotypes'], row['Location'], row['all_genotypes'])
     return df_with_coords
 
 

diff --git a/opentargets_pharmgkb/ontology_apis.py b/opentargets_pharmgkb/ontology_apis.py
@@ -13,7 +13,7 @@
 logger.setLevel(logging.INFO)
 
 HOST = 'https://www.ebi.ac.uk'
-OLS_API_ROOT = f'{HOST}/ols/api'
+OLS_API_ROOT = f'{HOST}/ols4/api'
 
 # Defaults from CMAT for getting EFO mappings that don't require manual curation
 zooma_filters = {'ontologies': 'efo,ordo,hp,mondo',
@@ -26,7 +26,7 @@
 @lru_cache
 @retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
 def get_chebi_iri(drug_name):
-    chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}')
+    chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}&queryFields=label&exact=true')
     response = requests.get(chebi_search_url)
     response.raise_for_status()
     data = response.json()

diff --git a/opentargets_pharmgkb/variant_coordinates.py b/opentargets_pharmgkb/variant_coordinates.py
@@ -2,59 +2,16 @@
 from functools import lru_cache
 import re
 
+import pandas as pd
 import requests
+from Bio import SeqIO
 
 logger = logging.getLogger(__package__)
 
 
-def get_coordinates_for_clinical_annotation(rsid, all_genotypes):
-    """
-    Gets vcf-style coordinate string (chr_pos_ref_alt) using rsid alone, falling back on genotype information if needed.
-    Returns None if coordinates cannot be determined.
-    """
-    chrom, pos, ref, alts = get_coordinates_for_rs(rsid)
-    if not chrom or not pos or not ref or not alts:
-        return None
-    if len(alts) == 1:
-        return f'{chrom}_{pos}_{ref}_{alts[0]}'
-    # If multiple alts, check what is referred to in the clinical alleles table
-    alleles = {ref}
-    for genotype in all_genotypes:
-        # X chrom variants
-        if len(genotype) == 1:
-            alleles.add(genotype)
-            continue
-        # SNPs
-        if len(genotype) == 2:
-            alleles.add(genotype[0])
-            alleles.add(genotype[1])
-            continue
-        # short indels
-        m = re.match('([ACGT]+|del)/([ACGT]+|del)', genotype, re.IGNORECASE)
-        if not m:
-            continue
-        alleles.add(convert_allele(m.group(1)))
-        alleles.add(convert_allele(m.group(2)))
-    if len(alleles) > 2:
-        logger.warning(f'Too many alleles for {rsid}, skipping')
-        return None
-    for a in alleles:
-        if a in alts:
-            return f'{chrom}_{pos}_{ref}_{a}'
-    return None
-
-
-def convert_allele(allele):
-    # TODO "del" alleles are an issue for chr_pos_ref_alt - need context bases
-    if allele.lower() == 'del':
-        return '-'
-    return allele
-
-
 @lru_cache
-def get_coordinates_for_rs(rsid):
-    """Queries Ensembl for vcf-style coordinates for rsid. Returns None if not found."""
-    # TODO do this in bulk (batches of 200) or replace with reference FASTA check
+def get_chrom_pos_for_rs_from_ensembl(rsid):
+    """Queries Ensembl for chromosome and position of rsid. Returns None if not found."""
     if not rsid.startswith('rs'):
         rsid = f'rs{rsid}'
     ensembl_url = f'https://rest.ensembl.org/variation/human/{rsid}?content-type=application/json'
@@ -68,8 +25,110 @@ def get_coordinates_for_rs(rsid):
                 if '_' in chrom:
                     continue
                 pos = mapping['start']
-                alleles = mapping['allele_string'].split('/')
-                ref = alleles[0]
-                alts = alleles[1:]
-                return chrom, pos, ref, alts
-    return None, None, None, None
+                return chrom, pos
+    return None, None
+
+
+class Fasta:
+
+    def __init__(self, path_to_fasta):
+        self.record_dict = SeqIO.to_dict(SeqIO.parse(path_to_fasta, 'fasta'))
+
+    def get_coordinates_for_clinical_annotation(self, rsid, location, all_genotypes):
+        """
+        Gets vcf-style coordinate string (chr_pos_ref_alt) using location and genotype information.
+
+        :param rsid: rsID of the variant
+        :param location: string consisting of RefSeq accession and position separated by a colon,
+                         e.g. 'NC_000001.11:46399999'
+        :param all_genotypes: list of genotype strings, e.g. ['TT', 'TA', 'AA']
+        :return: string of form chr_pos_ref_alt, or None if coordinates cannot be determined
+        """
+        if pd.isna(location):
+            return None
+        chrom, pos = location.strip().split(':')
+        if not chrom or not pos:
+            return None
+
+        # Ranges are inclusive of both start and end
+        if '_' in pos:
+            start, end = pos.split('_')
+            start = int(start)
+            end = int(end)
+        else:
+            start = end = int(pos)
+
+        alleles = set()
+        contains_del = False
+        for genotype in all_genotypes:
+            # X chrom variants
+            if len(genotype) == 1:
+                alleles.add(genotype)
+                continue
+            # SNPs
+            if len(genotype) == 2:
+                alleles.add(genotype[0])
+                alleles.add(genotype[1])
+                continue
+            # short indels
+            m = re.match('([ACGT]+|del)/([ACGT]+|del)', genotype, re.IGNORECASE)
+            if not m:
+                logger.info(f'Could not parse genotype for {rsid}: {genotype}')
+                continue
+            if m.group(1) == 'del'.lower() or m.group(2).lower() == 'del':
+                contains_del = True
+            alleles.add(m.group(1))
+            alleles.add(m.group(2))
+
+        # Correct for deletion alleles
+        if contains_del:
+            if end == start:
+                end -= 1  # keep end == start if they began that way
+            start -= 1
+            alleles = {self.add_context_base(chrom, start, allele) for allele in alleles}
+
+        if not alleles:
+            logger.warning(f'Could not parse genotypes: {rsid}\t{",".join(all_genotypes)}')
+            return None
+        ref = self.get_ref_from_fasta(chrom, start, end)
+        # Remove ref if present among alleles; otherwise report & skip
+        if ref in alleles:
+            alts = alleles - {ref}
+        else:
+            logger.warning(f'Ref not in alleles: {rsid}\t{ref}\t{",".join(alleles)}')
+            return None
+        chrom_num = self.get_chrom_num_from_refseq(chrom)
+        for alt in sorted(alts):
+            # TODO multiple IDs for multiple alts?
+            return f'{chrom_num}_{start}_{ref}_{alt}'
+        return None
+
+    @lru_cache
+    def get_ref_from_fasta(self, chrom, start, end=None):
+        """Get reference allele at a given location."""
+        if not end:
+            end = start
+        try:
+            return str(self.record_dict[chrom][start-1:end].seq).upper()
+        except (KeyError, IndexError):
+            logger.warning(f'Could not get reference allele for {chrom}:{start}_{end}')
+            return None
+
+    @lru_cache
+    def add_context_base(self, chrom, pos, allele):
+        """Add left-hand context base to allele at a given location."""
+        context_base = self.get_ref_from_fasta(chrom, pos)
+        if context_base:
+            if allele.lower() == 'del':
+                return context_base
+            return f'{context_base}{allele}'
+        return None
+
+    @lru_cache
+    def get_chrom_num_from_refseq(self, chrom_refseq):
+        # TODO use the assembly report? API call?
+        m = re.search(r'Homo sapiens chromosome (.*?), ', self.record_dict[chrom_refseq].description)
+        if m and m.group(1):
+            return m.group(1)
+        logger.warning(f'Could not get chromosome number for {chrom_refseq}')
+        return None
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,13 @@
+import os
+
+import pytest
+
+from opentargets_pharmgkb.variant_coordinates import Fasta
+
+
+fasta_path = os.path.join(os.path.dirname(__file__), 'resources', 'chr21.fa')
+
+
+@pytest.fixture
+def fasta():
+    return Fasta(fasta_path)