Skip to content

Commit

Permalink
Merge pull request #15 from apriltuesday/issue-5
Browse files Browse the repository at this point in the history
Generate chr_pos_ref_alt variant identifiers using reference assembly
  • Loading branch information
apriltuesday authored Aug 24, 2023
2 parents 7af9590 + 73876b1 commit cdb0966
Show file tree
Hide file tree
Showing 12 changed files with 584,156 additions and 146 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ Pipeline to provide evidence strings for Open Targets from PharmGKB
DATA_DIR=<directory for data>
wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
wget https://api.pharmgkb.org/v1/download/file/data/variants.zip
unzip -j clinicalAnnotations.zip "*.tsv" -d $DATA_DIR
unzip -j clinicalAnnotations.zip "CREATED*.txt" -d $DATA_DIR
unzip -j drugs.zip "*.tsv" -d $DATA_DIR
rm clinicalAnnotations.zip drugs.zip
unzip -j variants.zip "*.tsv" -d $DATA_DIR
rm clinicalAnnotations.zip drugs.zip variants.zip
# Run pipeline
generate_evidence.py --data-dir $DATA_DIR --created-date <created date> --output-path evidence.json
generate_evidence.py --data-dir $DATA_DIR --fasta <path to fasta> --created-date <created date> --output-path evidence.json
```
2 changes: 2 additions & 0 deletions bin/generate_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
parser.add_argument('--data-dir', help='Directory containing necessary .tsv files from PharmGKB', required=True)
parser.add_argument('--fasta', help='Path to FASTA file for GRCh38 (should use RefSeq contigs)')
parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
parser.add_argument('--output-path', help='Path to output evidence strings', required=True)

Expand All @@ -13,6 +14,7 @@
args = parser.parse_args()
evidence_generation.pipeline(
data_dir=args.data_dir,
fasta_path=args.fasta,
created_date=args.created_date,
output_path=args.output_path
)
27 changes: 18 additions & 9 deletions opentargets_pharmgkb/evidence_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation
from opentargets_pharmgkb.variant_coordinates import Fasta

logging.basicConfig()
logger = logging.getLogger(__package__)
Expand All @@ -20,19 +20,21 @@
ID_COL_NAME = 'Clinical Annotation ID'


def pipeline(data_dir, created_date, output_path):
def pipeline(data_dir, fasta_path, created_date, output_path, debug_path=None):
clinical_annot_path = os.path.join(data_dir, 'clinical_annotations.tsv')
clinical_alleles_path = os.path.join(data_dir, 'clinical_ann_alleles.tsv')
clinical_evidence_path = os.path.join(data_dir, 'clinical_ann_evidence.tsv')
variants_path = os.path.join(data_dir, 'variants.tsv')
drugs_path = os.path.join(data_dir, 'drugs.tsv')
for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path):
for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, variants_path, drugs_path):
if not os.path.exists(p):
logger.error(f'Missing required data file: {p}')
raise ValueError(f'Missing required data file: {p}')

clinical_annot_table = read_tsv_to_df(clinical_annot_path)
clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
variants_table = read_tsv_to_df(variants_path)
drugs_table = read_tsv_to_df(drugs_path)

# Restrict to variants with rsIDs
Expand All @@ -44,7 +46,9 @@ def pipeline(data_dir, created_date, output_path):
counts.with_rs = len(rs_only_table)

# Main processing
merged_with_alleles_table = pd.merge(rs_only_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
merged_with_variants_table = pd.merge(rs_only_table, variants_table, left_on='Variant/Haplotypes',
right_on='Variant Name', how='left')
merged_with_alleles_table = pd.merge(merged_with_variants_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
counts.exploded_alleles = len(merged_with_alleles_table)

mapped_drugs = explode_and_map_drugs(merged_with_alleles_table, drugs_table)
Expand All @@ -53,7 +57,7 @@ def pipeline(data_dir, created_date, output_path):
mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
counts.exploded_phenotypes = len(mapped_phenotypes)

coordinates_table = get_vcf_coordinates(mapped_phenotypes)
coordinates_table = get_vcf_coordinates(mapped_phenotypes, fasta_path)
consequences_table = get_functional_consequences(coordinates_table)

# Add clinical evidence with PMIDs
Expand All @@ -75,28 +79,33 @@ def pipeline(data_dir, created_date, output_path):
output.write('\n'.join(json.dumps(ev) for ev in evidence))

# Final count report
gene_comparison_counts(evidence_table, counts, debug_path=f'{output_path.rsplit(".", 1)[0]}_genes.csv')
if not debug_path:
debug_path = f'{output_path.rsplit(".", 1)[0]}_genes.csv'
gene_comparison_counts(evidence_table, counts, debug_path=debug_path)
counts.report()


def read_tsv_to_df(path):
return pd.read_csv(path, sep='\t', dtype=str)


def get_vcf_coordinates(df):
def get_vcf_coordinates(df, fasta_path):
"""
Get VCF-style coordinates (chr_pos_ref_alt) for dataframe.
:param df: dataframe to annotate (needs 'Genotype/Allele' and 'Variant/Haplotypes' columns)
:return: dataframe with 'vcf_coords' column added
"""
fasta = Fasta(fasta_path)
# First set a column with all genotypes for a given rs
df_with_coords = pd.merge(df, df.groupby(by=ID_COL_NAME).aggregate(
all_genotypes=('Genotype/Allele', list)), on=ID_COL_NAME)
# Then get coordinates for each row
# TODO Currently this does one call per genotype per RS
# Remove redundant calls once we figure out how to handle genotypes & multiple alts per RS
for i, row in df_with_coords.iterrows():
df_with_coords.at[i, 'vcf_coords'] = get_coordinates_for_clinical_annotation(
row['Variant/Haplotypes'], row['all_genotypes'])
df_with_coords.at[i, 'vcf_coords'] = fasta.get_coordinates_for_clinical_annotation(
row['Variant/Haplotypes'], row['Location'], row['all_genotypes'])
return df_with_coords


Expand Down
4 changes: 2 additions & 2 deletions opentargets_pharmgkb/ontology_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
logger.setLevel(logging.INFO)

HOST = 'https://www.ebi.ac.uk'
OLS_API_ROOT = f'{HOST}/ols/api'
OLS_API_ROOT = f'{HOST}/ols4/api'

# Defaults from CMAT for getting EFO mappings that don't require manual curation
zooma_filters = {'ontologies': 'efo,ordo,hp,mondo',
Expand All @@ -26,7 +26,7 @@
@lru_cache
@retry(exceptions=(ConnectionError, RequestException), tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def get_chebi_iri(drug_name):
chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}')
chebi_search_url = os.path.join(OLS_API_ROOT, f'search?ontology=chebi&q={drug_name}&queryFields=label&exact=true')
response = requests.get(chebi_search_url)
response.raise_for_status()
data = response.json()
Expand Down
163 changes: 111 additions & 52 deletions opentargets_pharmgkb/variant_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,59 +2,16 @@
from functools import lru_cache
import re

import pandas as pd
import requests
from Bio import SeqIO

logger = logging.getLogger(__package__)


def get_coordinates_for_clinical_annotation(rsid, all_genotypes):
"""
Gets vcf-style coordinate string (chr_pos_ref_alt) using rsid alone, falling back on genotype information if needed.
Returns None if coordinates cannot be determined.
"""
chrom, pos, ref, alts = get_coordinates_for_rs(rsid)
if not chrom or not pos or not ref or not alts:
return None
if len(alts) == 1:
return f'{chrom}_{pos}_{ref}_{alts[0]}'
# If multiple alts, check what is referred to in the clinical alleles table
alleles = {ref}
for genotype in all_genotypes:
# X chrom variants
if len(genotype) == 1:
alleles.add(genotype)
continue
# SNPs
if len(genotype) == 2:
alleles.add(genotype[0])
alleles.add(genotype[1])
continue
# short indels
m = re.match('([ACGT]+|del)/([ACGT]+|del)', genotype, re.IGNORECASE)
if not m:
continue
alleles.add(convert_allele(m.group(1)))
alleles.add(convert_allele(m.group(2)))
if len(alleles) > 2:
logger.warning(f'Too many alleles for {rsid}, skipping')
return None
for a in alleles:
if a in alts:
return f'{chrom}_{pos}_{ref}_{a}'
return None


def convert_allele(allele):
# TODO "del" alleles are an issue for chr_pos_ref_alt - need context bases
if allele.lower() == 'del':
return '-'
return allele


@lru_cache
def get_coordinates_for_rs(rsid):
"""Queries Ensembl for vcf-style coordinates for rsid. Returns None if not found."""
# TODO do this in bulk (batches of 200) or replace with reference FASTA check
def get_chrom_pos_for_rs_from_ensembl(rsid):
"""Queries Ensembl for chromosome and position of rsid. Returns None if not found."""
if not rsid.startswith('rs'):
rsid = f'rs{rsid}'
ensembl_url = f'https://rest.ensembl.org/variation/human/{rsid}?content-type=application/json'
Expand All @@ -68,8 +25,110 @@ def get_coordinates_for_rs(rsid):
if '_' in chrom:
continue
pos = mapping['start']
alleles = mapping['allele_string'].split('/')
ref = alleles[0]
alts = alleles[1:]
return chrom, pos, ref, alts
return None, None, None, None
return chrom, pos
return None, None


class Fasta:

def __init__(self, path_to_fasta):
self.record_dict = SeqIO.to_dict(SeqIO.parse(path_to_fasta, 'fasta'))

def get_coordinates_for_clinical_annotation(self, rsid, location, all_genotypes):
"""
Gets vcf-style coordinate string (chr_pos_ref_alt) using location and genotype information.
:param rsid: rsID of the variant
:param location: string consisting of RefSeq accession and position separated by a colon,
e.g. 'NC_000001.11:46399999'
:param all_genotypes: list of genotype strings, e.g. ['TT', 'TA', 'AA']
:return: string of form chr_pos_ref_alt, or None if coordinates cannot be determined
"""
if pd.isna(location):
return None
chrom, pos = location.strip().split(':')
if not chrom or not pos:
return None

# Ranges are inclusive of both start and end
if '_' in pos:
start, end = pos.split('_')
start = int(start)
end = int(end)
else:
start = end = int(pos)

alleles = set()
contains_del = False
for genotype in all_genotypes:
# X chrom variants
if len(genotype) == 1:
alleles.add(genotype)
continue
# SNPs
if len(genotype) == 2:
alleles.add(genotype[0])
alleles.add(genotype[1])
continue
# short indels
m = re.match('([ACGT]+|del)/([ACGT]+|del)', genotype, re.IGNORECASE)
if not m:
logger.info(f'Could not parse genotype for {rsid}: {genotype}')
continue
if m.group(1) == 'del'.lower() or m.group(2).lower() == 'del':
contains_del = True
alleles.add(m.group(1))
alleles.add(m.group(2))

# Correct for deletion alleles
if contains_del:
if end == start:
end -= 1 # keep end == start if they began that way
start -= 1
alleles = {self.add_context_base(chrom, start, allele) for allele in alleles}

if not alleles:
logger.warning(f'Could not parse genotypes: {rsid}\t{",".join(all_genotypes)}')
return None
ref = self.get_ref_from_fasta(chrom, start, end)
# Remove ref if present among alleles; otherwise report & skip
if ref in alleles:
alts = alleles - {ref}
else:
logger.warning(f'Ref not in alleles: {rsid}\t{ref}\t{",".join(alleles)}')
return None
chrom_num = self.get_chrom_num_from_refseq(chrom)
for alt in sorted(alts):
# TODO multiple IDs for multiple alts?
return f'{chrom_num}_{start}_{ref}_{alt}'
return None

@lru_cache
def get_ref_from_fasta(self, chrom, start, end=None):
"""Get reference allele at a given location."""
if not end:
end = start
try:
return str(self.record_dict[chrom][start-1:end].seq).upper()
except (KeyError, IndexError):
logger.warning(f'Could not get reference allele for {chrom}:{start}_{end}')
return None

@lru_cache
def add_context_base(self, chrom, pos, allele):
"""Add left-hand context base to allele at a given location."""
context_base = self.get_ref_from_fasta(chrom, pos)
if context_base:
if allele.lower() == 'del':
return context_base
return f'{context_base}{allele}'
return None

@lru_cache
def get_chrom_num_from_refseq(self, chrom_refseq):
# TODO use the assembly report? API call?
m = re.search(r'Homo sapiens chromosome (.*?), ', self.record_dict[chrom_refseq].description)
if m and m.group(1):
return m.group(1)
logger.warning(f'Could not get chromosome number for {chrom_refseq}')
return None
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os

import pytest

from opentargets_pharmgkb.variant_coordinates import Fasta


fasta_path = os.path.join(os.path.dirname(__file__), 'resources', 'chr21.fa')


@pytest.fixture
def fasta():
return Fasta(fasta_path)
Loading

0 comments on commit cdb0966

Please sign in to comment.