Skip to content

Commit

Permalink
Merge pull request #13 from apriltuesday/issue-10
Browse files Browse the repository at this point in the history
Add counts and fix bugs from test run
  • Loading branch information
apriltuesday authored Jul 17, 2023
2 parents 3ae0a92 + aa24d59 commit b2fc26b
Show file tree
Hide file tree
Showing 12 changed files with 269 additions and 161 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
# opentargets-pharmgkb
Pipeline to provide evidence strings for Open Targets from PharmGKB

```
# Download data
DATA_DIR=<directory for data>
wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
unzip -j clinicalAnnotations.zip "*.tsv" -d $DATA_DIR
unzip -j clinicalAnnotations.zip "CREATED*.txt" -d $DATA_DIR
unzip -j drugs.zip "*.tsv" -d $DATA_DIR
rm clinicalAnnotations.zip drugs.zip
# Run pipeline
generate_evidence.py --data-dir $DATA_DIR --created-date <created date> --output-path evidence.json
```
10 changes: 2 additions & 8 deletions bin/generate_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,15 @@
from opentargets_pharmgkb import evidence_generation

parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
parser.add_argument('--clinical-annot-path', help='Path to clinical_annotations.tsv', required=True)
parser.add_argument('--clinical-alleles-path', help='Path to clinical_ann_alleles.tsv', required=True)
parser.add_argument('--clinical-evidence-path', help='Path to clinical_ann_evidence.tsv', required=True)
parser.add_argument('--drugs-path', help='Path to drugs.tsv', required=True)
parser.add_argument('--data-dir', help='Directory containing necessary .tsv files from PharmGKB', required=True)
parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
parser.add_argument('--output-path', help='Path to output evidence strings', required=True)


if __name__ == '__main__':
args = parser.parse_args()
evidence_generation.pipeline(
clinical_annot_path=args.clinical_annot_path,
clinical_alleles_path=args.clinical_alleles_path,
clinical_evidence_path=args.clinical_evidence_path,
drugs_path=args.drugs_path,
data_dir=args.data_dir,
created_date=args.created_date,
output_path=args.output_path
)
41 changes: 41 additions & 0 deletions opentargets_pharmgkb/counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
class ClinicalAnnotationCounts:
"""Simple class to hold counts for generating clinical annotation evidence."""

def __init__(self):
# Input counts (before annotation and exploding by drug, etc.)
self.clinical_annotations = 0
self.with_rs = 0
# Counts after exploding by each attribute
self.exploded_alleles = 0
self.exploded_drugs = 0
self.exploded_phenotypes = 0
# Output counts (after annotation and exploding)
self.evidence_strings = 0
self.with_chebi = 0
self.with_efo = 0
self.with_consequence = 0
# self.with_pgkb_gene = 0
self.with_vep_gene = 0
# Evaluation counts - after annotation but without exploding
self.annot_with_pgkb_genes = 0
self.annot_with_vep_genes = 0
self.pgkb_vep_gene_diff = 0

def report(self):
report_str = f'\nTotal clinical annotations: {self.clinical_annotations}\n'
report_str += f'\tWith RS: {self.with_rs}\n'
report_str += f'Exploded by allele: {self.exploded_alleles}\n'
report_str += f'Exploded by drug: {self.exploded_drugs}\n'
report_str += f'Exploded by phenotype: {self.exploded_phenotypes}\n'
report_str += f'Total evidence strings: {self.evidence_strings}\n'
report_str += f'\tWith CHEBI: {self.with_chebi}\n'
report_str += f'\tWith EFO phenotype: {self.with_efo}\n'
report_str += f'\tWith functional consequence: {self.with_consequence}\n'
# report_str += f'\tWith PGKB gene: {self.with_pgkb_gene}\n'
report_str += f'\tWith VEP gene: {self.with_vep_gene}\n'
report_str += f'Gene comparisons per annotation\n'
report_str += f'\tWith PGKB genes: {self.annot_with_pgkb_genes}\n'
report_str += f'\tWith VEP genes: {self.annot_with_vep_genes}\n'
report_str += f'\tPGKB genes != VEP genes: {self.pgkb_vep_gene_diff}\n'
print(report_str)
return report_str
87 changes: 73 additions & 14 deletions opentargets_pharmgkb/evidence_generation.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,83 @@
import json
import logging
import multiprocessing
import os
from itertools import zip_longest

import pandas as pd
from cmat.consequence_prediction.common.biomart import query_biomart
from cmat.consequence_prediction.snp_indel_variants.pipeline import process_variants

from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation

logging.basicConfig()
logger = logging.getLogger(__package__)
logger.setLevel(level=logging.DEBUG)

ID_COL_NAME = 'Clinical Annotation ID'


def pipeline(clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path, created_date, output_path):
def pipeline(data_dir, created_date, output_path):
clinical_annot_path = os.path.join(data_dir, 'clinical_annotations.tsv')
clinical_alleles_path = os.path.join(data_dir, 'clinical_ann_alleles.tsv')
clinical_evidence_path = os.path.join(data_dir, 'clinical_ann_evidence.tsv')
drugs_path = os.path.join(data_dir, 'drugs.tsv')
for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path):
if not os.path.exists(p):
logger.error(f'Missing required data file: {p}')
raise ValueError(f'Missing required data file: {p}')

clinical_annot_table = read_tsv_to_df(clinical_annot_path)
clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
drugs_table = read_tsv_to_df(drugs_path)

merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
# Restrict to variants with rsIDs
rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')]
rs_only_table = clinical_annot_table[clinical_annot_table['Variant/Haplotypes'].str.contains('rs')]

# Gather input counts
counts = ClinicalAnnotationCounts()
counts.clinical_annotations = len(clinical_annot_table)
counts.with_rs = len(rs_only_table)

# Main processing
merged_with_alleles_table = pd.merge(rs_only_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
counts.exploded_alleles = len(merged_with_alleles_table)

mapped_drugs = explode_and_map_drugs(merged_with_alleles_table, drugs_table)
counts.exploded_drugs = len(mapped_drugs)

coordinates_table = get_vcf_coordinates(rs_only_table)
consequences_table = get_functional_consequences(coordinates_table)
mapped_genes = explode_and_map_genes(consequences_table)
mapped_drugs = explode_and_map_drugs(mapped_genes, drugs_table)
mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
counts.exploded_phenotypes = len(mapped_phenotypes)

coordinates_table = get_vcf_coordinates(mapped_phenotypes)
consequences_table = get_functional_consequences(coordinates_table)

# Add clinical evidence with PMIDs
pmid_evidence = clinical_evidence_table[clinical_evidence_table['PMID'].notna()]
evidence_table = pd.merge(mapped_phenotypes, pmid_evidence.groupby(by=ID_COL_NAME).aggregate(
evidence_table = pd.merge(consequences_table, pmid_evidence.groupby(by=ID_COL_NAME).aggregate(
publications=('PMID', list)), on=ID_COL_NAME)

# Gather output counts
counts.evidence_strings = len(evidence_table)
counts.with_chebi = evidence_table['chebi'].count()
counts.with_efo = evidence_table['efo'].count()
counts.with_consequence = evidence_table['consequence_term'].count()
# counts.with_pgkb_gene = evidence_table['gene_from_pgkb'].count()
counts.with_vep_gene = evidence_table['overlapping_gene'].count()

# Generate evidence
evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in evidence_table.iterrows()]
with open(output_path, 'w+') as output:
output.write('\n'.join(json.dumps(ev) for ev in evidence))

# Final count report
gene_comparison_counts(evidence_table, counts, debug_path=f'{output_path.rsplit(".", 1)[0]}_genes.csv')
counts.report()


def read_tsv_to_df(path):
return pd.read_csv(path, sep='\t', dtype=str)
Expand Down Expand Up @@ -116,9 +155,9 @@ def explode_and_map_genes(df):
ensembl_ids = query_biomart(
('hgnc_symbol', 'split_gene'),
('ensembl_gene_id', 'gene_from_pgkb'),
split_genes['split_gene'].drop_duplicates().tolist()
split_genes['split_gene'].dropna().drop_duplicates().tolist()
)
mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene')
mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene', how='left')
# HGNC could map to more than one ensembl gene id, so must explode again
mapped_genes = mapped_genes.explode('gene_from_pgkb').reset_index(drop=True)
return mapped_genes
Expand Down Expand Up @@ -153,7 +192,9 @@ def explode_and_map_drugs(df, drugs_table):


def chebi_id_to_iri(id_):
return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
if pd.notna(id_):
return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
return None


def explode_and_map_phenotypes(df):
Expand Down Expand Up @@ -193,16 +234,16 @@ def generate_clinical_annotation_evidence(created_date, row):
# VARIANT ATTRIBUTES
'variantId': row['vcf_coords'],
'variantRsId': row['Variant/Haplotypes'],
'targetFromSourceId': row['gene_from_pgkb'],
# 'originalSourceGeneId': row['gene_from_pgkb'],
'variantFunctionalConsequenceId': row['consequence_term'],
'variantOverlappingGeneId': row['overlapping_gene'],
'targetFromSourceId': row['overlapping_gene'],

# GENOTYPE ATTRIBUTES
'genotype': row['Genotype/Allele'],
'genotypeAnnotationText': row['Annotation Text'],

# PHENOTYPE ATTRIBUTES
'drugText': row['split_drug'],
'drugFromSource': row['split_drug'],
'drugId': row['chebi'],
'pgxCategory': row['Phenotype Category'],
'phenotypeText': row['split_phenotype'],
Expand All @@ -212,3 +253,21 @@ def generate_clinical_annotation_evidence(created_date, row):
evidence_string = {key: value for key, value in evidence_string.items()
if value and (isinstance(value, list) or pd.notna(value))}
return evidence_string


def gene_comparison_counts(df, counts, debug_path=None):
# Map PGKB genes
mapped_genes = explode_and_map_genes(df)
# Re-group by ID column
genes_table = mapped_genes.groupby(by=ID_COL_NAME).aggregate(
all_pgkb_genes=('gene_from_pgkb', lambda x: set(x.dropna())),
all_vep_genes=('overlapping_gene', lambda x: set(x.dropna()))
)
# Compare sets of genes
counts.annot_with_pgkb_genes = len(genes_table[genes_table['all_pgkb_genes'] != set()])
counts.annot_with_vep_genes = len(genes_table[genes_table['all_vep_genes'] != set()])
neq_genes_table = genes_table[genes_table['all_pgkb_genes'] != genes_table['all_vep_genes']]
counts.pgkb_vep_gene_diff = len(neq_genes_table)
# Debug dump genes table
if debug_path:
neq_genes_table.to_csv(debug_path)
3 changes: 2 additions & 1 deletion opentargets_pharmgkb/pandas_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd


def none_to_nan(x):
Expand All @@ -17,5 +18,5 @@ def explode_column(df, source_col, target_col, sep=';'):
:return: dataframe with target_col added
"""
split_cols = df.assign(**{target_col: df[source_col].str.split(sep)}).explode(target_col).reset_index(drop=True)
split_cols[target_col] = split_cols[target_col].map(lambda x: x.strip())
split_cols[target_col] = split_cols[target_col].map(lambda x: str(x).strip() if pd.notna(x) else np.nan)
return split_cols
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy==1.24.3
pandas==1.5.3
pytest==7.2.2
requests==2.28.2
requests==2.31.0
retry==0.9.2
cmat @ git+https://github.com/apriltuesday/eva-opentargets.git@refactor#egg=cmat
cmat @ git+https://github.com/EBIvariation/eva-opentargets.git#egg=cmat
5 changes: 4 additions & 1 deletion tests/resources/clinical_ann_alleles.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,7 @@ Clinical Annotation ID Genotype/Allele Annotation Text Allele Function
1444668808 del/del Hypertensive patients with the rs1799752 del/del genotype may have a decreased response to irbesartan as compared to patients with the ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCC/ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCC genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to irbesartan.
1451114960 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 or 2R/2R genotype may have an increased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/3R or 3R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.
1451114960 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 2R/3R genotype may have an increased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 3R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.
1451114960 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 3R/3R genotype may have a decreased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/2R or 2R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.
1451114960 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 3R/3R genotype may have a decreased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/2R or 2R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.
1449566379 AA Patients with the AA genotype and hypertension may have an increased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the AG or GG genotypes. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
1449566379 AG Patients with the AG genotype and hypertension may have an increased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the GG genotype, but a decreased response as compared to patients with the AA genotype. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
1449566379 GG Patients with the GG genotype and hypertension may have a decreased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the AA or AG genotypes. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
1 change: 1 addition & 0 deletions tests/resources/clinical_ann_evidence.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ Clinical Annotation ID Evidence ID Evidence Type Evidence URL PMID Summary Score
1451114960 827825053 Variant Phenotype Annotation https://www.pharmgkb.org/variantAnnotation/827825053 16249645 Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 is associated with increased survival when treated with cisplatin and fluorouracil in people with Stomach Neoplasms. 0.25
1451114960 769169082 Variant Drug Annotation https://www.pharmgkb.org/variantAnnotation/769169082 11913730 Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 is associated with increased response to fluorouracil in people with Colorectal Neoplasms as compared to genotypes (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 + (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3. 1.5
1451114960 1449155868 Variant Phenotype Annotation https://www.pharmgkb.org/variantAnnotation/1449155868 28972045 Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 is associated with increased overall survival when treated with cisplatin, epirubicin and fluorouracil in people with Esophageal Neoplasms or Stomach Neoplasms as compared to genotypes (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 + (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3. 2.5
1449566379 1449566369 Variant Drug Annotation https://www.pharmgkb.org/variantAnnotation/1449566369 29925376 Genotypes AA + AG is associated with increased response to hydrochlorothiazide in people with Hypertension as compared to genotype GG. 2.5
1 change: 1 addition & 0 deletions tests/resources/clinical_annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ Clinical Annotation ID Variant/Haplotypes Gene Level of Evidence Level Override
1448603303 rs3766246 FAAH 3 3.0 Toxicity 1 1 morphine 2021-03-24 https://www.pharmgkb.org/clinicalAnnotation/1448603303 Pediatric
1444668808 rs1799752 ACE 3 Tier 1 VIP 0.75 Efficacy 2 2 irbesartan Hypertension;Hypertrophy, Left Ventricular 2021-09-09 https://www.pharmgkb.org/clinicalAnnotation/1444668808
1451114960 rs45445694 TYMS 3 Tier 1 VIP 3.25 Efficacy 13 13 fluorouracil;FOLFIRI;FOLFOX overall survival;progression-free survival 2021-03-24 https://www.pharmgkb.org/clinicalAnnotation/1451114960
1449566379 rs11065987 3 2.5 Efficacy 1 1 hydrochlorothiazide Hypertension 2021-03-24 https://www.pharmgkb.org/clinicalAnnotation/1449566379
Loading

0 comments on commit b2fc26b

Please sign in to comment.