Merge pull request #13 from apriltuesday/issue-10

Add counts and fix bugs from test run
EBIvariation · Jul 17, 2023 · b2fc26b · b2fc26b
2 parents 3ae0a92 + aa24d59
commit b2fc26b
Show file tree

Hide file tree

Showing 12 changed files with 269 additions and 161 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,17 @@
 # opentargets-pharmgkb
 Pipeline to provide evidence strings for Open Targets from PharmGKB
+
+```
+# Download data
+DATA_DIR=<directory for data>
+wget https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
+wget https://api.pharmgkb.org/v1/download/file/data/drugs.zip
+
+unzip -j clinicalAnnotations.zip "*.tsv" -d $DATA_DIR
+unzip -j clinicalAnnotations.zip "CREATED*.txt" -d $DATA_DIR
+unzip -j drugs.zip "*.tsv" -d $DATA_DIR
+rm clinicalAnnotations.zip drugs.zip
+
+# Run pipeline
+generate_evidence.py --data-dir $DATA_DIR --created-date <created date> --output-path evidence.json
+```
diff --git a/bin/generate_evidence.py b/bin/generate_evidence.py
@@ -4,21 +4,15 @@
 from opentargets_pharmgkb import evidence_generation
 
 parser = argparse.ArgumentParser('Generates Open Targets evidence strings from PharmGKB data')
-parser.add_argument('--clinical-annot-path', help='Path to clinical_annotations.tsv', required=True)
-parser.add_argument('--clinical-alleles-path', help='Path to clinical_ann_alleles.tsv', required=True)
-parser.add_argument('--clinical-evidence-path', help='Path to clinical_ann_evidence.tsv', required=True)
-parser.add_argument('--drugs-path', help='Path to drugs.tsv', required=True)
+parser.add_argument('--data-dir', help='Directory containing necessary .tsv files from PharmGKB', required=True)
 parser.add_argument('--created-date', help='Created date of downloaded files (provided by PharmGKB)', required=True)
 parser.add_argument('--output-path', help='Path to output evidence strings', required=True)
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     evidence_generation.pipeline(
-        clinical_annot_path=args.clinical_annot_path,
-        clinical_alleles_path=args.clinical_alleles_path,
-        clinical_evidence_path=args.clinical_evidence_path,
-        drugs_path=args.drugs_path,
+        data_dir=args.data_dir,
         created_date=args.created_date,
         output_path=args.output_path
     )
diff --git a/opentargets_pharmgkb/counts.py b/opentargets_pharmgkb/counts.py
@@ -0,0 +1,41 @@
+class ClinicalAnnotationCounts:
+    """Simple class to hold counts for generating clinical annotation evidence."""
+
+    def __init__(self):
+        # Input counts (before annotation and exploding by drug, etc.)
+        self.clinical_annotations = 0
+        self.with_rs = 0
+        # Counts after exploding by each attribute
+        self.exploded_alleles = 0
+        self.exploded_drugs = 0
+        self.exploded_phenotypes = 0
+        # Output counts (after annotation and exploding)
+        self.evidence_strings = 0
+        self.with_chebi = 0
+        self.with_efo = 0
+        self.with_consequence = 0
+        # self.with_pgkb_gene = 0
+        self.with_vep_gene = 0
+        # Evaluation counts - after annotation but without exploding
+        self.annot_with_pgkb_genes = 0
+        self.annot_with_vep_genes = 0
+        self.pgkb_vep_gene_diff = 0
+
+    def report(self):
+        report_str = f'\nTotal clinical annotations: {self.clinical_annotations}\n'
+        report_str += f'\tWith RS: {self.with_rs}\n'
+        report_str += f'Exploded by allele: {self.exploded_alleles}\n'
+        report_str += f'Exploded by drug: {self.exploded_drugs}\n'
+        report_str += f'Exploded by phenotype: {self.exploded_phenotypes}\n'
+        report_str += f'Total evidence strings: {self.evidence_strings}\n'
+        report_str += f'\tWith CHEBI: {self.with_chebi}\n'
+        report_str += f'\tWith EFO phenotype: {self.with_efo}\n'
+        report_str += f'\tWith functional consequence: {self.with_consequence}\n'
+        # report_str += f'\tWith PGKB gene: {self.with_pgkb_gene}\n'
+        report_str += f'\tWith VEP gene: {self.with_vep_gene}\n'
+        report_str += f'Gene comparisons per annotation\n'
+        report_str += f'\tWith PGKB genes: {self.annot_with_pgkb_genes}\n'
+        report_str += f'\tWith VEP genes: {self.annot_with_vep_genes}\n'
+        report_str += f'\tPGKB genes != VEP genes: {self.pgkb_vep_gene_diff}\n'
+        print(report_str)
+        return report_str
diff --git a/opentargets_pharmgkb/evidence_generation.py b/opentargets_pharmgkb/evidence_generation.py
@@ -1,44 +1,83 @@
 import json
+import logging
 import multiprocessing
+import os
 from itertools import zip_longest
 
 import pandas as pd
 from cmat.consequence_prediction.common.biomart import query_biomart
 from cmat.consequence_prediction.snp_indel_variants.pipeline import process_variants
 
+from opentargets_pharmgkb.counts import ClinicalAnnotationCounts
 from opentargets_pharmgkb.ontology_apis import get_chebi_iri, get_efo_iri
 from opentargets_pharmgkb.pandas_utils import none_to_nan, explode_column
 from opentargets_pharmgkb.variant_coordinates import get_coordinates_for_clinical_annotation
 
+logging.basicConfig()
+logger = logging.getLogger(__package__)
+logger.setLevel(level=logging.DEBUG)
+
 ID_COL_NAME = 'Clinical Annotation ID'
 
 
-def pipeline(clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path, created_date, output_path):
+def pipeline(data_dir, created_date, output_path):
+    clinical_annot_path = os.path.join(data_dir, 'clinical_annotations.tsv')
+    clinical_alleles_path = os.path.join(data_dir, 'clinical_ann_alleles.tsv')
+    clinical_evidence_path = os.path.join(data_dir, 'clinical_ann_evidence.tsv')
+    drugs_path = os.path.join(data_dir, 'drugs.tsv')
+    for p in (clinical_annot_path, clinical_alleles_path, clinical_evidence_path, drugs_path):
+        if not os.path.exists(p):
+            logger.error(f'Missing required data file: {p}')
+            raise ValueError(f'Missing required data file: {p}')
+
     clinical_annot_table = read_tsv_to_df(clinical_annot_path)
     clinical_alleles_table = read_tsv_to_df(clinical_alleles_path)
     clinical_evidence_table = read_tsv_to_df(clinical_evidence_path)
     drugs_table = read_tsv_to_df(drugs_path)
 
-    merged_table = pd.merge(clinical_annot_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
     # Restrict to variants with rsIDs
-    rs_only_table = merged_table[merged_table['Variant/Haplotypes'].str.contains('rs')]
+    rs_only_table = clinical_annot_table[clinical_annot_table['Variant/Haplotypes'].str.contains('rs')]
+
+    # Gather input counts
+    counts = ClinicalAnnotationCounts()
+    counts.clinical_annotations = len(clinical_annot_table)
+    counts.with_rs = len(rs_only_table)
+
+    # Main processing
+    merged_with_alleles_table = pd.merge(rs_only_table, clinical_alleles_table, on=ID_COL_NAME, how='left')
+    counts.exploded_alleles = len(merged_with_alleles_table)
+
+    mapped_drugs = explode_and_map_drugs(merged_with_alleles_table, drugs_table)
+    counts.exploded_drugs = len(mapped_drugs)
 
-    coordinates_table = get_vcf_coordinates(rs_only_table)
-    consequences_table = get_functional_consequences(coordinates_table)
-    mapped_genes = explode_and_map_genes(consequences_table)
-    mapped_drugs = explode_and_map_drugs(mapped_genes, drugs_table)
     mapped_phenotypes = explode_and_map_phenotypes(mapped_drugs)
+    counts.exploded_phenotypes = len(mapped_phenotypes)
+
+    coordinates_table = get_vcf_coordinates(mapped_phenotypes)
+    consequences_table = get_functional_consequences(coordinates_table)
 
     # Add clinical evidence with PMIDs
     pmid_evidence = clinical_evidence_table[clinical_evidence_table['PMID'].notna()]
-    evidence_table = pd.merge(mapped_phenotypes, pmid_evidence.groupby(by=ID_COL_NAME).aggregate(
+    evidence_table = pd.merge(consequences_table, pmid_evidence.groupby(by=ID_COL_NAME).aggregate(
         publications=('PMID', list)), on=ID_COL_NAME)
 
+    # Gather output counts
+    counts.evidence_strings = len(evidence_table)
+    counts.with_chebi = evidence_table['chebi'].count()
+    counts.with_efo = evidence_table['efo'].count()
+    counts.with_consequence = evidence_table['consequence_term'].count()
+    # counts.with_pgkb_gene = evidence_table['gene_from_pgkb'].count()
+    counts.with_vep_gene = evidence_table['overlapping_gene'].count()
+
     # Generate evidence
     evidence = [generate_clinical_annotation_evidence(created_date, row) for _, row in evidence_table.iterrows()]
     with open(output_path, 'w+') as output:
         output.write('\n'.join(json.dumps(ev) for ev in evidence))
 
+    # Final count report
+    gene_comparison_counts(evidence_table, counts, debug_path=f'{output_path.rsplit(".", 1)[0]}_genes.csv')
+    counts.report()
+
 
 def read_tsv_to_df(path):
     return pd.read_csv(path, sep='\t', dtype=str)
@@ -116,9 +155,9 @@ def explode_and_map_genes(df):
     ensembl_ids = query_biomart(
         ('hgnc_symbol', 'split_gene'),
         ('ensembl_gene_id', 'gene_from_pgkb'),
-        split_genes['split_gene'].drop_duplicates().tolist()
+        split_genes['split_gene'].dropna().drop_duplicates().tolist()
     )
-    mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene')
+    mapped_genes = pd.merge(split_genes, ensembl_ids, on='split_gene', how='left')
     # HGNC could map to more than one ensembl gene id, so must explode again
     mapped_genes = mapped_genes.explode('gene_from_pgkb').reset_index(drop=True)
     return mapped_genes
@@ -153,7 +192,9 @@ def explode_and_map_drugs(df, drugs_table):
 
 
 def chebi_id_to_iri(id_):
-    return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
+    if pd.notna(id_):
+        return f'http://purl.obolibrary.org/obo/CHEBI_{id_}'
+    return None
 
 
 def explode_and_map_phenotypes(df):
@@ -193,16 +234,16 @@ def generate_clinical_annotation_evidence(created_date, row):
         # VARIANT ATTRIBUTES
         'variantId': row['vcf_coords'],
         'variantRsId': row['Variant/Haplotypes'],
-        'targetFromSourceId': row['gene_from_pgkb'],
+        # 'originalSourceGeneId': row['gene_from_pgkb'],
         'variantFunctionalConsequenceId': row['consequence_term'],
-        'variantOverlappingGeneId': row['overlapping_gene'],
+        'targetFromSourceId': row['overlapping_gene'],
 
         # GENOTYPE ATTRIBUTES
         'genotype': row['Genotype/Allele'],
         'genotypeAnnotationText': row['Annotation Text'],
 
         # PHENOTYPE ATTRIBUTES
-        'drugText': row['split_drug'],
+        'drugFromSource': row['split_drug'],
         'drugId': row['chebi'],
         'pgxCategory': row['Phenotype Category'],
         'phenotypeText': row['split_phenotype'],
@@ -212,3 +253,21 @@ def generate_clinical_annotation_evidence(created_date, row):
     evidence_string = {key: value for key, value in evidence_string.items()
                        if value and (isinstance(value, list) or pd.notna(value))}
     return evidence_string
+
+
+def gene_comparison_counts(df, counts, debug_path=None):
+    # Map PGKB genes
+    mapped_genes = explode_and_map_genes(df)
+    # Re-group by ID column
+    genes_table = mapped_genes.groupby(by=ID_COL_NAME).aggregate(
+        all_pgkb_genes=('gene_from_pgkb', lambda x: set(x.dropna())),
+        all_vep_genes=('overlapping_gene', lambda x: set(x.dropna()))
+    )
+    # Compare sets of genes
+    counts.annot_with_pgkb_genes = len(genes_table[genes_table['all_pgkb_genes'] != set()])
+    counts.annot_with_vep_genes = len(genes_table[genes_table['all_vep_genes'] != set()])
+    neq_genes_table = genes_table[genes_table['all_pgkb_genes'] != genes_table['all_vep_genes']]
+    counts.pgkb_vep_gene_diff = len(neq_genes_table)
+    # Debug dump genes table
+    if debug_path:
+        neq_genes_table.to_csv(debug_path)
diff --git a/opentargets_pharmgkb/pandas_utils.py b/opentargets_pharmgkb/pandas_utils.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 
 def none_to_nan(x):
@@ -17,5 +18,5 @@ def explode_column(df, source_col, target_col, sep=';'):
     :return: dataframe with target_col added
     """
     split_cols = df.assign(**{target_col: df[source_col].str.split(sep)}).explode(target_col).reset_index(drop=True)
-    split_cols[target_col] = split_cols[target_col].map(lambda x: x.strip())
+    split_cols[target_col] = split_cols[target_col].map(lambda x: str(x).strip() if pd.notna(x) else np.nan)
     return split_cols
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy==1.24.3
 pandas==1.5.3
 pytest==7.2.2
-requests==2.28.2
+requests==2.31.0
 retry==0.9.2
-cmat @ git+https://github.com/apriltuesday/eva-opentargets.git@refactor#egg=cmat
+cmat @ git+https://github.com/EBIvariation/eva-opentargets.git#egg=cmat
diff --git a/tests/resources/clinical_ann_alleles.tsv b/tests/resources/clinical_ann_alleles.tsv
@@ -26,4 +26,7 @@ Clinical Annotation ID	Genotype/Allele	Annotation Text	Allele Function
 1444668808	del/del	Hypertensive patients with the rs1799752 del/del genotype may have a decreased response to irbesartan as compared to patients with the ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCC/ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCC genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to irbesartan.	
 1451114960	(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2	Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 or 2R/2R genotype may have an increased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/3R or 3R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.	
 1451114960	(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3	Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 2R/3R genotype may have an increased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 3R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.	
-1451114960	(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3	Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 3R/3R genotype may have a decreased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/2R or 2R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.	
+1451114960	(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3	Patients with the rs45445694 (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 or 3R/3R genotype may have a decreased response or survival when treated with fluorouracil chemotherapy regimens as compared to patients with the 2R/2R or 2R/3R genotype. However, conflicting evidence has been reported. Other genetic and clinical factors may also influence response to fluorouracil chemotherapy.
+1449566379	AA	Patients with the AA genotype and hypertension may have an increased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the AG or GG genotypes. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
+1449566379	AG	Patients with the AG genotype and hypertension may have an increased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the GG genotype, but a decreased response as compared to patients with the AA genotype. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
+1449566379	GG	Patients with the GG genotype and hypertension may have a decreased response to hydrochlorothiazide treatment, as measured by decreases in systolic and diastolic blood pressure, as compared to patients with the AA or AG genotypes. Other genetic and clinical factors may also affect a patient's response to hydrochlorothiazide.
diff --git a/tests/resources/clinical_ann_evidence.tsv b/tests/resources/clinical_ann_evidence.tsv
@@ -69,3 +69,4 @@ Clinical Annotation ID	Evidence ID	Evidence Type	Evidence URL	PMID	Summary	Score
 1451114960	827825053	Variant Phenotype Annotation	https://www.pharmgkb.org/variantAnnotation/827825053	16249645	Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 is associated with increased survival when treated with cisplatin and fluorouracil in people with Stomach Neoplasms.	0.25
 1451114960	769169082	Variant Drug Annotation	https://www.pharmgkb.org/variantAnnotation/769169082	11913730	Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 is associated with increased response to fluorouracil in people with Colorectal Neoplasms as compared to genotypes (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 + (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3.	1.5
 1451114960	1449155868	Variant Phenotype Annotation	https://www.pharmgkb.org/variantAnnotation/1449155868	28972045	Genotype (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 is associated with increased overall survival when treated with cisplatin, epirubicin and fluorouracil in people with Esophageal Neoplasms or Stomach Neoplasms as compared to genotypes (CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3 + (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3.	2.5
+1449566379	1449566369	Variant Drug Annotation	https://www.pharmgkb.org/variantAnnotation/1449566369	29925376	Genotypes AA + AG is associated with increased response to hydrochlorothiazide in people with Hypertension as compared to genotype GG.	2.5
diff --git a/tests/resources/clinical_annotations.tsv b/tests/resources/clinical_annotations.tsv
@@ -8,3 +8,4 @@ Clinical Annotation ID	Variant/Haplotypes	Gene	Level of Evidence	Level Override
 1448603303	rs3766246	FAAH	3			3.0	Toxicity	1	1	morphine		2021-03-24	https://www.pharmgkb.org/clinicalAnnotation/1448603303	Pediatric
 1444668808	rs1799752	ACE	3		Tier 1 VIP	0.75	Efficacy	2	2	irbesartan	Hypertension;Hypertrophy, Left Ventricular	2021-09-09	https://www.pharmgkb.org/clinicalAnnotation/1444668808	
 1451114960	rs45445694	TYMS	3		Tier 1 VIP	3.25	Efficacy	13	13	fluorouracil;FOLFIRI;FOLFOX	overall survival;progression-free survival	2021-03-24	https://www.pharmgkb.org/clinicalAnnotation/1451114960	
+1449566379	rs11065987		3			2.5	Efficacy	1	1	hydrochlorothiazide	Hypertension	2021-03-24	https://www.pharmgkb.org/clinicalAnnotation/1449566379