merge

HealthML · May 19, 2022 · 5dd6a20 · 5dd6a20
2 parents 54f3e3c + bfa1049
commit 5dd6a20
Show file tree

Hide file tree

Showing 23 changed files with 1,918 additions and 42 deletions.
diff --git a/Snakefile b/Snakefile
@@ -7,6 +7,17 @@ import gzip
 
 configfile: "conf/config.yaml"
 
+#####################
+# utility functions #
+#####################
+
+def get_mem_mb(mem_mb=10000, factor=1.5):
+    # return function that tries a higher memory limit for retries, scaling exponentially with a factor
+    def gm(wildcards, attempt):
+        return factor ** (attempt - 1) * mem_mb
+    return gm
+
+
 ###################################
 # samplesheet for the plink files #
 ###################################
@@ -46,7 +57,7 @@ wildcard_constraints:
 # local rules #
 ###############
 
-localrules: link_all, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all
+localrules: link_all, link_genotypes, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all, complete_cases_ancestry, gather_complete_cases
 
 
 ###############
@@ -156,4 +167,5 @@ include: "snake/deepripe.smk"
 include: "snake/evep.smk"
 include: "snake/splice_ai.smk"
 include: "snake/association.smk"
+include: "snake/conditional_tests.smk"
 include: "snake/results_processing.smk"
diff --git a/env/snakemake.yml b/env/snakemake.yml
@@ -7,3 +7,4 @@ dependencies:
   - python=3.8
   - snakemake-minimal
   - pandas
+  - mamba
diff --git a/run_cluster.sh b/run_cluster.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+#SBATCH --job-name=controljob_%j
+#SBATCH --output=snakemake_%j.log
+#SBATCH --partition=vcpu
+#SBATCH --time=48:00:00
+#SBATCH -c 1
+#SBATCH --mem 2000
+
+# Initialize conda:
+eval "$(conda shell.bash hook)"
+
+snakemake_env="install/snakemake"
+
+if [ ! -d $snakemake_env ]; then
+    ./install.sh
+fi
+
+#conda activate $snakemake_env
+
+conda activate snakemake
+
+snakemake --snakefile Snakefile \
+          --configfile conf/config.yaml \
+	  --profile ./slurm \
+          --directory "${PWD}" \
+	  "${@}"
+
+
diff --git a/script/python/assoc_baseline_scoretest_conditional_analysis.py b/script/python/assoc_baseline_scoretest_conditional_analysis.py
@@ -0,0 +1,179 @@
+
+
+
+import logging
+logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
+
+
+import numpy as np
+import pandas as pd
+import gzip
+
+from seak.scoretest import ScoretestNoK
+from seak.data_loaders import intersect_ids, CovariatesLoaderCSV, VariantLoaderSnpReader
+from pysnptools.snpreader import Bed
+from util.association import BurdenLoaderHDF5
+from util import Timer
+
+
+'''
+This script is the baseline that we later compare to.
+It loops over genes and performs score-tests, using pre-computed indicator variables (0/1)
+'''
+
+
+# covariates loader
+covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
+                                       snakemake.input.covariates_tsv,
+                                       snakemake.params.covariate_column_names,
+                                       sep='\t',
+                                       path_to_phenotypes=snakemake.input.phenotypes_tsv)
+
+# set up burden loaders
+bloader_lof = BurdenLoaderHDF5(snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof)
+bloader_missense = BurdenLoaderHDF5(snakemake.input.h5_missense, snakemake.input.iid_missense, snakemake.input.gid_missense)
+
+# make sure individuals are in the same order
+bloader_lof.update_individuals(covariatesloader.get_iids())
+bloader_missense.update_individuals(covariatesloader.get_iids())
+
+# gene names to iterate over
+genes = np.union1d(bloader_lof.get_vids(), bloader_missense.get_vids())
+if isinstance(genes, str):
+    genes = [genes]
+
+# keep only those that are significant!
+# (1) load the results table
+results = pd.read_csv(snakemake.input.results_tsv, sep='\t', na_values='.')
+
+# the columns to consider
+sigcols = snakemake.params.columns
+if isinstance(sigcols, str):
+    sigcols = [sigcols]
+
+pvcols_score = sigcols
+results = results[['gene', 'nCarrier_pLOF', 'nCarrier_missense'] + pvcols_score]
+results['pheno'] = snakemake.params.phenotype
+results = results.loc[results.gene.isin(genes)]
+results['gene_name'] = results['gene'].str.split('_', expand=True)[1]
+genes = results.gene.tolist()
+
+# (2) load the list of variants to condition on
+_conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno'])
+_conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ]
+_conditional_list = _conditional_list.drop_duplicates() 
+
+if len(_conditional_list) == 0:
+    logging.info('No genes pass significance threshold.'.format(snakemake.params.phenotype))
+    with gzip.open(snakemake.output.results_tsv, 'wt') as outfile:
+        outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype))
+        sys.exit(0)
+else:
+    results =  results.merge(_conditional_list, how='right', on=['gene_name', 'pheno'])
+
+# (3) keep only genes below threshold 
+sig_genes = [results.gene[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score ]
+sig_genes = np.unique(np.concatenate(sig_genes))
+
+if len(sig_genes) == 0:
+    logging.info('No genes pass significance threshold.')
+    with gzip.open(snakemake.output.results_tsv, 'wt') as outfile:
+        outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype))
+    sys.exit(0)
+results = results.loc[results.gene.isin(sig_genes)].copy()
+genes = results.gene.tolist()
+
+logging.info('About to evaluate variants in {} genes'.format(len(results.gene.unique())))
+
+# conditional analysis:
+# these genotypes will be used for the conditional analysis
+geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
+geno_cond.update_individuals(covariatesloader.get_iids())
+
+# this file contains the mapping of associations to SNPs to condition on
+conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0)
+conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates()
+conditional_list = conditional_list.loc[conditional_list.gene_name.isin(results.gene_name)]
+
+geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
+logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids())))
+
+# set up the null model
+Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK')
+null_model = ScoretestNoK(Y, X)
+
+logging.info('Phenotype: {}, Sample size: {}'.format(snakemake.params.phenotype, len(Y)))
+
+def test_gene(gene):
+
+    # conditional analysis
+    # set up the function to load the variants to condition on
+
+    def get_conditional(gene):
+
+        # print(gene)
+        # print(gene.split('_')[1])
+        # print(conditional_list)
+
+        cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == gene.split('_')[1], 'snp_rsid' ].unique().tolist()
+
+        temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False)
+        temp_genotypes -= np.nanmean(temp_genotypes, axis=0)
+
+        cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.)
+
+        return cond_snps, temp_vids
+
+    pval_dict = {}
+    pval_dict['gene'] = gene
+
+    def call(GV, G2, name):
+        # calls the test, appends results to pval_dict
+        if GV is None:
+            pval_dict['pv_' + name] = np.nan
+            pval_dict['nCarrier_' + name] = 0
+        else:
+            pv = null_model.pv_alt_model(GV, G2)
+            if pv < 0.:
+                logging.warning('negative value encountered in p-value computation for gene {}, p-value: {}, using saddle instead.'.format(gene, pv))
+                pv = null_model.pv_alt_model(GV, G2, method='saddle')
+            pval_dict['pv_' + name] = pv
+            pval_dict['nCarrier_' + name] = int(GV.sum())
+
+
+    # Load the protein lof burdens
+    try:
+        Glof = bloader_lof.genotypes_by_id(gene).astype(float)
+    except KeyError:
+        Glof = None
+
+    # Load the missense burdens
+    try:
+        Gmiss = bloader_missense.genotypes_by_id(gene).astype(float)
+    except KeyError:
+        Gmiss = None
+
+    G2, cond_vid = get_conditional(gene)
+
+    call(Glof, G2, 'pLOF')
+    call(Gmiss, G2, 'missense')
+
+    if (Glof is None) or (Gmiss is None):
+        call(None, 'mrg')
+    else:
+        G = np.maximum(Glof, Gmiss)
+        call(G, G2, 'mrg')
+
+    pval_dict['cond_snps'] = ','.join(cond_vid)
+
+    return pval_dict
+
+
+timer = Timer()
+results = pd.DataFrame.from_dict([test_gene(gene) for gene in genes]).set_index('gene')
+
+t = timer.check()
+logging.info('{} genes tested in {:.2f} minutes.'.format(len(results), t/60.))
+
+results.to_csv(snakemake.output.results_tsv, sep='\t', index=True, na_rep='.')
+
diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple.py b/script/python/assoc_sclrt_kernels_deepripe_multiple.py
@@ -225,9 +225,8 @@ def get_rbp(interval):
             if temp_genotypes is None:
                 raise GotNone
 
-            nanmean = np.nanmean(temp_genotypes, axis=0)
-
-            ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped!
+            nanmean = np.nanmean(temp_genotypes, axis=0) 
+            ncarrier = np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes) > 0, axis=0)# calculated differently here because alleles can be flipped!
 
             G1 = temp_genotypes - nanmean
             G1 = np.ma.masked_invalid(G1).filled(0.)
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ dependencies: @@
       - python=3.8
       - snakemake-minimal
       - pandas
+      - mamba