From 8c6c7e8ae8fc07a420302dd3f281990d7d07fd87 Mon Sep 17 00:00:00 2001
From: remomomo <remomomonti@gmail.com>
Date: Wed, 27 Jul 2022 09:24:08 +0200
Subject: [PATCH] added scripts to get calibrated h2-estimates
 (*_eval_top_hits_h2)

---
 ...nels_deepripe_multiple_eval_top_hits_h2.py | 548 ++++++++++++++++++
 ...sclrt_kernels_missense_eval_top_hits_h2.py | 493 ++++++++++++++++
 ...sclrt_kernels_spliceai_eval_top_hits_h2.py | 511 ++++++++++++++++
 snake/association.smk                         | 138 +++++
 4 files changed, 1690 insertions(+)
 create mode 100644 script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py
 create mode 100644 script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py
 create mode 100644 script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py

diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py
new file mode 100644
index 0000000..015fde9
--- /dev/null
+++ b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py
@@ -0,0 +1,548 @@
+import os
+# os.environ["OMP_NUM_THREADS"] = "16"
+
+import logging
+
+logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
+
+import pandas as pd
+import numpy as np
+import h5py
+import gzip
+import pickle
+import sys
+
+from numpy.linalg import cholesky, LinAlgError
+from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity
+
+# needed to calculate Bayesian h2 confidence intervals
+from scipy.integrate import trapezoid
+from scipy.interpolate import interp1d
+
+# seak imports
+from seak.data_loaders import intersect_ids, Hdf5Loader, VariantLoaderSnpReader, CovariatesLoaderCSV
+from seak.scoretest import ScoretestNoK
+from seak.lrt import LRTnoK
+
+from pysnptools.snpreader import Bed
+
+from util import Timer
+
+
+class GotNone(Exception):
+    pass
+
+
+# set up the covariatesloader
+
+covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
+                                       snakemake.input.covariates_tsv,
+                                       snakemake.params.covariate_column_names,
+                                       sep='\t',
+                                       path_to_phenotypes=snakemake.input.phenotypes_tsv)
+
+# initialize the null models
+Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK')
+
+null_model_score = ScoretestNoK(Y, X)
+null_model_lrt = LRTnoK(X, Y)
+
+
+# set up function to filter variants:
+def maf_filter(mac_report):
+    # load the MAC report, keep only observed variants with MAF below threshold
+    mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref'])
+
+    if snakemake.params.filter_highconfidence:
+        # note: We don't filter out the variants for which alt/ref are "flipped" for the RBPs
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & (mac_report.hiconf_reg.astype(bool))]
+    else:
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0)]
+
+    return mac_report.set_index('SNP').loc[vids]
+
+
+def sid_filter(vids):
+    
+    if 'sid_include' in snakemake.config:
+        print('limiting to variants present in {}'.format(snakemake.config['sid_include']))
+        
+        infilepath = snakemake.config['sid_include']
+        
+        if infilepath.endswith('gz'):
+            with gzip.open(infilepath,'rt') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+        else:
+            with open(infilepath, 'r') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+    else:
+        return vids
+                
+    return intersect_ids(vids, sid)
+    
+
+
+def vep_filter(h5_rbp, bed_rbp):
+    # returns variant ids for variants that pass filtering threshold and mask for the hdf5loader
+
+    with h5py.File(h5_rbp, 'r') as f:
+
+        rbp_of_interest = snakemake.params.rbp_of_interest
+        try:
+            # in newer version of h5py the strings are not decoded -> decode them
+            labels = [l.decode() for l in f['labels'][:]]
+        except AttributeError:
+            # this used to work in older versions of  h5py:
+            labels = list(f['labels'][:])
+
+        if isinstance(rbp_of_interest, str):
+            rbp_of_interest = [rbp_of_interest]
+
+        assert all([rbp in labels for rbp in rbp_of_interest]), 'Error: not all of {} are in labels: {}'.format(rbp_of_interest, labels)
+
+        keep = list(i for i, l in enumerate(labels) if l in rbp_of_interest)
+        diffscores = f['diffscore'][:, keep]
+
+    rbp_mask = np.array([i in rbp_of_interest for i in labels])
+
+    if np.ndim(diffscores) == 1:
+        diffscores = diffscores[:, np.newaxis]
+
+    vid_mask = np.max(np.abs(diffscores), axis=1) >= snakemake.params.min_impact
+
+    vid_df = pd.read_csv(bed_rbp, sep='\t', header=None, usecols=[3], names=['vid'], dtype={'vid': str})
+    vids = vid_df.vid.str.split(pat='_[ACGT]+>[ACGT]+$', n=1, expand=True)[0].values[vid_mask]
+
+    return vids, rbp_mask, np.array(labels)[rbp_mask].tolist()
+
+
+def get_plof_id_func(vep_tsv):
+    # returns a function that takes an array of variant ids as input and returns a boolean array which indicates whether a variant is
+    # annotated as a protein LOF variant for any gene
+
+    plof_ids = pd.read_csv(vep_tsv, sep='\t', usecols=['Uploaded_variation'], index_col='Uploaded_variation').index.drop_duplicates()
+
+    def is_plof(vids):
+        return np.array([vid in plof_ids for vid in vids])
+
+    return is_plof
+
+
+# start: kernel parameters / functions
+def get_cholesky(S):
+    try:
+        chol = cholesky(S)
+        flag = 1
+    except LinAlgError:
+        try:
+            np.fill_diagonal(S, S.diagonal() + 1e-6)  # sometimes this saves it...
+            S /= np.max(S)
+            chol = cholesky(S)
+            flag = 0
+        except LinAlgError:
+            chol = np.eye(len(S))
+            flag = -1
+    return chol, flag
+
+
+def get_simil_from_vep(V):
+    # cosine similarity
+    return cosine_similarity(V)
+
+
+def get_simil_from_pos(pos):
+    # 0.5 at 50bp distance
+    gamma = -1*np.log(0.5)/(50**2)
+    return rbf_kernel(pos[:, np.newaxis], gamma=gamma)
+
+
+def get_weights_from_vep(V):
+    # max norm
+    return np.sqrt(np.max(np.abs(V), axis=1))
+
+
+def get_regions():
+    # load the results, keep those below a certain p-value
+    results = pd.read_csv(snakemake.input.results_tsv, sep='\t', comment='#')
+
+    kern = snakemake.params.kernels
+    if isinstance(kern, str):
+        kern = [kern]
+
+    pvcols_score = ['pv_score_' + k for k in kern ]
+    pvcols_lrt = ['pv_lrt_' + k for k in kern]
+    statcols = ['lrtstat_' + k for k in kern]
+    results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt]
+
+    # get genes below threshold
+    genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ]
+    genes = np.unique(np.concatenate(genes))
+
+    if len(genes) == 0:
+        return None
+
+    # set up the regions to loop over for the chromosome
+    regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str})
+    regions.columns = ['chrom', 'start', 'end', 'name', 'strand']
+    regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'})
+    regions = regions.set_index('name').loc[genes]
+
+    regions = regions.join(results.set_index('gene'), how='left').reset_index()
+    return regions
+# end: kernel parameters / functions
+
+
+# function to scale the kernel so that sum(diag) == N
+N = Y.shape[0]
+def scale_G(G):
+    
+    # calculate the scaling factor
+    sv = np.linalg.svd(G, full_matrices=False, compute_uv=False)
+    scaling_factor = np.sqrt(N/np.sum(sv**2))
+
+    return G * scaling_factor
+
+
+def rollmean(x):
+    # returns the means of neighboring elements in an array
+    return (x[1:]+x[:-1])*0.5
+    
+    
+def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999):
+    # get the h2-grid to integrate over
+    grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)])
+    return grid, np.diff(grid)
+
+
+def eval_nll_grid(model=None, dof=None, **kwargs):
+    '''
+    
+    Evaluates the negative log likelihood on a grid of h2-values for a given model
+    
+    returns a dictionary:
+    
+      grid:       h2-values
+      step:       step size
+      nll:        negative log likelihood values at the h2-values
+      sigma2:     estimate for the environmental residual variance at the h2-values
+      integral:   integral of the likelihood scaled by the maximum (for debugging)
+      normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior 
+    
+    '''    
+    Y = model.y.reshape((-1,1))
+    
+    assert Y.shape[0] == len(model.y), "only works for single phenotype"
+    
+    
+    evalgrid, step = h2_grid(**kwargs)    
+    
+    nll = np.zeros_like(evalgrid)
+    sigma2 = np.zeros_like(evalgrid)
+    
+    for i, val in enumerate(evalgrid):
+        res = model.nLLeval(val, dof=dof)
+        nll[i] = res['nLL']
+        try:
+            sigma2[i] = res['sigma2']
+        except KeyError:
+            sigma2[i] = np.nan
+    
+    scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum
+    integral = trapezoid(scaled, evalgrid) # integral of scaled values
+    normalized = scaled / integral # normalized values (scaling cancels out)
+    
+    return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized}
+
+
+# genotype path, vep-path:
+assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files'
+geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.mac_report, snakemake.input.ensembl_vep_tsv)
+
+# get the top hits
+regions_all = get_regions()
+if regions_all is None:
+    logging.info('No genes pass significance threshold, exiting.')
+    sys.exit(0)
+
+# where we store the results
+stats = []
+i_gene = 0
+
+# dict with paths to input files
+vep_h5 = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.h5_rbp_plus, snakemake.input.h5_rbp_minus))}
+vep_bed = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.bed_rbp_plus, snakemake.input.bed_rbp_minus))}
+
+
+# enter the chromosome loop:
+timer = Timer()
+for i, (chromosome, bed, mac_report, vep_tsv) in enumerate(geno_vep):
+
+    if chromosome.replace('chr','') not in regions_all.chrom.unique():
+        continue
+
+    # get variants that pass MAF threshold:
+    mac_report = maf_filter(mac_report)
+    filter_vids = mac_report.index.values
+    filter_vids = sid_filter(filter_vids)
+
+    # function to identify protein LOF variants
+    is_plof = get_plof_id_func(vep_tsv)
+
+    # set up local collapsing
+    # collapser = LocalCollapsing(distance_threshold=51.)
+
+    for strand in ['plus', 'minus']:
+
+        # set up the regions to loop over for the chromosome
+        chromosome_id = chromosome.replace('chr', '')
+
+        regions = regions_all[(regions_all.chrom == chromosome_id) & (regions_all.strand == strand)]
+
+        if len(regions) == 0:
+            continue
+
+        # get variants that pass variant effect prediction threshold:
+        vep_vids, vep_mask, labels = vep_filter(vep_h5[chromosome][strand], vep_bed[chromosome][strand])
+
+        # combine
+        filter_vids_chromosome = intersect_ids(vep_vids, filter_vids)
+
+        # initialize the vep loader
+        veploader = Hdf5Loader(vep_bed[chromosome][strand], vep_h5[chromosome][strand], 'diffscore', from_janggu=True)
+        veploader.update_variants(filter_vids_chromosome)
+        veploader.set_mask(vep_mask)
+
+        # set up the variant loader (rbp variants) for the chromosome + strand
+        plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4))
+        plinkloader.update_variants(veploader.get_vids())
+        plinkloader.update_individuals(covariatesloader.get_iids())
+
+
+        # set up the genotype + vep loading function
+        def get_rbp(interval):
+
+            temp_genotypes, temp_vids, pos = plinkloader.genotypes_by_region(interval, return_pos=True)
+
+            if temp_genotypes is None:
+                raise GotNone
+
+            nanmean = np.nanmean(temp_genotypes, axis=0) 
+            ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped!
+                
+            ncarrier = np.nansum(temp_genotypes, axis=0)
+            ncarrier = np.minimum(ncarrier, temp_genotypes.shape[0] - ncarrier).astype(int)
+
+            temp_genotypes -= np.nanmean(temp_genotypes, axis=0)
+            G1 = np.ma.masked_invalid(temp_genotypes).filled(0.)
+
+            # deepripe variant effect predictions (single RBP)
+            V1 = veploader.anno_by_id(temp_vids)
+
+            weights = get_weights_from_vep(V1)
+
+            S = get_simil_from_pos(pos)
+            S *= get_simil_from_vep(V1) # Schur product of two positive definite matrices is positive definite
+
+            cummac = mac_report.loc[temp_vids].Minor
+
+            V1 = pd.DataFrame(V1, columns=labels)[sorted(labels)]
+
+            return G1, temp_vids, weights, S, ncarrier, cummac, pos, V1
+
+
+        # set up the test-function for a single gene
+        def test_gene(interval, seed):
+
+            interval = interval.to_dict()
+
+            pval_dict = {}
+            pval_dict['gene'] = interval['name']
+            called = []
+
+            def pv_score(GV):
+                pv = null_model_score.pv_alt_model(GV)
+                if pv < 0.:
+                    pv = null_model_score.pv_alt_model(GV, method='saddle')
+                return pv
+
+            def call_score(GV, name, vids=None):
+                if name not in pval_dict:
+                    pval_dict[name] = {}
+                    called.append(name)
+                pval_dict[name] = {}
+                # single-marker p-values
+                pval_dict[name]['pv_score'] = np.array([pv_score(GV[:, i, np.newaxis]) for i in range(GV.shape[1])])
+                
+                # single-marker coefficients
+                beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ]
+                pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) 
+                pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta])
+                if vids is not None:
+                    pval_dict[name]['vid'] = vids
+
+            def call_lrt(GV, name, vids=None):
+                if name not in pval_dict:
+                    pval_dict[name] = {}
+                    called.append(name)
+                    
+                GV_scaled = scale_G(GV)
+                    
+                # get gene parameters, test statistics and and single-marker regression weights
+                lik = null_model_lrt.altmodel(GV_scaled)
+                pval_dict[name]['nLL'] = lik['nLL']
+                pval_dict[name]['sigma2'] = lik['sigma2']
+                pval_dict[name]['lrtstat'] = lik['stat']
+                pval_dict[name]['h2'] = lik['h2']
+                
+                if not lik['alteqnull']:
+                    ## start: get confidence intervals for h2
+                    try:
+                        result = eval_nll_grid(model=null_model_lrt.model1, dof=None)
+                        
+                        # approximate the cdf
+                        cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                        
+                        # approximate the quantile function
+                        qfun = interp1d(cd, rollmean(result['grid']))
+                        
+                        pval_dict[name]['h2_0.025'] = qfun(0.025)
+                        pval_dict[name]['h2_0.975'] = qfun(0.975)
+                        
+                    except ValueError as e:
+                        # likely an interpolation-error from the quantile function because h2 is very low
+                        
+                        print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name))
+                        print(lik)
+                        
+                        result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None)
+                        
+                        # approximate the cdf
+                        cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                        
+                        try:
+                            # approximate the quantile function
+                            qfun = interp1d(cd, rollmean(result['grid']))
+                            pval_dict[name]['h2_0.025'] = qfun(0.025)
+                            pval_dict[name]['h2_0.975'] = qfun(0.975)
+                        except ValueError:
+                            pval_dict[name]['h2_0.025'] = np.array([np.nan])
+                            pval_dict[name]['h2_0.975'] = np.array([np.nan])
+                            
+                    # save values
+                    pval_dict[name]['h2_grid'] = result['grid']
+                    pval_dict[name]['nLL_grid'] = result['nll']
+                    pval_dict[name]['sigma2_grid'] = result['sigma2']
+                    pval_dict[name]['normalized_likelihood'] = result['normalized']
+                        
+                    ## end: get confidence intervals for h2                
+                    
+                if vids is not None:
+                    pval_dict[name]['vid'] = vids
+
+            # load rbp variants
+            G, vids, weights, S, ncarrier, cummac, pos, V = get_rbp(interval)
+            keep = ~is_plof(vids)
+
+            # these are common to all kernels
+            pval_dict['vid'] = vids
+            pval_dict['weights'] = weights ** 2 # get_weights_from_vep returns the square root!
+            pval_dict['MAC'] = cummac
+            pval_dict['nCarrier'] = ncarrier
+            pval_dict['not_LOF'] = keep
+            for col in V.columns:
+                pval_dict[col] = V[col].values.astype(np.float32)
+
+            # # cholesky
+            if G.shape[1] > 1:
+                L, flag1 = get_cholesky(S)
+            else:
+                L, flag1 = np.eye(G.shape[1]), -1
+            
+            # # do a score test (cholesky, and weighted cholesky)
+            # GL = G.dot(L)
+            # call_score(GL, 'lincholesky')
+            GWL = G.dot(np.diag(weights, k=0)).dot(L)
+            # call_score(GWL, 'linwcholesky')
+
+            # single-variant p-values:
+            call_score(G, 'linw', vids=vids) # get's the single-variant p-values and effect-sizes -> no weighting!
+            
+            call_lrt(GWL, 'linwcholesky') # performs weighting and LRT (-> calculate h2)
+
+            # sanity checks
+            # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
+            # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())
+
+            if np.any(keep):
+
+                if keep.sum() == 1:
+                    # only single SNP is not LOF
+                    # GL = G[:, keep] # actually just the linear kernel
+                    GWL = G[:, keep].dot(np.diag(weights[keep], k=0)) # actually just the linear weighted kernel
+                else:
+                    L, flag2 = get_cholesky(S[np.ix_(keep, keep)])
+                    GWL = G[:, keep].dot(np.diag(weights[keep], k=0)).dot(L)
+
+                # call_score(G1, 'lincholesky_notLOF')
+                # call_score(GWL, 'linwcholesky_notLOF')
+                
+                # call_lrt(GL, 'lincholesky_notLOF')
+                # call_lrt(GWL, 'linwcholesky_notLOF')
+                # G1 = G[:,keep].dot(np.diag(weights[keep], k=0))
+                # call_score(G1, 'linw_notLOF', vids=vids[keep])
+                
+                call_lrt(GWL, 'linwcholesky_notLOF')
+
+            return pval_dict, called
+
+
+        logging.info('loaders for chromosome {}, strand "{}" initialized in {:.1f} seconds.'.format(chromosome, strand, timer.check()))
+        # run tests for all genes on the chromosome / strand
+        for _, region in regions.iterrows():
+
+            try:
+                gene_stats, called = test_gene(region, i_gene)
+            except GotNone:
+                continue
+
+            # build the single-variant datafame
+            single_var_columns = ['gene', 'vid', 'weights', 'MAC', 'nCarrier', 'not_LOF' ] + sorted(snakemake.params.rbp_of_interest)
+            sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns})
+
+            sv_df['pv_score'] = gene_stats['linw']['pv_score'] # single-variant p-values estimated independently
+            sv_df['beta'] = gene_stats['linw']['beta'] # single-variant coeffcients estimated independently *without* weighting
+            sv_df['betaSd'] = gene_stats['linw']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting
+            sv_df['pheno'] = snakemake.params.phenotype
+
+            out_dir = os.path.join(snakemake.params.out_dir_stats, region['name'])
+            os.makedirs(out_dir, exist_ok=True)
+
+            sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False)
+
+            for k in called:
+
+                if k == 'variant_pvals':
+                    continue
+
+                results_dict = gene_stats[k]
+
+                df_cols = ['pv_score', 'beta', 'betaSd', 'vid']  # parts of the dict that have lenght > 1
+
+                df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict})
+                
+                if df.shape[0] > 0:
+                    df['gene'] = gene_stats['gene']
+                    df['pheno'] = snakemake.params.phenotype
+                    df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False)
+
+                #  other cols ['nLL', 'sigma2', 'lrtstat', 'h2', ...]
+                other_cols = {k: v for k, v in results_dict.items() if k not in df_cols}
+                other_cols['gene'] = gene_stats['gene']
+                other_cols['pheno'] = snakemake.params.phenotype
+
+                pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb'))
+
+            i_gene += 1
+            logging.info('tested {} genes...'.format(i_gene))
+
+        timer.reset()
diff --git a/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py
new file mode 100644
index 0000000..229e42b
--- /dev/null
+++ b/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py
@@ -0,0 +1,493 @@
+
+import os
+import sys
+# os.environ["OMP_NUM_THREADS"] = "16"
+
+import logging
+logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
+
+import pandas as pd
+import numpy as np
+import pickle
+import gzip
+
+# needed to calculate Bayesian h2 confidence intervals
+from scipy.integrate import trapezoid
+from scipy.interpolate import interp1d
+
+# seak imports
+from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV
+from seak.kernels import LocalCollapsing
+from seak.scoretest import ScoretestNoK
+from seak.lrt import LRTnoK
+
+import gzip
+
+from pysnptools.snpreader import Bed
+
+from util.association import BurdenLoaderHDF5
+from util import Timer
+
+
+class GotNone(Exception):
+    pass
+
+# set up the covariatesloader
+
+covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
+                                       snakemake.input.covariates_tsv,
+                                       snakemake.params.covariate_column_names,
+                                       sep='\t',
+                                       path_to_phenotypes=snakemake.input.phenotypes_tsv)
+
+# initialize the null models
+Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK')
+
+null_model_score = ScoretestNoK(Y, X)
+null_model_lrt = LRTnoK(X, Y)
+
+
+# set up function to filter variants:
+def maf_filter(mac_report):
+
+    # load the MAC report, keep only observed variants with MAF below threshold
+    mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref'])
+
+    if snakemake.params.filter_highconfidence:
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))]
+    else:
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref)]
+
+    # this has already been done in filter_variants.py
+    # load the variant annotation, keep only variants in high-confidece regions
+    # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg'])
+    # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values]
+    # vids = np.intersect1d(vids, vids_highconf)
+
+    return mac_report.set_index('SNP').loc[vids]
+
+def sid_filter(vids):
+    
+    if 'sid_include' in snakemake.config:
+        print('limiting to variants present in {}'.format(snakemake.config['sid_include']))
+        
+        infilepath = snakemake.config['sid_include']
+        
+        if infilepath.endswith('gz'):
+            with gzip.open(infilepath,'rt') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+        else:
+            with open(infilepath, 'r') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+    else:
+        return vids
+                
+    return intersect_ids(vids, sid)
+
+def get_regions():
+    # load the results, keep those below a certain p-value
+    results = pd.read_csv(snakemake.input.results_tsv, sep='\t')
+
+    kern = snakemake.params.kernels
+    if isinstance(kern, str):
+        kern = [kern]
+
+    pvcols_score = ['pv_score_' + k for k in kern ]
+    pvcols_lrt = ['pv_lrt_' + k for k in kern]
+    statcols = ['lrtstat_' + k for k in kern]
+    results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt]
+
+    # get genes below threshold
+    genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ]
+    genes = np.unique(np.concatenate(genes))
+
+    if len(genes) == 0:
+        return None
+
+    # set up the regions to loop over for the chromosome
+    regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str})
+    regions.columns = ['chrom', 'start', 'end', 'name', 'strand']
+    regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'})
+    regions = regions.set_index('name').loc[genes]
+
+    regions = regions.join(results.set_index('gene'), how='left').reset_index()
+    return regions
+
+
+# function to scale the kernel so that sum(diag) == N
+N = Y.shape[0]
+def scale_G(G):
+    
+    # calculate the scaling factor
+    sv = np.linalg.svd(G, full_matrices=False, compute_uv=False)
+    scaling_factor = np.sqrt(N/np.sum(sv**2))
+
+    return G * scaling_factor
+
+
+def rollmean(x):
+    # returns the means of neighboring elements in an array
+    return (x[1:]+x[:-1])*0.5
+    
+    
+def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999):
+    # get the h2-grid to integrate over
+    grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)])
+    return grid, np.diff(grid)
+
+
+def eval_nll_grid(model=None, dof=None, **kwargs):
+    '''
+    
+    Evaluates the negative log likelihood on a grid of h2-values for a given model
+    
+    returns a dictionary:
+    
+      grid:       h2-values
+      step:       step size
+      nll:        negative log likelihood values at the h2-values
+      sigma2:     estimate for the environmental residual variance at the h2-values
+      integral:   integral of the likelihood scaled by the maximum (for debugging)
+      normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior 
+    
+    '''    
+    Y = model.y.reshape((-1,1))
+    
+    assert Y.shape[0] == len(model.y), "only works for single phenotype"
+    
+    
+    evalgrid, step = h2_grid(**kwargs)    
+    
+    nll = np.zeros_like(evalgrid)
+    sigma2 = np.zeros_like(evalgrid)
+    
+    for i, val in enumerate(evalgrid):
+        res = model.nLLeval(val, dof=dof)
+        nll[i] = res['nLL']
+        try:
+            sigma2[i] = res['sigma2']
+        except KeyError:
+            sigma2[i] = np.nan
+    
+    scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum
+    integral = trapezoid(scaled, evalgrid) # integral of scaled values
+    normalized = scaled / integral # normalized values (scaling cancels out)
+    
+    return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized}
+
+
+# genotype path, vep-path:
+assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files'
+geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof)
+
+
+regions_all = get_regions()
+if regions_all is None:
+    logging.info('No genes pass significance threshold, exiting.')
+    sys.exit(0)
+
+logging.info('About to evaluate variants in {} genes'.format(len(regions_all)))
+
+i_gene = 0
+# where we store the results
+stats = []
+# enter the chromosome loop:
+timer = Timer()
+for i, (chromosome, bed, vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep):
+
+    if chromosome.replace('chr','') not in regions_all.chrom.unique():
+        continue
+
+    # set up the ensembl vep loader for the chromosome
+    ensemblvepdf = pd.read_csv(vep_tsv,
+                               sep='\t',
+                               usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact', 'ref', 'alt', 'cosine_similarity'],
+                               index_col='Uploaded_variation')
+
+    # get set of variants for the chromosome:
+    mac_report = maf_filter(mac_report)
+    filter_vids = mac_report.index.values
+    filter_vids = sid_filter(filter_vids)
+
+
+    # filter by MAF
+    keep = intersect_ids(filter_vids, ensemblvepdf.index.values)
+    ensemblvepdf = ensemblvepdf.loc[keep]
+    ensemblvepdf.reset_index(inplace=True)
+
+    # filter by impact:
+    ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene' ,'pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact ]
+
+    # initialize the loader
+    eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data = ensemblvepdf[['pos_standardized' ,'impact', 'ref', 'alt', 'cosine_similarity']].values)
+
+    # set up the regions to loop over for the chromosome
+    regions = regions_all.copy()
+
+    # discard all genes for which we don't have annotations
+    regions['gene'] = regions.name.str.split('_', expand=True)[0]
+    regions.set_index('gene', inplace=True)
+    genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene))
+    regions = regions.loc[genes].reset_index()
+    regions = regions.sort_values(['chrom' ,'start' ,'end'])
+
+    # set up the variant loader (missense variants) for the chromosome
+    plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4))
+    plinkloader.update_variants(eveploader.get_vids())
+    plinkloader.update_individuals(covariatesloader.get_iids())
+
+    # set up the protein LOF burden loader
+    bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof)
+    bloader_lof.update_individuals(covariatesloader.get_iids())
+
+    # set up local collapsing
+    collapser = LocalCollapsing(distance_threshold=1.)
+
+    # set up the missense genotype + vep loading function
+    def get_missense(interval):
+
+        try:
+            V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0])
+        except KeyError:
+            raise GotNone
+
+        if V1.index.empty:
+            raise GotNone
+
+        vids = V1.index.get_level_values('vid')
+        V1 = V1.droplevel(['gene'])
+
+        temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False)
+
+        ncarrier = np.nansum(temp_genotypes > 0, axis=0)
+
+        temp_genotypes -= np.nanmean(temp_genotypes, axis=0)
+        G1 = np.ma.masked_invalid(temp_genotypes).filled(0.)
+
+        cummac = mac_report.loc[vids].Minor
+
+        # polyphen / sift impact
+        weights = V1[1].values.astype(np.float64)
+
+        # "standardized" positions -> codon start positions
+        pos = V1[0].values.astype(np.int32)
+
+        ref = V1[2].values.astype(str)
+        alt = V1[3].values.astype(str)
+        cosine_similarity = V1[4].values.astype(np.float64)
+
+        return G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity
+
+    # set up the protein-LOF loading function
+
+    def get_plof(interval):
+
+        try:
+            G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float64)
+        except KeyError:
+            G2 = None
+
+        return G2
+
+    # set up the test-function for a single gene
+    def test_gene(interval, seed):
+
+        pval_dict = {}
+        pval_dict['gene'] = interval['name']
+        called = []
+
+        def pv_score(GV):
+            pv = null_model_score.pv_alt_model(GV)
+            if pv < 0.:
+                pv = null_model_score.pv_alt_model(GV, method='saddle')
+            return pv
+
+        def call_score(GV, name, vids=None):
+            if name not in pval_dict:
+                pval_dict[name] = {}
+                called.append(name)
+            pval_dict[name] = {}
+            # single-marker p-values
+            pval_dict[name]['pv_score'] = np.array([pv_score(GV[:,i,np.newaxis]) for i in range(GV.shape[1])])
+
+            # single-marker coefficients 
+            beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ]
+            pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) 
+            pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta])
+            if vids is not None:
+                pval_dict[name]['vid'] = vids
+
+
+        def call_lrt(GV, name, vids=None):
+            if name not in pval_dict:
+                pval_dict[name] = {}
+                called.append(name)
+                
+            GV_scaled = scale_G(GV)
+                
+            # get gene parameters, test statistics and and single-marker regression weights
+            lik = null_model_lrt.altmodel(GV_scaled)
+            pval_dict[name]['nLL'] = lik['nLL']
+            pval_dict[name]['sigma2'] = lik['sigma2']
+            pval_dict[name]['lrtstat'] = lik['stat']
+            pval_dict[name]['h2'] = lik['h2']
+            
+            if not lik['alteqnull']:
+                ## start: get confidence intervals for h2
+                try:
+                    result = eval_nll_grid(model=null_model_lrt.model1, dof=None)
+                    
+                    # approximate the cdf
+                    cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                    
+                    # approximate the quantile function
+                    qfun = interp1d(cd, rollmean(result['grid']))
+                    
+                    pval_dict[name]['h2_0.025'] = qfun(0.025)
+                    pval_dict[name]['h2_0.975'] = qfun(0.975)
+                    
+                except ValueError as e:
+                    # likely an interpolation-error from the quantile function because h2 is very low
+                    
+                    print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name))
+                    print(lik)
+                    
+                    result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None)
+                    
+                    # approximate the cdf
+                    cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                    
+                    try:
+                        # approximate the quantile function
+                        qfun = interp1d(cd, rollmean(result['grid']))
+                        pval_dict[name]['h2_0.025'] = qfun(0.025)
+                        pval_dict[name]['h2_0.975'] = qfun(0.975)
+                    except ValueError:
+                        pval_dict[name]['h2_0.025'] = np.array([np.nan])
+                        pval_dict[name]['h2_0.975'] = np.array([np.nan])
+                        
+                # save values
+                pval_dict[name]['h2_grid'] = result['grid']
+                pval_dict[name]['nLL_grid'] = result['nll']
+                pval_dict[name]['sigma2_grid'] = result['sigma2']
+                pval_dict[name]['normalized_likelihood'] = result['normalized']
+                    
+                ## end: get confidence intervals for h2
+            
+            if vids is not None:
+                pval_dict[name]['vid'] = vids
+
+
+        # load missense variants
+        G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval)
+
+        # these are common to all kernels
+        pval_dict['vid'] = vids
+        pval_dict['weights'] = weights
+        pval_dict['MAC'] = cummac
+        pval_dict['nCarrier'] = ncarrier
+        pval_dict['pos'] = pos
+        pval_dict['ref'] = ref
+        pval_dict['alt'] = alt
+        pval_dict['cosine_similarity'] = cosine_similarity
+
+        # single-variant p-values:
+        call_score(G1, 'variant_pvals') # score-pvalues and coeficients estimated independently
+
+        # sanity checks
+        # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
+        # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())
+
+        # do a score burden test (max weighted), this is different than the baseline!
+        G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True)
+
+        # burden
+        call_score(G1_burden, 'linwb')
+        call_lrt(G1_burden, 'linwb')
+
+        # perform local collapsing with weights
+        if G1.shape[1] > 1:
+            G1, clusters = collapser.collapse(G1, pos, np.sqrt(weights)) # will lead to a crash when there are negative weights...
+        else:
+            G1 = G1.dot(np.diag(np.sqrt(weights), k=0))
+            clusters = np.array([0])
+
+        pval_dict['cluster_id'] = clusters
+
+        # get the single-cluster p-values + stats
+        call_score(G1, 'linwcollapsed')
+        call_lrt(G1, 'linwcollapsed')
+        pval_dict['linwcollapsed']['cluster_id'] = sorted(set(clusters))
+
+        # load plof burden
+        G2 = get_plof(interval)
+
+        if G2 is not None:
+
+            call_score(G2, 'LOF')
+            call_lrt(G2, 'LOF')
+
+            # merged (single variable)
+            G1_burden_mrg = np.maximum(G2, G1_burden)
+            call_score(G1_burden_mrg, 'linwb_mrgLOF')
+            call_lrt(G1_burden_mrg, 'linwb_mrgLOF')
+
+            # concatenated
+            call_score(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF')
+            call_lrt(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF')
+            pval_dict['linwcollapsed_cLOF']['cluster_id'] = sorted(set(clusters)) + [-1] # -1 indicates the LOF cluster
+
+        return pval_dict, called
+
+    logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check()))
+    # run tests for all genes on the chromosome
+    for _, region in regions.iterrows():
+
+        try:
+            gene_stats, called = test_gene(region, i_gene)
+        except GotNone:
+            continue
+
+
+        # build the single-variant datafame
+        single_var_columns = ['gene','vid','weights','MAC','nCarrier','pos','ref','alt','cosine_similarity','cluster_id']
+        sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns})
+
+        sv_df['pv_score'] = gene_stats['variant_pvals']['pv_score'] # single-variant p-values estimated independently
+        sv_df['beta'] = gene_stats['variant_pvals']['beta'] # single-variant coeffcients estimated independently *without* weighting
+        sv_df['betaSd'] = gene_stats['variant_pvals']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting
+        sv_df['pheno'] = snakemake.params.phenotype
+
+        out_dir = os.path.join(snakemake.params.out_dir_stats, region['name'])
+        os.makedirs(out_dir, exist_ok=True)
+
+        sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False)
+
+
+        for k in called:
+
+            if k == 'variant_pvals':
+                continue
+
+            results_dict = gene_stats[k]
+                
+            df_cols = ['cluster_id','pv_score', 'beta', 'betaSd', 'vid'] # parts of the dict that have lenghth == (number of variants)
+            
+            df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict})
+            df['gene'] = gene_stats['gene']
+            df['pheno'] = snakemake.params.phenotype
+            df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False)
+            
+            #  other cols ['nLL', 'sigma2', 'lrtstat', 'h2', 'h2_grid', 'nLL_grid', 'sigma2_grid',  'normalized_likelihood', 'h2_0.025', 'h2_0.975']
+            other_cols = {k: v for k, v in results_dict.items() if k not in df_cols }
+            other_cols['gene'] = gene_stats['gene']
+            other_cols['pheno'] = snakemake.params.phenotype
+            
+            pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb'))
+            
+                            
+        i_gene += 1
+        logging.info('tested {} genes...'.format(i_gene))
+        
+    timer.reset()
+
diff --git a/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py
new file mode 100644
index 0000000..93d11ff
--- /dev/null
+++ b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py
@@ -0,0 +1,511 @@
+import os
+# os.environ["OMP_NUM_THREADS"] = "16"
+
+import logging
+
+logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
+
+import pandas as pd
+import numpy as np
+import gzip
+
+# needed to calculate Bayesian h2 confidence intervals
+from scipy.integrate import trapezoid
+from scipy.interpolate import interp1d
+
+# seak imports
+from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV
+from seak.scoretest import ScoretestNoK
+from seak.lrt import LRTnoK, pv_chi2mixture, fit_chi2mixture
+
+from pysnptools.snpreader import Bed
+import pickle
+import sys
+
+from util.association import BurdenLoaderHDF5
+from util import Timer
+
+
+class GotNone(Exception):
+    pass
+
+
+# set up the covariatesloader
+
+covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
+                                       snakemake.input.covariates_tsv,
+                                       snakemake.params.covariate_column_names,
+                                       sep='\t',
+                                       path_to_phenotypes=snakemake.input.phenotypes_tsv)
+
+# initialize the null models
+Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK')
+
+null_model_score = ScoretestNoK(Y, X)
+null_model_lrt = LRTnoK(X, Y)
+
+
+# set up function to filter variants:
+def maf_filter(mac_report):
+    # load the MAC report, keep only observed variants with MAF below threshold
+    mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref'])
+
+    if snakemake.params.filter_highconfidence:
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))]
+    else:
+        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool))]
+
+    # this has already been done in filter_variants.py
+    # load the variant annotation, keep only variants in high-confidece regions
+    # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg'])
+    # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values]
+    # vids = np.intersect1d(vids, vids_highconf)
+
+    return mac_report.set_index('SNP').loc[vids]
+
+def sid_filter(vids):
+    
+    if 'sid_include' in snakemake.config:
+        print('limiting to variants present in {}'.format(snakemake.config['sid_include']))
+        
+        infilepath = snakemake.config['sid_include']
+        
+        if infilepath.endswith('gz'):
+            with gzip.open(infilepath,'rt') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+        else:
+            with open(infilepath, 'r') as infile:
+                sid = np.array([l.rstrip() for l in infile])
+    else:
+        return vids
+                
+    return intersect_ids(vids, sid)
+
+def get_regions():
+    # load the results, keep those below a certain p-value
+    results = pd.read_csv(snakemake.input.results_tsv, sep='\t')
+
+    kern = snakemake.params.kernels
+    if isinstance(kern, str):
+        kern = [kern]
+
+    pvcols_score = ['pv_score_' + k for k in kern ]
+    pvcols_lrt = ['pv_lrt_' + k for k in kern]
+    statcols = ['lrtstat_' + k for k in kern]
+    results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt]
+
+    # get genes below threshold
+    genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ]
+    genes = np.unique(np.concatenate(genes))
+
+    if len(genes) == 0:
+        return None
+
+    # set up the regions to loop over for the chromosome
+    regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str})
+    regions.columns = ['chrom', 'start', 'end', 'name', 'strand']
+    regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'})
+    regions = regions.set_index('name').loc[genes]
+
+    regions = regions.join(results.set_index('gene'), how='left').reset_index()
+    return regions
+
+
+# function to scale the kernel so that sum(diag) == N
+N = Y.shape[0]
+def scale_G(G):
+    
+    # calculate the scaling factor
+    sv = np.linalg.svd(G, full_matrices=False, compute_uv=False)
+    scaling_factor = np.sqrt(N/np.sum(sv**2))
+
+    return G * scaling_factor
+
+
+def rollmean(x):
+    # returns the means of neighboring elements in an array
+    return (x[1:]+x[:-1])*0.5
+    
+    
+def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999):
+    # get the h2-grid to integrate over
+    grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)])
+    return grid, np.diff(grid)
+
+
+def eval_nll_grid(model=None, dof=None, **kwargs):
+    '''
+    
+    Evaluates the negative log likelihood on a grid of h2-values for a given model
+    
+    returns a dictionary:
+    
+      grid:       h2-values
+      step:       step size
+      nll:        negative log likelihood values at the h2-values
+      sigma2:     estimate for the environmental residual variance at the h2-values
+      integral:   integral of the likelihood scaled by the maximum (for debugging)
+      normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior 
+    
+    '''    
+    Y = model.y.reshape((-1,1))
+    
+    assert Y.shape[0] == len(model.y), "only works for single phenotype"
+    
+    
+    evalgrid, step = h2_grid(**kwargs)    
+    
+    nll = np.zeros_like(evalgrid)
+    sigma2 = np.zeros_like(evalgrid)
+    
+    for i, val in enumerate(evalgrid):
+        res = model.nLLeval(val, dof=dof)
+        nll[i] = res['nLL']
+        try:
+            sigma2[i] = res['sigma2']
+        except KeyError:
+            sigma2[i] = np.nan
+    
+    scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum
+    integral = trapezoid(scaled, evalgrid) # integral of scaled values
+    normalized = scaled / integral # normalized values (scaling cancels out)
+    
+    return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized}
+
+
+# genotype path, vep-path:
+assert len(snakemake.params.ids) == len (snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files'
+geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.ensembl_vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof)
+
+# get the top hits
+regions_all = get_regions()
+if regions_all is None:
+    logging.info('No genes pass significance threshold, exiting.')
+    sys.exit(0)
+
+# where we store the results
+stats = []
+i_gene = 0
+# enter the chromosome loop:
+
+timer = Timer()
+for i, (chromosome, bed, vep_tsv, ensembl_vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep):
+
+    if chromosome.replace('chr','') not in regions_all.chrom.unique():
+        continue
+
+    # set up the ensembl vep loader for the chromosome
+    spliceaidf = pd.read_csv(vep_tsv,
+                             sep='\t',
+                             usecols=['name', 'chrom', 'end', 'gene', 'max_effect', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'],
+                             index_col='name')
+
+    # get set of variants for the chromosome:
+    mac_report = maf_filter(mac_report)
+    filter_vids = mac_report.index.values
+    filter_vids = sid_filter(filter_vids)
+
+    # filter by MAF
+    keep = intersect_ids(filter_vids, spliceaidf.index.values)
+    spliceaidf = spliceaidf.loc[keep]
+    spliceaidf.reset_index(inplace=True)
+
+    # filter by impact:
+    spliceaidf = spliceaidf[spliceaidf.max_effect >= snakemake.params.min_impact]
+
+    # set up the regions to loop over for the chromosome
+    regions = regions_all.copy()
+
+    # discard all genes for which we don't have annotations
+    gene_ids = regions.name.str.split('_', expand=True)  # table with two columns, ensembl-id and gene-name
+    regions['gene'] = gene_ids[1]  # this is the gene name
+    regions['ensembl_id'] = gene_ids[0]
+    regions.set_index('gene', inplace=True)
+    genes = intersect_ids(np.unique(regions.index.values), np.unique(spliceaidf.gene))  # intersection of gene names
+    regions = regions.loc[genes].reset_index()  # subsetting
+    regions = regions.sort_values(['chrom', 'start', 'end'])
+
+    # check if the variants are protein LOF variants, load the protein LOF variants:
+    ensemblvepdf = pd.read_csv(ensembl_vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Gene'])
+
+    # this column will contain the gene names:
+    genes = intersect_ids(np.unique(ensemblvepdf.Gene.values), regions.ensembl_id)  # intersection of ensembl gene ids
+    ensemblvepdf = ensemblvepdf.set_index('Gene').loc[genes].reset_index()
+    ensemblvepdf['gene'] = gene_ids.set_index(0).loc[ensemblvepdf.Gene.values].values
+
+    # set up the merge
+    ensemblvepdf.drop(columns=['Gene'], inplace=True)  # get rid of the ensembl ids, will use gene names instead
+    ensemblvepdf.rename(columns={'Uploaded_variation': 'name'}, inplace=True)
+    ensemblvepdf['is_plof'] = 1.
+
+    ensemblvepdf = ensemblvepdf[~ensemblvepdf.duplicated()]  # if multiple ensembl gene ids map to the same gene names, this prevents a crash.
+
+    # we add a column to the dataframe indicating whether the variant is already annotated as protein loss of function by the ensembl variant effect predictor
+    spliceaidf = pd.merge(spliceaidf, ensemblvepdf, on=['name', 'gene'], how='left', validate='one_to_one')
+    spliceaidf['is_plof'] = spliceaidf['is_plof'].fillna(0.).astype(bool)
+
+    # initialize the loader
+    # Note: we use "end" here because the start + 1 = end, and we need 1-based coordiantes (this would break if we had indels)
+    eveploader = EnsemblVEPLoader(spliceaidf['name'], spliceaidf['chrom'].astype('str') + ':' + spliceaidf['end'].astype('str'), spliceaidf['gene'], data=spliceaidf[['max_effect', 'is_plof', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']].values)
+
+    # set up the variant loader (splice variants) for the chromosome
+    plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4))
+    plinkloader.update_variants(eveploader.get_vids())
+    plinkloader.update_individuals(covariatesloader.get_iids())
+
+    # set up the protein LOF burden loader
+    bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof)
+    bloader_lof.update_individuals(covariatesloader.get_iids())
+
+
+    # set up the splice genotype + vep loading function
+    def get_splice(interval):
+
+        try:
+            V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[1])
+        except KeyError:
+            raise GotNone
+
+        if V1.index.empty:
+            raise GotNone
+
+        vids = V1.index.get_level_values('vid')
+        V1 = V1.droplevel(['gene'])
+
+        temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False)
+
+        ncarrier = np.nansum(temp_genotypes > 0, axis=0)
+        
+        temp_genotypes -= np.nanmean(temp_genotypes, axis=0)
+        G1 = np.ma.masked_invalid(temp_genotypes).filled(0.)
+
+        cummac = mac_report.loc[vids].Minor
+
+        # spliceAI max score
+        weights = V1[0].values.astype(np.float64)
+
+        is_plof = V1[1].values.astype(bool)
+
+        splice_preds_all = V1.iloc[:,2:]
+        splice_preds_all.columns = ['DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']
+
+        # "standardized" positions -> codon start positions
+        # pos = V1[0].values.astype(np.int32)
+
+        return G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all
+
+
+    # set up the protein-LOF loading function
+
+    def get_plof(interval):
+
+        try:
+            G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float)
+        except KeyError:
+            G2 = None
+
+        return G2
+
+
+    # set up the test-function for a single gene
+    def test_gene(interval, seed):
+
+        pval_dict = {}
+        pval_dict['gene'] = interval['name']
+        called = []
+
+        def pv_score(GV):
+            pv = null_model_score.pv_alt_model(GV)
+            if pv < 0.:
+                pv = null_model_score.pv_alt_model(GV, method='saddle')
+            return pv
+
+        def call_score(GV, name, vids=None):
+            if name not in pval_dict:
+                pval_dict[name] = {}
+                called.append(name)
+            pval_dict[name] = {}
+            # single-marker p-values
+            pval_dict[name]['pv_score'] = np.array([pv_score(GV[:,i,np.newaxis]) for i in range(GV.shape[1])])
+            
+            # single-marker coefficients 
+            beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ]
+            pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) 
+            pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta])
+            if vids is not None:
+                pval_dict[name]['vid'] = vids
+
+
+        def call_lrt(GV, name, vids=None):
+            if name not in pval_dict:
+                pval_dict[name] = {}
+                called.append(name)
+                
+            GV_scaled = scale_G(GV)    
+            
+            # get gene parameters, test statistics and and single-marker regression weights
+            lik = null_model_lrt.altmodel(GV_scaled)
+            pval_dict[name]['nLL'] = lik['nLL']
+            pval_dict[name]['sigma2'] = lik['sigma2']
+            pval_dict[name]['lrtstat'] = lik['stat']
+            pval_dict[name]['h2'] = lik['h2']
+
+            
+            if not lik['alteqnull']:
+                ## start: get confidence intervals for h2
+                try:
+                    result = eval_nll_grid(model=null_model_lrt.model1, dof=None)
+                    
+                    # approximate the cdf
+                    cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                    
+                    # approximate the quantile function
+                    qfun = interp1d(cd, rollmean(result['grid']))
+                    
+                    pval_dict[name]['h2_0.025'] = qfun(0.025)
+                    pval_dict[name]['h2_0.975'] = qfun(0.975)
+                    
+                except ValueError as e:
+                    # likely an interpolation-error from the quantile function because h2 is very low
+                    
+                    print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name))
+                    print(lik)
+                    
+                    result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None)
+                    
+                    # approximate the cdf
+                    cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative 
+                    
+                    try:
+                        # approximate the quantile function
+                        qfun = interp1d(cd, rollmean(result['grid']))
+                        pval_dict[name]['h2_0.025'] = qfun(0.025)
+                        pval_dict[name]['h2_0.975'] = qfun(0.975)
+                    except ValueError:
+                        pval_dict[name]['h2_0.025'] = np.array([np.nan])
+                        pval_dict[name]['h2_0.975'] = np.array([np.nan])
+                        
+                # save values
+                pval_dict[name]['h2_grid'] = result['grid']
+                pval_dict[name]['nLL_grid'] = result['nll']
+                pval_dict[name]['sigma2_grid'] = result['sigma2']
+                pval_dict[name]['normalized_likelihood'] = result['normalized']
+                    
+                ## end: get confidence intervals for h2
+            
+            
+            if vids is not None:
+                pval_dict[name]['vid'] = vids
+
+
+        # load splice variants
+        G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all = get_splice(interval)
+        # keep indicates which variants are NOT "protein LOF" variants, i.e. variants already identified by the ensembl VEP
+        keep = ~is_plof
+
+        # these are common to all kernels
+        pval_dict['vid'] = vids
+        pval_dict['weights'] = weights
+        pval_dict['MAC'] = cummac
+        pval_dict['nCarrier'] = ncarrier
+        pval_dict['not_LOF'] = keep
+        for col in splice_preds_all.columns:
+            pval_dict[col] = splice_preds_all[col].values.astype(np.float32)
+
+        # single-variant p-values:
+        call_score(G1, 'variant_pvals') # single variant p-values and coefficients estimated independently 
+        call_lrt(G1.dot(np.diag(np.sqrt(weights), k=0)), 'variant_pvals') # single variant coefficients estimated *jointly* after weighting
+
+        # sanity checks
+        # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
+        # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())
+
+        # do a score burden test (max weighted), this is different than the baseline!
+        G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True)
+        call_score(G1_burden, 'linwb')
+        call_lrt(G1_burden, 'linwb')
+
+        # linear weighted kernel
+        G1 = G1.dot(np.diag(np.sqrt(weights), k=0))
+
+        # do a score test (linear weighted)
+        call_score(G1, 'linw', vids=vids)
+        call_lrt(G1, 'linw')
+
+        # load plof burden
+        G2 = get_plof(interval)
+
+        if G2 is not None:
+
+            call_score(G2, 'LOF')
+            call_lrt(G2, 'LOF')
+
+            if np.any(keep):
+
+                # merged (single variable)
+                G1_burden_mrg = np.maximum(G2, G1_burden)
+
+                call_score(G1_burden_mrg, 'linwb_mrgLOF')
+                call_lrt(G1_burden_mrg, 'linwb_mrgLOF')
+
+                # concatenated ( >= 2 variables)
+                # we separate out the ones that are already part of the protein LOF variants!
+
+                G1 = np.concatenate([G1[:, keep], G2], axis=1)
+                call_score(G1, 'linw_cLOF', vids=np.array(vids[keep].tolist() + [-1]))
+                call_lrt(G1, 'linw_cLOF')
+            else:
+                logging.info('All Splice-AI variants for gene {} where already identified by the Ensembl variant effect predictor'.format(interval['name']))
+
+
+        return pval_dict, called
+
+
+    logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check()))
+    # run tests for all genes on the chromosome
+    for _, region in regions.iterrows():
+
+        try:
+            gene_stats, called = test_gene(region, i_gene)
+        except GotNone:
+            continue
+
+        # build the single-variant datafame
+        single_var_columns = ['gene', 'vid', 'weights', 'MAC', 'nCarrier', 'not_LOF', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']
+        sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns})
+
+        sv_df['pv_score'] = gene_stats['variant_pvals']['pv_score'] # single-variant p-values estimated independently
+        sv_df['beta'] = gene_stats['variant_pvals']['beta'] # single-variant coeffcients estimated independently *without* weighting
+        sv_df['betaSd'] = gene_stats['variant_pvals']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting
+        sv_df['pheno'] = snakemake.params.phenotype
+
+        out_dir = os.path.join(snakemake.params.out_dir_stats, region['name'])
+        os.makedirs(out_dir, exist_ok=True)
+
+        sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False)
+
+        for k in called:
+
+            if k == 'variant_pvals':
+                continue
+
+            results_dict = gene_stats[k]
+
+            df_cols = ['pv_score', 'beta', 'betaSd', 'vid']  # parts of the dict that have lenght > 1
+
+            df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict})
+            df['gene'] = gene_stats['gene']
+            df['pheno'] = snakemake.params.phenotype
+            df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False)
+
+            #  other cols ['nLL', 'sigma2', 'lrtstat', 'h2', 'h2_grid', 'nLL_grid', 'sigma2_grid', 'normalized_likelihood', 'h2_0.025', 'h2_0.975']
+            other_cols = {k: v for k, v in results_dict.items() if k not in df_cols}
+            other_cols['gene'] = gene_stats['gene']
+            other_cols['pheno'] = snakemake.params.phenotype
+
+            pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb'))
+
+        i_gene += 1
+        logging.info('tested {} genes...'.format(i_gene))
+
+    timer.reset()
\ No newline at end of file
diff --git a/snake/association.smk b/snake/association.smk
index 550f3dc..c01154e 100644
--- a/snake/association.smk
+++ b/snake/association.smk
@@ -152,6 +152,51 @@ rule assoc_missense_localcollapsing_eval_top_hits_all:
         expand(rules.assoc_missense_localcollapsing_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
         
 
+rule assoc_missense_localcollapsing_eval_top_hits_h2:
+    # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for missense variants
+    input:
+        h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True),
+        iid_lof = expand(rules.export_plof_burden.output.iid_txt,  id = plinkfiles.getIds(), allow_missing=True),
+        gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True),
+        covariates_tsv = config['covariates'],
+        phenotypes_tsv = config['phenotypes'],
+        bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()),
+        vep_tsv = expand(rules.evep_missense_proc.output.tsv_filtered, id = plinkfiles.getIds()),
+        mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True),
+        regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed,
+        results_tsv = rules.assoc_missense_localcollapsing.output.results_tsv,
+        seak_install = rules.install_seak.output
+    output:
+        out_ok = touch('work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok')
+    params:
+        kernels = ['linwcollapsed','linwcollapsed_cLOF','linwb','linwb_mrgLOF'], # genes with p < 1e-7 in one of these kernels will be analysed in detail
+        phenotype = lambda wc: phenotypes[ wc.pheno ],
+        covariate_column_names = config['covariate_column_names'],
+        max_maf = config['maf_cutoff'],
+        min_impact = config['min_impact'],
+        out_dir_stats = lambda wc: 'work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno),
+        ids = plinkfiles.getIds(),
+        filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence]
+    resources:
+        mem_mb=get_mem_mb(10000,1.5),
+        time="06:00:00",
+        partition="hpcpu"
+    threads:
+        1
+    log:
+        'logs/association/sclrt_kernels_missense_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log'
+    conda:
+        '../env/seak.yml'
+    script:
+        '../script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py'
+        
+        
+rule assoc_missense_localcollapsing_eval_top_hits_h2_all:
+    # run above rule for all phenotypes - this rule will effectively run the entire pipeline for missense variants
+    input:
+        expand(rules.assoc_missense_localcollapsing_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
+
+
 rule assoc_missense_localcollapsing_retest_top_hits:
     # calculates gene-specific LRT p-values for the most significant genes for missense variants
     input:
@@ -337,6 +382,52 @@ rule assoc_spliceai_linw_eval_top_hits_all:
          expand(rules.assoc_spliceai_linw_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
 
 
+rule assoc_spliceai_linw_eval_top_hits_h2:
+    # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for splice variants
+    input:
+        h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True),
+        iid_lof = expand(rules.export_plof_burden.output.iid_txt,  id = plinkfiles.getIds(), allow_missing=True),
+        gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True),
+        ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True),
+        covariates_tsv = config['covariates'],
+        phenotypes_tsv = config['phenotypes'],
+        bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()),
+        vep_tsv = expand(rules.splice_ai_filter_and_overlap_with_genotypes.output.tsv, id = plinkfiles.getIds()),
+        mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True),
+        regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed,
+        results_tsv = rules.assoc_spliceai_linw.output.results_tsv,
+        seak_install = rules.install_seak.output
+    output:
+        touch('work/association/sclrt_kernels_spliceai/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok')
+    params:
+        kernels = ['linb','linwb','lin','linw','linb_mrgLOF','linwb_mrgLOF','lin_cLOF','linw_cLOF'],
+        phenotype = lambda wc: phenotypes[ wc.pheno ],
+        covariate_column_names = config['covariate_column_names'],
+        max_maf = config['maf_cutoff'],
+        min_impact = config['splice_ai_min_impact'],
+        out_dir_stats = lambda wc: 'work/association/sclrt_kernels_spliceai/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno),
+        ids = plinkfiles.getIds(),
+        filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence],
+        debug = False
+    log:
+        'logs/association/sclrt_kernels_spliceai_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log'
+    resources:
+        mem_mb=get_mem_mb(12000,1.5),
+        time="6:00:00",
+        partition="hpcpu"
+    threads:
+        1
+    conda:
+        '../env/seak.yml'
+    script:
+        '../script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py'
+
+rule assoc_spliceai_linw_eval_top_hits_h2_all:
+    # runs rule above for all phenotypes - this rule will effectively run the entire pipeline for splice variants
+    input:
+         expand(rules.assoc_spliceai_linw_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
+
+
 rule assoc_spliceai_linw_retest_top_hits:
     # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for splice variants
     input:
@@ -571,6 +662,53 @@ rule assoc_deepripe_multiple_cholesky_eval_top_hits_all:
     input:
         expand(rules.assoc_deepripe_multiple_cholesky_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
 
+rule assoc_deepripe_multiple_cholesky_eval_top_hits_h2:
+    # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for DeepRiPe variants
+    input:
+        bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()),
+        h5_rbp_plus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['plus']),
+        h5_rbp_minus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['minus']),
+        bed_rbp_plus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['plus']),
+        bed_rbp_minus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['minus']),
+        ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True),
+        mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True),
+        regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed,
+        seak_install = rules.install_seak.output,
+        covariates_tsv = config['covariates'],
+        phenotypes_tsv = config['phenotypes'],
+        results_tsv = rules.assoc_deepripe_multiple_cholesky.output.results_tsv
+    output:
+        touch('work/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok')
+    params:
+        kernels = ['lincholesky', 'linwcholesky', 'lincholesky_notLOF', 'linwcholesky_notLOF'],
+        phenotype = lambda wc: phenotypes[ wc.pheno ],
+        covariate_column_names = config['covariate_column_names'],
+        max_maf = config['maf_cutoff'],
+        min_impact = config['deepripe_min_impact'],
+        out_dir_stats = lambda wc: 'work/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno),
+        ids = plinkfiles.getIds(),
+        filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence],
+        rbp_of_interest = rules.assoc_deepripe_multiple_cholesky.params.rbp_of_interest,
+        debug = False
+    resources:
+        mem_mb=get_mem_mb(10000,1.5),
+        time="03:00:00",
+        partition="hpcpu"
+    threads:
+        1
+    log:
+        'logs/association/sclrt_kernels_deepripe_multiple_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log'
+    conda:
+        '../env/seak.yml'
+    script:
+        '../script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py'
+
+
+rule assoc_deepripe_multiple_cholesky_eval_top_hits_h2_all:
+    # runs rule above for all phenotypes - this will effectively run the entire pipeline for DeepRiPe variants
+    input:
+        expand(rules.assoc_deepripe_multiple_cholesky_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all'])
+
 
 rule assoc_deepripe_multiple_cholesky_retest_top_hits:
     # re-tests most significant genes for DeepRiPe variants using gene-specific null-distributions