diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py new file mode 100644 index 0000000..015fde9 --- /dev/null +++ b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py @@ -0,0 +1,548 @@ +import os +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging + +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np +import h5py +import gzip +import pickle +import sys + +from numpy.linalg import cholesky, LinAlgError +from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity + +# needed to calculate Bayesian h2 confidence intervals +from scipy.integrate import trapezoid +from scipy.interpolate import interp1d + +# seak imports +from seak.data_loaders import intersect_ids, Hdf5Loader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK + +from pysnptools.snpreader import Bed + +from util import Timer + + +class GotNone(Exception): + pass + + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) +null_model_lrt = LRTnoK(X, Y) + + +# set up function to filter variants: +def maf_filter(mac_report): + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + # note: We don't filter out the variants for which alt/ref are "flipped" for the RBPs + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0)] + + return mac_report.set_index('SNP').loc[vids] + + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + + + +def vep_filter(h5_rbp, bed_rbp): + # returns variant ids for variants that pass filtering threshold and mask for the hdf5loader + + with h5py.File(h5_rbp, 'r') as f: + + rbp_of_interest = snakemake.params.rbp_of_interest + try: + # in newer version of h5py the strings are not decoded -> decode them + labels = [l.decode() for l in f['labels'][:]] + except AttributeError: + # this used to work in older versions of h5py: + labels = list(f['labels'][:]) + + if isinstance(rbp_of_interest, str): + rbp_of_interest = [rbp_of_interest] + + assert all([rbp in labels for rbp in rbp_of_interest]), 'Error: not all of {} are in labels: {}'.format(rbp_of_interest, labels) + + keep = list(i for i, l in enumerate(labels) if l in rbp_of_interest) + diffscores = f['diffscore'][:, keep] + + rbp_mask = np.array([i in rbp_of_interest for i in labels]) + + if np.ndim(diffscores) == 1: + diffscores = diffscores[:, np.newaxis] + + vid_mask = np.max(np.abs(diffscores), axis=1) >= snakemake.params.min_impact + + vid_df = pd.read_csv(bed_rbp, sep='\t', header=None, usecols=[3], names=['vid'], dtype={'vid': str}) + vids = vid_df.vid.str.split(pat='_[ACGT]+>[ACGT]+$', n=1, expand=True)[0].values[vid_mask] + + return vids, rbp_mask, np.array(labels)[rbp_mask].tolist() + + +def get_plof_id_func(vep_tsv): + # returns a function that takes an array of variant ids as input and returns a boolean array which indicates whether a variant is + # annotated as a protein LOF variant for any gene + + plof_ids = pd.read_csv(vep_tsv, sep='\t', usecols=['Uploaded_variation'], index_col='Uploaded_variation').index.drop_duplicates() + + def is_plof(vids): + return np.array([vid in plof_ids for vid in vids]) + + return is_plof + + +# start: kernel parameters / functions +def get_cholesky(S): + try: + chol = cholesky(S) + flag = 1 + except LinAlgError: + try: + np.fill_diagonal(S, S.diagonal() + 1e-6) # sometimes this saves it... + S /= np.max(S) + chol = cholesky(S) + flag = 0 + except LinAlgError: + chol = np.eye(len(S)) + flag = -1 + return chol, flag + + +def get_simil_from_vep(V): + # cosine similarity + return cosine_similarity(V) + + +def get_simil_from_pos(pos): + # 0.5 at 50bp distance + gamma = -1*np.log(0.5)/(50**2) + return rbf_kernel(pos[:, np.newaxis], gamma=gamma) + + +def get_weights_from_vep(V): + # max norm + return np.sqrt(np.max(np.abs(V), axis=1)) + + +def get_regions(): + # load the results, keep those below a certain p-value + results = pd.read_csv(snakemake.input.results_tsv, sep='\t', comment='#') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern ] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + # get genes below threshold + genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ] + genes = np.unique(np.concatenate(genes)) + + if len(genes) == 0: + return None + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + regions = regions.set_index('name').loc[genes] + + regions = regions.join(results.set_index('gene'), how='left').reset_index() + return regions +# end: kernel parameters / functions + + +# function to scale the kernel so that sum(diag) == N +N = Y.shape[0] +def scale_G(G): + + # calculate the scaling factor + sv = np.linalg.svd(G, full_matrices=False, compute_uv=False) + scaling_factor = np.sqrt(N/np.sum(sv**2)) + + return G * scaling_factor + + +def rollmean(x): + # returns the means of neighboring elements in an array + return (x[1:]+x[:-1])*0.5 + + +def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999): + # get the h2-grid to integrate over + grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)]) + return grid, np.diff(grid) + + +def eval_nll_grid(model=None, dof=None, **kwargs): + ''' + + Evaluates the negative log likelihood on a grid of h2-values for a given model + + returns a dictionary: + + grid: h2-values + step: step size + nll: negative log likelihood values at the h2-values + sigma2: estimate for the environmental residual variance at the h2-values + integral: integral of the likelihood scaled by the maximum (for debugging) + normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior + + ''' + Y = model.y.reshape((-1,1)) + + assert Y.shape[0] == len(model.y), "only works for single phenotype" + + + evalgrid, step = h2_grid(**kwargs) + + nll = np.zeros_like(evalgrid) + sigma2 = np.zeros_like(evalgrid) + + for i, val in enumerate(evalgrid): + res = model.nLLeval(val, dof=dof) + nll[i] = res['nLL'] + try: + sigma2[i] = res['sigma2'] + except KeyError: + sigma2[i] = np.nan + + scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum + integral = trapezoid(scaled, evalgrid) # integral of scaled values + normalized = scaled / integral # normalized values (scaling cancels out) + + return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized} + + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.mac_report, snakemake.input.ensembl_vep_tsv) + +# get the top hits +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + sys.exit(0) + +# where we store the results +stats = [] +i_gene = 0 + +# dict with paths to input files +vep_h5 = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.h5_rbp_plus, snakemake.input.h5_rbp_minus))} +vep_bed = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.bed_rbp_plus, snakemake.input.bed_rbp_minus))} + + +# enter the chromosome loop: +timer = Timer() +for i, (chromosome, bed, mac_report, vep_tsv) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + # get variants that pass MAF threshold: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + # function to identify protein LOF variants + is_plof = get_plof_id_func(vep_tsv) + + # set up local collapsing + # collapser = LocalCollapsing(distance_threshold=51.) + + for strand in ['plus', 'minus']: + + # set up the regions to loop over for the chromosome + chromosome_id = chromosome.replace('chr', '') + + regions = regions_all[(regions_all.chrom == chromosome_id) & (regions_all.strand == strand)] + + if len(regions) == 0: + continue + + # get variants that pass variant effect prediction threshold: + vep_vids, vep_mask, labels = vep_filter(vep_h5[chromosome][strand], vep_bed[chromosome][strand]) + + # combine + filter_vids_chromosome = intersect_ids(vep_vids, filter_vids) + + # initialize the vep loader + veploader = Hdf5Loader(vep_bed[chromosome][strand], vep_h5[chromosome][strand], 'diffscore', from_janggu=True) + veploader.update_variants(filter_vids_chromosome) + veploader.set_mask(vep_mask) + + # set up the variant loader (rbp variants) for the chromosome + strand + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(veploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + + # set up the genotype + vep loading function + def get_rbp(interval): + + temp_genotypes, temp_vids, pos = plinkloader.genotypes_by_region(interval, return_pos=True) + + if temp_genotypes is None: + raise GotNone + + nanmean = np.nanmean(temp_genotypes, axis=0) + ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped! + + ncarrier = np.nansum(temp_genotypes, axis=0) + ncarrier = np.minimum(ncarrier, temp_genotypes.shape[0] - ncarrier).astype(int) + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + # deepripe variant effect predictions (single RBP) + V1 = veploader.anno_by_id(temp_vids) + + weights = get_weights_from_vep(V1) + + S = get_simil_from_pos(pos) + S *= get_simil_from_vep(V1) # Schur product of two positive definite matrices is positive definite + + cummac = mac_report.loc[temp_vids].Minor + + V1 = pd.DataFrame(V1, columns=labels)[sorted(labels)] + + return G1, temp_vids, weights, S, ncarrier, cummac, pos, V1 + + + # set up the test-function for a single gene + def test_gene(interval, seed): + + interval = interval.to_dict() + + pval_dict = {} + pval_dict['gene'] = interval['name'] + called = [] + + def pv_score(GV): + pv = null_model_score.pv_alt_model(GV) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, method='saddle') + return pv + + def call_score(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + pval_dict[name] = {} + # single-marker p-values + pval_dict[name]['pv_score'] = np.array([pv_score(GV[:, i, np.newaxis]) for i in range(GV.shape[1])]) + + # single-marker coefficients + beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ] + pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) + pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta]) + if vids is not None: + pval_dict[name]['vid'] = vids + + def call_lrt(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + + GV_scaled = scale_G(GV) + + # get gene parameters, test statistics and and single-marker regression weights + lik = null_model_lrt.altmodel(GV_scaled) + pval_dict[name]['nLL'] = lik['nLL'] + pval_dict[name]['sigma2'] = lik['sigma2'] + pval_dict[name]['lrtstat'] = lik['stat'] + pval_dict[name]['h2'] = lik['h2'] + + if not lik['alteqnull']: + ## start: get confidence intervals for h2 + try: + result = eval_nll_grid(model=null_model_lrt.model1, dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + + except ValueError as e: + # likely an interpolation-error from the quantile function because h2 is very low + + print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name)) + print(lik) + + result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + try: + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + except ValueError: + pval_dict[name]['h2_0.025'] = np.array([np.nan]) + pval_dict[name]['h2_0.975'] = np.array([np.nan]) + + # save values + pval_dict[name]['h2_grid'] = result['grid'] + pval_dict[name]['nLL_grid'] = result['nll'] + pval_dict[name]['sigma2_grid'] = result['sigma2'] + pval_dict[name]['normalized_likelihood'] = result['normalized'] + + ## end: get confidence intervals for h2 + + if vids is not None: + pval_dict[name]['vid'] = vids + + # load rbp variants + G, vids, weights, S, ncarrier, cummac, pos, V = get_rbp(interval) + keep = ~is_plof(vids) + + # these are common to all kernels + pval_dict['vid'] = vids + pval_dict['weights'] = weights ** 2 # get_weights_from_vep returns the square root! + pval_dict['MAC'] = cummac + pval_dict['nCarrier'] = ncarrier + pval_dict['not_LOF'] = keep + for col in V.columns: + pval_dict[col] = V[col].values.astype(np.float32) + + # # cholesky + if G.shape[1] > 1: + L, flag1 = get_cholesky(S) + else: + L, flag1 = np.eye(G.shape[1]), -1 + + # # do a score test (cholesky, and weighted cholesky) + # GL = G.dot(L) + # call_score(GL, 'lincholesky') + GWL = G.dot(np.diag(weights, k=0)).dot(L) + # call_score(GWL, 'linwcholesky') + + # single-variant p-values: + call_score(G, 'linw', vids=vids) # get's the single-variant p-values and effect-sizes -> no weighting! + + call_lrt(GWL, 'linwcholesky') # performs weighting and LRT (-> calculate h2) + + # sanity checks + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + if np.any(keep): + + if keep.sum() == 1: + # only single SNP is not LOF + # GL = G[:, keep] # actually just the linear kernel + GWL = G[:, keep].dot(np.diag(weights[keep], k=0)) # actually just the linear weighted kernel + else: + L, flag2 = get_cholesky(S[np.ix_(keep, keep)]) + GWL = G[:, keep].dot(np.diag(weights[keep], k=0)).dot(L) + + # call_score(G1, 'lincholesky_notLOF') + # call_score(GWL, 'linwcholesky_notLOF') + + # call_lrt(GL, 'lincholesky_notLOF') + # call_lrt(GWL, 'linwcholesky_notLOF') + # G1 = G[:,keep].dot(np.diag(weights[keep], k=0)) + # call_score(G1, 'linw_notLOF', vids=vids[keep]) + + call_lrt(GWL, 'linwcholesky_notLOF') + + return pval_dict, called + + + logging.info('loaders for chromosome {}, strand "{}" initialized in {:.1f} seconds.'.format(chromosome, strand, timer.check())) + # run tests for all genes on the chromosome / strand + for _, region in regions.iterrows(): + + try: + gene_stats, called = test_gene(region, i_gene) + except GotNone: + continue + + # build the single-variant datafame + single_var_columns = ['gene', 'vid', 'weights', 'MAC', 'nCarrier', 'not_LOF' ] + sorted(snakemake.params.rbp_of_interest) + sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns}) + + sv_df['pv_score'] = gene_stats['linw']['pv_score'] # single-variant p-values estimated independently + sv_df['beta'] = gene_stats['linw']['beta'] # single-variant coeffcients estimated independently *without* weighting + sv_df['betaSd'] = gene_stats['linw']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting + sv_df['pheno'] = snakemake.params.phenotype + + out_dir = os.path.join(snakemake.params.out_dir_stats, region['name']) + os.makedirs(out_dir, exist_ok=True) + + sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False) + + for k in called: + + if k == 'variant_pvals': + continue + + results_dict = gene_stats[k] + + df_cols = ['pv_score', 'beta', 'betaSd', 'vid'] # parts of the dict that have lenght > 1 + + df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict}) + + if df.shape[0] > 0: + df['gene'] = gene_stats['gene'] + df['pheno'] = snakemake.params.phenotype + df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False) + + # other cols ['nLL', 'sigma2', 'lrtstat', 'h2', ...] + other_cols = {k: v for k, v in results_dict.items() if k not in df_cols} + other_cols['gene'] = gene_stats['gene'] + other_cols['pheno'] = snakemake.params.phenotype + + pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb')) + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + timer.reset() diff --git a/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py new file mode 100644 index 0000000..229e42b --- /dev/null +++ b/script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py @@ -0,0 +1,493 @@ + +import os +import sys +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np +import pickle +import gzip + +# needed to calculate Bayesian h2 confidence intervals +from scipy.integrate import trapezoid +from scipy.interpolate import interp1d + +# seak imports +from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.kernels import LocalCollapsing +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK + +import gzip + +from pysnptools.snpreader import Bed + +from util.association import BurdenLoaderHDF5 +from util import Timer + + +class GotNone(Exception): + pass + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) +null_model_lrt = LRTnoK(X, Y) + + +# set up function to filter variants: +def maf_filter(mac_report): + + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref)] + + # this has already been done in filter_variants.py + # load the variant annotation, keep only variants in high-confidece regions + # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg']) + # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values] + # vids = np.intersect1d(vids, vids_highconf) + + return mac_report.set_index('SNP').loc[vids] + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + +def get_regions(): + # load the results, keep those below a certain p-value + results = pd.read_csv(snakemake.input.results_tsv, sep='\t') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern ] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + # get genes below threshold + genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ] + genes = np.unique(np.concatenate(genes)) + + if len(genes) == 0: + return None + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + regions = regions.set_index('name').loc[genes] + + regions = regions.join(results.set_index('gene'), how='left').reset_index() + return regions + + +# function to scale the kernel so that sum(diag) == N +N = Y.shape[0] +def scale_G(G): + + # calculate the scaling factor + sv = np.linalg.svd(G, full_matrices=False, compute_uv=False) + scaling_factor = np.sqrt(N/np.sum(sv**2)) + + return G * scaling_factor + + +def rollmean(x): + # returns the means of neighboring elements in an array + return (x[1:]+x[:-1])*0.5 + + +def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999): + # get the h2-grid to integrate over + grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)]) + return grid, np.diff(grid) + + +def eval_nll_grid(model=None, dof=None, **kwargs): + ''' + + Evaluates the negative log likelihood on a grid of h2-values for a given model + + returns a dictionary: + + grid: h2-values + step: step size + nll: negative log likelihood values at the h2-values + sigma2: estimate for the environmental residual variance at the h2-values + integral: integral of the likelihood scaled by the maximum (for debugging) + normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior + + ''' + Y = model.y.reshape((-1,1)) + + assert Y.shape[0] == len(model.y), "only works for single phenotype" + + + evalgrid, step = h2_grid(**kwargs) + + nll = np.zeros_like(evalgrid) + sigma2 = np.zeros_like(evalgrid) + + for i, val in enumerate(evalgrid): + res = model.nLLeval(val, dof=dof) + nll[i] = res['nLL'] + try: + sigma2[i] = res['sigma2'] + except KeyError: + sigma2[i] = np.nan + + scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum + integral = trapezoid(scaled, evalgrid) # integral of scaled values + normalized = scaled / integral # normalized values (scaling cancels out) + + return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized} + + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) + + +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + sys.exit(0) + +logging.info('About to evaluate variants in {} genes'.format(len(regions_all))) + +i_gene = 0 +# where we store the results +stats = [] +# enter the chromosome loop: +timer = Timer() +for i, (chromosome, bed, vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + # set up the ensembl vep loader for the chromosome + ensemblvepdf = pd.read_csv(vep_tsv, + sep='\t', + usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact', 'ref', 'alt', 'cosine_similarity'], + index_col='Uploaded_variation') + + # get set of variants for the chromosome: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + + # filter by MAF + keep = intersect_ids(filter_vids, ensemblvepdf.index.values) + ensemblvepdf = ensemblvepdf.loc[keep] + ensemblvepdf.reset_index(inplace=True) + + # filter by impact: + ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene' ,'pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact ] + + # initialize the loader + eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data = ensemblvepdf[['pos_standardized' ,'impact', 'ref', 'alt', 'cosine_similarity']].values) + + # set up the regions to loop over for the chromosome + regions = regions_all.copy() + + # discard all genes for which we don't have annotations + regions['gene'] = regions.name.str.split('_', expand=True)[0] + regions.set_index('gene', inplace=True) + genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene)) + regions = regions.loc[genes].reset_index() + regions = regions.sort_values(['chrom' ,'start' ,'end']) + + # set up the variant loader (missense variants) for the chromosome + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(eveploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + # set up the protein LOF burden loader + bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof) + bloader_lof.update_individuals(covariatesloader.get_iids()) + + # set up local collapsing + collapser = LocalCollapsing(distance_threshold=1.) + + # set up the missense genotype + vep loading function + def get_missense(interval): + + try: + V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0]) + except KeyError: + raise GotNone + + if V1.index.empty: + raise GotNone + + vids = V1.index.get_level_values('vid') + V1 = V1.droplevel(['gene']) + + temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) + + ncarrier = np.nansum(temp_genotypes > 0, axis=0) + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + cummac = mac_report.loc[vids].Minor + + # polyphen / sift impact + weights = V1[1].values.astype(np.float64) + + # "standardized" positions -> codon start positions + pos = V1[0].values.astype(np.int32) + + ref = V1[2].values.astype(str) + alt = V1[3].values.astype(str) + cosine_similarity = V1[4].values.astype(np.float64) + + return G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity + + # set up the protein-LOF loading function + + def get_plof(interval): + + try: + G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float64) + except KeyError: + G2 = None + + return G2 + + # set up the test-function for a single gene + def test_gene(interval, seed): + + pval_dict = {} + pval_dict['gene'] = interval['name'] + called = [] + + def pv_score(GV): + pv = null_model_score.pv_alt_model(GV) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, method='saddle') + return pv + + def call_score(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + pval_dict[name] = {} + # single-marker p-values + pval_dict[name]['pv_score'] = np.array([pv_score(GV[:,i,np.newaxis]) for i in range(GV.shape[1])]) + + # single-marker coefficients + beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ] + pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) + pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta]) + if vids is not None: + pval_dict[name]['vid'] = vids + + + def call_lrt(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + + GV_scaled = scale_G(GV) + + # get gene parameters, test statistics and and single-marker regression weights + lik = null_model_lrt.altmodel(GV_scaled) + pval_dict[name]['nLL'] = lik['nLL'] + pval_dict[name]['sigma2'] = lik['sigma2'] + pval_dict[name]['lrtstat'] = lik['stat'] + pval_dict[name]['h2'] = lik['h2'] + + if not lik['alteqnull']: + ## start: get confidence intervals for h2 + try: + result = eval_nll_grid(model=null_model_lrt.model1, dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + + except ValueError as e: + # likely an interpolation-error from the quantile function because h2 is very low + + print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name)) + print(lik) + + result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + try: + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + except ValueError: + pval_dict[name]['h2_0.025'] = np.array([np.nan]) + pval_dict[name]['h2_0.975'] = np.array([np.nan]) + + # save values + pval_dict[name]['h2_grid'] = result['grid'] + pval_dict[name]['nLL_grid'] = result['nll'] + pval_dict[name]['sigma2_grid'] = result['sigma2'] + pval_dict[name]['normalized_likelihood'] = result['normalized'] + + ## end: get confidence intervals for h2 + + if vids is not None: + pval_dict[name]['vid'] = vids + + + # load missense variants + G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval) + + # these are common to all kernels + pval_dict['vid'] = vids + pval_dict['weights'] = weights + pval_dict['MAC'] = cummac + pval_dict['nCarrier'] = ncarrier + pval_dict['pos'] = pos + pval_dict['ref'] = ref + pval_dict['alt'] = alt + pval_dict['cosine_similarity'] = cosine_similarity + + # single-variant p-values: + call_score(G1, 'variant_pvals') # score-pvalues and coeficients estimated independently + + # sanity checks + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + # do a score burden test (max weighted), this is different than the baseline! + G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) + + # burden + call_score(G1_burden, 'linwb') + call_lrt(G1_burden, 'linwb') + + # perform local collapsing with weights + if G1.shape[1] > 1: + G1, clusters = collapser.collapse(G1, pos, np.sqrt(weights)) # will lead to a crash when there are negative weights... + else: + G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) + clusters = np.array([0]) + + pval_dict['cluster_id'] = clusters + + # get the single-cluster p-values + stats + call_score(G1, 'linwcollapsed') + call_lrt(G1, 'linwcollapsed') + pval_dict['linwcollapsed']['cluster_id'] = sorted(set(clusters)) + + # load plof burden + G2 = get_plof(interval) + + if G2 is not None: + + call_score(G2, 'LOF') + call_lrt(G2, 'LOF') + + # merged (single variable) + G1_burden_mrg = np.maximum(G2, G1_burden) + call_score(G1_burden_mrg, 'linwb_mrgLOF') + call_lrt(G1_burden_mrg, 'linwb_mrgLOF') + + # concatenated + call_score(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF') + call_lrt(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF') + pval_dict['linwcollapsed_cLOF']['cluster_id'] = sorted(set(clusters)) + [-1] # -1 indicates the LOF cluster + + return pval_dict, called + + logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check())) + # run tests for all genes on the chromosome + for _, region in regions.iterrows(): + + try: + gene_stats, called = test_gene(region, i_gene) + except GotNone: + continue + + + # build the single-variant datafame + single_var_columns = ['gene','vid','weights','MAC','nCarrier','pos','ref','alt','cosine_similarity','cluster_id'] + sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns}) + + sv_df['pv_score'] = gene_stats['variant_pvals']['pv_score'] # single-variant p-values estimated independently + sv_df['beta'] = gene_stats['variant_pvals']['beta'] # single-variant coeffcients estimated independently *without* weighting + sv_df['betaSd'] = gene_stats['variant_pvals']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting + sv_df['pheno'] = snakemake.params.phenotype + + out_dir = os.path.join(snakemake.params.out_dir_stats, region['name']) + os.makedirs(out_dir, exist_ok=True) + + sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False) + + + for k in called: + + if k == 'variant_pvals': + continue + + results_dict = gene_stats[k] + + df_cols = ['cluster_id','pv_score', 'beta', 'betaSd', 'vid'] # parts of the dict that have lenghth == (number of variants) + + df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict}) + df['gene'] = gene_stats['gene'] + df['pheno'] = snakemake.params.phenotype + df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False) + + # other cols ['nLL', 'sigma2', 'lrtstat', 'h2', 'h2_grid', 'nLL_grid', 'sigma2_grid', 'normalized_likelihood', 'h2_0.025', 'h2_0.975'] + other_cols = {k: v for k, v in results_dict.items() if k not in df_cols } + other_cols['gene'] = gene_stats['gene'] + other_cols['pheno'] = snakemake.params.phenotype + + pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb')) + + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + timer.reset() + diff --git a/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py new file mode 100644 index 0000000..93d11ff --- /dev/null +++ b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py @@ -0,0 +1,511 @@ +import os +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging + +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np +import gzip + +# needed to calculate Bayesian h2 confidence intervals +from scipy.integrate import trapezoid +from scipy.interpolate import interp1d + +# seak imports +from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK, pv_chi2mixture, fit_chi2mixture + +from pysnptools.snpreader import Bed +import pickle +import sys + +from util.association import BurdenLoaderHDF5 +from util import Timer + + +class GotNone(Exception): + pass + + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) +null_model_lrt = LRTnoK(X, Y) + + +# set up function to filter variants: +def maf_filter(mac_report): + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool))] + + # this has already been done in filter_variants.py + # load the variant annotation, keep only variants in high-confidece regions + # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg']) + # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values] + # vids = np.intersect1d(vids, vids_highconf) + + return mac_report.set_index('SNP').loc[vids] + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + +def get_regions(): + # load the results, keep those below a certain p-value + results = pd.read_csv(snakemake.input.results_tsv, sep='\t') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern ] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + # get genes below threshold + genes = [results.gene[results[k] < 1e-7].values for k in pvcols_score + pvcols_lrt ] + genes = np.unique(np.concatenate(genes)) + + if len(genes) == 0: + return None + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + regions = regions.set_index('name').loc[genes] + + regions = regions.join(results.set_index('gene'), how='left').reset_index() + return regions + + +# function to scale the kernel so that sum(diag) == N +N = Y.shape[0] +def scale_G(G): + + # calculate the scaling factor + sv = np.linalg.svd(G, full_matrices=False, compute_uv=False) + scaling_factor = np.sqrt(N/np.sum(sv**2)) + + return G * scaling_factor + + +def rollmean(x): + # returns the means of neighboring elements in an array + return (x[1:]+x[:-1])*0.5 + + +def h2_grid(low_log10=-8, high_log10=-2, left_len=1000, right_len=999): + # get the h2-grid to integrate over + grid = np.concatenate([np.zeros(1,), 10**(np.linspace(low_log10,high_log10,num=left_len, endpoint=False)), np.linspace(10**high_log10, 0.999999, num=right_len)]) + return grid, np.diff(grid) + + +def eval_nll_grid(model=None, dof=None, **kwargs): + ''' + + Evaluates the negative log likelihood on a grid of h2-values for a given model + + returns a dictionary: + + grid: h2-values + step: step size + nll: negative log likelihood values at the h2-values + sigma2: estimate for the environmental residual variance at the h2-values + integral: integral of the likelihood scaled by the maximum (for debugging) + normalized: normalized likelihood evaluated at the h2-values, i.e. the posterior + + ''' + Y = model.y.reshape((-1,1)) + + assert Y.shape[0] == len(model.y), "only works for single phenotype" + + + evalgrid, step = h2_grid(**kwargs) + + nll = np.zeros_like(evalgrid) + sigma2 = np.zeros_like(evalgrid) + + for i, val in enumerate(evalgrid): + res = model.nLLeval(val, dof=dof) + nll[i] = res['nLL'] + try: + sigma2[i] = res['sigma2'] + except KeyError: + sigma2[i] = np.nan + + scaled = np.exp(-1*nll + np.min(nll)) # scaled by the maximum + integral = trapezoid(scaled, evalgrid) # integral of scaled values + normalized = scaled / integral # normalized values (scaling cancels out) + + return {'grid': evalgrid, 'step': step, 'nll': nll, 'sigma2': sigma2, 'integral':integral, 'normalized': normalized} + + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len (snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.ensembl_vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) + +# get the top hits +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + sys.exit(0) + +# where we store the results +stats = [] +i_gene = 0 +# enter the chromosome loop: + +timer = Timer() +for i, (chromosome, bed, vep_tsv, ensembl_vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + # set up the ensembl vep loader for the chromosome + spliceaidf = pd.read_csv(vep_tsv, + sep='\t', + usecols=['name', 'chrom', 'end', 'gene', 'max_effect', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'], + index_col='name') + + # get set of variants for the chromosome: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + # filter by MAF + keep = intersect_ids(filter_vids, spliceaidf.index.values) + spliceaidf = spliceaidf.loc[keep] + spliceaidf.reset_index(inplace=True) + + # filter by impact: + spliceaidf = spliceaidf[spliceaidf.max_effect >= snakemake.params.min_impact] + + # set up the regions to loop over for the chromosome + regions = regions_all.copy() + + # discard all genes for which we don't have annotations + gene_ids = regions.name.str.split('_', expand=True) # table with two columns, ensembl-id and gene-name + regions['gene'] = gene_ids[1] # this is the gene name + regions['ensembl_id'] = gene_ids[0] + regions.set_index('gene', inplace=True) + genes = intersect_ids(np.unique(regions.index.values), np.unique(spliceaidf.gene)) # intersection of gene names + regions = regions.loc[genes].reset_index() # subsetting + regions = regions.sort_values(['chrom', 'start', 'end']) + + # check if the variants are protein LOF variants, load the protein LOF variants: + ensemblvepdf = pd.read_csv(ensembl_vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Gene']) + + # this column will contain the gene names: + genes = intersect_ids(np.unique(ensemblvepdf.Gene.values), regions.ensembl_id) # intersection of ensembl gene ids + ensemblvepdf = ensemblvepdf.set_index('Gene').loc[genes].reset_index() + ensemblvepdf['gene'] = gene_ids.set_index(0).loc[ensemblvepdf.Gene.values].values + + # set up the merge + ensemblvepdf.drop(columns=['Gene'], inplace=True) # get rid of the ensembl ids, will use gene names instead + ensemblvepdf.rename(columns={'Uploaded_variation': 'name'}, inplace=True) + ensemblvepdf['is_plof'] = 1. + + ensemblvepdf = ensemblvepdf[~ensemblvepdf.duplicated()] # if multiple ensembl gene ids map to the same gene names, this prevents a crash. + + # we add a column to the dataframe indicating whether the variant is already annotated as protein loss of function by the ensembl variant effect predictor + spliceaidf = pd.merge(spliceaidf, ensemblvepdf, on=['name', 'gene'], how='left', validate='one_to_one') + spliceaidf['is_plof'] = spliceaidf['is_plof'].fillna(0.).astype(bool) + + # initialize the loader + # Note: we use "end" here because the start + 1 = end, and we need 1-based coordiantes (this would break if we had indels) + eveploader = EnsemblVEPLoader(spliceaidf['name'], spliceaidf['chrom'].astype('str') + ':' + spliceaidf['end'].astype('str'), spliceaidf['gene'], data=spliceaidf[['max_effect', 'is_plof', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']].values) + + # set up the variant loader (splice variants) for the chromosome + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(eveploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + # set up the protein LOF burden loader + bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof) + bloader_lof.update_individuals(covariatesloader.get_iids()) + + + # set up the splice genotype + vep loading function + def get_splice(interval): + + try: + V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[1]) + except KeyError: + raise GotNone + + if V1.index.empty: + raise GotNone + + vids = V1.index.get_level_values('vid') + V1 = V1.droplevel(['gene']) + + temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) + + ncarrier = np.nansum(temp_genotypes > 0, axis=0) + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + cummac = mac_report.loc[vids].Minor + + # spliceAI max score + weights = V1[0].values.astype(np.float64) + + is_plof = V1[1].values.astype(bool) + + splice_preds_all = V1.iloc[:,2:] + splice_preds_all.columns = ['DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'] + + # "standardized" positions -> codon start positions + # pos = V1[0].values.astype(np.int32) + + return G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all + + + # set up the protein-LOF loading function + + def get_plof(interval): + + try: + G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float) + except KeyError: + G2 = None + + return G2 + + + # set up the test-function for a single gene + def test_gene(interval, seed): + + pval_dict = {} + pval_dict['gene'] = interval['name'] + called = [] + + def pv_score(GV): + pv = null_model_score.pv_alt_model(GV) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, method='saddle') + return pv + + def call_score(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + pval_dict[name] = {} + # single-marker p-values + pval_dict[name]['pv_score'] = np.array([pv_score(GV[:,i,np.newaxis]) for i in range(GV.shape[1])]) + + # single-marker coefficients + beta = [ null_model_score.coef(GV[:,i,np.newaxis]) for i in range(GV.shape[1]) ] + pval_dict[name]['beta'] = np.array([x['beta'][0,0] for x in beta]) + pval_dict[name]['betaSd'] = np.array([np.sqrt(x['var_beta'][0,0]) for x in beta]) + if vids is not None: + pval_dict[name]['vid'] = vids + + + def call_lrt(GV, name, vids=None): + if name not in pval_dict: + pval_dict[name] = {} + called.append(name) + + GV_scaled = scale_G(GV) + + # get gene parameters, test statistics and and single-marker regression weights + lik = null_model_lrt.altmodel(GV_scaled) + pval_dict[name]['nLL'] = lik['nLL'] + pval_dict[name]['sigma2'] = lik['sigma2'] + pval_dict[name]['lrtstat'] = lik['stat'] + pval_dict[name]['h2'] = lik['h2'] + + + if not lik['alteqnull']: + ## start: get confidence intervals for h2 + try: + result = eval_nll_grid(model=null_model_lrt.model1, dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + + except ValueError as e: + # likely an interpolation-error from the quantile function because h2 is very low + + print('Error when trying to calculate h2-confidence intervals for gene {}, kernel {}. Retrying with higher resolution for low h2 values'.format(interval['name'], name)) + print(lik) + + result = eval_nll_grid(model=null_model_lrt.model1, low_log10=-9, high_log10=-2, left_len=2000, right_len=999 , dof=None) + + # approximate the cdf + cd = np.cumsum(rollmean(result['normalized'])*result['step']) # cumulative + + try: + # approximate the quantile function + qfun = interp1d(cd, rollmean(result['grid'])) + pval_dict[name]['h2_0.025'] = qfun(0.025) + pval_dict[name]['h2_0.975'] = qfun(0.975) + except ValueError: + pval_dict[name]['h2_0.025'] = np.array([np.nan]) + pval_dict[name]['h2_0.975'] = np.array([np.nan]) + + # save values + pval_dict[name]['h2_grid'] = result['grid'] + pval_dict[name]['nLL_grid'] = result['nll'] + pval_dict[name]['sigma2_grid'] = result['sigma2'] + pval_dict[name]['normalized_likelihood'] = result['normalized'] + + ## end: get confidence intervals for h2 + + + if vids is not None: + pval_dict[name]['vid'] = vids + + + # load splice variants + G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all = get_splice(interval) + # keep indicates which variants are NOT "protein LOF" variants, i.e. variants already identified by the ensembl VEP + keep = ~is_plof + + # these are common to all kernels + pval_dict['vid'] = vids + pval_dict['weights'] = weights + pval_dict['MAC'] = cummac + pval_dict['nCarrier'] = ncarrier + pval_dict['not_LOF'] = keep + for col in splice_preds_all.columns: + pval_dict[col] = splice_preds_all[col].values.astype(np.float32) + + # single-variant p-values: + call_score(G1, 'variant_pvals') # single variant p-values and coefficients estimated independently + call_lrt(G1.dot(np.diag(np.sqrt(weights), k=0)), 'variant_pvals') # single variant coefficients estimated *jointly* after weighting + + # sanity checks + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + # do a score burden test (max weighted), this is different than the baseline! + G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) + call_score(G1_burden, 'linwb') + call_lrt(G1_burden, 'linwb') + + # linear weighted kernel + G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) + + # do a score test (linear weighted) + call_score(G1, 'linw', vids=vids) + call_lrt(G1, 'linw') + + # load plof burden + G2 = get_plof(interval) + + if G2 is not None: + + call_score(G2, 'LOF') + call_lrt(G2, 'LOF') + + if np.any(keep): + + # merged (single variable) + G1_burden_mrg = np.maximum(G2, G1_burden) + + call_score(G1_burden_mrg, 'linwb_mrgLOF') + call_lrt(G1_burden_mrg, 'linwb_mrgLOF') + + # concatenated ( >= 2 variables) + # we separate out the ones that are already part of the protein LOF variants! + + G1 = np.concatenate([G1[:, keep], G2], axis=1) + call_score(G1, 'linw_cLOF', vids=np.array(vids[keep].tolist() + [-1])) + call_lrt(G1, 'linw_cLOF') + else: + logging.info('All Splice-AI variants for gene {} where already identified by the Ensembl variant effect predictor'.format(interval['name'])) + + + return pval_dict, called + + + logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check())) + # run tests for all genes on the chromosome + for _, region in regions.iterrows(): + + try: + gene_stats, called = test_gene(region, i_gene) + except GotNone: + continue + + # build the single-variant datafame + single_var_columns = ['gene', 'vid', 'weights', 'MAC', 'nCarrier', 'not_LOF', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'] + sv_df = pd.DataFrame.from_dict({k: gene_stats[k] for k in single_var_columns}) + + sv_df['pv_score'] = gene_stats['variant_pvals']['pv_score'] # single-variant p-values estimated independently + sv_df['beta'] = gene_stats['variant_pvals']['beta'] # single-variant coeffcients estimated independently *without* weighting + sv_df['betaSd'] = gene_stats['variant_pvals']['betaSd'] # standard errors for the single-variant coefficients estimated independently *without* weighting + sv_df['pheno'] = snakemake.params.phenotype + + out_dir = os.path.join(snakemake.params.out_dir_stats, region['name']) + os.makedirs(out_dir, exist_ok=True) + + sv_df.to_csv(out_dir + '/variants.tsv.gz', sep='\t', index=False) + + for k in called: + + if k == 'variant_pvals': + continue + + results_dict = gene_stats[k] + + df_cols = ['pv_score', 'beta', 'betaSd', 'vid'] # parts of the dict that have lenght > 1 + + df = pd.DataFrame.from_dict(data={k: results_dict[k] for k in df_cols if k in results_dict}) + df['gene'] = gene_stats['gene'] + df['pheno'] = snakemake.params.phenotype + df.to_csv(out_dir + '/{}.tsv.gz'.format(k), sep='\t', index=False) + + # other cols ['nLL', 'sigma2', 'lrtstat', 'h2', 'h2_grid', 'nLL_grid', 'sigma2_grid', 'normalized_likelihood', 'h2_0.025', 'h2_0.975'] + other_cols = {k: v for k, v in results_dict.items() if k not in df_cols} + other_cols['gene'] = gene_stats['gene'] + other_cols['pheno'] = snakemake.params.phenotype + + pickle.dump(other_cols, open(out_dir + '/{}_stats.pkl'.format(k), 'wb')) + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + timer.reset() \ No newline at end of file diff --git a/snake/association.smk b/snake/association.smk index 550f3dc..c01154e 100644 --- a/snake/association.smk +++ b/snake/association.smk @@ -152,6 +152,51 @@ rule assoc_missense_localcollapsing_eval_top_hits_all: expand(rules.assoc_missense_localcollapsing_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) +rule assoc_missense_localcollapsing_eval_top_hits_h2: + # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for missense variants + input: + h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True), + iid_lof = expand(rules.export_plof_burden.output.iid_txt, id = plinkfiles.getIds(), allow_missing=True), + gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True), + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + vep_tsv = expand(rules.evep_missense_proc.output.tsv_filtered, id = plinkfiles.getIds()), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + results_tsv = rules.assoc_missense_localcollapsing.output.results_tsv, + seak_install = rules.install_seak.output + output: + out_ok = touch('work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok') + params: + kernels = ['linwcollapsed','linwcollapsed_cLOF','linwb','linwb_mrgLOF'], # genes with p < 1e-7 in one of these kernels will be analysed in detail + phenotype = lambda wc: phenotypes[ wc.pheno ], + covariate_column_names = config['covariate_column_names'], + max_maf = config['maf_cutoff'], + min_impact = config['min_impact'], + out_dir_stats = lambda wc: 'work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno), + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence] + resources: + mem_mb=get_mem_mb(10000,1.5), + time="06:00:00", + partition="hpcpu" + threads: + 1 + log: + 'logs/association/sclrt_kernels_missense_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_missense_eval_top_hits_h2.py' + + +rule assoc_missense_localcollapsing_eval_top_hits_h2_all: + # run above rule for all phenotypes - this rule will effectively run the entire pipeline for missense variants + input: + expand(rules.assoc_missense_localcollapsing_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + + rule assoc_missense_localcollapsing_retest_top_hits: # calculates gene-specific LRT p-values for the most significant genes for missense variants input: @@ -337,6 +382,52 @@ rule assoc_spliceai_linw_eval_top_hits_all: expand(rules.assoc_spliceai_linw_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) +rule assoc_spliceai_linw_eval_top_hits_h2: + # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for splice variants + input: + h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True), + iid_lof = expand(rules.export_plof_burden.output.iid_txt, id = plinkfiles.getIds(), allow_missing=True), + gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True), + ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True), + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + vep_tsv = expand(rules.splice_ai_filter_and_overlap_with_genotypes.output.tsv, id = plinkfiles.getIds()), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + results_tsv = rules.assoc_spliceai_linw.output.results_tsv, + seak_install = rules.install_seak.output + output: + touch('work/association/sclrt_kernels_spliceai/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok') + params: + kernels = ['linb','linwb','lin','linw','linb_mrgLOF','linwb_mrgLOF','lin_cLOF','linw_cLOF'], + phenotype = lambda wc: phenotypes[ wc.pheno ], + covariate_column_names = config['covariate_column_names'], + max_maf = config['maf_cutoff'], + min_impact = config['splice_ai_min_impact'], + out_dir_stats = lambda wc: 'work/association/sclrt_kernels_spliceai/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno), + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + debug = False + log: + 'logs/association/sclrt_kernels_spliceai_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(12000,1.5), + time="6:00:00", + partition="hpcpu" + threads: + 1 + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_spliceai_eval_top_hits_h2.py' + +rule assoc_spliceai_linw_eval_top_hits_h2_all: + # runs rule above for all phenotypes - this rule will effectively run the entire pipeline for splice variants + input: + expand(rules.assoc_spliceai_linw_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + + rule assoc_spliceai_linw_retest_top_hits: # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for splice variants input: @@ -571,6 +662,53 @@ rule assoc_deepripe_multiple_cholesky_eval_top_hits_all: input: expand(rules.assoc_deepripe_multiple_cholesky_eval_top_hits.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) +rule assoc_deepripe_multiple_cholesky_eval_top_hits_h2: + # calculates single-variant p-values and regression coefficients and other statistics for the most significant genes for DeepRiPe variants + input: + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + h5_rbp_plus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['plus']), + h5_rbp_minus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['minus']), + bed_rbp_plus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['plus']), + bed_rbp_minus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['minus']), + ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + seak_install = rules.install_seak.output, + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + results_tsv = rules.assoc_deepripe_multiple_cholesky.output.results_tsv + output: + touch('work/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}/{pheno}/top_hits_h2/all.ok') + params: + kernels = ['lincholesky', 'linwcholesky', 'lincholesky_notLOF', 'linwcholesky_notLOF'], + phenotype = lambda wc: phenotypes[ wc.pheno ], + covariate_column_names = config['covariate_column_names'], + max_maf = config['maf_cutoff'], + min_impact = config['deepripe_min_impact'], + out_dir_stats = lambda wc: 'work/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}/{pheno}/top_hits_h2/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno), + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + rbp_of_interest = rules.assoc_deepripe_multiple_cholesky.params.rbp_of_interest, + debug = False + resources: + mem_mb=get_mem_mb(10000,1.5), + time="03:00:00", + partition="hpcpu" + threads: + 1 + log: + 'logs/association/sclrt_kernels_deepripe_multiple_eval_top_hits_h2/{filter_highconfidence}_{pheno}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits_h2.py' + + +rule assoc_deepripe_multiple_cholesky_eval_top_hits_h2_all: + # runs rule above for all phenotypes - this will effectively run the entire pipeline for DeepRiPe variants + input: + expand(rules.assoc_deepripe_multiple_cholesky_eval_top_hits_h2.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + rule assoc_deepripe_multiple_cholesky_retest_top_hits: # re-tests most significant genes for DeepRiPe variants using gene-specific null-distributions