diff --git a/Snakefile b/Snakefile index d91b4ce..9f9d5ae 100644 --- a/Snakefile +++ b/Snakefile @@ -7,6 +7,17 @@ import gzip configfile: "conf/config.yaml" +##################### +# utility functions # +##################### + +def get_mem_mb(mem_mb=10000, factor=1.5): + # return function that tries a higher memory limit for retries, scaling exponentially with a factor + def gm(wildcards, attempt): + return factor ** (attempt - 1) * mem_mb + return gm + + ################################### # samplesheet for the plink files # ################################### @@ -46,7 +57,7 @@ wildcard_constraints: # local rules # ############### -localrules: link_all, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all +localrules: link_all, link_genotypes, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all, complete_cases_ancestry, gather_complete_cases ############### @@ -156,4 +167,5 @@ include: "snake/deepripe.smk" include: "snake/evep.smk" include: "snake/splice_ai.smk" include: "snake/association.smk" +include: "snake/conditional_tests.smk" include: "snake/results_processing.smk" diff --git a/env/snakemake.yml b/env/snakemake.yml index 19e804c..4663315 100644 --- a/env/snakemake.yml +++ b/env/snakemake.yml @@ -7,3 +7,4 @@ dependencies: - python=3.8 - snakemake-minimal - pandas + - mamba diff --git a/run_cluster.sh b/run_cluster.sh new file mode 100755 index 0000000..af8e851 --- /dev/null +++ b/run_cluster.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +#SBATCH --job-name=controljob_%j +#SBATCH --output=snakemake_%j.log +#SBATCH --partition=vcpu +#SBATCH --time=48:00:00 +#SBATCH -c 1 +#SBATCH --mem 2000 + +# Initialize conda: +eval "$(conda shell.bash hook)" + +snakemake_env="install/snakemake" + +if [ ! -d $snakemake_env ]; then + ./install.sh +fi + +#conda activate $snakemake_env + +conda activate snakemake + +snakemake --snakefile Snakefile \ + --configfile conf/config.yaml \ + --profile ./slurm \ + --directory "${PWD}" \ + "${@}" + + diff --git a/script/python/assoc_baseline_scoretest_conditional_analysis.py b/script/python/assoc_baseline_scoretest_conditional_analysis.py new file mode 100644 index 0000000..a60244b --- /dev/null +++ b/script/python/assoc_baseline_scoretest_conditional_analysis.py @@ -0,0 +1,179 @@ + + + +import logging +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + + +import numpy as np +import pandas as pd +import gzip + +from seak.scoretest import ScoretestNoK +from seak.data_loaders import intersect_ids, CovariatesLoaderCSV, VariantLoaderSnpReader +from pysnptools.snpreader import Bed +from util.association import BurdenLoaderHDF5 +from util import Timer + + +''' +This script is the baseline that we later compare to. +It loops over genes and performs score-tests, using pre-computed indicator variables (0/1) +''' + + +# covariates loader +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# set up burden loaders +bloader_lof = BurdenLoaderHDF5(snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) +bloader_missense = BurdenLoaderHDF5(snakemake.input.h5_missense, snakemake.input.iid_missense, snakemake.input.gid_missense) + +# make sure individuals are in the same order +bloader_lof.update_individuals(covariatesloader.get_iids()) +bloader_missense.update_individuals(covariatesloader.get_iids()) + +# gene names to iterate over +genes = np.union1d(bloader_lof.get_vids(), bloader_missense.get_vids()) +if isinstance(genes, str): + genes = [genes] + +# keep only those that are significant! +# (1) load the results table +results = pd.read_csv(snakemake.input.results_tsv, sep='\t', na_values='.') + +# the columns to consider +sigcols = snakemake.params.columns +if isinstance(sigcols, str): + sigcols = [sigcols] + +pvcols_score = sigcols +results = results[['gene', 'nCarrier_pLOF', 'nCarrier_missense'] + pvcols_score] +results['pheno'] = snakemake.params.phenotype +results = results.loc[results.gene.isin(genes)] +results['gene_name'] = results['gene'].str.split('_', expand=True)[1] +genes = results.gene.tolist() + +# (2) load the list of variants to condition on +_conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno']) +_conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ] +_conditional_list = _conditional_list.drop_duplicates() + +if len(_conditional_list) == 0: + logging.info('No genes pass significance threshold.'.format(snakemake.params.phenotype)) + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype)) + sys.exit(0) +else: + results = results.merge(_conditional_list, how='right', on=['gene_name', 'pheno']) + +# (3) keep only genes below threshold +sig_genes = [results.gene[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score ] +sig_genes = np.unique(np.concatenate(sig_genes)) + +if len(sig_genes) == 0: + logging.info('No genes pass significance threshold.') + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype)) + sys.exit(0) +results = results.loc[results.gene.isin(sig_genes)].copy() +genes = results.gene.tolist() + +logging.info('About to evaluate variants in {} genes'.format(len(results.gene.unique()))) + +# conditional analysis: +# these genotypes will be used for the conditional analysis +geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) +geno_cond.update_individuals(covariatesloader.get_iids()) + +# this file contains the mapping of associations to SNPs to condition on +conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) +conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() +conditional_list = conditional_list.loc[conditional_list.gene_name.isin(results.gene_name)] + +geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) +logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) + +# set up the null model +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK') +null_model = ScoretestNoK(Y, X) + +logging.info('Phenotype: {}, Sample size: {}'.format(snakemake.params.phenotype, len(Y))) + +def test_gene(gene): + + # conditional analysis + # set up the function to load the variants to condition on + + def get_conditional(gene): + + # print(gene) + # print(gene.split('_')[1]) + # print(conditional_list) + + cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == gene.split('_')[1], 'snp_rsid' ].unique().tolist() + + temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False) + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + + cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.) + + return cond_snps, temp_vids + + pval_dict = {} + pval_dict['gene'] = gene + + def call(GV, G2, name): + # calls the test, appends results to pval_dict + if GV is None: + pval_dict['pv_' + name] = np.nan + pval_dict['nCarrier_' + name] = 0 + else: + pv = null_model.pv_alt_model(GV, G2) + if pv < 0.: + logging.warning('negative value encountered in p-value computation for gene {}, p-value: {}, using saddle instead.'.format(gene, pv)) + pv = null_model.pv_alt_model(GV, G2, method='saddle') + pval_dict['pv_' + name] = pv + pval_dict['nCarrier_' + name] = int(GV.sum()) + + + # Load the protein lof burdens + try: + Glof = bloader_lof.genotypes_by_id(gene).astype(float) + except KeyError: + Glof = None + + # Load the missense burdens + try: + Gmiss = bloader_missense.genotypes_by_id(gene).astype(float) + except KeyError: + Gmiss = None + + G2, cond_vid = get_conditional(gene) + + call(Glof, G2, 'pLOF') + call(Gmiss, G2, 'missense') + + if (Glof is None) or (Gmiss is None): + call(None, 'mrg') + else: + G = np.maximum(Glof, Gmiss) + call(G, G2, 'mrg') + + pval_dict['cond_snps'] = ','.join(cond_vid) + + return pval_dict + + +timer = Timer() +results = pd.DataFrame.from_dict([test_gene(gene) for gene in genes]).set_index('gene') + +t = timer.check() +logging.info('{} genes tested in {:.2f} minutes.'.format(len(results), t/60.)) + +results.to_csv(snakemake.output.results_tsv, sep='\t', index=True, na_rep='.') + diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple.py b/script/python/assoc_sclrt_kernels_deepripe_multiple.py index c645c2d..7ba927e 100644 --- a/script/python/assoc_sclrt_kernels_deepripe_multiple.py +++ b/script/python/assoc_sclrt_kernels_deepripe_multiple.py @@ -225,9 +225,8 @@ def get_rbp(interval): if temp_genotypes is None: raise GotNone - nanmean = np.nanmean(temp_genotypes, axis=0) - - ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped! + nanmean = np.nanmean(temp_genotypes, axis=0) + ncarrier = np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes) > 0, axis=0)# calculated differently here because alleles can be flipped! G1 = temp_genotypes - nanmean G1 = np.ma.masked_invalid(G1).filled(0.) diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple_conditional_analysis.py b/script/python/assoc_sclrt_kernels_deepripe_multiple_conditional_analysis.py new file mode 100644 index 0000000..f082c1f --- /dev/null +++ b/script/python/assoc_sclrt_kernels_deepripe_multiple_conditional_analysis.py @@ -0,0 +1,441 @@ +import os +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging + +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np +import h5py +import gzip +import pickle +import sys + +from numpy.linalg import cholesky, LinAlgError +from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity + +# seak imports +from seak.data_loaders import intersect_ids, Hdf5Loader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK + +from pysnptools.snpreader import Bed + +from util import Timer + + +class GotNone(Exception): + pass + + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) + +# conditional analysis: +# we need to fit gene-specific null models for the LRT! +# null_model_lrt = LRTnoK(X, Y) +null_model_lrt = None + + +# set up function to filter variants: +def maf_filter(mac_report): + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + # note: We don't filter out the variants for which alt/ref are "flipped" for the RBPs + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0)] + + return mac_report.set_index('SNP').loc[vids] + + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + + +def vep_filter(h5_rbp, bed_rbp): + # returns variant ids for variants that pass filtering threshold and mask for the hdf5loader + + with h5py.File(h5_rbp, 'r') as f: + + rbp_of_interest = snakemake.params.rbp_of_interest + try: + # in newer version of h5py the strings are not decoded -> decode them + labels = [l.decode() for l in f['labels'][:]] + except AttributeError: + # this used to work in older versions of h5py: + labels = list(f['labels'][:]) + + if isinstance(rbp_of_interest, str): + rbp_of_interest = [rbp_of_interest] + + assert all([rbp in labels for rbp in rbp_of_interest]), 'Error: not all of {} are in labels: {}'.format(rbp_of_interest, labels) + + keep = list(i for i, l in enumerate(labels) if l in rbp_of_interest) + diffscores = f['diffscore'][:, keep] + + rbp_mask = np.array([i in rbp_of_interest for i in labels]) + + if np.ndim(diffscores) == 1: + diffscores = diffscores[:, np.newaxis] + + vid_mask = np.max(np.abs(diffscores), axis=1) >= snakemake.params.min_impact + + vid_df = pd.read_csv(bed_rbp, sep='\t', header=None, usecols=[3], names=['vid'], dtype={'vid': str}) + vids = vid_df.vid.str.split(pat='_[ACGT]+>[ACGT]+$', n=1, expand=True)[0].values[vid_mask] + + return vids, rbp_mask, np.array(labels)[rbp_mask].tolist() + + +def get_plof_id_func(vep_tsv): + # returns a function that takes an array of variant ids as input and returns a boolean array which indicates whether a variant is + # annotated as a protein LOF variant for any gene + + plof_ids = pd.read_csv(vep_tsv, sep='\t', usecols=['Uploaded_variation'], index_col='Uploaded_variation').index.drop_duplicates() + + def is_plof(vids): + return np.array([vid in plof_ids for vid in vids]) + + return is_plof + + +# start: kernel parameters / functions +def get_cholesky(S): + try: + chol = cholesky(S) + flag = 1 + except LinAlgError: + try: + np.fill_diagonal(S, S.diagonal() + 1e-6) # sometimes this saves it... + S /= np.max(S) + chol = cholesky(S) + flag = 0 + except LinAlgError: + chol = np.eye(len(S)) + flag = -1 + return chol, flag + + +def get_simil_from_vep(V): + # cosine similarity + return cosine_similarity(V) + + +def get_simil_from_pos(pos): + # 0.5 at 50bp distance + gamma = -1*np.log(0.5)/(50**2) + return rbf_kernel(pos[:, np.newaxis], gamma=gamma) + + +def get_weights_from_vep(V): + # max norm + return np.sqrt(np.max(np.abs(V), axis=1)) + + +def get_regions(): + # load the results, keep those below a certain p-value + present in conditional analysis list + results = pd.read_csv(snakemake.input.results_tsv, sep='\t', comment='#') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + results.rename(columns={'gene':'name'}, inplace=True) + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + + # conditional analysis, keep only genes that are provided in conditional_list + regions['gene_name'] = regions['name'].str.split('_', expand=True)[1] + + _conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno']) + _conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ] + _conditional_list = _conditional_list.drop_duplicates() + + if len(_conditional_list) == 0: + logging.info('No genes pass significance threshold for phenotype {}, exiting.'.format(snakemake.params.phenotype)) + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype)) + sys.exit(0) + else: + regions = regions.merge(_conditional_list, how='right') + + regions = regions.merge(results, how='left', on=['name'], validate='one_to_one') + + sig_genes = [results.name[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score + pvcols_lrt] + sig_genes = np.unique(np.concatenate(sig_genes)) + + if len(sig_genes) == 0: + return None + + regions = regions.loc[regions.name.isin(sig_genes)] + + return regions + + +# end: kernel parameters / functions + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.mac_report, snakemake.input.ensembl_vep_tsv) + +# get the top hits +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype)) + sys.exit(0) + + +# dict with paths to input files +vep_h5 = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.h5_rbp_plus, snakemake.input.h5_rbp_minus))} +vep_bed = {chrom: {'plus': p, 'minus': m} for chrom, (p, m) in zip(snakemake.params.ids, zip(snakemake.input.bed_rbp_plus, snakemake.input.bed_rbp_minus))} + +# storing all results here +results = [] + +i_gene = 0 +i_chrom = 0 + +# conditional analysis: +# these genotypes will be used for the conditional analysis +geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) +geno_cond.update_individuals(covariatesloader.get_iids()) + +# this file contains the mapping of associations to SNPs to condition on +conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) +conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() + +geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) +logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) + +# enter the chromosome loop: +timer = Timer() +for i, (chromosome, bed, mac_report, vep_tsv) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + if snakemake.params.debug: + # process only two chromosomes if debugging... + if i_chrom > 2: + break + + timer.reset() + + # get variants that pass MAF threshold: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + # function to identify protein LOF variants + is_plof = get_plof_id_func(vep_tsv) + + # set up local collapsing + # collapser = LocalCollapsing(distance_threshold=51.) + + for strand in ['plus', 'minus']: + + # set up the regions to loop over for the chromosome + chromosome_id = chromosome.replace('chr', '') + + regions = regions_all[(regions_all.chrom == chromosome_id) & (regions_all.strand == strand)] + + if len(regions) == 0: + continue + + # get variants that pass variant effect prediction threshold: + vep_vids, vep_mask, labels = vep_filter(vep_h5[chromosome][strand], vep_bed[chromosome][strand]) + + # combine + filter_vids_chromosome = intersect_ids(vep_vids, filter_vids) + + # initialize the vep loader + veploader = Hdf5Loader(vep_bed[chromosome][strand], vep_h5[chromosome][strand], 'diffscore', from_janggu=True) + veploader.update_variants(filter_vids_chromosome) + veploader.set_mask(vep_mask) + + # set up the variant loader (rbp variants) for the chromosome + strand + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(veploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + + # set up the genotype + vep loading function + def get_rbp(interval): + + temp_genotypes, temp_vids, pos = plinkloader.genotypes_by_region(interval, return_pos=True) + + if temp_genotypes is None: + raise GotNone + + nanmean = np.nanmean(temp_genotypes, axis=0) + ncarrier = np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes) > 0, axis=0)# calculated differently here because alleles can be flipped! + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + # deepripe variant effect predictions (single RBP) + V1 = veploader.anno_by_id(temp_vids) + + weights = get_weights_from_vep(V1) + + S = get_simil_from_pos(pos) + S *= get_simil_from_vep(V1) # Schur product of two positive definite matrices is positive definite + + cummac = mac_report.loc[temp_vids].Minor + + V1 = pd.DataFrame(V1, columns=labels)[sorted(labels)] + + return G1, temp_vids, weights, S, ncarrier, cummac, pos, V1 + + + # conditional analysis + # set up the function to load the variants to condition on + + def get_conditional(interval): + + cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == interval['gene_name'], 'snp_rsid' ].unique().tolist() + + temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False) + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + + cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.) + + return cond_snps, temp_vids + + + # set up the test-function for a single gene + def test_gene(interval, seed): + + interval = interval.to_dict() + + pval_dict = {} + pval_dict['gene'] = interval['name'] + + out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) + os.makedirs(out_dir, exist_ok=True) + + # conditional analysis: + # get the snps to condition on, and include them in the null model for the LRT + cond_snps, cond_snps_vid = get_conditional(interval) + null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) + + # conditional analysis: + # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... + def pv_score(GV, G2=cond_snps): + # wraps score-test + pv = null_model_score.pv_alt_model(GV, G2) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, G2, method='saddle') + return pv + + def call_test(GV, name): + pval_dict['pv_score_' + name] = pv_score(GV) + altmodel = null_model_lrt.altmodel(GV) + res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) + pval_dict['pv_lrt_' + name] = res['pv'] + pval_dict['lrtstat_' + name] = altmodel['stat'] + if 'h2' in altmodel: + pval_dict['h2_' + name] = altmodel['h2'] + + if res['pv'] != 1.: + for stat in ['scale', 'dof', 'mixture', 'imax']: + pval_dict[stat + '_' + name] = res[stat] + if len(res['res'] > 0): + pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) + + # load rbp variants + G, vids, weights, S, ncarrier, cummac, pos, V = get_rbp(interval) + keep = ~is_plof(vids) + + # cholesky + if G.shape[1] > 1: + L, flag1 = get_cholesky(S) + else: + L, flag1 = np.eye(G.shape[1]), -1 + + # do a score test (cholesky, and weighted cholesky) + GWL = G.dot(np.diag(weights, k=0)).dot(L) + call_test(GWL, 'linwcholesky') + + # sanity checks + assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + if np.any(keep): + + if keep.sum() == 1: + # only single SNP is not LOF + GWL = G[:, keep].dot(np.diag(weights[keep], k=0)) # actually just the linear weighted kernel + else: + L, flag2 = get_cholesky(S[np.ix_(keep, keep)]) + GWL = G[:, keep].dot(np.diag(weights[keep], k=0)).dot(L) + + call_test(GWL, 'linwcholesky_notLOF') + + # conditional analysis: keep names of SNPs that we condition on + pval_dict['cond_snps'] = ','.join(cond_snps_vid) + + return pval_dict + + + logging.info('loaders for chromosome {}, strand "{}" initialized in {:.1f} seconds.'.format(chromosome, strand, timer.check())) + # run tests for all genes on the chromosome / strand + for _, region in regions.iterrows(): + + if snakemake.params.debug: + if i_gene > 5: + continue + + try: + results.append(test_gene(region, i_gene)) + except GotNone: + continue + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + i_chrom += 1 + timer.reset() + +# export the results in a single table +results = pd.DataFrame(results) +results['pheno'] = snakemake.params.phenotype +results.to_csv(snakemake.output.results_tsv, sep='\t', index=False) diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits.py b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits.py index 765a4b3..0d739be 100644 --- a/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits.py +++ b/script/python/assoc_sclrt_kernels_deepripe_multiple_eval_top_hits.py @@ -261,6 +261,9 @@ def get_rbp(interval): if temp_genotypes is None: raise GotNone + nanmean = np.nanmean(temp_genotypes, axis=0) + ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped! + ncarrier = np.nansum(temp_genotypes, axis=0) ncarrier = np.minimum(ncarrier, temp_genotypes.shape[0] - ncarrier).astype(int) @@ -358,8 +361,8 @@ def call_lrt(GV, name, vids=None): call_lrt(G.dot(np.diag(weights, k=0)), 'linw') # performs weighting and joint estimation of effect-sizes # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) if np.any(keep): diff --git a/script/python/assoc_sclrt_kernels_deepripe_multiple_retest_top_hits.py b/script/python/assoc_sclrt_kernels_deepripe_multiple_retest_top_hits.py index ce58987..0cafec5 100644 --- a/script/python/assoc_sclrt_kernels_deepripe_multiple_retest_top_hits.py +++ b/script/python/assoc_sclrt_kernels_deepripe_multiple_retest_top_hits.py @@ -285,8 +285,8 @@ def get_rbp(interval): if temp_genotypes is None: raise GotNone - ncarrier = np.nansum(temp_genotypes, axis=0) - ncarrier = np.minimum(ncarrier, temp_genotypes.shape[0] - ncarrier).astype(int) + nanmean = np.nanmean(temp_genotypes, axis=0) + ncarrier = np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes) > 0, axis=0)# calculated differently here because alleles can be flipped! temp_genotypes -= np.nanmean(temp_genotypes, axis=0) G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) @@ -353,8 +353,8 @@ def call_test(GV, name): call_test(GWL, 'linwcholesky') # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) if np.any(keep): diff --git a/script/python/assoc_sclrt_kernels_missense.py b/script/python/assoc_sclrt_kernels_missense.py index f99b506..1280d64 100644 --- a/script/python/assoc_sclrt_kernels_missense.py +++ b/script/python/assoc_sclrt_kernels_missense.py @@ -255,9 +255,9 @@ def call_lrt(GV, name): if G2 is not None: # merged (single variable), missense non-weighted - G1_burden_mrg = np.maximum(G1_burden_nonweighted, G2) - call_score(G1_burden_mrg, 'linb_mrgLOF') - call_lrt(G1_burden_mrg, 'linb_mrgLOF') + # G1_burden_mrg = np.maximum(G1_burden_nonweighted, G2) + # call_score(G1_burden_mrg, 'linb_mrgLOF') + # call_lrt(G1_burden_mrg, 'linb_mrgLOF') # merged (single variable) G1_burden_mrg = np.maximum(G2, G1_burden) @@ -265,9 +265,9 @@ def call_lrt(GV, name): call_lrt(G1_burden_mrg, 'linwb_mrgLOF') # concatenated ( >= 2 variables, non-weighted) - G1_nonweighted = np.concatenate([G1_nonweighted, G2], axis=1) - call_score(G1_nonweighted, 'lincollapsed_cLOF') - call_lrt(G1_nonweighted, 'lincollapsed_cLOF') + # G1_nonweighted = np.concatenate([G1_nonweighted, G2], axis=1) + # call_score(G1_nonweighted, 'lincollapsed_cLOF') + # call_lrt(G1_nonweighted, 'lincollapsed_cLOF') # concatenated ( >= 2 variables) G1 = np.concatenate([G1, G2], axis=1) @@ -309,7 +309,8 @@ def call_lrt(GV, name): stats = pd.DataFrame.from_dict(stats).set_index('gene') # these are the kernels -kern = ['linb', 'linwb', 'linb_mrgLOF', 'linwb_mrgLOF', 'lincollapsed', 'linwcollapsed', 'lincollapsed_cLOF', 'linwcollapsed_cLOF'] +# kern = ['linb', 'linwb', 'linb_mrgLOF', 'linwb_mrgLOF', 'lincollapsed', 'linwcollapsed', 'lincollapsed_cLOF', 'linwcollapsed_cLOF'] +kern = ['linb', 'linwb', 'linwb_mrgLOF', 'lincollapsed', 'linwcollapsed', 'linwcollapsed_cLOF'] # concatenate and save gene-specific simulations # ...complicated nested dict comprehension that ensures we drop empty values etc. diff --git a/script/python/assoc_sclrt_kernels_missense_conditional_analysis.py b/script/python/assoc_sclrt_kernels_missense_conditional_analysis.py new file mode 100644 index 0000000..3441a62 --- /dev/null +++ b/script/python/assoc_sclrt_kernels_missense_conditional_analysis.py @@ -0,0 +1,388 @@ + +import os +import sys +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np +import pickle +import gzip + +# seak imports +from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.kernels import LocalCollapsing +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK, pv_chi2mixture, fit_chi2mixture + +from pysnptools.snpreader import Bed + +from util.association import BurdenLoaderHDF5 +from util import Timer + +import functools + + +class GotNone(Exception): + pass + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) + +# conditional analysis: +# we need to fit gene-specific null models for the LRT! +# null_model_lrt = LRTnoK(X, Y) +null_model_lrt = None + + +# set up function to filter variants: +def maf_filter(mac_report): + + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref)] + + # this has already been done in filter_variants.py + # load the variant annotation, keep only variants in high-confidece regions + # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg']) + # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values] + # vids = np.intersect1d(vids, vids_highconf) + + return mac_report.set_index('SNP').loc[vids] + + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + + + +def get_regions(): + # load the results, keep those below a certain p-value + present in conditional analysis list + results = pd.read_csv(snakemake.input.results_tsv, sep='\t') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + results.rename(columns={'gene':'name'}, inplace=True) + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + + # conditional analysis, keep only genes that are provided in conditional_list + regions['gene_name'] = regions['name'].str.split('_', expand=True)[1] + + _conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno']) + _conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ] + _conditional_list = _conditional_list.drop_duplicates() + + if len(_conditional_list) == 0: + logging.info('No genes pass significance threshold for phenotype {}, exiting.'.format(snakemake.params.phenotype)) + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype)) + sys.exit(0) + else: + regions = regions.merge(_conditional_list, how='right') + + regions = regions.merge(results, how='left', on=['name'], validate='one_to_one') + + sig_genes = [results.name[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score + pvcols_lrt] + sig_genes = np.unique(np.concatenate(sig_genes)) + + if len(sig_genes) == 0: + return None + + regions = regions.loc[regions.name.isin(sig_genes)] + + return regions + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len(snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) + +# get the top hits +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype)) + sys.exit(0) + +logging.info('About to evaluate variants in {} genes'.format(len(regions_all.name.unique()))) + +# storing all results here +results = [] + +i_gene = 0 +i_chrom = 0 + +# conditional analysis: +# these genotypes will be used for the conditional analysis +geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) +geno_cond.update_individuals(covariatesloader.get_iids()) + +# this file contains the mapping of associations to SNPs to condition on +conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) +conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() + +geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) +logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) + + +# enter the chromosome loop: +timer = Timer() +for i, (chromosome, bed, vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + if snakemake.params.debug: + # process only two chromosomes if debugging... + if i_chrom > 2: + break + + timer.reset() + + # set up the ensembl vep loader for the chromosome + ensemblvepdf = pd.read_csv(vep_tsv, + sep='\t', + usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact', 'ref', 'alt', 'cosine_similarity'], + index_col='Uploaded_variation') + + # get set of variants for the chromosome: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + # filter by MAF + keep = intersect_ids(filter_vids, ensemblvepdf.index.values) + ensemblvepdf = ensemblvepdf.loc[keep] + ensemblvepdf.reset_index(inplace=True) + + # filter by impact: + ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene' ,'pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact ] + + # initialize the loader + eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data = ensemblvepdf[['pos_standardized' ,'impact', 'ref', 'alt', 'cosine_similarity']].values) + + # set up the regions to loop over for the chromosome + regions = regions_all.copy() + + # discard all genes for which we don't have annotations + regions['gene'] = regions.name.str.split('_', expand=True)[0] + regions.set_index('gene', inplace=True) + genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene)) + regions = regions.loc[genes].reset_index() + regions = regions.sort_values(['chrom', 'start', 'end']) + + # set up the variant loader (missense variants) for the chromosome + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(eveploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + # set up the protein LOF burden loader + bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof) + bloader_lof.update_individuals(covariatesloader.get_iids()) + + # set up local collapsing + collapser = LocalCollapsing(distance_threshold=1.) + + # set up the missense genotype + vep loading function + def get_missense(interval): + + try: + V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0]) + except KeyError: + raise GotNone + + if V1.index.empty: + raise GotNone + + vids = V1.index.get_level_values('vid') + V1 = V1.droplevel(['gene']) + + temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) + + ncarrier = np.nansum(temp_genotypes > 0, axis=0) + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + cummac = mac_report.loc[vids].Minor + + # polyphen / sift impact + weights = V1[1].values.astype(np.float64) + + # "standardized" positions -> codon start positions + pos = V1[0].values.astype(np.int32) + + ref = V1[2].values.astype(str) + alt = V1[3].values.astype(str) + cosine_similarity = V1[4].values.astype(np.float64) + + return G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity + + # set up the protein-LOF loading function + + def get_plof(interval): + + try: + G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float64) + except KeyError: + G2 = None + + return G2 + + + # conditional analysis + # set up the function to load the variants to condition on + + def get_conditional(interval): + + cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == interval['gene_name'], 'snp_rsid' ].unique().tolist() + + temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False) + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + + cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.) + + return cond_snps, temp_vids + + + # set up the test-function for a single gene + def test_gene(interval, seed): + + pval_dict = {} + pval_dict['gene'] = interval['name'] + + out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) + os.makedirs(out_dir, exist_ok=True) + + # conditional analysis: + # get the snps to condition on, and include them in the null model for the LRT + cond_snps, cond_snps_vid = get_conditional(interval) + null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) + + # conditional analysis: + # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... + def pv_score(GV, G2=cond_snps): + # wraps score-test + pv = null_model_score.pv_alt_model(GV, G2) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, G2, method='saddle') + return pv + + def call_test(GV, name): + pval_dict['pv_score_' + name] = pv_score(GV) + altmodel = null_model_lrt.altmodel(GV) + res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) + pval_dict['pv_lrt_' + name] = res['pv'] + pval_dict['lrtstat_' + name ] = altmodel['stat'] + if 'h2' in altmodel: + pval_dict['h2_' + name ] = altmodel['h2'] + if res['pv'] != 1.: + for stat in ['scale', 'dof', 'mixture', 'imax']: + pval_dict[stat + '_' + name] = res[stat] + if len(res['res'] > 0): + pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) + + # load missense variants + G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval) + + # sanity checks + assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + # perform test using gene-specific distribution, gbvc + G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) + call_test(G1_burden, 'linwb') + + # perform local collapsing with weights + if G1.shape[1] > 1: + G1, clusters = collapser.collapse(G1, pos, np.sqrt(weights)) + else: + G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) + + # perform test using gene-specific distribution, kernel-based + call_test(G1, 'linwcollapsed') + + # load plof burden + G2 = get_plof(interval) + + if G2 is not None: + + # merged (single variable) + G1_burden_mrg = np.maximum(G2, G1_burden) + call_test(G1_burden_mrg, 'linwb_mrgLOF') + + # concatenated + call_test(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF') + + # conditional analysis: keep names of SNPs that we condition on + pval_dict['cond_snps'] = ','.join(cond_snps_vid) + + return pval_dict + + logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check())) + # run tests for all genes on the chromosome + + for _, region in regions.iterrows(): + + if snakemake.params.debug: + if i_gene > 5: + continue + + try: + results.append(test_gene(region, i_gene)) # i_gene is used as the random seed to make things reproducible + except GotNone: + continue + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + i_chrom += 1 + timer.reset() + +# export the results in a single table +results = pd.DataFrame(results) +results['pheno'] = snakemake.params.phenotype +results.to_csv(snakemake.output.results_tsv, sep='\t', index=False) + diff --git a/script/python/assoc_sclrt_kernels_missense_eval_top_hits.py b/script/python/assoc_sclrt_kernels_missense_eval_top_hits.py index 7ca7530..7d92623 100644 --- a/script/python/assoc_sclrt_kernels_missense_eval_top_hits.py +++ b/script/python/assoc_sclrt_kernels_missense_eval_top_hits.py @@ -193,7 +193,7 @@ def get_missense(interval): temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) - ncarrier = np.nansum(np.nansum(temp_genotypes, axis=1) >= 1) + ncarrier = np.nansum(temp_genotypes > 0, axis=0) temp_genotypes -= np.nanmean(temp_genotypes, axis=0) G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) @@ -287,8 +287,8 @@ def call_lrt(GV, name, vids=None): call_lrt(G1.dot(np.diag(np.sqrt(weights), k=0)), 'variant_pvals') # coeficients after weight adjustment (estimated *jointly*) # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # do a score burden test (max weighted), this is different than the baseline! G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) diff --git a/script/python/assoc_sclrt_kernels_missense_retest_top_hits.py b/script/python/assoc_sclrt_kernels_missense_retest_top_hits.py index 8e96fc6..ffd2218 100644 --- a/script/python/assoc_sclrt_kernels_missense_retest_top_hits.py +++ b/script/python/assoc_sclrt_kernels_missense_retest_top_hits.py @@ -219,7 +219,7 @@ def get_missense(interval): temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) - ncarrier = np.nansum(np.nansum(temp_genotypes, axis=1) >= 1) + ncarrier = np.nansum(temp_genotypes > 0, axis=0) temp_genotypes -= np.nanmean(temp_genotypes, axis=0) G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) @@ -285,8 +285,8 @@ def call_test(GV, name): G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval) # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # perform test using gene-specific distribution, gbvc G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) diff --git a/script/python/assoc_sclrt_kernels_spliceai_conditional_analysis.py b/script/python/assoc_sclrt_kernels_spliceai_conditional_analysis.py new file mode 100644 index 0000000..8ccb2bd --- /dev/null +++ b/script/python/assoc_sclrt_kernels_spliceai_conditional_analysis.py @@ -0,0 +1,424 @@ +import os +# os.environ["OMP_NUM_THREADS"] = "16" + +import logging + +logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) + +import pandas as pd +import numpy as np + +# seak imports +from seak.data_loaders import intersect_ids, EnsemblVEPLoader, VariantLoaderSnpReader, CovariatesLoaderCSV +from seak.scoretest import ScoretestNoK +from seak.lrt import LRTnoK +import gzip + +from pysnptools.snpreader import Bed +import pickle +import sys + +from util.association import BurdenLoaderHDF5 +from util import Timer + + +class GotNone(Exception): + pass + + +# set up the covariatesloader + +covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype, + snakemake.input.covariates_tsv, + snakemake.params.covariate_column_names, + sep='\t', + path_to_phenotypes=snakemake.input.phenotypes_tsv) + +# initialize the null models +Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK') + +null_model_score = ScoretestNoK(Y, X) + +# conditional analysis: +# we need to fit gene-specific null models for the LRT! +# null_model_lrt = LRTnoK(X, Y) +null_model_lrt = None + + +# set up function to filter variants: +def maf_filter(mac_report): + # load the MAC report, keep only observed variants with MAF below threshold + mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref']) + + if snakemake.params.filter_highconfidence: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))] + else: + vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool))] + + # this has already been done in filter_variants.py + # load the variant annotation, keep only variants in high-confidece regions + # anno = pd.read_csv(anno_tsv, sep='\t', usecols=['Name', 'hiconf_reg']) + # vids_highconf = anno.Name[anno.hiconf_reg.astype(bool).values] + # vids = np.intersect1d(vids, vids_highconf) + + return mac_report.set_index('SNP').loc[vids] + +def sid_filter(vids): + + if 'sid_include' in snakemake.config: + print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) + + infilepath = snakemake.config['sid_include'] + + if infilepath.endswith('gz'): + with gzip.open(infilepath,'rt') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + with open(infilepath, 'r') as infile: + sid = np.array([l.rstrip() for l in infile]) + else: + return vids + + return intersect_ids(vids, sid) + + +def get_regions(): + # load the results, keep those below a certain p-value + present in conditional analysis list + results = pd.read_csv(snakemake.input.results_tsv, sep='\t') + + kern = snakemake.params.kernels + if isinstance(kern, str): + kern = [kern] + + pvcols_score = ['pv_score_' + k for k in kern] + pvcols_lrt = ['pv_lrt_' + k for k in kern] + statcols = ['lrtstat_' + k for k in kern] + results = results[['gene', 'n_snp', 'cumMAC', 'nCarrier'] + statcols + pvcols_score + pvcols_lrt] + + results.rename(columns={'gene':'name'}, inplace=True) + + # set up the regions to loop over for the chromosome + regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0 ,1 ,2 ,3, 5], dtype={0 :str, 1: np.int32, 2 :np.int32, 3 :str, 5:str}) + regions.columns = ['chrom', 'start', 'end', 'name', 'strand'] + regions['strand'] = regions.strand.map({'+': 'plus', '-': 'minus'}) + + # conditional analysis, keep only genes that are provided in conditional_list + regions['gene_name'] = regions['name'].str.split('_', expand=True)[1] + + _conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno']) + _conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ] + _conditional_list = _conditional_list.drop_duplicates() + + if len(_conditional_list) == 0: + logging.info('No genes pass significance threshold for phenotype {}, exiting.'.format(snakemake.params.phenotype)) + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype)) + sys.exit(0) + else: + regions = regions.merge(_conditional_list, how='right') + + regions = regions.merge(results, how='left', on=['name'], validate='one_to_one') + + sig_genes = [results.name[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score + pvcols_lrt] + sig_genes = np.unique(np.concatenate(sig_genes)) + + if len(sig_genes) == 0: + return None + + regions = regions.loc[regions.name.isin(sig_genes)] + + return regions + +# genotype path, vep-path: +assert len(snakemake.params.ids) == len (snakemake.input.bed), 'Error: length of chromosome IDs does not match length of genotype files' +geno_vep = zip(snakemake.params.ids, snakemake.input.bed, snakemake.input.vep_tsv, snakemake.input.ensembl_vep_tsv, snakemake.input.mac_report, snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof) + +# get the top hits +regions_all = get_regions() +if regions_all is None: + logging.info('No genes pass significance threshold, exiting.') + import gzip + with gzip.open(snakemake.output.results_tsv, 'wt') as outfile: + outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype)) + sys.exit(0) + +logging.info('About to evaluate variants in {} genes'.format(len(regions_all.name.unique()))) + +# storing all results here +results = [] + +i_gene = 0 +i_chrom = 0 + +# conditional analysis: +# these genotypes will be used for the conditional analysis +geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) +geno_cond.update_individuals(covariatesloader.get_iids()) + +# this file contains the mapping of associations to SNPs to condition on +conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) +conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() + +geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) +logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) + +# enter the chromosome loop: +timer = Timer() +for i, (chromosome, bed, vep_tsv, ensembl_vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): + + if chromosome.replace('chr','') not in regions_all.chrom.unique(): + continue + + if snakemake.params.debug: + # process only two chromosomes if debugging... + if i_chrom > 2: + break + + timer.reset() + + # set up the ensembl vep loader for the chromosome + spliceaidf = pd.read_csv(vep_tsv, + sep='\t', + usecols=['name', 'chrom', 'end', 'gene', 'max_effect', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'], + index_col='name') + + # get set of variants for the chromosome: + mac_report = maf_filter(mac_report) + filter_vids = mac_report.index.values + filter_vids = sid_filter(filter_vids) + + # filter by MAF + keep = intersect_ids(filter_vids, spliceaidf.index.values) + spliceaidf = spliceaidf.loc[keep] + spliceaidf.reset_index(inplace=True) + + # filter by impact: + spliceaidf = spliceaidf[spliceaidf.max_effect >= snakemake.params.min_impact] + + # set up the regions to loop over for the chromosome + regions = regions_all.copy() + + # discard all genes for which we don't have annotations + gene_ids = regions.name.str.split('_', expand=True) # table with two columns, ensembl-id and gene-name + regions['gene'] = gene_ids[1] # this is the gene name + regions['ensembl_id'] = gene_ids[0] + regions.set_index('gene', inplace=True) + genes = intersect_ids(np.unique(regions.index.values), np.unique(spliceaidf.gene)) # intersection of gene names + regions = regions.loc[genes].reset_index() # subsetting + regions = regions.sort_values(['chrom', 'start', 'end']) + + # check if the variants are protein LOF variants, load the protein LOF variants: + ensemblvepdf = pd.read_csv(ensembl_vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Gene']) + + # this column will contain the gene names: + genes = intersect_ids(np.unique(ensemblvepdf.Gene.values), regions.ensembl_id) # intersection of ensembl gene ids + ensemblvepdf = ensemblvepdf.set_index('Gene').loc[genes].reset_index() + ensemblvepdf['gene'] = gene_ids.set_index(0).loc[ensemblvepdf.Gene.values].values + + # set up the merge + ensemblvepdf.drop(columns=['Gene'], inplace=True) # get rid of the ensembl ids, will use gene names instead + ensemblvepdf.rename(columns={'Uploaded_variation': 'name'}, inplace=True) + ensemblvepdf['is_plof'] = 1. + + ensemblvepdf = ensemblvepdf[~ensemblvepdf.duplicated()] # if multiple ensembl gene ids map to the same gene names, this prevents a crash. + + # we add a column to the dataframe indicating whether the variant is already annotated as protein loss of function by the ensembl variant effect predictor + spliceaidf = pd.merge(spliceaidf, ensemblvepdf, on=['name', 'gene'], how='left', validate='one_to_one') + spliceaidf['is_plof'] = spliceaidf['is_plof'].fillna(0.).astype(bool) + + # initialize the loader + # Note: we use "end" here because the start + 1 = end, and we need 1-based coordiantes (this would break if we had indels) + eveploader = EnsemblVEPLoader(spliceaidf['name'], spliceaidf['chrom'].astype('str') + ':' + spliceaidf['end'].astype('str'), spliceaidf['gene'], data=spliceaidf[['max_effect', 'is_plof', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']].values) + + # set up the variant loader (splice variants) for the chromosome + plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4)) + plinkloader.update_variants(eveploader.get_vids()) + plinkloader.update_individuals(covariatesloader.get_iids()) + + # set up the protein LOF burden loader + bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof) + bloader_lof.update_individuals(covariatesloader.get_iids()) + + + # set up the splice genotype + vep loading function + def get_splice(interval): + + try: + V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[1]) + except KeyError: + raise GotNone + + if V1.index.empty: + raise GotNone + + vids = V1.index.get_level_values('vid') + V1 = V1.droplevel(['gene']) + + temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) + + ncarrier = np.nansum(temp_genotypes > 0, axis=0) + + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) + + cummac = mac_report.loc[vids].Minor + + # spliceAI max score + weights = V1[0].values.astype(np.float64) + + is_plof = V1[1].values.astype(bool) + + splice_preds_all = V1.iloc[:,2:] + splice_preds_all.columns = ['DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL'] + + # "standardized" positions -> codon start positions + # pos = V1[0].values.astype(np.int32) + + return G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all + + + # set up the protein-LOF loading function + + def get_plof(interval): + + try: + G2 = bloader_lof.genotypes_by_id(interval['name']).astype(np.float64) + except KeyError: + G2 = None + + return G2 + + + # conditional analysis + # set up the function to load the variants to condition on + + def get_conditional(interval): + + cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == interval['gene_name'], 'snp_rsid' ].unique().tolist() + + temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False) + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) + + cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.) + + return cond_snps, temp_vids + + + # set up the test-function for a single gene + def test_gene(interval, seed): + + pval_dict = {} + pval_dict['gene'] = interval['name'] + + out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name']) + os.makedirs(out_dir, exist_ok=True) + + # conditional analysis: + # get the snps to condition on, and include them in the null model for the LRT + cond_snps, cond_snps_vid = get_conditional(interval) + null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y) + + # conditional analysis: + # the score-test takes a second argument (G2) that allows conditioning on a second set of variants... + def pv_score(GV, G2=cond_snps): + # wraps score-test + pv = null_model_score.pv_alt_model(GV, G2) + if pv < 0.: + pv = null_model_score.pv_alt_model(GV, G2, method='saddle') + return pv + + def call_test(GV, name): + pval_dict['pv_score_' + name] = pv_score(GV) + + altmodel = null_model_lrt.altmodel(GV) + + res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed) + pval_dict['pv_lrt_' + name] = res['pv'] + pval_dict['lrtstat_' + name ] = altmodel['stat'] + if 'h2' in altmodel: + pval_dict['h2_' + name ] = altmodel['h2'] + if res['pv'] != 1.: + for stat in ['scale', 'dof', 'mixture', 'imax']: + pval_dict[stat + '_' + name] = res[stat] + if len(res['res'] > 0): + pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name)) + + + # load splice variants + G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all = get_splice(interval) + # keep indicates which variants are NOT "protein LOF" variants, i.e. variants already identified by the ensembl VEP + keep = ~is_plof + + # sanity checks + assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + + + # do a score burden test (max weighted), this is different than the baseline! + G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) + + try: + call_test(G1_burden, 'linwb') + except AssertionError as err: + out_dump_np = '{}_{}_{{}}_covar.npy'.format(interval['name'],snakemake.params.phenotype) + np.save(out_dump_np.format('G1burden'), G1_burden) + np.save(out_dump_np.format('G1'), G1) + np.save(out_dump_np.format('Covar'), np.concatenate([X, cond_snps], axis=1)) + np.save(out_dump_np.format('Y'), Y) + logging.error('AssertionError encountered when testing gene {}, conditioning on {}. dumping Y, covariates and G1 to {}'.format(interval['name'], ','.join(cond_snps_vid),out_dump_np.format('*'))) + raise err + + # linear weighted kernel + G1 = G1.dot(np.diag(np.sqrt(weights), k=0)) + + # do a score test (linear weighted) + call_test(G1, 'linw') + + # load plof burden + G2 = get_plof(interval) + + if G2 is not None: + + if np.any(keep): + + # merged (single variable) + G1_burden_mrg = np.maximum(G2, G1_burden) + call_test(G1_burden_mrg, 'linwb_mrgLOF') + + # concatenated ( >= 2 variables) + # we separate out the ones that are already part of the protein LOF variants! + + G1 = np.concatenate([G1[:, keep], G2], axis=1) + call_test(G1, 'linw_cLOF') + else: + logging.info('All Splice-AI variants for gene {} where already identified by the Ensembl variant effect predictor'.format(interval['name'])) + + # conditional analysis: keep names of SNPs that we condition on + pval_dict['cond_snps'] = ','.join(cond_snps_vid) + + return pval_dict + + + logging.info('loaders for chromosome {} initialized in {:.1f} seconds.'.format(chromosome, timer.check())) + # run tests for all genes on the chromosome + for _, region in regions.iterrows(): + + if snakemake.params.debug: + if i_gene > 2: + continue + + try: + results.append(test_gene(region, i_gene)) # i_gene is used as the random seed to make things reproducible + except GotNone: + continue + + i_gene += 1 + logging.info('tested {} genes...'.format(i_gene)) + + i_chrom += 1 + timer.reset() + +# export the results in a single table +results = pd.DataFrame(results) +results['pheno'] = snakemake.params.phenotype +results.to_csv(snakemake.output.results_tsv, sep='\t', index=False) diff --git a/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits.py b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits.py index a0f8c90..cb30d6d 100644 --- a/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits.py +++ b/script/python/assoc_sclrt_kernels_spliceai_eval_top_hits.py @@ -208,10 +208,11 @@ def get_splice(interval): temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) + ncarrier = np.nansum(temp_genotypes > 0, axis=0) + temp_genotypes -= np.nanmean(temp_genotypes, axis=0) G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) - ncarrier = np.sum(G1 > 0.5, axis=0) cummac = mac_report.loc[vids].Minor # spliceAI max score @@ -305,8 +306,8 @@ def call_lrt(GV, name, vids=None): call_lrt(G1.dot(np.diag(np.sqrt(weights), k=0)), 'variant_pvals') # single variant coefficients estimated *jointly* after weighting # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # do a score burden test (max weighted), this is different than the baseline! G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True) diff --git a/script/python/assoc_sclrt_kernels_spliceai_retest_top_hits.py b/script/python/assoc_sclrt_kernels_spliceai_retest_top_hits.py index d138006..7042016 100644 --- a/script/python/assoc_sclrt_kernels_spliceai_retest_top_hits.py +++ b/script/python/assoc_sclrt_kernels_spliceai_retest_top_hits.py @@ -236,7 +236,7 @@ def get_splice(interval): temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) - ncarrier = np.nansum(np.nansum(temp_genotypes, axis=1) >= 1) + ncarrier = np.nansum(temp_genotypes > 0, axis=0) temp_genotypes -= np.nanmean(temp_genotypes, axis=0) G1 = np.ma.masked_invalid(temp_genotypes).filled(0.) @@ -307,8 +307,8 @@ def call_test(GV, name): keep = ~is_plof # sanity checks - assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) - assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) + # assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {} got: {}'.format(interval['n_snp'], len(vids)) + # assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum()) # do a score burden test (max weighted), this is different than the baseline! diff --git a/script/python/gather_complete_cases.py b/script/python/gather_complete_cases.py index 6a8a86d..f43a793 100644 --- a/script/python/gather_complete_cases.py +++ b/script/python/gather_complete_cases.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from util.snake import clean_str covar = pd.read_csv(snakemake.input.covar_tsv, sep='\t') @@ -7,13 +8,15 @@ pheno = pd.read_csv(snakemake.input.pheno_tsv, sep='\t') -not_nan = {} -for p in pheno.columns[1:]: - not_nan[clean_str(p)] = pheno[[pheno.columns[0],p]].dropna()[pheno.columns[0]].values +phenos = [ clean_str(x) for x in pheno.columns.tolist()[1:] ] -for k, v in not_nan.items(): +for i, k in enumerate(phenos): + + iid = pheno.iloc[:,[0,i+1]].dropna().iloc[:,0].values + iid = np.intersect1d(iid, covar.iloc[:,0].values) + with open('data/covariates/complete_cases/{}.txt'.format(k), 'x') as outfile: - for value in v: + for value in iid: outfile.write('{}\n'.format(value)) with open('data/covariates/complete_cases/covariates.txt', 'x') as outfile: diff --git a/script/python/mac_report.py b/script/python/mac_report.py index 284c06c..d21651b 100644 --- a/script/python/mac_report.py +++ b/script/python/mac_report.py @@ -27,6 +27,7 @@ def get_args(): p.add_argument('--hwe', required=False, default=0.00001, type=float) p.add_argument('--log', required=False, default='log.txt') p.add_argument('--threads', default=4, type=int) + p.add_argument('--mem_mb', default=4000, type=int) args = p.parse_args() return(args) @@ -73,18 +74,18 @@ def main(): if args.hwe > 0.: tmpfilename_vid = '{}.snplist'.format(args.out_prefix) if args.iid is not None: - plink_command = [args.plink2_path, '--keep', tmpfilename_iid, '--bfile', args.bed, '--threads', str(args.threads), '--hwe', str(args.hwe), '--write-snplist', '--out', args.out_prefix] + plink_command = [args.plink2_path, '--keep', tmpfilename_iid, '--bfile', args.bed, '--memory', str(args.mem_mb-2000), '--threads', str(args.threads), '--hwe', str(args.hwe), '--write-snplist', '--out', args.out_prefix] else: - plink_command = [args.plink2_path, '--bfile', args.bed, '--threads', str(args.threads), '--hwe', str(args.hwe), '--write-snplist', '--out', args.out_prefix] + plink_command = [args.plink2_path, '--bfile', args.bed,'--memory', str(args.mem_mb-2000), '--threads', str(args.threads), '--hwe', str(args.hwe), '--write-snplist', '--out', args.out_prefix] logging.info('plink command: {}'.format(' '.join(plink_command))) _ = subprocess.run(plink_command, check=True) # plink command to create count reports plink_command = [args.plink_path, '--bfile', args.bed] if args.iid is not None: - plink_command += ['--keep', tmpfilename_iid, '--freq', 'counts', 'gz', '--threads', str(args.threads), '--out', args.out_prefix] + plink_command += ['--keep', tmpfilename_iid, '--freq', 'counts', 'gz','--memory', str(args.mem_mb-2000), '--threads', str(args.threads), '--out', args.out_prefix] else: - plink_command += ['--freq', 'counts', 'gz', '--threads', str(args.threads), '--out', args.out_prefix ] + plink_command += ['--freq', 'counts', 'gz', '--threads', str(args.threads),'--memory', str(args.mem_mb-2000), '--out', args.out_prefix ] if args.hwe > 0.: plink_command += ['--extract', tmpfilename_vid] diff --git a/slurm/config.yaml b/slurm/config.yaml new file mode 100644 index 0000000..f7eca57 --- /dev/null +++ b/slurm/config.yaml @@ -0,0 +1,24 @@ +cluster: + mkdir -p logs/{rule} && + sbatch + --partition={resources.partition} + --cpus-per-task={threads} + --mem={resources.mem_mb} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=logs/{rule}/{rule}-{wildcards}-%j.out +default-resources: + - partition=vcpu,hpcpu + - mem_mb=4000 + - time="01:00:00" +restart-times: 1 +max-jobs-per-second: 10 +max-status-checks-per-second: 1 +local-cores: 1 +latency-wait: 5 +jobs: 50 +keep-going: True +rerun-incomplete: True +printshellcmds: True +scheduler: greedy +use-conda: True diff --git a/snake/association.smk b/snake/association.smk index 706314f..550f3dc 100644 --- a/snake/association.smk +++ b/snake/association.smk @@ -3,6 +3,7 @@ # the {id} wildcard refers to the different chromosomes # the {pheno} wildcard refers to the different phenotypes + rule assoc_baseline_scoretest: # runs "baseline" association tests: # score tests for indicator variables @@ -25,6 +26,11 @@ rule assoc_baseline_scoretest: filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence] output: results_tsv = 'work/association/baseline_scoretest/{filter_highconfidence}/{pheno}/results_{id}.tsv.gz' + threads: + 1 + resources: + mem_mb=4000, + time='2:00:00' log: 'logs/association/baseline_scoretest/{filter_highconfidence}/{pheno}/{id}.log' conda: @@ -48,6 +54,9 @@ rule pLOF_nsnp_cummac: tsv='work/association/baseline_scoretest/{filter_highconfidence}/{pheno}/pLOF_nSNP_cumMAC.tsv.gz' conda: '../env/seak.yml' + resources: + mem_mb=4000, + time='0:30:00' script: '../script/python/pLOF_nSNP_cumMAC.py' @@ -82,6 +91,11 @@ rule assoc_missense_localcollapsing: filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], sclrt_nominal_significance_cutoff = 0.1, debug=False + resources: + mem_mb=get_mem_mb(12000,1.5), + time="48:00:00" + threads: + 1 log: 'logs/association/sclrt_kernels_missense/{filter_highconfidence}_{pheno}.log' conda: @@ -111,7 +125,7 @@ rule assoc_missense_localcollapsing_eval_top_hits: output: out_ok = touch('work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits/all.ok') params: - kernels = ['lincollapsed','linwcollapsed','lincollapsed_cLOF','linwcollapsed_cLOF','linb','linwb','linb_mrgLOF','linwb_mrgLOF'], # genes with p < 1e-7 in one of these kernels will be analysed in detail + kernels = ['linwcollapsed','linwcollapsed_cLOF','linwb','linwb_mrgLOF'], # genes with p < 1e-7 in one of these kernels will be analysed in detail phenotype = lambda wc: phenotypes[ wc.pheno ], covariate_column_names = config['covariate_column_names'], max_maf = config['maf_cutoff'], @@ -119,6 +133,11 @@ rule assoc_missense_localcollapsing_eval_top_hits: out_dir_stats = lambda wc: 'work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/top_hits/'.format(filter_highconfidence=wc.filter_highconfidence, pheno=wc.pheno), ids = plinkfiles.getIds(), filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence] + resources: + mem_mb=get_mem_mb(10000,1.5), + time="02:30:00" + threads: + 1 log: 'logs/association/sclrt_kernels_missense_eval_top_hits/{filter_highconfidence}_{pheno}.log' conda: @@ -162,6 +181,11 @@ rule assoc_missense_localcollapsing_retest_top_hits: filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], debug=False, random=False + resources: + mem_mb=get_mem_mb(15000,1.5), + time="02:30:00" + threads: + 1 log: 'logs/association/sclrt_kernels_missense_retest_top_hits/{filter_highconfidence}_{pheno}.log' conda: @@ -205,6 +229,11 @@ rule assoc_missense_localcollapsing_retest_random: filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], debug=False, random=True + resources: + mem_mb=get_mem_mb(10000,1.5), + time="2:30:00" + threads: + 1 log: 'logs/association/sclrt_kernels_missense_retest_random/{filter_highconfidence}_{pheno}.log' conda: @@ -248,6 +277,11 @@ rule assoc_spliceai_linw: sclrt_nominal_significance_cutoff = 0.1 log: 'logs/association/sclrt_kernels_spliceai/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(12000,1.5), + time="24:00:00" + threads: + 1 conda: '../env/seak.yml' script: @@ -287,6 +321,11 @@ rule assoc_spliceai_linw_eval_top_hits: debug = False log: 'logs/association/sclrt_kernels_spliceai_eval_top_hits/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(12000,1.5), + time="2:30:00" + threads: + 1 conda: '../env/seak.yml' script: @@ -330,6 +369,11 @@ rule assoc_spliceai_linw_retest_top_hits: random=False log: 'logs/association/sclrt_kernels_spliceai_retest_top_hits/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(10000,1.5), + time="1:30:00" + threads: + 1 conda: '../env/seak.yml' script: @@ -372,6 +416,11 @@ rule assoc_spliceai_linw_retest_random: random = True log: 'logs/association/sclrt_kernels_spliceai_retest_top_hits/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(10000,1.5), + time="1:30:00" + threads: + 1 conda: '../env/seak.yml' script: @@ -460,6 +509,11 @@ rule assoc_deepripe_multiple_cholesky: sclrt_nominal_significance_cutoff = 0.1 log: 'logs/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}_{pheno}.log' + resources: + mem_mb=get_mem_mb(10000,1.5), + time="8:00:00" + threads: + 1 conda: '../env/seak.yml' script: @@ -499,6 +553,11 @@ rule assoc_deepripe_multiple_cholesky_eval_top_hits: filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], rbp_of_interest = rules.assoc_deepripe_multiple_cholesky.params.rbp_of_interest, debug = False + resources: + mem_mb=get_mem_mb(10000,1.5), + time="01:00:00" + threads: + 1 log: 'logs/association/sclrt_kernels_deepripe_multiple_eval_top_hits/{filter_highconfidence}_{pheno}.log' conda: @@ -544,6 +603,11 @@ rule assoc_deepripe_multiple_cholesky_retest_top_hits: rbp_of_interest = rules.assoc_deepripe_multiple_cholesky.params.rbp_of_interest, debug = False, random = False + resources: + mem_mb=get_mem_mb(10000,1.5), + time="01:00:00" + threads: + 1 log: 'logs/association/sclrt_kernels_deepripe_multiple_retest_top_hits/{filter_highconfidence}_{pheno}.log' conda: @@ -602,3 +666,29 @@ rule assoc_deepripe_multiple_cholesky_retest_random_all: input: expand(rules.assoc_deepripe_multiple_cholesky_retest_random.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + +####### +# all # +####### + + +rule all_assoc: + input: + rules.assoc_baseline_scoretest_all.input, + rules.assoc_missense_localcollapsing_all.input, + rules.assoc_spliceai_linw_all.input, + rules.assoc_deepripe_multiple_cholesky_all.input + + +rule all_eval_top_hits: + input: + rules.assoc_missense_localcollapsing_eval_top_hits_all.input, + rules.assoc_spliceai_linw_eval_top_hits_all.input, + rules.assoc_deepripe_multiple_cholesky_eval_top_hits_all.input + + +rule all_retest_top_hits: + input: + rules.assoc_missense_localcollapsing_retest_top_hits_all.input, + rules.assoc_spliceai_linw_retest_top_hits_all.input, + rules.assoc_deepripe_multiple_cholesky_retest_top_hits_all.input diff --git a/snake/conditional_tests.smk b/snake/conditional_tests.smk new file mode 100644 index 0000000..d0cf52c --- /dev/null +++ b/snake/conditional_tests.smk @@ -0,0 +1,197 @@ + +# TODO: add conditional_list to config.yaml + + +rule assoc_baseline_scoretest_conditional_analysis: + # performs conditional analysis for the baseline + input: + h5_missense = rules.export_missense_burden.output.h5, + iid_missense = rules.export_missense_burden.output.iid_txt, + gid_missense = rules.export_missense_burden.output.gene_txt, + h5_lof = rules.export_plof_burden.output.h5, + iid_lof = rules.export_plof_burden.output.iid_txt, + gid_lof = rules.export_plof_burden.output.gene_txt, + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + results_tsv = rules.assoc_baseline_scoretest.output['results_tsv'], + conditional_list = config['conditional_list'], + conditional_geno = config['conditional_geno'] + '.bed' + output: + results_tsv = 'work/association/baseline_scoretest/{filter_highconfidence}/{pheno}/conditional_analysis/conditionalres_{id}.tsv.gz' + params: + # phenotypes is a dictionary that maps file-prefixes to column names, see ../Snakefile + phenotype = lambda wc: phenotypes[ wc.pheno ], + covariate_column_names = config['covariate_column_names'], + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + columns = ['pv_pLOF','pv_missense','pv_mrg'], + significance_cutoff=1e-7 + threads: + 1 + resources: + mem_mb=4000, + time='0:10:00' + log: + 'logs/association/baseline_scoretest_conditional_analysis/{filter_highconfidence}/{pheno}/{id}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_baseline_scoretest_conditional_analysis.py' + + +rule assoc_baseline_scoretest_conditional_analysis_all: + input: + expand(rules.assoc_baseline_scoretest_conditional_analysis.output, filter_highconfidence=['all'], pheno=phenotypes.keys(), id=plinkfiles.getIds()) + + +rule assoc_missense_localcollapsing_conditional_analysis: + # calculates gene-specific LRT p-values for the most significant genes for missense variants + # conditions on a list of common variants + input: + h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True), + iid_lof = expand(rules.export_plof_burden.output.iid_txt, id = plinkfiles.getIds(), allow_missing=True), + gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True), + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + vep_tsv = expand(rules.evep_missense_proc.output.tsv_filtered, id = plinkfiles.getIds()), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + results_tsv = rules.assoc_missense_localcollapsing.output.results_tsv, + seak_install = rules.install_seak.output, + conditional_list = config['conditional_list'], + conditional_geno = config['conditional_geno'] + '.bed' + output: + results_tsv = 'work/association/sclrt_kernels_missense/{filter_highconfidence}/{pheno}/conditional_analysis/lrt_conditional.tsv.gz' + params: + phenotype = lambda wc: phenotypes[ wc.pheno ], + covariate_column_names = config['covariate_column_names'], + kernels = ['linwcollapsed','linwcollapsed_cLOF','linwb','linwb_mrgLOF'], # kernels to consider + max_maf = config['maf_cutoff'], + min_impact = config['min_impact'], + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + out_dir_stats=lambda wc, output: '/'.join(output['results_tsv'].split('/')[:-1]), + significance_cutoff=1e-7, + debug=False + resources: + mem_mb=12000, + time="02:30:00" + log: + 'logs/association/sclrt_kernels_missense_conditional_analysis/{filter_highconfidence}_{pheno}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_missense_conditional_analysis.py' + + +rule all_assoc_missense_localcollapsing_conditional_analysis: + # runs rule above for all phenotypes + input: + expand(rules.assoc_missense_localcollapsing_conditional_analysis.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + + +rule assoc_spliceai_linw_conditional_analysis: + # calculates gene-specific LRT p-values for the most significant genes for missense variants + # conditions on a list of common variants + input: + h5_lof = expand(rules.export_plof_burden.output.h5, id = plinkfiles.getIds(), allow_missing=True), + iid_lof = expand(rules.export_plof_burden.output.iid_txt, id = plinkfiles.getIds(), allow_missing=True), + gid_lof = expand(rules.export_plof_burden.output.gene_txt, id = plinkfiles.getIds(), allow_missing=True), + ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True), + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + vep_tsv = expand(rules.splice_ai_filter_and_overlap_with_genotypes.output.tsv, id = plinkfiles.getIds()), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + results_tsv = rules.assoc_spliceai_linw.output.results_tsv, + seak_install = rules.install_seak.output, + conditional_list = config['conditional_list'], + conditional_geno = config['conditional_geno'] + '.bed' + output: + results_tsv = 'work/association/sclrt_kernels_spliceai/{filter_highconfidence}/{pheno}/conditional_analysis/lrt_conditional.tsv.gz' + params: + kernels = ['linwb','linw','linwb_mrgLOF','linw_cLOF'], + phenotype = lambda wc: phenotypes[wc.pheno], + covariate_column_names = config['covariate_column_names'], + max_maf = config['maf_cutoff'], + min_impact = config['splice_ai_min_impact'], + out_dir_stats=lambda wc, output: '/'.join(output['results_tsv'].split('/')[:-1]), + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + debug = False, + significance_cutoff = 1e-7 + resources: + mem_mb=10000, + time="02:30:00" + log: + 'logs/association/sclrt_kernels_spliceai_conditional_analysis/{filter_highconfidence}_{pheno}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_spliceai_conditional_analysis.py' + + + +rule all_assoc_spliceai_linw_conditional_analysis: + # runs rule above for all phenotypes + input: + expand(rules.assoc_spliceai_linw_conditional_analysis.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + + + +rule assoc_deepripe_multiple_cholesky_conditional_analysis: + # re-tests most significant genes for DeepRiPe variants using gene-specific null-distributions + input: + bed = expand(rules.link_genotypes.output.bed, id = plinkfiles.getIds()), + h5_rbp_plus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['plus']), + h5_rbp_minus = expand(rules.run_deepripe_vep.output.h5, id = plinkfiles.getIds(), strand=['minus']), + bed_rbp_plus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['plus']), + bed_rbp_minus = expand(rules.run_deepripe_vep.output.bed, id = plinkfiles.getIds(), strand=['minus']), + ensembl_vep_tsv = expand(rules.process_ensembl_vep_output_highimpact.output.tsv, id = plinkfiles.getIds(), allow_missing=True), + mac_report = expand(rules.filter_variants.output.vid_tsv, id = plinkfiles.getIds(), allow_missing=True), + regions_bed = rules.get_protein_coding_genes.output.pc_genes_bed, + seak_install = rules.install_seak.output, + covariates_tsv = config['covariates'], + phenotypes_tsv = config['phenotypes'], + results_tsv = rules.assoc_deepripe_multiple_cholesky.output.results_tsv, + conditional_list = config['conditional_list'], + conditional_geno = config['conditional_geno'] + '.bed' + output: + results_tsv = 'work/association/sclrt_kernels_deepripe_multiple/{filter_highconfidence}/{pheno}/conditional_analysis/lrt_conditional.tsv.gz' + params: + kernels = ['linwcholesky', 'linwcholesky_notLOF'], + phenotype = lambda wc: phenotypes[wc.pheno], + covariate_column_names = config['covariate_column_names'], + max_maf = config['maf_cutoff'], + min_impact = config['deepripe_min_impact'], + out_dir_stats=lambda wc, output: '/'.join(output['results_tsv'].split('/')[:-1]), + ids = plinkfiles.getIds(), + filter_highconfidence = lambda wc: {'all': False, 'highconf_only': True}[wc.filter_highconfidence], + rbp_of_interest = rules.assoc_deepripe_multiple_cholesky.params.rbp_of_interest, + debug = False, + random = False, + significance_cutoff = 1e-7 + resources: + mem_mb=10000, + time="02:30:00" + log: + 'logs/association/sclrt_kernels_deepripe_multiple_conditional_analysis/{filter_highconfidence}_{pheno}.log' + conda: + '../env/seak.yml' + script: + '../script/python/assoc_sclrt_kernels_deepripe_multiple_conditional_analysis.py' + + +rule assoc_deepripe_multiple_cholesky_conditional_analysis_all: + # runs rule above for all phenotypes - this will effectively run the entire pipeline for DeepRiPe variants + input: + expand(rules.assoc_deepripe_multiple_cholesky_conditional_analysis.output, pheno=phenotypes.keys(), filter_highconfidence=['all']) + + +rule conditional_analysis_all: + input: + rules.assoc_deepripe_multiple_cholesky_conditional_analysis_all.input, + rules.all_assoc_spliceai_linw_conditional_analysis.input, + rules.all_assoc_missense_localcollapsing_conditional_analysis.input, + rules.assoc_baseline_scoretest_conditional_analysis_all.input \ No newline at end of file diff --git a/snake/evep.smk b/snake/evep.smk index 372760a..fc66442 100644 --- a/snake/evep.smk +++ b/snake/evep.smk @@ -189,6 +189,9 @@ rule export_plof_burden: iid_txt = 'work/gene_scores/indicator01/pLOF/{filter_highconfidence}/{id}_iid.txt' log: 'logs/evep/export_plof_burden_{filter_highconfidence}_{id}.log' + resources: + mem_mb=8000, + time='1:00:00' conda: '../env/seak.yml' script: @@ -228,6 +231,11 @@ rule export_missense_burden: iid_txt = 'work/gene_scores/indicator01/missense/{filter_highconfidence}/{id}_iid.txt' log: 'logs/evep/export_missense_burden_{filter_highconfidence}_{id}.log' + resources: + mem_mb=12000, + time='1:00:00' + threads: + 1 conda: '../env/seak.yml' script: diff --git a/snake/results_processing.smk b/snake/results_processing.smk index ed2d0b7..cf15586 100644 --- a/snake/results_processing.smk +++ b/snake/results_processing.smk @@ -1,6 +1,4 @@ - - rule export_results_baseline: # export results and calculate genomic inflation input: diff --git a/snake/setup.smk b/snake/setup.smk index 615159a..6908cab 100644 --- a/snake/setup.smk +++ b/snake/setup.smk @@ -81,6 +81,41 @@ rule gather_complete_cases: script: '../script/python/gather_complete_cases.py' + +rule complete_cases_ancestry: + # create ancestry keep-files for plink + input: + iid_txt = 'data/covariates/complete_cases/covariates.txt', + ancestry_tsv = config['ancestry_scoring_file'] if 'ancestry_scoring_file' in config else 'ancestry_scoring_file_is_undefined' + output: + ancestry_keep = expand('data/ancestry_keep_files/{ancestries}.keep', ancestries=['AFR','AMR','EAS','EUR','SAS']) + run: + import pandas as pd + + with open(input['iid_txt'], 'r') as infile: + iids = [ l.rstrip() for l in infile ] + + ancestry = pd.read_csv(input['ancestry_tsv'], sep='\t', dtype={0:str}) + if 'IID' in ancestry.columns: + ancestry.rename(columns={'IID':'iid'}, inplace=True) + ancestry.set_index('iid', inplace=True) + + # print(ancestry.head()) + + print('{} individuals in ancestry file'.format(len(ancestry))) + print('{} individuals in iid-file. Subsetting to those individuals.'.format(len(iids))) + + ancestry = ancestry.loc[iids] + + for a in ancestry.columns: + outfile = 'data/ancestry_keep_files/{}.keep'.format(a) + + with open(outfile, 'w') as out: + for idx in ancestry[ancestry[a] > 0.5].index.values: + out.write('{}\t{}\n'.format(idx, idx)) + + print('written {} iids to {}'.format((ancestry[a] > 0.5).sum(), outfile)) + rule mac_report: # filter by HWE and @@ -117,10 +152,52 @@ rule mac_report_all: input: expand(rules.mac_report.output, id = plinkfiles.getIds()) + +rule mac_report_ancestry: + # filter by HWE and + # create minor allele count report for specific ancestry and chromosome + input: + iid_txt = 'data/ancestry_keep_files/{ancestry}.keep', + plink = 'bin/plink', + bed = rules.link_genotypes.output.bed + output: + frq_tsv = temp('work/mac_report/all/{ancestry}/{id}.frq.counts.gz'), + tsv = 'work/mac_report/all/{ancestry}/{id}.tsv.gz', + plink_log = 'work/mac_report/all/{ancestry}/{id}.log' + params: + in_prefix = lambda wc, input: input.bed[:-4], + out_prefix = 'work/mac_report/all/{ancestry}/{id}' + log: + 'logs/mac_report_ancestry/{ancestry}/{id}.log' + conda: + '../env/genomics.yml' + threads: + 1 + resources: + mem_mb=8000, + time="1:00:00" + shell: + 'python script/python/mac_report.py ' + '--iid {input.iid_txt} ' + '--bed {params.in_prefix} ' + '-o {params.out_prefix} ' + '--plink_path {input.plink} ' + '--log {log} ' + '--threads {threads} ' + '--mem_mb {resources.mem_mb} ' + '&> {log} ' + + +rule mac_report_ancestry_all: + # create minor allele count reports for all ancestries and all chromosomes + input: + expand(rules.mac_report_ancestry.output, id = plinkfiles.getIds(), ancestry=['AFR','AMR','EAS','EUR','SAS']) + + rule filter_variants: # apply missingness filters and # high-confidence region filter for phenotype-complete cases - # returns the filtered variant ids, their MAF (calculated on the covariate-complete cases) and their MAC (calculated on the phenotype-complete cases) + # returns the filtered variant ids, their MAF (calculated on the covariate-complete cases) and their MAC (calculated on the *phenotype*-complete cases) # the MAF filter is applied while performing the association tests, NOT here. input: mac_tsv = rules.mac_report.output.tsv,