Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
remomomo committed May 19, 2022
2 parents 54f3e3c + bfa1049 commit 5dd6a20
Show file tree
Hide file tree
Showing 23 changed files with 1,918 additions and 42 deletions.
14 changes: 13 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ import gzip

configfile: "conf/config.yaml"

#####################
# utility functions #
#####################

def get_mem_mb(mem_mb=10000, factor=1.5):
# return function that tries a higher memory limit for retries, scaling exponentially with a factor
def gm(wildcards, attempt):
return factor ** (attempt - 1) * mem_mb
return gm


###################################
# samplesheet for the plink files #
###################################
Expand Down Expand Up @@ -46,7 +57,7 @@ wildcard_constraints:
# local rules #
###############

localrules: link_all, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all
localrules: link_all, link_genotypes, install_sh, install_seak, setup_all, run_deepripe_vep_all, run_ensembl_vep_all, evep_missense_proc_all, splice_ai_filter_and_overlap_with_genotypes_all, splice_ai_vcf_to_tsv_all, filter_variants_all, export_plof_burden_all, export_missense_burden_all, assoc_baseline_scoretest_all, mac_report_all, assoc_spliceai_linw_all, assoc_deepripe_single_localcollapsing_all, pLOF_nsnp_cummac_all, complete_cases_ancestry, gather_complete_cases


###############
Expand Down Expand Up @@ -156,4 +167,5 @@ include: "snake/deepripe.smk"
include: "snake/evep.smk"
include: "snake/splice_ai.smk"
include: "snake/association.smk"
include: "snake/conditional_tests.smk"
include: "snake/results_processing.smk"
1 change: 1 addition & 0 deletions env/snakemake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ dependencies:
- python=3.8
- snakemake-minimal
- pandas
- mamba
29 changes: 29 additions & 0 deletions run_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

#SBATCH --job-name=controljob_%j
#SBATCH --output=snakemake_%j.log
#SBATCH --partition=vcpu
#SBATCH --time=48:00:00
#SBATCH -c 1
#SBATCH --mem 2000

# Initialize conda:
eval "$(conda shell.bash hook)"

snakemake_env="install/snakemake"

if [ ! -d $snakemake_env ]; then
./install.sh
fi

#conda activate $snakemake_env

conda activate snakemake

snakemake --snakefile Snakefile \
--configfile conf/config.yaml \
--profile ./slurm \
--directory "${PWD}" \
"${@}"


179 changes: 179 additions & 0 deletions script/python/assoc_baseline_scoretest_conditional_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@



import logging
logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)


import numpy as np
import pandas as pd
import gzip

from seak.scoretest import ScoretestNoK
from seak.data_loaders import intersect_ids, CovariatesLoaderCSV, VariantLoaderSnpReader
from pysnptools.snpreader import Bed
from util.association import BurdenLoaderHDF5
from util import Timer


'''
This script is the baseline that we later compare to.
It loops over genes and performs score-tests, using pre-computed indicator variables (0/1)
'''


# covariates loader
covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
snakemake.input.covariates_tsv,
snakemake.params.covariate_column_names,
sep='\t',
path_to_phenotypes=snakemake.input.phenotypes_tsv)

# set up burden loaders
bloader_lof = BurdenLoaderHDF5(snakemake.input.h5_lof, snakemake.input.iid_lof, snakemake.input.gid_lof)
bloader_missense = BurdenLoaderHDF5(snakemake.input.h5_missense, snakemake.input.iid_missense, snakemake.input.gid_missense)

# make sure individuals are in the same order
bloader_lof.update_individuals(covariatesloader.get_iids())
bloader_missense.update_individuals(covariatesloader.get_iids())

# gene names to iterate over
genes = np.union1d(bloader_lof.get_vids(), bloader_missense.get_vids())
if isinstance(genes, str):
genes = [genes]

# keep only those that are significant!
# (1) load the results table
results = pd.read_csv(snakemake.input.results_tsv, sep='\t', na_values='.')

# the columns to consider
sigcols = snakemake.params.columns
if isinstance(sigcols, str):
sigcols = [sigcols]

pvcols_score = sigcols
results = results[['gene', 'nCarrier_pLOF', 'nCarrier_missense'] + pvcols_score]
results['pheno'] = snakemake.params.phenotype
results = results.loc[results.gene.isin(genes)]
results['gene_name'] = results['gene'].str.split('_', expand=True)[1]
genes = results.gene.tolist()

# (2) load the list of variants to condition on
_conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0, usecols=['gene_name','pheno'])
_conditional_list = _conditional_list[(_conditional_list.pheno == snakemake.wildcards['pheno']) | (_conditional_list.pheno == snakemake.params.phenotype) ]
_conditional_list = _conditional_list.drop_duplicates()

if len(_conditional_list) == 0:
logging.info('No genes pass significance threshold.'.format(snakemake.params.phenotype))
with gzip.open(snakemake.output.results_tsv, 'wt') as outfile:
outfile.write('# no significant hits for phenotype {}. \n'.format(snakemake.params.phenotype))
sys.exit(0)
else:
results = results.merge(_conditional_list, how='right', on=['gene_name', 'pheno'])

# (3) keep only genes below threshold
sig_genes = [results.gene[results[k] < snakemake.params.significance_cutoff].values for k in pvcols_score ]
sig_genes = np.unique(np.concatenate(sig_genes))

if len(sig_genes) == 0:
logging.info('No genes pass significance threshold.')
with gzip.open(snakemake.output.results_tsv, 'wt') as outfile:
outfile.write('# no hits below specified threshold ({}) for phenotype {}. \n'.format(snakemake.params.significance_cutoff, snakemake.params.phenotype))
sys.exit(0)
results = results.loc[results.gene.isin(sig_genes)].copy()
genes = results.gene.tolist()

logging.info('About to evaluate variants in {} genes'.format(len(results.gene.unique())))

# conditional analysis:
# these genotypes will be used for the conditional analysis
geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
geno_cond.update_individuals(covariatesloader.get_iids())

# this file contains the mapping of associations to SNPs to condition on
conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0)
conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates()
conditional_list = conditional_list.loc[conditional_list.gene_name.isin(results.gene_name)]

geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids())))

# set up the null model
Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK')
null_model = ScoretestNoK(Y, X)

logging.info('Phenotype: {}, Sample size: {}'.format(snakemake.params.phenotype, len(Y)))

def test_gene(gene):

# conditional analysis
# set up the function to load the variants to condition on

def get_conditional(gene):

# print(gene)
# print(gene.split('_')[1])
# print(conditional_list)

cond_snps_vid = conditional_list.loc[ conditional_list.gene_name == gene.split('_')[1], 'snp_rsid' ].unique().tolist()

temp_genotypes, temp_vids = geno_cond.genotypes_by_id(cond_snps_vid, return_pos=False)
temp_genotypes -= np.nanmean(temp_genotypes, axis=0)

cond_snps = np.ma.masked_invalid(temp_genotypes).filled(0.)

return cond_snps, temp_vids

pval_dict = {}
pval_dict['gene'] = gene

def call(GV, G2, name):
# calls the test, appends results to pval_dict
if GV is None:
pval_dict['pv_' + name] = np.nan
pval_dict['nCarrier_' + name] = 0
else:
pv = null_model.pv_alt_model(GV, G2)
if pv < 0.:
logging.warning('negative value encountered in p-value computation for gene {}, p-value: {}, using saddle instead.'.format(gene, pv))
pv = null_model.pv_alt_model(GV, G2, method='saddle')
pval_dict['pv_' + name] = pv
pval_dict['nCarrier_' + name] = int(GV.sum())


# Load the protein lof burdens
try:
Glof = bloader_lof.genotypes_by_id(gene).astype(float)
except KeyError:
Glof = None

# Load the missense burdens
try:
Gmiss = bloader_missense.genotypes_by_id(gene).astype(float)
except KeyError:
Gmiss = None

G2, cond_vid = get_conditional(gene)

call(Glof, G2, 'pLOF')
call(Gmiss, G2, 'missense')

if (Glof is None) or (Gmiss is None):
call(None, 'mrg')
else:
G = np.maximum(Glof, Gmiss)
call(G, G2, 'mrg')

pval_dict['cond_snps'] = ','.join(cond_vid)

return pval_dict


timer = Timer()
results = pd.DataFrame.from_dict([test_gene(gene) for gene in genes]).set_index('gene')

t = timer.check()
logging.info('{} genes tested in {:.2f} minutes.'.format(len(results), t/60.))

results.to_csv(snakemake.output.results_tsv, sep='\t', index=True, na_rep='.')

5 changes: 2 additions & 3 deletions script/python/assoc_sclrt_kernels_deepripe_multiple.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,8 @@ def get_rbp(interval):
if temp_genotypes is None:
raise GotNone

nanmean = np.nanmean(temp_genotypes, axis=0)

ncarrier = np.nansum(np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes), axis=1) >= 1) # calculated differently here because alleles can be flipped!
nanmean = np.nanmean(temp_genotypes, axis=0)
ncarrier = np.nansum(np.where((nanmean / 2 > 0.5), abs(temp_genotypes - 2), temp_genotypes) > 0, axis=0)# calculated differently here because alleles can be flipped!

G1 = temp_genotypes - nanmean
G1 = np.ma.masked_invalid(G1).filled(0.)
Expand Down
Loading

0 comments on commit 5dd6a20

Please sign in to comment.