Skip to content

Commit

Permalink
Merge pull request #46 from BackofenLab/workMalek
Browse files Browse the repository at this point in the history
Updated functional enrichment (With genes now)
  • Loading branch information
Maluuck authored Nov 8, 2023
2 parents d5e8ae6 + 2722b3d commit 0f923a3
Show file tree
Hide file tree
Showing 24 changed files with 79,066 additions and 82,753 deletions.
42 changes: 21 additions & 21 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,32 @@
from util.stopwatch import Stopwatch


def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot):
def calc_genes_pval(curr, alpha, in_gene, bg_genes, num_in_genes):
# Lists are read as strings, evaluate to lists using JSON.
# alternative is using eval() which is slower
prot_list = curr.replace("'", '"')
prot_list = json.loads(prot_list)
gene_list = curr.replace("'", '"')
gene_list = json.loads(gene_list)

# get the protein length of term
num_term_prot = len(prot_list)
num_term_prot = len(gene_list)

# Get intersection of proteins
prots_term = list(set(prot_list) & in_pr)
num_inter = len(prots_term)
gene_term = list(set(gene_list) & in_gene)
num_inter = len(gene_term)

if num_inter == 0:
# Condition reduces runtime by 60%, needs to be tested carefully!
# Check if enriched terms with and without that condition are
# more or less the same
p_value = alpha
else:
p_value = hypergeo_testing(num_inter, bg_proteins, num_term_prot, num_in_prot)
p_value = hypergeo_testing(num_inter, bg_genes, num_term_prot, num_in_genes)

return (p_value, prots_term)
return (p_value, gene_term)


@lru_cache(maxsize=None)
def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
def hypergeo_testing(intersec, total_genes, term_genes, in_genes):
"""Perfoms hypergeometric testing and returns a p_value
Args:
intersec(int): number of intersections between input and
Expand All @@ -52,16 +52,16 @@ def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
mpmath.mp.dps = 64

# Do the relevant calculations for the hypergeometric testing
term_one = mpmath.binomial(term_proteins, intersec)
term_two = mpmath.binomial(total_proteins - term_proteins, in_proteins - intersec)
term_three = mpmath.binomial(total_proteins, in_proteins)
term_one = mpmath.binomial(term_genes, intersec)
term_two = mpmath.binomial(total_genes - term_genes, in_genes - intersec)
term_three = mpmath.binomial(total_genes, in_genes)

# Calculate final p_value
p_value = (term_one * term_two) / term_three
return float(p_value)


def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):
"""inhouse functional enrichment - performs gene set enrichment analysis
for a given set of proteins. Calculates p-value and Benjamini-Hochberg FDR
for every functional term
Expand All @@ -76,9 +76,9 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch = Stopwatch()

# Get number of all proteins in the organism (from Cypher)
bg_proteins = queries.get_number_of_proteins(driver, species_id)
num_in_prot = len(in_proteins)
prots = set(in_proteins)
bg_proteins = queries.get_number_of_genes(driver, species_id)
num_in_gene = len(in_genes)
genes = set(in_genes)
# pandas DataFrames for nodes and edges
csv.field_size_limit(sys.maxsize)

Expand All @@ -92,18 +92,18 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
alpha = 0.05

# calculate p_value for all terms
new_prots = []
new_genes = []
new_p = []
arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]
arguments = [(value, alpha, genes, bg_proteins, num_in_gene) for value in df_terms["symbols"]]

with multiprocessing.Pool() as pool:
# Apply the function to each input value in parallel and collect the results
for a, b in pool.starmap(calc_proteins_pval, arguments):
for a, b in pool.starmap(calc_genes_pval, arguments):
new_p.append(a)
new_prots.append(b)
new_genes.append(b)

# Update Dataframe and sort by p_value (descending)
df_terms["proteins"] = new_prots
df_terms["genes"] = new_genes
df_terms["p_value"] = new_p
df_terms.sort_values(by="p_value", ascending=False, inplace=True)
df_terms = df_terms.reset_index(drop=True)
Expand Down
8 changes: 5 additions & 3 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ def files(path):
@app.route("/api/subgraph/enrichment", methods=["POST"])
def proteins_enrichment():
driver = database.get_driver()
proteins = request.form.get("proteins").split(",")
genes = request.form.get("genes").split(",")
species_id = int(request.form.get("species_id"))

# in-house functional enrichment
list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
list_enrichment = enrichment.functional_enrichment(driver, genes, species_id)

# STRING API functional enrichment
"""df_enrichment = stringdb.functional_enrichment(proteins, species_id)
Expand Down Expand Up @@ -106,7 +106,9 @@ def proteins_subgraph_api():

stopwatch.round("Neo4j")

nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")
nodes = (
pd.DataFrame(proteins).rename(columns={"ENSEMBL_PROTEIN": "external_id"}).drop_duplicates(subset="external_id")
)

edges = pd.DataFrame({"source": source, "target": target, "score": score})
edges = edges.drop_duplicates(subset=["source", "target"])
Expand Down
Binary file modified backend/src/pathway_data/data/AllPathways_human.csv.gz
Binary file not shown.
Binary file modified backend/src/pathway_data/data/AllPathways_mouse.csv.gz
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
time cost: 1.7737935715251498 hours
time cost: 2.39641605105665 hours
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
time cost: 1.7013362871938282 hours
time cost: 2.5310714825656677 hours
Binary file modified backend/src/pathway_data/data/bader_human.csv.gz
Binary file not shown.
Binary file modified backend/src/pathway_data/data/bader_mouse.csv.gz
Binary file not shown.
38,248 changes: 17,377 additions & 20,871 deletions backend/src/pathway_data/data/human_all_pathways.gmt

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions backend/src/pathway_data/data/kegg_compounds.human.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ C00036 Oxaloacetate
C00068 Thiamin diphosphate
C00074 Phosphoenolpyruvate
C00084 Acetaldehyde
C00085 D-Fructose 6-phosphate
C00103 D-Glucose 1-phosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
Expand All @@ -15,15 +16,14 @@ C00197 3-Phospho-D-glycerate
C00221 beta-D-Glucose
C00236 3-Phospho-D-glyceroyl phosphate
C00267 alpha-D-Glucose
C00354 D-Fructose 1,6-bisphosphate
C00469 Ethanol
C00631 2-Phospho-D-glycerate
C00668 alpha-D-Glucose 6-phosphate
C01159 2,3-Bisphospho-D-glycerate
C01172 beta-D-Glucose 6-phosphate
C01451 Salicin
C05125 2-(alpha-Hydroxyethyl)thiamine diphosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C06186 Arbutin
C06187 Arbutin 6-phosphate
C06188 Salicin 6-phosphate
Expand Down Expand Up @@ -61,6 +61,7 @@ C01182 D-Ribulose 1,5-bisphosphate
C01218 6-Phospho-2-dehydro-D-gluconate
C01236 D-Glucono-1,5-lactone 6-phosphate
C01801 Deoxyribose
C02076 Sedoheptulose
C03752 2-Amino-2-deoxy-D-gluconate
C04442 2-Dehydro-3-deoxy-6-phospho-D-gluconate
C05382 Sedoheptulose 7-phosphate
Expand Down Expand Up @@ -118,6 +119,7 @@ C14899 3-Dehydro-L-gulonate 6-phosphate
C15930 L-Galactonate
C20680 2-Dehydro-3-deoxy-L-galactonate
C22337 D-Ribulose 1-phosphate
C22712 4-Deoxy-L-erythro-hex-4-enopyranuronate
C00095 D-Fructose
C00096 GDP-mannose
C00159 D-Mannose
Expand Down Expand Up @@ -167,7 +169,6 @@ C18096 D-Allulose 6-phosphate
C20781 2,4-Diketo-3-deoxy-L-fuconate
C20836 beta-L-Fucopyranose
C00052 UDP-alpha-D-galactose
C00085 D-Fructose 6-phosphate
C00089 Sucrose
C00124 D-Galactose
C00137 myo-Inositol
Expand Down Expand Up @@ -285,7 +286,6 @@ C01623 UDP-apiose
C01674 Chitobiose
C01788 CDP-abequose
C02199 UDP-L-rhamnose
C02336 beta-D-Fructose
C02352 1,4-beta-D-Xylan
C02474 Arabinan
C02713 N-Acetylmuramate
Expand Down Expand Up @@ -1798,7 +1798,6 @@ C00230 3,4-Dihydroxybenzoate
C00251 Chorismate
C00254 Prephenate
C00296 Quinate
C00354 D-Fructose 1,6-bisphosphate
C00493 Shikimate
C00587 3-Hydroxybenzoate
C00826 L-Arogenate
Expand Down Expand Up @@ -3175,6 +3174,7 @@ C15891 Endomorphin-2
C15995 Bombesin
C16511 L-Homocysteic acid
C16512 Palmitoylethanolamide
C18178 Resolvin D1
C11378 Ubiquinone-10
C16844 Hydroxyl radical
C21478 Erastin
Expand Down
10 changes: 5 additions & 5 deletions backend/src/pathway_data/data/kegg_compounds.mouse.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ C00036 Oxaloacetate
C00068 Thiamin diphosphate
C00074 Phosphoenolpyruvate
C00084 Acetaldehyde
C00085 D-Fructose 6-phosphate
C00103 D-Glucose 1-phosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
Expand All @@ -15,15 +16,14 @@ C00197 3-Phospho-D-glycerate
C00221 beta-D-Glucose
C00236 3-Phospho-D-glyceroyl phosphate
C00267 alpha-D-Glucose
C00354 D-Fructose 1,6-bisphosphate
C00469 Ethanol
C00631 2-Phospho-D-glycerate
C00668 alpha-D-Glucose 6-phosphate
C01159 2,3-Bisphospho-D-glycerate
C01172 beta-D-Glucose 6-phosphate
C01451 Salicin
C05125 2-(alpha-Hydroxyethyl)thiamine diphosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C06186 Arbutin
C06187 Arbutin 6-phosphate
C06188 Salicin 6-phosphate
Expand Down Expand Up @@ -61,6 +61,7 @@ C01182 D-Ribulose 1,5-bisphosphate
C01218 6-Phospho-2-dehydro-D-gluconate
C01236 D-Glucono-1,5-lactone 6-phosphate
C01801 Deoxyribose
C02076 Sedoheptulose
C03752 2-Amino-2-deoxy-D-gluconate
C04442 2-Dehydro-3-deoxy-6-phospho-D-gluconate
C05382 Sedoheptulose 7-phosphate
Expand Down Expand Up @@ -118,6 +119,7 @@ C14899 3-Dehydro-L-gulonate 6-phosphate
C15930 L-Galactonate
C20680 2-Dehydro-3-deoxy-L-galactonate
C22337 D-Ribulose 1-phosphate
C22712 4-Deoxy-L-erythro-hex-4-enopyranuronate
C00095 D-Fructose
C00096 GDP-mannose
C00159 D-Mannose
Expand Down Expand Up @@ -167,7 +169,6 @@ C18096 D-Allulose 6-phosphate
C20781 2,4-Diketo-3-deoxy-L-fuconate
C20836 beta-L-Fucopyranose
C00052 UDP-alpha-D-galactose
C00085 D-Fructose 6-phosphate
C00089 Sucrose
C00124 D-Galactose
C00137 myo-Inositol
Expand Down Expand Up @@ -285,7 +286,6 @@ C01623 UDP-apiose
C01674 Chitobiose
C01788 CDP-abequose
C02199 UDP-L-rhamnose
C02336 beta-D-Fructose
C02352 1,4-beta-D-Xylan
C02474 Arabinan
C02713 N-Acetylmuramate
Expand Down Expand Up @@ -1798,7 +1798,6 @@ C00230 3,4-Dihydroxybenzoate
C00251 Chorismate
C00254 Prephenate
C00296 Quinate
C00354 D-Fructose 1,6-bisphosphate
C00493 Shikimate
C00587 3-Hydroxybenzoate
C00826 L-Arogenate
Expand Down Expand Up @@ -3175,6 +3174,7 @@ C15891 Endomorphin-2
C15995 Bombesin
C16511 L-Homocysteic acid
C16512 Palmitoylethanolamide
C18178 Resolvin D1
C11378 Ubiquinone-10
C16844 Hydroxyl radical
C21478 Erastin
Expand Down
Loading

0 comments on commit 0f923a3

Please sign in to comment.