Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated functional enrichment (With genes now) #46

Merged
merged 3 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,32 @@
from util.stopwatch import Stopwatch


def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot):
def calc_genes_pval(curr, alpha, in_gene, bg_genes, num_in_genes):
# Lists are read as strings, evaluate to lists using JSON.
# alternative is using eval() which is slower
prot_list = curr.replace("'", '"')
prot_list = json.loads(prot_list)
gene_list = curr.replace("'", '"')
gene_list = json.loads(gene_list)

# get the protein length of term
num_term_prot = len(prot_list)
num_term_prot = len(gene_list)

# Get intersection of proteins
prots_term = list(set(prot_list) & in_pr)
num_inter = len(prots_term)
gene_term = list(set(gene_list) & in_gene)
num_inter = len(gene_term)

if num_inter == 0:
# Condition reduces runtime by 60%, needs to be tested carefully!
# Check if enriched terms with and without that condition are
# more or less the same
p_value = alpha
else:
p_value = hypergeo_testing(num_inter, bg_proteins, num_term_prot, num_in_prot)
p_value = hypergeo_testing(num_inter, bg_genes, num_term_prot, num_in_genes)

return (p_value, prots_term)
return (p_value, gene_term)


@lru_cache(maxsize=None)
def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
def hypergeo_testing(intersec, total_genes, term_genes, in_genes):
"""Perfoms hypergeometric testing and returns a p_value
Args:
intersec(int): number of intersections between input and
Expand All @@ -52,16 +52,16 @@ def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
mpmath.mp.dps = 64

# Do the relevant calculations for the hypergeometric testing
term_one = mpmath.binomial(term_proteins, intersec)
term_two = mpmath.binomial(total_proteins - term_proteins, in_proteins - intersec)
term_three = mpmath.binomial(total_proteins, in_proteins)
term_one = mpmath.binomial(term_genes, intersec)
term_two = mpmath.binomial(total_genes - term_genes, in_genes - intersec)
term_three = mpmath.binomial(total_genes, in_genes)

# Calculate final p_value
p_value = (term_one * term_two) / term_three
return float(p_value)


def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):
"""inhouse functional enrichment - performs gene set enrichment analysis
for a given set of proteins. Calculates p-value and Benjamini-Hochberg FDR
for every functional term
Expand All @@ -76,9 +76,9 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch = Stopwatch()

# Get number of all proteins in the organism (from Cypher)
bg_proteins = queries.get_number_of_proteins(driver, species_id)
num_in_prot = len(in_proteins)
prots = set(in_proteins)
bg_proteins = queries.get_number_of_genes(driver, species_id)
num_in_gene = len(in_genes)
genes = set(in_genes)
# pandas DataFrames for nodes and edges
csv.field_size_limit(sys.maxsize)

Expand All @@ -92,18 +92,18 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
alpha = 0.05

# calculate p_value for all terms
new_prots = []
new_genes = []
new_p = []
arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]
arguments = [(value, alpha, genes, bg_proteins, num_in_gene) for value in df_terms["symbols"]]

with multiprocessing.Pool() as pool:
# Apply the function to each input value in parallel and collect the results
for a, b in pool.starmap(calc_proteins_pval, arguments):
for a, b in pool.starmap(calc_genes_pval, arguments):
new_p.append(a)
new_prots.append(b)
new_genes.append(b)

# Update Dataframe and sort by p_value (descending)
df_terms["proteins"] = new_prots
df_terms["genes"] = new_genes
df_terms["p_value"] = new_p
df_terms.sort_values(by="p_value", ascending=False, inplace=True)
df_terms = df_terms.reset_index(drop=True)
Expand Down
8 changes: 5 additions & 3 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ def files(path):
@app.route("/api/subgraph/enrichment", methods=["POST"])
def proteins_enrichment():
driver = database.get_driver()
proteins = request.form.get("proteins").split(",")
genes = request.form.get("genes").split(",")
species_id = int(request.form.get("species_id"))

# in-house functional enrichment
list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
list_enrichment = enrichment.functional_enrichment(driver, genes, species_id)

# STRING API functional enrichment
"""df_enrichment = stringdb.functional_enrichment(proteins, species_id)
Expand Down Expand Up @@ -106,7 +106,9 @@ def proteins_subgraph_api():

stopwatch.round("Neo4j")

nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")
nodes = (
pd.DataFrame(proteins).rename(columns={"ENSEMBL_PROTEIN": "external_id"}).drop_duplicates(subset="external_id")
)

edges = pd.DataFrame({"source": source, "target": target, "score": score})
edges = edges.drop_duplicates(subset=["source", "target"])
Expand Down
Binary file modified backend/src/pathway_data/data/AllPathways_human.csv.gz
Binary file not shown.
Binary file modified backend/src/pathway_data/data/AllPathways_mouse.csv.gz
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
time cost: 1.7737935715251498 hours
time cost: 2.39641605105665 hours
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
time cost: 1.7013362871938282 hours
time cost: 2.5310714825656677 hours
Binary file modified backend/src/pathway_data/data/bader_human.csv.gz
Binary file not shown.
Binary file modified backend/src/pathway_data/data/bader_mouse.csv.gz
Binary file not shown.
38,248 changes: 17,377 additions & 20,871 deletions backend/src/pathway_data/data/human_all_pathways.gmt

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions backend/src/pathway_data/data/kegg_compounds.human.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ C00036 Oxaloacetate
C00068 Thiamin diphosphate
C00074 Phosphoenolpyruvate
C00084 Acetaldehyde
C00085 D-Fructose 6-phosphate
C00103 D-Glucose 1-phosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
Expand All @@ -15,15 +16,14 @@ C00197 3-Phospho-D-glycerate
C00221 beta-D-Glucose
C00236 3-Phospho-D-glyceroyl phosphate
C00267 alpha-D-Glucose
C00354 D-Fructose 1,6-bisphosphate
C00469 Ethanol
C00631 2-Phospho-D-glycerate
C00668 alpha-D-Glucose 6-phosphate
C01159 2,3-Bisphospho-D-glycerate
C01172 beta-D-Glucose 6-phosphate
C01451 Salicin
C05125 2-(alpha-Hydroxyethyl)thiamine diphosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C06186 Arbutin
C06187 Arbutin 6-phosphate
C06188 Salicin 6-phosphate
Expand Down Expand Up @@ -61,6 +61,7 @@ C01182 D-Ribulose 1,5-bisphosphate
C01218 6-Phospho-2-dehydro-D-gluconate
C01236 D-Glucono-1,5-lactone 6-phosphate
C01801 Deoxyribose
C02076 Sedoheptulose
C03752 2-Amino-2-deoxy-D-gluconate
C04442 2-Dehydro-3-deoxy-6-phospho-D-gluconate
C05382 Sedoheptulose 7-phosphate
Expand Down Expand Up @@ -118,6 +119,7 @@ C14899 3-Dehydro-L-gulonate 6-phosphate
C15930 L-Galactonate
C20680 2-Dehydro-3-deoxy-L-galactonate
C22337 D-Ribulose 1-phosphate
C22712 4-Deoxy-L-erythro-hex-4-enopyranuronate
C00095 D-Fructose
C00096 GDP-mannose
C00159 D-Mannose
Expand Down Expand Up @@ -167,7 +169,6 @@ C18096 D-Allulose 6-phosphate
C20781 2,4-Diketo-3-deoxy-L-fuconate
C20836 beta-L-Fucopyranose
C00052 UDP-alpha-D-galactose
C00085 D-Fructose 6-phosphate
C00089 Sucrose
C00124 D-Galactose
C00137 myo-Inositol
Expand Down Expand Up @@ -285,7 +286,6 @@ C01623 UDP-apiose
C01674 Chitobiose
C01788 CDP-abequose
C02199 UDP-L-rhamnose
C02336 beta-D-Fructose
C02352 1,4-beta-D-Xylan
C02474 Arabinan
C02713 N-Acetylmuramate
Expand Down Expand Up @@ -1798,7 +1798,6 @@ C00230 3,4-Dihydroxybenzoate
C00251 Chorismate
C00254 Prephenate
C00296 Quinate
C00354 D-Fructose 1,6-bisphosphate
C00493 Shikimate
C00587 3-Hydroxybenzoate
C00826 L-Arogenate
Expand Down Expand Up @@ -3175,6 +3174,7 @@ C15891 Endomorphin-2
C15995 Bombesin
C16511 L-Homocysteic acid
C16512 Palmitoylethanolamide
C18178 Resolvin D1
C11378 Ubiquinone-10
C16844 Hydroxyl radical
C21478 Erastin
Expand Down
10 changes: 5 additions & 5 deletions backend/src/pathway_data/data/kegg_compounds.mouse.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ C00036 Oxaloacetate
C00068 Thiamin diphosphate
C00074 Phosphoenolpyruvate
C00084 Acetaldehyde
C00085 D-Fructose 6-phosphate
C00103 D-Glucose 1-phosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
Expand All @@ -15,15 +16,14 @@ C00197 3-Phospho-D-glycerate
C00221 beta-D-Glucose
C00236 3-Phospho-D-glyceroyl phosphate
C00267 alpha-D-Glucose
C00354 D-Fructose 1,6-bisphosphate
C00469 Ethanol
C00631 2-Phospho-D-glycerate
C00668 alpha-D-Glucose 6-phosphate
C01159 2,3-Bisphospho-D-glycerate
C01172 beta-D-Glucose 6-phosphate
C01451 Salicin
C05125 2-(alpha-Hydroxyethyl)thiamine diphosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C06186 Arbutin
C06187 Arbutin 6-phosphate
C06188 Salicin 6-phosphate
Expand Down Expand Up @@ -61,6 +61,7 @@ C01182 D-Ribulose 1,5-bisphosphate
C01218 6-Phospho-2-dehydro-D-gluconate
C01236 D-Glucono-1,5-lactone 6-phosphate
C01801 Deoxyribose
C02076 Sedoheptulose
C03752 2-Amino-2-deoxy-D-gluconate
C04442 2-Dehydro-3-deoxy-6-phospho-D-gluconate
C05382 Sedoheptulose 7-phosphate
Expand Down Expand Up @@ -118,6 +119,7 @@ C14899 3-Dehydro-L-gulonate 6-phosphate
C15930 L-Galactonate
C20680 2-Dehydro-3-deoxy-L-galactonate
C22337 D-Ribulose 1-phosphate
C22712 4-Deoxy-L-erythro-hex-4-enopyranuronate
C00095 D-Fructose
C00096 GDP-mannose
C00159 D-Mannose
Expand Down Expand Up @@ -167,7 +169,6 @@ C18096 D-Allulose 6-phosphate
C20781 2,4-Diketo-3-deoxy-L-fuconate
C20836 beta-L-Fucopyranose
C00052 UDP-alpha-D-galactose
C00085 D-Fructose 6-phosphate
C00089 Sucrose
C00124 D-Galactose
C00137 myo-Inositol
Expand Down Expand Up @@ -285,7 +286,6 @@ C01623 UDP-apiose
C01674 Chitobiose
C01788 CDP-abequose
C02199 UDP-L-rhamnose
C02336 beta-D-Fructose
C02352 1,4-beta-D-Xylan
C02474 Arabinan
C02713 N-Acetylmuramate
Expand Down Expand Up @@ -1798,7 +1798,6 @@ C00230 3,4-Dihydroxybenzoate
C00251 Chorismate
C00254 Prephenate
C00296 Quinate
C00354 D-Fructose 1,6-bisphosphate
C00493 Shikimate
C00587 3-Hydroxybenzoate
C00826 L-Arogenate
Expand Down Expand Up @@ -3175,6 +3174,7 @@ C15891 Endomorphin-2
C15995 Bombesin
C16511 L-Homocysteic acid
C16512 Palmitoylethanolamide
C18178 Resolvin D1
C11378 Ubiquinone-10
C16844 Hydroxyl radical
C21478 Erastin
Expand Down
Loading
Loading