Skip to content

Commit

Permalink
NN-380 Fixed enrichment and linted
Browse files Browse the repository at this point in the history
  • Loading branch information
Maluuck committed Nov 8, 2023
1 parent 05dba8d commit 8c6eb0e
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 41 deletions.
42 changes: 21 additions & 21 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,32 @@
from util.stopwatch import Stopwatch


def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot):
def calc_genes_pval(curr, alpha, in_gene, bg_genes, num_in_genes):
# Lists are read as strings, evaluate to lists using JSON.
# alternative is using eval() which is slower
prot_list = curr.replace("'", '"')
prot_list = json.loads(prot_list)
gene_list = curr.replace("'", '"')
gene_list = json.loads(gene_list)

# get the protein length of term
num_term_prot = len(prot_list)
num_term_prot = len(gene_list)

# Get intersection of proteins
prots_term = list(set(prot_list) & in_pr)
num_inter = len(prots_term)
gene_term = list(set(gene_list) & in_gene)
num_inter = len(gene_term)

if num_inter == 0:
# Condition reduces runtime by 60%, needs to be tested carefully!
# Check if enriched terms with and without that condition are
# more or less the same
p_value = alpha
else:
p_value = hypergeo_testing(num_inter, bg_proteins, num_term_prot, num_in_prot)
p_value = hypergeo_testing(num_inter, bg_genes, num_term_prot, num_in_genes)

return (p_value, prots_term)
return (p_value, gene_term)


@lru_cache(maxsize=None)
def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
def hypergeo_testing(intersec, total_genes, term_genes, in_genes):
"""Perfoms hypergeometric testing and returns a p_value
Args:
intersec(int): number of intersections between input and
Expand All @@ -52,16 +52,16 @@ def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
mpmath.mp.dps = 64

# Do the relevant calculations for the hypergeometric testing
term_one = mpmath.binomial(term_proteins, intersec)
term_two = mpmath.binomial(total_proteins - term_proteins, in_proteins - intersec)
term_three = mpmath.binomial(total_proteins, in_proteins)
term_one = mpmath.binomial(term_genes, intersec)
term_two = mpmath.binomial(total_genes - term_genes, in_genes - intersec)
term_three = mpmath.binomial(total_genes, in_genes)

# Calculate final p_value
p_value = (term_one * term_two) / term_three
return float(p_value)


def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):
"""inhouse functional enrichment - performs gene set enrichment analysis
for a given set of proteins. Calculates p-value and Benjamini-Hochberg FDR
for every functional term
Expand All @@ -76,9 +76,9 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch = Stopwatch()

# Get number of all proteins in the organism (from Cypher)
bg_proteins = queries.get_number_of_proteins(driver, species_id)
num_in_prot = len(in_proteins)
prots = set(in_proteins)
bg_proteins = queries.get_number_of_genes(driver, species_id)
num_in_gene = len(in_genes)
genes = set(in_genes)
# pandas DataFrames for nodes and edges
csv.field_size_limit(sys.maxsize)

Expand All @@ -92,18 +92,18 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
alpha = 0.05

# calculate p_value for all terms
new_prots = []
new_genes = []
new_p = []
arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]
arguments = [(value, alpha, genes, bg_proteins, num_in_gene) for value in df_terms["symbols"]]

with multiprocessing.Pool() as pool:
# Apply the function to each input value in parallel and collect the results
for a, b in pool.starmap(calc_proteins_pval, arguments):
for a, b in pool.starmap(calc_genes_pval, arguments):
new_p.append(a)
new_prots.append(b)
new_genes.append(b)

# Update Dataframe and sort by p_value (descending)
df_terms["proteins"] = new_prots
df_terms["genes"] = new_genes
df_terms["p_value"] = new_p
df_terms.sort_values(by="p_value", ascending=False, inplace=True)
df_terms = df_terms.reset_index(drop=True)
Expand Down
8 changes: 5 additions & 3 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ def files(path):
@app.route("/api/subgraph/enrichment", methods=["POST"])
def proteins_enrichment():
driver = database.get_driver()
proteins = request.form.get("proteins").split(",")
genes = request.form.get("genes").split(",")
species_id = int(request.form.get("species_id"))

# in-house functional enrichment
list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
list_enrichment = enrichment.functional_enrichment(driver, genes, species_id)

# STRING API functional enrichment
"""df_enrichment = stringdb.functional_enrichment(proteins, species_id)
Expand Down Expand Up @@ -106,7 +106,9 @@ def proteins_subgraph_api():

stopwatch.round("Neo4j")

nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")
nodes = (
pd.DataFrame(proteins).rename(columns={"ENSEMBL_PROTEIN": "external_id"}).drop_duplicates(subset="external_id")
)

edges = pd.DataFrame({"source": source, "target": target, "score": score})
edges = edges.drop_duplicates(subset=["source", "target"])
Expand Down
1 change: 0 additions & 1 deletion backend/src/pathway_data/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,3 @@ def scrapping(path, species):
diseases_file.close()
drugs_file.close()
compounds_file.close()

2 changes: 1 addition & 1 deletion backend/src/pathway_data/pathway_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def read_kegg_data(specifier):
kegg_df = pd.read_csv(
f"data/kegg_pathways.{specifier}.tsv",
delimiter="\t",
usecols=["id", "name", "symbols","genes_external_ids"],
usecols=["id", "name", "symbols", "genes_external_ids"],
)
kegg_df.insert(loc=2, column="category", value="KEGG")
kegg_df = kegg_df.rename(columns={"genes_external_ids": "genes"})
Expand Down
28 changes: 14 additions & 14 deletions backend/src/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
query = f"""
MATCH (protein:Protein:{species})
WHERE protein.SYMBOL IN {str([n.title() for n in names])}
OR protein.ENSEMBL IN {str([n.title() for n in names])}
RETURN protein, protein.ENSEMBL AS id
OR protein.ENSEMBL_PROTEIN IN {str([n.title() for n in names])}
RETURN protein, protein.ENSEMBL_PROTEIN AS id
"""
with driver.session() as session:
result = session.run(query)
Expand All @@ -60,8 +60,8 @@ def get_protein_neighbours(
# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
WHERE source.ENSEMBL_PROTEIN IN {protein_ids}
AND target.ENSEMBL_PROTEIN IN {protein_ids}
AND association.combined >= {threshold}
RETURN source, target, association.combined AS score
"""
Expand All @@ -85,8 +85,8 @@ def get_protein_associations(
# unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
WHERE source.ENSEMBL_PROTEIN IN {protein_ids}
AND target.ENSEMBL_PROTEIN IN {protein_ids}
AND association.Score >= {threshold}
RETURN source, target, association.Score AS score
"""
Expand All @@ -103,28 +103,28 @@ def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str

query = f"""
MATCH (term:FT:{species})
RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins
RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Symbols AS symbols
"""

with driver.session() as session:
result = session.run(query)
return result.data()


def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int:
def get_number_of_genes(driver: neo4j.Driver, species_id: int) -> int:
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (n:Protein:{species})
RETURN count(n) AS num_proteins
MATCH (n:TG:{species})
RETURN count(n) AS num_genes
"""
with driver.session() as session:
result = session.run(query)
num_proteins = result.single(strict=True)["num_proteins"]
return int(num_proteins)
num_genes = result.single(strict=True)["num_genes"]
return int(num_genes)


def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
Expand All @@ -144,8 +144,8 @@ def _convert_to_connection_info_score(
nodes.append(row["source"])
nodes.append(row["target"])
if protein:
source.append(row["source"].get("ENSEMBL"))
target.append(row["target"].get("ENSEMBL"))
source.append(row["source"].get("ENSEMBL_PROTEIN"))
target.append(row["target"].get("ENSEMBL_PROTEIN"))
else:
source.append(row["source"].get("Term"))
target.append(row["target"].get("Term"))
Expand Down
3 changes: 2 additions & 1 deletion frontend/src/components/enrichment/EnrichmentTool.vue
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@
formData.append('proteins', com.gephi_data.nodes.map(node => node.id))
formData.append('genes', com.gephi_data.nodes.map(node => node.attributes["Name"]))
formData.append('species_id', com.gephi_data.nodes[0].species)
Expand Down Expand Up @@ -420,4 +421,4 @@
display: inline-flex;
}
</style>
</style>

0 comments on commit 8c6eb0e

Please sign in to comment.