NN-380 Fixed enrichment and linted

BackofenLab · Nov 8, 2023 · 8c6eb0e · 8c6eb0e
1 parent 05dba8d
commit 8c6eb0e
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 41 deletions.
diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py
@@ -13,32 +13,32 @@
 from util.stopwatch import Stopwatch
 
 
-def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot):
+def calc_genes_pval(curr, alpha, in_gene, bg_genes, num_in_genes):
     # Lists are read as strings, evaluate to lists using JSON.
     # alternative is using eval() which is slower
-    prot_list = curr.replace("'", '"')
-    prot_list = json.loads(prot_list)
+    gene_list = curr.replace("'", '"')
+    gene_list = json.loads(gene_list)
 
     # get the protein length of term
-    num_term_prot = len(prot_list)
+    num_term_prot = len(gene_list)
 
     # Get intersection of proteins
-    prots_term = list(set(prot_list) & in_pr)
-    num_inter = len(prots_term)
+    gene_term = list(set(gene_list) & in_gene)
+    num_inter = len(gene_term)
 
     if num_inter == 0:
         # Condition reduces runtime by 60%, needs to be tested carefully!
         # Check if enriched terms with and without that condition are
         # more or less the same
         p_value = alpha
     else:
-        p_value = hypergeo_testing(num_inter, bg_proteins, num_term_prot, num_in_prot)
+        p_value = hypergeo_testing(num_inter, bg_genes, num_term_prot, num_in_genes)
 
-    return (p_value, prots_term)
+    return (p_value, gene_term)
 
 
 @lru_cache(maxsize=None)
-def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
+def hypergeo_testing(intersec, total_genes, term_genes, in_genes):
     """Perfoms hypergeometric testing and returns a p_value
     Args:
         intersec(int): number of intersections between input and
@@ -52,16 +52,16 @@ def hypergeo_testing(intersec, total_proteins, term_proteins, in_proteins):
     mpmath.mp.dps = 64
 
     # Do the relevant calculations for the hypergeometric testing
-    term_one = mpmath.binomial(term_proteins, intersec)
-    term_two = mpmath.binomial(total_proteins - term_proteins, in_proteins - intersec)
-    term_three = mpmath.binomial(total_proteins, in_proteins)
+    term_one = mpmath.binomial(term_genes, intersec)
+    term_two = mpmath.binomial(total_genes - term_genes, in_genes - intersec)
+    term_three = mpmath.binomial(total_genes, in_genes)
 
     # Calculate final p_value
     p_value = (term_one * term_two) / term_three
     return float(p_value)
 
 
-def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
+def functional_enrichment(driver: neo4j.Driver, in_genes, species_id: Any):
     """inhouse functional enrichment - performs gene set enrichment analysis
     for a given set of proteins. Calculates p-value and Benjamini-Hochberg FDR
     for every functional term
@@ -76,9 +76,9 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
     stopwatch = Stopwatch()
 
     # Get number of all proteins in the organism (from Cypher)
-    bg_proteins = queries.get_number_of_proteins(driver, species_id)
-    num_in_prot = len(in_proteins)
-    prots = set(in_proteins)
+    bg_proteins = queries.get_number_of_genes(driver, species_id)
+    num_in_gene = len(in_genes)
+    genes = set(in_genes)
     # pandas DataFrames for nodes and edges
     csv.field_size_limit(sys.maxsize)
 
@@ -92,18 +92,18 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
     alpha = 0.05
 
     # calculate p_value for all terms
-    new_prots = []
+    new_genes = []
     new_p = []
-    arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]
+    arguments = [(value, alpha, genes, bg_proteins, num_in_gene) for value in df_terms["symbols"]]
 
     with multiprocessing.Pool() as pool:
         # Apply the function to each input value in parallel and collect the results
-        for a, b in pool.starmap(calc_proteins_pval, arguments):
+        for a, b in pool.starmap(calc_genes_pval, arguments):
             new_p.append(a)
-            new_prots.append(b)
+            new_genes.append(b)
 
     # Update Dataframe and sort by p_value (descending)
-    df_terms["proteins"] = new_prots
+    df_terms["genes"] = new_genes
     df_terms["p_value"] = new_p
     df_terms.sort_values(by="p_value", ascending=False, inplace=True)
     df_terms = df_terms.reset_index(drop=True)

diff --git a/backend/src/main.py b/backend/src/main.py
@@ -50,11 +50,11 @@ def files(path):
 @app.route("/api/subgraph/enrichment", methods=["POST"])
 def proteins_enrichment():
     driver = database.get_driver()
-    proteins = request.form.get("proteins").split(",")
+    genes = request.form.get("genes").split(",")
     species_id = int(request.form.get("species_id"))
 
     # in-house functional enrichment
-    list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
+    list_enrichment = enrichment.functional_enrichment(driver, genes, species_id)
 
     # STRING API functional enrichment
     """df_enrichment = stringdb.functional_enrichment(proteins, species_id)
@@ -106,7 +106,9 @@ def proteins_subgraph_api():
 
     stopwatch.round("Neo4j")
 
-    nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")
+    nodes = (
+        pd.DataFrame(proteins).rename(columns={"ENSEMBL_PROTEIN": "external_id"}).drop_duplicates(subset="external_id")
+    )
 
     edges = pd.DataFrame({"source": source, "target": target, "score": score})
     edges = edges.drop_duplicates(subset=["source", "target"])

diff --git a/backend/src/pathway_data/kegg.py b/backend/src/pathway_data/kegg.py
@@ -224,4 +224,3 @@ def scrapping(path, species):
     diseases_file.close()
     drugs_file.close()
     compounds_file.close()
-
diff --git a/backend/src/pathway_data/pathway_data.py b/backend/src/pathway_data/pathway_data.py
@@ -154,7 +154,7 @@ def read_kegg_data(specifier):
     kegg_df = pd.read_csv(
         f"data/kegg_pathways.{specifier}.tsv",
         delimiter="\t",
-        usecols=["id", "name", "symbols","genes_external_ids"],
+        usecols=["id", "name", "symbols", "genes_external_ids"],
     )
     kegg_df.insert(loc=2, column="category", value="KEGG")
     kegg_df = kegg_df.rename(columns={"genes_external_ids": "genes"})

diff --git a/backend/src/queries.py b/backend/src/queries.py
@@ -38,8 +38,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id
     query = f"""
         MATCH (protein:Protein:{species})
         WHERE protein.SYMBOL IN {str([n.title() for n in names])} 
-            OR protein.ENSEMBL IN {str([n.title() for n in names])} 
-        RETURN protein, protein.ENSEMBL AS id
+            OR protein.ENSEMBL_PROTEIN IN {str([n.title() for n in names])} 
+        RETURN protein, protein.ENSEMBL_PROTEIN AS id
     """
     with driver.session() as session:
         result = session.run(query)
@@ -60,8 +60,8 @@ def get_protein_neighbours(
     # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
     query = f"""
         MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
-        WHERE source.ENSEMBL IN {protein_ids}
-            AND target.ENSEMBL IN {protein_ids}
+        WHERE source.ENSEMBL_PROTEIN IN {protein_ids}
+            AND target.ENSEMBL_PROTEIN IN {protein_ids}
             AND association.combined >= {threshold}
         RETURN source, target, association.combined AS score
     """
@@ -85,8 +85,8 @@ def get_protein_associations(
     # unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons
     query = f"""
         MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
-        WHERE source.ENSEMBL IN {protein_ids}
-            AND target.ENSEMBL IN {protein_ids}
+        WHERE source.ENSEMBL_PROTEIN IN {protein_ids}
+            AND target.ENSEMBL_PROTEIN IN {protein_ids}
             AND association.Score >= {threshold}
         RETURN source, target, association.Score AS score
     """
@@ -103,28 +103,28 @@ def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str
 
     query = f"""
         MATCH (term:FT:{species})
-        RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins
+        RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Symbols AS symbols
     """
 
     with driver.session() as session:
         result = session.run(query)
         return result.data()
 
 
-def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int:
+def get_number_of_genes(driver: neo4j.Driver, species_id: int) -> int:
     if species_id == 10090:
         species = "Mus_Musculus"
     elif species_id == 9606:
         species = "Homo_Sapiens"
 
     query = f"""
-        MATCH (n:Protein:{species})
-        RETURN count(n) AS num_proteins
+        MATCH (n:TG:{species})
+        RETURN count(n) AS num_genes
     """
     with driver.session() as session:
         result = session.run(query)
-        num_proteins = result.single(strict=True)["num_proteins"]
-        return int(num_proteins)
+        num_genes = result.single(strict=True)["num_genes"]
+        return int(num_genes)
 
 
 def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
@@ -144,8 +144,8 @@ def _convert_to_connection_info_score(
         nodes.append(row["source"])
         nodes.append(row["target"])
         if protein:
-            source.append(row["source"].get("ENSEMBL"))
-            target.append(row["target"].get("ENSEMBL"))
+            source.append(row["source"].get("ENSEMBL_PROTEIN"))
+            target.append(row["target"].get("ENSEMBL_PROTEIN"))
         else:
             source.append(row["source"].get("Term"))
             target.append(row["target"].get("Term"))

diff --git a/frontend/src/components/enrichment/EnrichmentTool.vue b/frontend/src/components/enrichment/EnrichmentTool.vue
@@ -268,6 +268,7 @@
 
 
             formData.append('proteins', com.gephi_data.nodes.map(node => node.id))
+            formData.append('genes', com.gephi_data.nodes.map(node => node.attributes["Name"]))
             formData.append('species_id', com.gephi_data.nodes[0].species)
                 
 
@@ -420,4 +421,4 @@
         display: inline-flex;
     }
 
-</style>
+</style>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -224,4 +224,3 @@ def scrapping(path, species):
		diseases_file.close()
		drugs_file.close()
		compounds_file.close()