From 07313d013393ff0abfd478cbb1c0f120472ab529 Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Mon, 11 Sep 2023 16:14:41 +0200 Subject: [PATCH 1/6] Rewriting of Frontend, Backend and queries to use new DB --- backend/src/enrichment.py | 11 +-- backend/src/enrichment_graph.py | 16 +-- backend/src/main.py | 19 ++-- backend/src/queries.py | 97 ++++++++++++------- .../components/enrichment/EnrichmentTool.vue | 1 + 5 files changed, 89 insertions(+), 55 deletions(-) diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py index 204b4cea..8bd08eda 100644 --- a/backend/src/enrichment.py +++ b/backend/src/enrichment.py @@ -16,14 +16,12 @@ def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot): # Lists are read as strings, evaluate to lists using JSON. # alternative is using eval() which is slower - prot_list = curr.replace("'", '"') - prot_list = json.loads(prot_list) # get the protein length of term - num_term_prot = len(prot_list) + num_term_prot = len(curr) # Get intersection of proteins - prots_term = list(set(prot_list) & in_pr) + prots_term = list(set(curr) & in_pr) num_inter = len(prots_term) if num_inter == 0: @@ -76,14 +74,14 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any): stopwatch = Stopwatch() # Get number of all proteins in the organism (from Cypher) - bg_proteins = queries.get_number_of_proteins(driver) + bg_proteins = queries.get_number_of_proteins(driver, species_id) num_in_prot = len(in_proteins) prots = set(in_proteins) # pandas DataFrames for nodes and edges csv.field_size_limit(sys.maxsize) # Read Terms and put into Dataframe - df_terms = pd.DataFrame(queries.get_enrichment_terms(driver)) + df_terms = pd.DataFrame(queries.get_enrichment_terms(driver, species_id)) tot_tests = len(df_terms) stopwatch.round("setup_enrichment") @@ -95,6 +93,7 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any): new_prots = [] new_p = [] arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]] + with multiprocessing.Pool() as pool: # Apply the function to each input value in parallel and collect the results for a, b in pool.starmap(calc_proteins_pval, arguments): diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py index 398a521b..552e127c 100644 --- a/backend/src/enrichment_graph.py +++ b/backend/src/enrichment_graph.py @@ -14,7 +14,7 @@ _BACKEND_JAR_PATH = "../gephi/target/gephi.backend-1.0-SNAPSHOT.jar" -def get_functional_graph(list_enrichment): +def get_functional_graph(list_enrichment, species_id): stopwatch = Stopwatch() list_term = [] @@ -24,19 +24,19 @@ def get_functional_graph(list_enrichment): driver = database.get_driver() # Execute the query and retrieve the CSV data - terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term) + terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id) stopwatch.round("Neo4j") - nodes = pd.DataFrame(terms).drop_duplicates(subset="external_id") + nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id") nodesterm = pd.DataFrame(list_enrichment) - df2 = nodesterm.rename({"id": "external_id"}, axis=1) + df2 = nodesterm.rename(columns={"id": "external_id"}) merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id") # Add the two columns to df2 - nodes = merged + nodes = merged.drop_duplicates() nodes["fdr_rate"] = nodes["fdr_rate"].fillna(0) nodes["p_value"] = nodes["p_value"].fillna(0) @@ -100,9 +100,9 @@ def get_functional_graph(list_enrichment): node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id]) node["attributes"]["PageRank"] = str(pagerank[mapped_node_id]) node["attributes"]["Ensembl ID"] = df_node.external_id - node["attributes"]["Name"] = df_node.name - node["label"] = df_node.name # Comment this out if you want no node labels displayed - node["attributes"]["Category"] = df_node.category + node["attributes"]["Name"] = df_node.Name + node["label"] = df_node.Name # Comment this out if you want no node labels displayed + node["attributes"]["Category"] = df_node.Category node["attributes"]["FDR"] = df_node.fdr_rate node["attributes"]["P Value"] = df_node.p_value diff --git a/backend/src/main.py b/backend/src/main.py index 47e9fbfd..ab2d20c2 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -51,7 +51,7 @@ def files(path): def proteins_enrichment(): driver = database.get_driver() proteins = request.form.get("proteins").split(",") - species_id = request.form.get("species_id") + species_id = int(request.form.get("species_id")) # in-house functional enrichment list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id) @@ -100,14 +100,16 @@ def proteins_subgraph_api(): stopwatch.round("Setup") if len(protein_ids) > 1: - proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold) + proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id) else: - proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold) + proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id) stopwatch.round("Neo4j") - nodes = pd.DataFrame(proteins).drop_duplicates(subset="external_id") + # TODO: make better (Vincent) + nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id") + edges = pd.DataFrame({"source": source, "target": target, "score": score}) edges = edges.drop_duplicates(subset=["source", "target"]) @@ -167,14 +169,14 @@ def proteins_subgraph_api(): # Use node mapping to add corresponding values of betweenness and pagerank node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id]) node["attributes"]["PageRank"] = str(pagerank[mapped_node_id]) - node["attributes"]["Description"] = df_node.description + node["attributes"]["Description"] = df_node.annotation node["attributes"]["Ensembl ID"] = df_node.external_id - node["attributes"]["Name"] = df_node.name + node["attributes"]["Name"] = df_node.SYMBOL if not (request.files.get("file") is None): if selected_d != None: for column in selected_d: node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item() - node["label"] = df_node.name + node["label"] = df_node.SYMBOL node["species"] = str(10090) # Identify subgraph nodes and update their attributes @@ -213,8 +215,9 @@ def terms_subgraph_api(): # Functional terms list_enrichment = ast.literal_eval(request.form.get("func-terms")) + species_id = int(request.form.get("species_id")) - json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment) + json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment, species_id=species_id) stopwatch.total("terms_subgraph_api") diff --git a/backend/src/queries.py b/backend/src/queries.py index 564cbffb..8d544990 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -7,29 +7,36 @@ import neo4j -def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str]): +def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], species_id: int): """:returns: terms, source, target, score""" + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + query = f""" - MATCH (source:Terms)-[association:OVERLAP]->(target:Terms) - WHERE source.external_id IN {term_ids} - AND target.external_id IN {term_ids} - AND source.category IN ["KEGG", "Reactome Pathways"] - AND target.category IN ["KEGG", "Reactome Pathways"] + MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species}) + WHERE source.Term IN {term_ids} + AND target.Term IN {term_ids} RETURN source, target, association.Score AS score; """ with driver.session() as session: result = session.run(query) # custom conversion is needed because otherwise it takes 10s with neo4j (for unknown reasons) - return _convert_to_connection_info_score(result=result, _int=False) + return _convert_to_connection_info_score(result=result, _int=False, protein=False) def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]: # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + query = f""" - MATCH (protein:Protein) - WHERE protein.species_id = {species_id} - AND protein.name IN {str([n.upper() for n in names])} - WITH collect(protein.external_id) AS ids + MATCH (protein:Protein:{species}) + WHERE protein.SYMBOL IN {str([n.title() for n in names])} + WITH collect(protein.ENSEMBL) AS ids RETURN ids """ with driver.session() as session: @@ -37,48 +44,63 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id def get_protein_neighbours( - driver: neo4j.Driver, protein_ids: list[str], threshold: int + driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int ) -> (list[str], list[str], list[str], list[int]): """ :returns: proteins, source_ids, target_ids, scores """ + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons query = f""" - MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein) - WHERE source.external_id IN {protein_ids} - AND target.external_id IN {protein_ids} + MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species}) + WHERE source.ENSEMBL IN {protein_ids} + AND target.ENSEMBL IN {protein_ids} AND association.combined >= {threshold} RETURN source, target, association.combined AS score """ with driver.session() as session: result = session.run(query).single(strict=True).value() - return _convert_to_connection_info_score(result=result, _int=True) + return _convert_to_connection_info_score(result=result, _int=True, protein=False) def get_protein_associations( - driver: neo4j.Driver, protein_ids: list[str], threshold: int + driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int ) -> (list[str], list[str], list[str], list[int]): """ :returns: proteins (nodes), source_ids, target_ids, score """ + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + # unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons query = f""" - MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein) - WHERE source.external_id IN {protein_ids} - AND target.external_id IN {protein_ids} - AND association.combined >= {threshold} - RETURN source, target, association.combined AS score + MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species}) + WHERE source.ENSEMBL IN {protein_ids} + AND target.ENSEMBL IN {protein_ids} + AND association.Score >= {threshold} + RETURN source, target, association.Score AS score """ with driver.session() as session: result = session.run(query) - return _convert_to_connection_info_score(result=result, _int=True) + return _convert_to_connection_info_score(result=result, _int=True, protein=True) -def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]: - query = """ - MATCH (term:Terms) - RETURN term.external_id AS id, term.name AS name, term.category AS category, term.proteins AS proteins +def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str, Any]]: + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + + query = f""" + MATCH (term:FT:{species}) + RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins """ with driver.session() as session: @@ -86,9 +108,14 @@ def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]: return result.data() -def get_number_of_proteins(driver: neo4j.Driver) -> int: - query = """ - MATCH (n:Protein) +def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int: + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + + query = f""" + MATCH (n:Protein:{species}) RETURN count(n) AS num_proteins """ with driver.session() as session: @@ -97,14 +124,18 @@ def get_number_of_proteins(driver: neo4j.Driver) -> int: return int(num_proteins) -def _convert_to_connection_info_score(result: neo4j.Result, _int: bool) -> (list[str], list[str], list[str], list[int]): +def _convert_to_connection_info_score(result: neo4j.Result, _int: bool, protein:bool) -> (list[str], list[str], list[str], list[int]): nodes, source, target, score = list(), list(), list(), list() for row in result: nodes.append(row["source"]) nodes.append(row["target"]) - source.append(row["source"].get("external_id")) - target.append(row["target"].get("external_id")) + if protein: + source.append(row["source"].get("ENSEMBL")) + target.append(row["target"].get("ENSEMBL")) + else: + source.append(row["source"].get("Term")) + target.append(row["target"].get("Term")) if _int: score.append(int(row["score"])) else: diff --git a/frontend/src/components/enrichment/EnrichmentTool.vue b/frontend/src/components/enrichment/EnrichmentTool.vue index 168c3bfb..84db68bf 100644 --- a/frontend/src/components/enrichment/EnrichmentTool.vue +++ b/frontend/src/components/enrichment/EnrichmentTool.vue @@ -88,6 +88,7 @@ var formData = new FormData() formData.append('func-terms', JSON.stringify(com.terms)) + formData.append('species_id', com.gephi_data.nodes[0].species) this.axios .post("/api/subgraph/terms", formData) From f61b260791dab09647ede76a15bd6a18d54cc853 Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Mon, 11 Sep 2023 17:55:36 +0200 Subject: [PATCH 2/6] reverting back to previous version --- backend/src/enrichment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py index 8bd08eda..35d6cfa0 100644 --- a/backend/src/enrichment.py +++ b/backend/src/enrichment.py @@ -16,12 +16,14 @@ def calc_proteins_pval(curr, alpha, in_pr, bg_proteins, num_in_prot): # Lists are read as strings, evaluate to lists using JSON. # alternative is using eval() which is slower + prot_list = curr.replace("'", '"') + prot_list = json.loads(prot_list) # get the protein length of term - num_term_prot = len(curr) + num_term_prot = len(prot_list) # Get intersection of proteins - prots_term = list(set(curr) & in_pr) + prots_term = list(set(prot_list) & in_pr) num_inter = len(prots_term) if num_inter == 0: From f0fb1d3be6d9529fa05d7435173e42b978648732 Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Tue, 12 Sep 2023 12:43:15 +0200 Subject: [PATCH 3/6] Backend updated; works with newest DB version --- backend/src/enrichment_graph.py | 3 ++- backend/src/main.py | 4 ++-- backend/src/queries.py | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py index 552e127c..64898f34 100644 --- a/backend/src/enrichment_graph.py +++ b/backend/src/enrichment_graph.py @@ -35,6 +35,7 @@ def get_functional_graph(list_enrichment, species_id): df2 = nodesterm.rename(columns={"id": "external_id"}) merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id") + print(df2[df2.duplicated(subset=["external_id"], keep=False)].sort_values(by="external_id")["external_id"].to_list()) # Add the two columns to df2 nodes = merged.drop_duplicates() @@ -65,7 +66,7 @@ def get_functional_graph(list_enrichment, species_id): nodes_sub = graph.create_nodes_subgraph(nk_graph, nodes) stopwatch.round("Enrichment") - + if len(nodes.index) == 0: sigmajs_data = {"nodes": [], "edges": []} else: diff --git a/backend/src/main.py b/backend/src/main.py index ab2d20c2..1d7e1e78 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -132,7 +132,7 @@ def proteins_subgraph_api(): # D-Value categorize via percentage if not (request.files.get("file") is None): panda_file.rename(columns={"SYMBOL": "name"}, inplace=True) - panda_file["name"] = panda_file["name"].str.upper() + panda_file["name"] = panda_file["name"].str.title() stopwatch.round("Enrichment") @@ -175,7 +175,7 @@ def proteins_subgraph_api(): if not (request.files.get("file") is None): if selected_d != None: for column in selected_d: - node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item() + node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item() node["label"] = df_node.SYMBOL node["species"] = str(10090) diff --git a/backend/src/queries.py b/backend/src/queries.py index 8d544990..56e0b51e 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -18,6 +18,8 @@ def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], sp MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species}) WHERE source.Term IN {term_ids} AND target.Term IN {term_ids} + AND NOT source.Category IN ["GOCC", "GOMF", "GOBP"] + AND NOT target.Category IN ["GOCC", "GOMF", "GOBP"] RETURN source, target, association.Score AS score; """ with driver.session() as session: From e8ec0e092279b3ae977189b2e9a024d094d9aaff Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Tue, 12 Sep 2023 13:17:13 +0200 Subject: [PATCH 4/6] lint changes and removed unnessecary print --- backend/src/enrichment_graph.py | 3 +-- backend/src/main.py | 1 - backend/src/queries.py | 4 +++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py index 64898f34..552e127c 100644 --- a/backend/src/enrichment_graph.py +++ b/backend/src/enrichment_graph.py @@ -35,7 +35,6 @@ def get_functional_graph(list_enrichment, species_id): df2 = nodesterm.rename(columns={"id": "external_id"}) merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id") - print(df2[df2.duplicated(subset=["external_id"], keep=False)].sort_values(by="external_id")["external_id"].to_list()) # Add the two columns to df2 nodes = merged.drop_duplicates() @@ -66,7 +65,7 @@ def get_functional_graph(list_enrichment, species_id): nodes_sub = graph.create_nodes_subgraph(nk_graph, nodes) stopwatch.round("Enrichment") - + if len(nodes.index) == 0: sigmajs_data = {"nodes": [], "edges": []} else: diff --git a/backend/src/main.py b/backend/src/main.py index 1d7e1e78..d400e6b2 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -109,7 +109,6 @@ def proteins_subgraph_api(): # TODO: make better (Vincent) nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id") - edges = pd.DataFrame({"source": source, "target": target, "score": score}) edges = edges.drop_duplicates(subset=["source", "target"]) diff --git a/backend/src/queries.py b/backend/src/queries.py index 56e0b51e..2d7712a4 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -126,7 +126,9 @@ def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int: return int(num_proteins) -def _convert_to_connection_info_score(result: neo4j.Result, _int: bool, protein:bool) -> (list[str], list[str], list[str], list[int]): +def _convert_to_connection_info_score( + result: neo4j.Result, _int: bool, protein: bool +) -> (list[str], list[str], list[str], list[int]): nodes, source, target, score = list(), list(), list(), list() for row in result: From 87cce6c5932bab802a695fdd7cdcf7bee831b02a Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Mon, 25 Sep 2023 11:32:03 +0200 Subject: [PATCH 5/6] NN-362: Overlap query includes all Terms, Protein filter also on ENSEMBL Protein ID --- backend/src/main.py | 1 - backend/src/queries.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/src/main.py b/backend/src/main.py index d400e6b2..2ce4161c 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -106,7 +106,6 @@ def proteins_subgraph_api(): stopwatch.round("Neo4j") - # TODO: make better (Vincent) nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id") edges = pd.DataFrame({"source": source, "target": target, "score": score}) diff --git a/backend/src/queries.py b/backend/src/queries.py index 2d7712a4..eaebcd94 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -18,8 +18,6 @@ def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], sp MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species}) WHERE source.Term IN {term_ids} AND target.Term IN {term_ids} - AND NOT source.Category IN ["GOCC", "GOMF", "GOBP"] - AND NOT target.Category IN ["GOCC", "GOMF", "GOBP"] RETURN source, target, association.Score AS score; """ with driver.session() as session: @@ -37,7 +35,8 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id query = f""" MATCH (protein:Protein:{species}) - WHERE protein.SYMBOL IN {str([n.title() for n in names])} + WHERE protein.SYMBOL IN {str([n.title() if not n.startswith("ENS") else n.upper() for n in names ])} + OR protein.ENSEMBL IN {str([n.title() if not n.startswith("ENS") else n.upper() for n in names ])} WITH collect(protein.ENSEMBL) AS ids RETURN ids """ From b1baa4446493a1b9c9c83e52d881ca4795fc9c13 Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Thu, 26 Oct 2023 16:01:35 +0200 Subject: [PATCH 6/6] newest changes, protein nodes also unconnected --- backend/src/main.py | 6 +++--- backend/src/queries.py | 24 ++++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/backend/src/main.py b/backend/src/main.py index 2ce4161c..0c5dfc43 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -95,14 +95,14 @@ def proteins_subgraph_api(): selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None threshold = int(float(request.form.get("threshold")) * 1000) - protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id) + proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id) stopwatch.round("Setup") if len(protein_ids) > 1: - proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id) + _, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id) else: - proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id) + _, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id) stopwatch.round("Neo4j") diff --git a/backend/src/queries.py b/backend/src/queries.py index eaebcd94..30b1e311 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -26,7 +26,7 @@ def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], sp return _convert_to_connection_info_score(result=result, _int=False, protein=False) -def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]: +def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]): # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons if species_id == 10090: species = "Mus_Musculus" @@ -35,13 +35,13 @@ def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id query = f""" MATCH (protein:Protein:{species}) - WHERE protein.SYMBOL IN {str([n.title() if not n.startswith("ENS") else n.upper() for n in names ])} - OR protein.ENSEMBL IN {str([n.title() if not n.startswith("ENS") else n.upper() for n in names ])} - WITH collect(protein.ENSEMBL) AS ids - RETURN ids + WHERE protein.SYMBOL IN {str([n.title() for n in names])} + OR protein.ENSEMBL IN {str([n.title() for n in names])} + RETURN protein, protein.ENSEMBL AS id """ with driver.session() as session: - return session.run(query).single(strict=True).value() + result = session.run(query) + return _convert_to_protein_id(result) def get_protein_neighbours( @@ -65,8 +65,8 @@ def get_protein_neighbours( """ with driver.session() as session: - result = session.run(query).single(strict=True).value() - return _convert_to_connection_info_score(result=result, _int=True, protein=False) + result = session.run(query) + return _convert_to_connection_info_score(result=result, _int=True, protein=True) def get_protein_associations( @@ -125,6 +125,14 @@ def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int: return int(num_proteins) +def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]): + proteins, ids = list(), list() + for row in result: + proteins.append(row["protein"]) + ids.append(row["id"]) + return proteins, ids + + def _convert_to_connection_info_score( result: neo4j.Result, _int: bool, protein: bool ) -> (list[str], list[str], list[str], list[int]):