diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py index 204b4cea..35d6cfa0 100644 --- a/backend/src/enrichment.py +++ b/backend/src/enrichment.py @@ -76,14 +76,14 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any): stopwatch = Stopwatch() # Get number of all proteins in the organism (from Cypher) - bg_proteins = queries.get_number_of_proteins(driver) + bg_proteins = queries.get_number_of_proteins(driver, species_id) num_in_prot = len(in_proteins) prots = set(in_proteins) # pandas DataFrames for nodes and edges csv.field_size_limit(sys.maxsize) # Read Terms and put into Dataframe - df_terms = pd.DataFrame(queries.get_enrichment_terms(driver)) + df_terms = pd.DataFrame(queries.get_enrichment_terms(driver, species_id)) tot_tests = len(df_terms) stopwatch.round("setup_enrichment") @@ -95,6 +95,7 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any): new_prots = [] new_p = [] arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]] + with multiprocessing.Pool() as pool: # Apply the function to each input value in parallel and collect the results for a, b in pool.starmap(calc_proteins_pval, arguments): diff --git a/backend/src/enrichment_graph.py b/backend/src/enrichment_graph.py index 398a521b..552e127c 100644 --- a/backend/src/enrichment_graph.py +++ b/backend/src/enrichment_graph.py @@ -14,7 +14,7 @@ _BACKEND_JAR_PATH = "../gephi/target/gephi.backend-1.0-SNAPSHOT.jar" -def get_functional_graph(list_enrichment): +def get_functional_graph(list_enrichment, species_id): stopwatch = Stopwatch() list_term = [] @@ -24,19 +24,19 @@ def get_functional_graph(list_enrichment): driver = database.get_driver() # Execute the query and retrieve the CSV data - terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term) + terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id) stopwatch.round("Neo4j") - nodes = pd.DataFrame(terms).drop_duplicates(subset="external_id") + nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id") nodesterm = pd.DataFrame(list_enrichment) - df2 = nodesterm.rename({"id": "external_id"}, axis=1) + df2 = nodesterm.rename(columns={"id": "external_id"}) merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id") # Add the two columns to df2 - nodes = merged + nodes = merged.drop_duplicates() nodes["fdr_rate"] = nodes["fdr_rate"].fillna(0) nodes["p_value"] = nodes["p_value"].fillna(0) @@ -100,9 +100,9 @@ def get_functional_graph(list_enrichment): node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id]) node["attributes"]["PageRank"] = str(pagerank[mapped_node_id]) node["attributes"]["Ensembl ID"] = df_node.external_id - node["attributes"]["Name"] = df_node.name - node["label"] = df_node.name # Comment this out if you want no node labels displayed - node["attributes"]["Category"] = df_node.category + node["attributes"]["Name"] = df_node.Name + node["label"] = df_node.Name # Comment this out if you want no node labels displayed + node["attributes"]["Category"] = df_node.Category node["attributes"]["FDR"] = df_node.fdr_rate node["attributes"]["P Value"] = df_node.p_value diff --git a/backend/src/main.py b/backend/src/main.py index 47e9fbfd..0c5dfc43 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -51,7 +51,7 @@ def files(path): def proteins_enrichment(): driver = database.get_driver() proteins = request.form.get("proteins").split(",") - species_id = request.form.get("species_id") + species_id = int(request.form.get("species_id")) # in-house functional enrichment list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id) @@ -95,18 +95,18 @@ def proteins_subgraph_api(): selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None threshold = int(float(request.form.get("threshold")) * 1000) - protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id) + proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id) stopwatch.round("Setup") if len(protein_ids) > 1: - proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold) + _, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id) else: - proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold) + _, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id) stopwatch.round("Neo4j") - nodes = pd.DataFrame(proteins).drop_duplicates(subset="external_id") + nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id") edges = pd.DataFrame({"source": source, "target": target, "score": score}) edges = edges.drop_duplicates(subset=["source", "target"]) @@ -130,7 +130,7 @@ def proteins_subgraph_api(): # D-Value categorize via percentage if not (request.files.get("file") is None): panda_file.rename(columns={"SYMBOL": "name"}, inplace=True) - panda_file["name"] = panda_file["name"].str.upper() + panda_file["name"] = panda_file["name"].str.title() stopwatch.round("Enrichment") @@ -167,14 +167,14 @@ def proteins_subgraph_api(): # Use node mapping to add corresponding values of betweenness and pagerank node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id]) node["attributes"]["PageRank"] = str(pagerank[mapped_node_id]) - node["attributes"]["Description"] = df_node.description + node["attributes"]["Description"] = df_node.annotation node["attributes"]["Ensembl ID"] = df_node.external_id - node["attributes"]["Name"] = df_node.name + node["attributes"]["Name"] = df_node.SYMBOL if not (request.files.get("file") is None): if selected_d != None: for column in selected_d: - node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item() - node["label"] = df_node.name + node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item() + node["label"] = df_node.SYMBOL node["species"] = str(10090) # Identify subgraph nodes and update their attributes @@ -213,8 +213,9 @@ def terms_subgraph_api(): # Functional terms list_enrichment = ast.literal_eval(request.form.get("func-terms")) + species_id = int(request.form.get("species_id")) - json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment) + json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment, species_id=species_id) stopwatch.total("terms_subgraph_api") diff --git a/backend/src/queries.py b/backend/src/queries.py index 564cbffb..30b1e311 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -7,78 +7,101 @@ import neo4j -def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str]): +def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], species_id: int): """:returns: terms, source, target, score""" + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + query = f""" - MATCH (source:Terms)-[association:OVERLAP]->(target:Terms) - WHERE source.external_id IN {term_ids} - AND target.external_id IN {term_ids} - AND source.category IN ["KEGG", "Reactome Pathways"] - AND target.category IN ["KEGG", "Reactome Pathways"] + MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species}) + WHERE source.Term IN {term_ids} + AND target.Term IN {term_ids} RETURN source, target, association.Score AS score; """ with driver.session() as session: result = session.run(query) # custom conversion is needed because otherwise it takes 10s with neo4j (for unknown reasons) - return _convert_to_connection_info_score(result=result, _int=False) + return _convert_to_connection_info_score(result=result, _int=False, protein=False) -def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]: +def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]): # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + query = f""" - MATCH (protein:Protein) - WHERE protein.species_id = {species_id} - AND protein.name IN {str([n.upper() for n in names])} - WITH collect(protein.external_id) AS ids - RETURN ids + MATCH (protein:Protein:{species}) + WHERE protein.SYMBOL IN {str([n.title() for n in names])} + OR protein.ENSEMBL IN {str([n.title() for n in names])} + RETURN protein, protein.ENSEMBL AS id """ with driver.session() as session: - return session.run(query).single(strict=True).value() + result = session.run(query) + return _convert_to_protein_id(result) def get_protein_neighbours( - driver: neo4j.Driver, protein_ids: list[str], threshold: int + driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int ) -> (list[str], list[str], list[str], list[int]): """ :returns: proteins, source_ids, target_ids, scores """ + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + # unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons query = f""" - MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein) - WHERE source.external_id IN {protein_ids} - AND target.external_id IN {protein_ids} + MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species}) + WHERE source.ENSEMBL IN {protein_ids} + AND target.ENSEMBL IN {protein_ids} AND association.combined >= {threshold} RETURN source, target, association.combined AS score """ with driver.session() as session: - result = session.run(query).single(strict=True).value() - return _convert_to_connection_info_score(result=result, _int=True) + result = session.run(query) + return _convert_to_connection_info_score(result=result, _int=True, protein=True) def get_protein_associations( - driver: neo4j.Driver, protein_ids: list[str], threshold: int + driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int ) -> (list[str], list[str], list[str], list[int]): """ :returns: proteins (nodes), source_ids, target_ids, score """ + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + # unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons query = f""" - MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein) - WHERE source.external_id IN {protein_ids} - AND target.external_id IN {protein_ids} - AND association.combined >= {threshold} - RETURN source, target, association.combined AS score + MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species}) + WHERE source.ENSEMBL IN {protein_ids} + AND target.ENSEMBL IN {protein_ids} + AND association.Score >= {threshold} + RETURN source, target, association.Score AS score """ with driver.session() as session: result = session.run(query) - return _convert_to_connection_info_score(result=result, _int=True) + return _convert_to_connection_info_score(result=result, _int=True, protein=True) + +def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str, Any]]: + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" -def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]: - query = """ - MATCH (term:Terms) - RETURN term.external_id AS id, term.name AS name, term.category AS category, term.proteins AS proteins + query = f""" + MATCH (term:FT:{species}) + RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins """ with driver.session() as session: @@ -86,9 +109,14 @@ def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]: return result.data() -def get_number_of_proteins(driver: neo4j.Driver) -> int: - query = """ - MATCH (n:Protein) +def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int: + if species_id == 10090: + species = "Mus_Musculus" + elif species_id == 9606: + species = "Homo_Sapiens" + + query = f""" + MATCH (n:Protein:{species}) RETURN count(n) AS num_proteins """ with driver.session() as session: @@ -97,14 +125,28 @@ def get_number_of_proteins(driver: neo4j.Driver) -> int: return int(num_proteins) -def _convert_to_connection_info_score(result: neo4j.Result, _int: bool) -> (list[str], list[str], list[str], list[int]): +def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]): + proteins, ids = list(), list() + for row in result: + proteins.append(row["protein"]) + ids.append(row["id"]) + return proteins, ids + + +def _convert_to_connection_info_score( + result: neo4j.Result, _int: bool, protein: bool +) -> (list[str], list[str], list[str], list[int]): nodes, source, target, score = list(), list(), list(), list() for row in result: nodes.append(row["source"]) nodes.append(row["target"]) - source.append(row["source"].get("external_id")) - target.append(row["target"].get("external_id")) + if protein: + source.append(row["source"].get("ENSEMBL")) + target.append(row["target"].get("ENSEMBL")) + else: + source.append(row["source"].get("Term")) + target.append(row["target"].get("Term")) if _int: score.append(int(row["score"])) else: diff --git a/frontend/src/components/enrichment/EnrichmentTool.vue b/frontend/src/components/enrichment/EnrichmentTool.vue index 168c3bfb..84db68bf 100644 --- a/frontend/src/components/enrichment/EnrichmentTool.vue +++ b/frontend/src/components/enrichment/EnrichmentTool.vue @@ -88,6 +88,7 @@ var formData = new FormData() formData.append('func-terms', JSON.stringify(com.terms)) + formData.append('species_id', com.gephi_data.nodes[0].species) this.axios .post("/api/subgraph/terms", formData)