Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update of queries to use new database #45

Merged
merged 6 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch = Stopwatch()

# Get number of all proteins in the organism (from Cypher)
bg_proteins = queries.get_number_of_proteins(driver)
bg_proteins = queries.get_number_of_proteins(driver, species_id)
num_in_prot = len(in_proteins)
prots = set(in_proteins)
# pandas DataFrames for nodes and edges
csv.field_size_limit(sys.maxsize)

# Read Terms and put into Dataframe
df_terms = pd.DataFrame(queries.get_enrichment_terms(driver))
df_terms = pd.DataFrame(queries.get_enrichment_terms(driver, species_id))
tot_tests = len(df_terms)

stopwatch.round("setup_enrichment")
Expand All @@ -95,6 +95,7 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
new_prots = []
new_p = []
arguments = [(value, alpha, prots, bg_proteins, num_in_prot) for value in df_terms["proteins"]]

with multiprocessing.Pool() as pool:
# Apply the function to each input value in parallel and collect the results
for a, b in pool.starmap(calc_proteins_pval, arguments):
Expand Down
16 changes: 8 additions & 8 deletions backend/src/enrichment_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
_BACKEND_JAR_PATH = "../gephi/target/gephi.backend-1.0-SNAPSHOT.jar"


def get_functional_graph(list_enrichment):
def get_functional_graph(list_enrichment, species_id):
stopwatch = Stopwatch()

list_term = []
Expand All @@ -24,19 +24,19 @@ def get_functional_graph(list_enrichment):
driver = database.get_driver()

# Execute the query and retrieve the CSV data
terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term)
terms, source, target, score = queries.get_terms_connected_by_overlap(driver, list_term, species_id)

stopwatch.round("Neo4j")

nodes = pd.DataFrame(terms).drop_duplicates(subset="external_id")
nodes = pd.DataFrame(terms).rename(columns={"Term": "external_id"}).drop_duplicates(subset="external_id")

nodesterm = pd.DataFrame(list_enrichment)

df2 = nodesterm.rename({"id": "external_id"}, axis=1)
df2 = nodesterm.rename(columns={"id": "external_id"})
merged = pd.merge(df2[["external_id", "fdr_rate", "p_value"]], nodes, on="external_id")

# Add the two columns to df2
nodes = merged
nodes = merged.drop_duplicates()

nodes["fdr_rate"] = nodes["fdr_rate"].fillna(0)
nodes["p_value"] = nodes["p_value"].fillna(0)
Expand Down Expand Up @@ -100,9 +100,9 @@ def get_functional_graph(list_enrichment):
node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
node["attributes"]["Ensembl ID"] = df_node.external_id
node["attributes"]["Name"] = df_node.name
node["label"] = df_node.name # Comment this out if you want no node labels displayed
node["attributes"]["Category"] = df_node.category
node["attributes"]["Name"] = df_node.Name
node["label"] = df_node.Name # Comment this out if you want no node labels displayed
node["attributes"]["Category"] = df_node.Category
node["attributes"]["FDR"] = df_node.fdr_rate
node["attributes"]["P Value"] = df_node.p_value

Expand Down
23 changes: 12 additions & 11 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def files(path):
def proteins_enrichment():
driver = database.get_driver()
proteins = request.form.get("proteins").split(",")
species_id = request.form.get("species_id")
species_id = int(request.form.get("species_id"))

# in-house functional enrichment
list_enrichment = enrichment.functional_enrichment(driver, proteins, species_id)
Expand Down Expand Up @@ -95,18 +95,18 @@ def proteins_subgraph_api():
selected_d = request.form.get("selected_d").split(",") if request.form.get("selected_d") else None
threshold = int(float(request.form.get("threshold")) * 1000)

protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)
proteins, protein_ids = queries.get_protein_ids_for_names(driver, protein_names, species_id)

stopwatch.round("Setup")

if len(protein_ids) > 1:
proteins, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold)
_, source, target, score = queries.get_protein_associations(driver, protein_ids, threshold, species_id)
else:
proteins, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold)
_, source, target, score = queries.get_protein_neighbours(driver, protein_ids, threshold, species_id)

stopwatch.round("Neo4j")

nodes = pd.DataFrame(proteins).drop_duplicates(subset="external_id")
nodes = pd.DataFrame(proteins).rename(columns={"ENSEMBL": "external_id"}).drop_duplicates(subset="external_id")

edges = pd.DataFrame({"source": source, "target": target, "score": score})
edges = edges.drop_duplicates(subset=["source", "target"])
Expand All @@ -130,7 +130,7 @@ def proteins_subgraph_api():
# D-Value categorize via percentage
if not (request.files.get("file") is None):
panda_file.rename(columns={"SYMBOL": "name"}, inplace=True)
panda_file["name"] = panda_file["name"].str.upper()
panda_file["name"] = panda_file["name"].str.title()

stopwatch.round("Enrichment")

Expand Down Expand Up @@ -167,14 +167,14 @@ def proteins_subgraph_api():
# Use node mapping to add corresponding values of betweenness and pagerank
node["attributes"]["Betweenness Centrality"] = str(betweenness[mapped_node_id])
node["attributes"]["PageRank"] = str(pagerank[mapped_node_id])
node["attributes"]["Description"] = df_node.description
node["attributes"]["Description"] = df_node.annotation
node["attributes"]["Ensembl ID"] = df_node.external_id
node["attributes"]["Name"] = df_node.name
node["attributes"]["Name"] = df_node.SYMBOL
if not (request.files.get("file") is None):
if selected_d != None:
for column in selected_d:
node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.name, column].item()
node["label"] = df_node.name
node["attributes"][column] = panda_file.loc[panda_file["name"] == df_node.SYMBOL, column].item()
node["label"] = df_node.SYMBOL
node["species"] = str(10090)

# Identify subgraph nodes and update their attributes
Expand Down Expand Up @@ -213,8 +213,9 @@ def terms_subgraph_api():

# Functional terms
list_enrichment = ast.literal_eval(request.form.get("func-terms"))
species_id = int(request.form.get("species_id"))

json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment)
json_str = enrichment_graph.get_functional_graph(list_enrichment=list_enrichment, species_id=species_id)

stopwatch.total("terms_subgraph_api")

Expand Down
116 changes: 79 additions & 37 deletions backend/src/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,88 +7,116 @@
import neo4j


def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str]):
def get_terms_connected_by_overlap(driver: neo4j.Driver, term_ids: list[str], species_id: int):
""":returns: terms, source, target, score"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (source:Terms)-[association:OVERLAP]->(target:Terms)
WHERE source.external_id IN {term_ids}
AND target.external_id IN {term_ids}
AND source.category IN ["KEGG", "Reactome Pathways"]
AND target.category IN ["KEGG", "Reactome Pathways"]
MATCH (source:FT:{species})-[association:OVERLAP]->(target:FT:{species})
WHERE source.Term IN {term_ids}
AND target.Term IN {term_ids}
RETURN source, target, association.Score AS score;
"""
with driver.session() as session:
result = session.run(query)
# custom conversion is needed because otherwise it takes 10s with neo4j (for unknown reasons)
return _convert_to_connection_info_score(result=result, _int=False)
return _convert_to_connection_info_score(result=result, _int=False, protein=False)


def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> list[str]:
def get_protein_ids_for_names(driver: neo4j.Driver, names: list[str], species_id: int) -> (list, list[str]):
# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (protein:Protein)
WHERE protein.species_id = {species_id}
AND protein.name IN {str([n.upper() for n in names])}
WITH collect(protein.external_id) AS ids
RETURN ids
MATCH (protein:Protein:{species})
WHERE protein.SYMBOL IN {str([n.title() for n in names])}
OR protein.ENSEMBL IN {str([n.title() for n in names])}
RETURN protein, protein.ENSEMBL AS id
"""
with driver.session() as session:
return session.run(query).single(strict=True).value()
result = session.run(query)
return _convert_to_protein_id(result)


def get_protein_neighbours(
driver: neo4j.Driver, protein_ids: list[str], threshold: int
driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
) -> (list[str], list[str], list[str], list[int]):
"""
:returns: proteins, source_ids, target_ids, scores
"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

# unsafe parameters because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
WHERE source.external_id IN {protein_ids}
AND target.external_id IN {protein_ids}
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
AND association.combined >= {threshold}
RETURN source, target, association.combined AS score
"""

with driver.session() as session:
result = session.run(query).single(strict=True).value()
return _convert_to_connection_info_score(result=result, _int=True)
result = session.run(query)
return _convert_to_connection_info_score(result=result, _int=True, protein=True)


def get_protein_associations(
driver: neo4j.Driver, protein_ids: list[str], threshold: int
driver: neo4j.Driver, protein_ids: list[str], threshold: int, species_id: int
) -> (list[str], list[str], list[str], list[int]):
"""
:returns: proteins (nodes), source_ids, target_ids, score
"""
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

# unsafe parameters are needed because otherwise this query takes 10s with neo4j for unknown reasons
query = f"""
MATCH (source:Protein)-[association:ASSOCIATION]->(target:Protein)
WHERE source.external_id IN {protein_ids}
AND target.external_id IN {protein_ids}
AND association.combined >= {threshold}
RETURN source, target, association.combined AS score
MATCH (source:Protein:{species})-[association:STRING]->(target:Protein:{species})
WHERE source.ENSEMBL IN {protein_ids}
AND target.ENSEMBL IN {protein_ids}
AND association.Score >= {threshold}
RETURN source, target, association.Score AS score
"""
with driver.session() as session:
result = session.run(query)
return _convert_to_connection_info_score(result=result, _int=True)
return _convert_to_connection_info_score(result=result, _int=True, protein=True)


def get_enrichment_terms(driver: neo4j.Driver, species_id: int) -> list[dict[str, Any]]:
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

def get_enrichment_terms(driver: neo4j.Driver) -> list[dict[str, Any]]:
query = """
MATCH (term:Terms)
RETURN term.external_id AS id, term.name AS name, term.category AS category, term.proteins AS proteins
query = f"""
MATCH (term:FT:{species})
RETURN term.Term AS id, term.Name AS name, term.Category AS category, term.Proteins AS proteins
"""

with driver.session() as session:
result = session.run(query)
return result.data()


def get_number_of_proteins(driver: neo4j.Driver) -> int:
query = """
MATCH (n:Protein)
def get_number_of_proteins(driver: neo4j.Driver, species_id: int) -> int:
if species_id == 10090:
species = "Mus_Musculus"
elif species_id == 9606:
species = "Homo_Sapiens"

query = f"""
MATCH (n:Protein:{species})
RETURN count(n) AS num_proteins
"""
with driver.session() as session:
Expand All @@ -97,14 +125,28 @@ def get_number_of_proteins(driver: neo4j.Driver) -> int:
return int(num_proteins)


def _convert_to_connection_info_score(result: neo4j.Result, _int: bool) -> (list[str], list[str], list[str], list[int]):
def _convert_to_protein_id(result: neo4j.Result) -> (list, list[str]):
proteins, ids = list(), list()
for row in result:
proteins.append(row["protein"])
ids.append(row["id"])
return proteins, ids


def _convert_to_connection_info_score(
result: neo4j.Result, _int: bool, protein: bool
) -> (list[str], list[str], list[str], list[int]):
nodes, source, target, score = list(), list(), list(), list()

for row in result:
nodes.append(row["source"])
nodes.append(row["target"])
source.append(row["source"].get("external_id"))
target.append(row["target"].get("external_id"))
if protein:
source.append(row["source"].get("ENSEMBL"))
target.append(row["target"].get("ENSEMBL"))
else:
source.append(row["source"].get("Term"))
target.append(row["target"].get("Term"))
if _int:
score.append(int(row["score"]))
else:
Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/enrichment/EnrichmentTool.vue
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@

var formData = new FormData()
formData.append('func-terms', JSON.stringify(com.terms))
formData.append('species_id', com.gephi_data.nodes[0].species)

this.axios
.post("/api/subgraph/terms", formData)
Expand Down
Loading