Skip to content

Commit

Permalink
NN-617 pick Zenodo URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
mominaatifdar committed Jan 9, 2025
1 parent 517b997 commit a4a6495
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 143 deletions.
161 changes: 161 additions & 0 deletions scraping/scraping/extract_data_LLM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@

import ollama
import json
import re
import os

def send_to_llm(input_data):

prompt = f"""
You are a highly precise extraction tool. I will provide you with the DOI of a scientific publication and its 'Data Availability' section.
Your task:
1. Extract **only** explicitly stated sequencing data accession codes and their respective database names (e.g., GEO, ENA, SRA, etc.). Do **not** infer or guess accession codes if they are not explicitly mentioned in the text.
2. Identify **all** source code URLs, including URLs from **GitHub** and **Zenodo**. For Zenodo, include any URL explicitly mentioned, regardless of whether it refers to sequencing data or source code. **Do not include URLs from any other sources**.
Return the results **only** in valid JSON format, with no additional text or explanation. Do **not** include any introductory or closing statements.
The returned JSON should strictly follow the following format:
{{
"accession codes": {{
"database_name_1": ["accession_code_1", "accession_code_2"],
"database_name_2": ["accession_code_3"]
}},
"source code": ["GitHub_URL", "Zenodo_URL"]
}}
Example Input:
DOI: <<DOI>> 10.1234/example.doi <<END DOI>>
Data Availability: <<DATA>> Raw sequencing data were deposited at the GEO under accession numbers GSE12345 and GSE67890. Sequencing data is available on Zenodo at https://zenodo.org/record/12345 and https://doi.org/10.1010/zenodo.1234567. Additional analysis scripts are hosted on Zenodo at https://zenodo.org/record/67890 and GitHub at https://github.com/example/repo. <<END DATA>>
Example Output:
{{
"accession codes": {{
"GEO": ["GSE12345", "GSE67890"]
}},
"source code": [
"https://github.com/example/repo",
"https://zenodo.org/record/12345",
"https://doi.org/10.1010/zenodo.1234567",
"https://zenodo.org/record/67890"
]
}}
Strict Rules:
- Only return the JSON structure as shown, with **no additional text or explanation**.
- If no accession codes are available, leave the "accession codes" section empty: "accession codes": {{}}.
- If no source code URLs are available, leave the "source code" section empty: "source code": [].
- Accession codes must strictly follow common formats (e.g., GSE followed by numbers for GEO, PRJ followed by alphanumeric strings for SRA, etc.). **Only include accession codes that match valid formats**.
- Include **all Zenodo URLs** explicitly mentioned in the text, regardless of their stated purpose, and ensure that **no other URLs** (except GitHub or Zenodo) are included in the "source code" section.
Input Data:
DOI: <<DOI>> {input_data['DOI']} <<END DOI>>
Data Availability: <<DATA>> {input_data['Data Availability']} <<END DATA>>
"""


response = ollama.chat(model="llama3.1:8b", messages=[
{
"role": "user",
"content": prompt
}
])

try:
result = {
"accession codes": [],
"source code": []
}

zenodo_urls_in_input = re.findall(r'https?://doi\.org/\d+\.\d+/zenodo\.\d+', input_data['Data Availability'])

if response and hasattr(response, 'message'):

try:
parsed = json.loads(response.message['content'])

if isinstance(parsed, dict):
result.update(parsed)
except json.JSONDecodeError:
print(f"Could not parse response as JSON: {response.message['content']}")

# print("RESULT BEFORE:", result)
if zenodo_urls_in_input:
existing_urls = set(result["source code"])
for url in zenodo_urls_in_input:
if url not in existing_urls:
result["source code"].append(url)
return result

except Exception as e:
# print(f"Error processing response: {str(e)}")
return {
"accession codes": [],
"source code": []
}

def process_file():
files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('formatted.json')]
# files = ["scrape_ds_test_formatted.json"]

for file in files:
print("Filename:", file)
with open(file, 'r', encoding='utf-8') as f:
data = json.load(f)

results = []
for entry in data:
doi = entry.get("DOI", "")
data_availability = entry.get("paragraph", "")
input_data = {
"DOI": doi,
"Data Availability": data_availability
}
# print('Processing DOI:', doi)
# print('Input data:', input_data['Data Availability'])

try:
result = send_to_llm(input_data)
found_ac_1 = False
found_ac_2 = False

# to prevent hallucination in accession codes
if 'GEO' in result.get('accession codes', {}):
found_ac_1 = any('GSE12345' in codes for codes in result['accession codes']['GEO'])
found_ac_2 = any('GSE67890' in codes for codes in result['accession codes']['GEO'])

# to prevent hallucination in source code URLs
new_source_code = []
if result.get('source code', []) != []:
for url in result.get('source code', []):
if 'github.com' in url.lower():
if 'github.com' in input_data['Data Availability']:
new_source_code.append(url)
elif 'zenodo' in url.lower():
if 'zenodo' in input_data['Data Availability'].lower():
new_source_code.append(url)

result['source code'] = new_source_code

if found_ac_1 or found_ac_2:
result = {'accession codes': {}, 'source code': {new_source_code}}

results.append({
"DOI": doi,
"results": result
})

except Exception as e:
print(f"Error processing DOI {doi}: {str(e)}")


# for i in results:
# print(i)
output_file = file.rstrip('.json')
output_file = f"{output_file}_LLM.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4)


if __name__ == "__main__":
process_file()
143 changes: 0 additions & 143 deletions scraping/scraping/extract_data_section_LLM.py

This file was deleted.

0 comments on commit a4a6495

Please sign in to comment.