Skip to content

Commit

Permalink
added method to remove HTML tags from scraped data section
Browse files Browse the repository at this point in the history
  • Loading branch information
mominaatifdar committed Jan 15, 2025
1 parent 43d9e35 commit e6fcf6d
Showing 1 changed file with 48 additions and 18 deletions.
66 changes: 48 additions & 18 deletions scraping/scraping/scrape_data_section_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json
import os
from ast import literal_eval
import re


def setup_chrome_driver(user_agent):
Expand Down Expand Up @@ -49,16 +50,15 @@ def scrape_data(data_keywords):
Performance will be optimized in future"""

files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('.txt')]
# files = ['_Nature Microbiology_Nature Nanotechnology_Science Immunology.txt']
# files = ['_Bioactive materials_Molecular Cancer_Molecular Neurodegeneration.txt']
for filename in files:
# print("filename:", filename)
ua = UserAgent()

results = []
output_file = f"{filename.rstrip(".txt")}_udc.json"
output_file = "scrape_ds_test.json"
content = []
with open(filename, 'r') as f:
# content = f.readlines()
for line in f:
items = literal_eval(line)
content.append(items[0])
Expand Down Expand Up @@ -93,27 +93,23 @@ def scrape_data(data_keywords):
parent_div = heading.find_parent('div')
text = str(parent_div)
start_phrase = heading.get_text()
# start_phrase = heading
end_phrase = "</div>"

substring = text[text.find(start_phrase):text.find(end_phrase) + len(end_phrase)]
substring = substring.split('</b>', 1)[1].strip()
substring = substring.rstrip('</div>')

# print(heading.get_text(), substring)
results.append({"DOI": url,
"header": heading.get_text(),
"paragraph": str(substring)})

elif '/nar' in url or '/neuonc' in url or '/nsr' in url:
paragraph = heading.find_next_siblings("p", limit=3)
# print(heading.get_text(), paragraph)
results.append({"DOI": url,
"header": heading.get_text(),
"paragraph": str(paragraph)})

else:
# print(heading.get_text(), paragraph)
results.append({"DOI": url,
"header": heading.get_text(),
"paragraph": str(paragraph)})
Expand All @@ -138,12 +134,11 @@ def scrape_data(data_keywords):
driver.quit()



def rerun_error_list(data_keywords):
"""Re-runs *_error_list.txt files,
Code will be optimized in future"""

files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('_error_list.txt')]
files = [os.path.join("files", f) for f in os.listdir("files") if f.startswith('_') and f.endswith('f_clean.json')]

for filename in files:
# print("Filename:", filename)
Expand All @@ -152,7 +147,7 @@ def rerun_error_list(data_keywords):

if filename.endswith('_error_list.txt'):
base_filename = filename[:-len('_error_list.txt')]
output_file = f"{base_filename}_udc.json"
output_file = f"files/{base_filename}_udc.json"

with open(output_file, 'r', encoding='utf-8') as f:
results = json.load(f)
Expand Down Expand Up @@ -218,16 +213,18 @@ def rerun_error_list(data_keywords):
results.append(new_entry)

if not data_found:
base_filename = base_filename.rstrip(".txt")
with open(f"{base_filename}_no_data.txt", "a", encoding="utf-8") as f2:
f2.write(doi)
no_data_file = os.path.splitext(os.path.basename(file))[0] + "_no_data.txt"
no_data_path = os.path.join("files", no_data_file)
with open(no_data_path, "a", encoding="utf-8") as f2:
f2.write(doi + "\n")

save_to_json(results, output_file)

else:
base_filename = base_filename.rstrip(".txt")
with open(f"{base_filename}_error_list_retry.txt", "a", encoding="utf-8") as f3:
f3.write(doi)
error_file = os.path.splitext(os.path.basename(file))[0] + "_error_list_retry.txt"
error_path = os.path.join("files", error_file)
with open(error_path, "a", encoding="utf-8") as f2:
f2.write(doi + "\n")

finally:
driver.quit()
Expand All @@ -249,7 +246,7 @@ def merge_data_code_section():
"""Publications having Data Availability and Code Availability separate are merged in a single entry"""

files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('udc.json')]

# files = ['scrape_ds_test.json']
for file in files:
with open(file,'r', encoding="utf-8") as f:
data = json.load(f)
Expand All @@ -274,6 +271,39 @@ def merge_data_code_section():



def clean_html():
"""Remove unnecessary HTML elements from 'paragraph' """


files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('udc_formatted.json')]
# files = ['scrape_ds_test.json']
for file in files:
results = []
try:
with open(file,'r', encoding="utf-8") as f:
data = json.load(f)

for each in data:
para = each["paragraph"]
text_no_html = re.sub(r'<[^>]+>', '', para)
text_clean = re.sub(r'\s+', ' ', text_no_html).strip()

result_entry = {
"DOI": each["DOI"],
"header": each["header"],
"paragraph": text_clean
}
results.append(result_entry)

output_file = file.rstrip(".json")
with open(f"{output_file}_f_clean.json","w") as f:
json.dump(results, f, indent=4, ensure_ascii=False)

except Exception as e:
print(f"An error occurred: {str(e)}")



if __name__ == "__main__":

data_keywords = ["Data and code availability",
Expand Down Expand Up @@ -321,4 +351,4 @@ def merge_data_code_section():

# scrape_data(data_keywords)
# merge_data_code_section()
rerun_error_list(data_keywords)
# clean_html()

0 comments on commit e6fcf6d

Please sign in to comment.