From e6fcf6d04a3eb51b8ffdbe56b260160ae0397cb8 Mon Sep 17 00:00:00 2001
From: Momina Atif Dar <p180030@nu.edu.pk>
Date: Wed, 15 Jan 2025 11:16:01 +0100
Subject: [PATCH] added method to remove HTML tags from scraped data section

---
 .../scraping/scrape_data_section_server.py    | 66 ++++++++++++++-----
 1 file changed, 48 insertions(+), 18 deletions(-)
diff --git a/scraping/scraping/scrape_data_section_server.py b/scraping/scraping/scrape_data_section_server.py
index e7c13b1f..991c774d 100644
--- a/scraping/scraping/scrape_data_section_server.py
+++ b/scraping/scraping/scrape_data_section_server.py
@@ -12,6 +12,7 @@
 import json
 import os
 from ast import literal_eval
+import re
 
 
 def setup_chrome_driver(user_agent):
@@ -49,16 +50,15 @@ def scrape_data(data_keywords):
         Performance will be optimized in future"""
     
     files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('.txt')]
-    # files = ['_Nature Microbiology_Nature Nanotechnology_Science Immunology.txt']
+    # files = ['_Bioactive materials_Molecular Cancer_Molecular Neurodegeneration.txt']
     for filename in files:
         # print("filename:", filename)
         ua = UserAgent()
 
         results = []
-        output_file = f"{filename.rstrip(".txt")}_udc.json"
+        output_file = "scrape_ds_test.json"
         content = []
         with open(filename, 'r') as f:
-            # content = f.readlines()
             for line in f:
                 items = literal_eval(line)
                 content.append(items[0])
@@ -93,27 +93,23 @@ def scrape_data(data_keywords):
                                     parent_div = heading.find_parent('div')
                                     text = str(parent_div)
                                     start_phrase = heading.get_text()
-                                    # start_phrase = heading
                                     end_phrase = "</div>"
 
                                     substring = text[text.find(start_phrase):text.find(end_phrase) + len(end_phrase)]
                                     substring = substring.split('</b>', 1)[1].strip()
                                     substring = substring.rstrip('</div>')
 
-                                    # print(heading.get_text(), substring)
                                     results.append({"DOI": url, 
                                         "header": heading.get_text(), 
                                         "paragraph": str(substring)})
                                     
                                 elif '/nar' in url or '/neuonc' in url or '/nsr' in url:
                                     paragraph = heading.find_next_siblings("p", limit=3)
-                                    # print(heading.get_text(), paragraph)
                                     results.append({"DOI": url, 
                                         "header": heading.get_text(), 
                                         "paragraph": str(paragraph)})
 
                                 else:
-                                    # print(heading.get_text(), paragraph)    
                                     results.append({"DOI": url, 
                                         "header": heading.get_text(), 
                                         "paragraph": str(paragraph)})
@@ -138,12 +134,11 @@ def scrape_data(data_keywords):
                 driver.quit()
 
 
-
 def rerun_error_list(data_keywords):
     """Re-runs *_error_list.txt files,
         Code will be optimized in future"""
 
-    files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('_error_list.txt')]
+    files = [os.path.join("files", f) for f in os.listdir("files") if f.startswith('_') and f.endswith('f_clean.json')]
 
     for filename in files:
         # print("Filename:", filename)
@@ -152,7 +147,7 @@ def rerun_error_list(data_keywords):
 
         if filename.endswith('_error_list.txt'):
             base_filename = filename[:-len('_error_list.txt')]
-        output_file = f"{base_filename}_udc.json"
+        output_file = f"files/{base_filename}_udc.json"
 
         with open(output_file, 'r', encoding='utf-8') as f:
             results = json.load(f)
@@ -218,16 +213,18 @@ def rerun_error_list(data_keywords):
                                             results.append(new_entry) 
 
                             if not data_found:
-                                base_filename = base_filename.rstrip(".txt")
-                                with open(f"{base_filename}_no_data.txt", "a", encoding="utf-8") as f2:
-                                    f2.write(doi)
+                                no_data_file = os.path.splitext(os.path.basename(file))[0] + "_no_data.txt"
+                                no_data_path = os.path.join("files", no_data_file)
+                                with open(no_data_path, "a", encoding="utf-8") as f2:
+                                    f2.write(doi + "\n")
 
                             save_to_json(results, output_file)
                             
                         else: 
-                            base_filename = base_filename.rstrip(".txt")
-                            with open(f"{base_filename}_error_list_retry.txt", "a", encoding="utf-8") as f3:
-                                f3.write(doi)
+                            error_file = os.path.splitext(os.path.basename(file))[0] + "_error_list_retry.txt"
+                            error_path = os.path.join("files", error_file)
+                            with open(error_path, "a", encoding="utf-8") as f2:
+                                f2.write(doi + "\n")
 
                     finally:
                         driver.quit()
@@ -249,7 +246,7 @@ def merge_data_code_section():
     """Publications having Data Availability and Code Availability separate are merged in a single entry"""
 
     files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('udc.json')]
-
+    # files = ['scrape_ds_test.json']
     for file in files:
         with open(file,'r', encoding="utf-8") as f:
             data = json.load(f)
@@ -274,6 +271,39 @@ def merge_data_code_section():
 
 
 
+def clean_html():
+    """Remove unnecessary HTML elements from 'paragraph' """
+
+    
+    files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('udc_formatted.json')]
+    # files = ['scrape_ds_test.json']
+    for file in files:
+        results = []
+        try:
+            with open(file,'r', encoding="utf-8") as f:
+                data = json.load(f)
+            
+            for each in data:
+                para = each["paragraph"]
+                text_no_html = re.sub(r'<[^>]+>', '', para)
+                text_clean = re.sub(r'\s+', ' ', text_no_html).strip()
+
+                result_entry = {
+                    "DOI": each["DOI"],
+                    "header": each["header"],
+                    "paragraph": text_clean
+                }
+                results.append(result_entry)
+
+            output_file = file.rstrip(".json")
+            with open(f"{output_file}_f_clean.json","w") as f:
+                json.dump(results, f, indent=4, ensure_ascii=False)
+
+        except Exception as e:
+            print(f"An error occurred: {str(e)}")
+
+
+
 if __name__ == "__main__":
 
     data_keywords = ["Data and code availability",
@@ -321,4 +351,4 @@ def merge_data_code_section():
    
     # scrape_data(data_keywords)
     # merge_data_code_section()
-    rerun_error_list(data_keywords)
\ No newline at end of file
+    # clean_html()