Adding new post

skynettoday · Jan 8, 2024 · bdf7822 · bdf7822
1 parent 484b4b1
commit bdf7822
Show file tree

Hide file tree

Showing 7 changed files with 287 additions and 74 deletions.
diff --git a/_posts/digests/2024-01-08-252.md b/_posts/digests/2024-01-08-252.md
diff --git a/assets/img/digests/252/00Times-lmkv-facebookJumbo.jpg b/assets/img/digests/252/00Times-lmkv-facebookJumbo.jpg
diff --git a/scripts/__pycache__/content_retrieval.cpython-311.pyc b/scripts/__pycache__/content_retrieval.cpython-311.pyc
diff --git a/scripts/__pycache__/parse_arxiv_html.cpython-311.pyc b/scripts/__pycache__/parse_arxiv_html.cpython-311.pyc
diff --git a/scripts/content_retrieval.py b/scripts/content_retrieval.py
@@ -0,0 +1,67 @@
+import requests
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+
+def get_arxiv_paper_contents(url):
+    def remove_unwanted_tags(content, tags_to_remove):
+        """ Removes specified tags from the content but keeps their text. """
+        for tag in tags_to_remove:
+            for sub_tag in content.find_all(tag):
+                sub_tag.replace_with(sub_tag.get_text())
+        return content
+
+    def extract_text_by_section_ordered(soup):
+        extracted_text = OrderedDict()
+        current_section = None
+
+        for element in soup.descendants:
+            if element.name == 'h2':
+                current_section = element.get_text(strip=True)
+                if current_section!="Abstract":
+                    current_section = current_section[1:]
+                extracted_text[current_section] = ''
+            elif element.name == 'p' and current_section:
+                cleaned_element = remove_unwanted_tags(element, ['cite', 'a', 'span'])
+                extracted_text[current_section] += cleaned_element.get_text(separator='', strip=False) + '\n'
+        return extracted_text
+
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    title = soup.title.string if soup.title else "Title not found"
+    section_texts = extract_text_by_section_ordered(soup)
+    abstract = section_texts.get('Abstract', 'Abstract not found')
+    introduction = section_texts.get('Introduction', 'Introduction not found')
+
+    return f"Title: {title}\n\nAbstract:\n{abstract}\n\nIntroduction:\n{introduction}"
+
+def get_reuters_article_content(url):
+    """
+    Extracts the content of a Reuters article from its URL.
+
+    :param url: URL of the Reuters article.
+    :return: Plain text string of the article contents, without HTML tags.
+    """
+    # Fetch the HTML content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Raise an error for bad status codes
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    # Find the div with a class that starts with "article-body__container"
+    article_body = soup.find('div', class_=lambda value: value and value.startswith('article-body__container'))
+
+    # Extract and return the text content, if the div is found
+    if article_body:
+        return article_body.get_text(strip=True)
+    else:
+        return "Article content not found."
+
+
+if __name__ == '__main__':
+    url = "https://browse.arxiv.org/html/2401.02117v1"
+    result = extract_title_abstract_introduction(url)
+
+    print(result)
+
diff --git a/scripts/csv2md.py b/scripts/csv2md.py
@@ -18,8 +18,10 @@
 import re
 
 from tenacity import retry, stop_after_attempt, wait_random_exponential
+from content_retrieval import get_arxiv_paper_contents, get_reuters_article_content
 
 
+# a05825917ad14bc38d6d4152b5fae19a
 CATEGORIES = [
     'Top News',
     'Tools',
@@ -65,6 +67,8 @@ def get_article_category(row, excerpt):
         return row['Type']
 
     title, url = row['Name'], row['URL']
+    if 'arxiv' in url:
+        return 'Research'
 
     prompt = f'''
 Title: {title}
@@ -86,29 +90,36 @@ def get_article_category(row, excerpt):
 
 The user will provide the article title, link, and description. 
 After careful consideration, you will respond with ONLY the predicted article type, with no explanations, punctuation, formatting, or anything else.
-Please only respond with one of the above types (Business, Resesarch, Tools, Concerns, Policy, Analysis, Expert Opinions, Explainers, Fun).
+Only respond with one of the above types (Business, Research, Tools, Concerns, Policy, Analysis, Expert Opinions, Explainers, Fun).
 '''.strip()
     return query_openai([
         {'role': 'system', 'content': system_prompt},
-        {'role': 'user', 'content': prompt}
-    ])
+        {'role': 'user', 'content': prompt},
+    ], model='gpt-4')
 
 
 def get_news_article(url):
-    if 'arxiv' in url:
-        url = arxiv_to_huggingface(url)
-
     try:
-        article = Article(url)
-        article.download()
-        article.parse()
-        assert article.text
-        return {
-            'text': article.text,
-            'top_image': article.top_image,
-            'has_top_image': article.has_top_image()
-        }
-    except:
+        if 'arxiv' in url:
+            url = arxiv_to_html(url)
+            text = get_arxiv_paper_contents(url)
+            return {
+                'text': text,
+                'top_image': None,
+                'has_top_image': False
+            }
+        else:
+            article = Article(url)
+            article.download()
+            article.parse()
+            assert article.text
+            return {
+                'text': article.text,
+                'top_image': article.top_image,
+                'has_top_image': article.has_top_image()
+            }
+    except Exception as e:
+        print('ERROR: not able to get text for URL '+url)
         return None
 
 
@@ -186,8 +197,8 @@ def get_output_file_name(n):
 def get_article_summary(title, news_article):
     system_prompt = '''
 You are an expert writer and commentator. 
-The user will give you an article, and you will write a short summary.
-The summary should be one paragraph long, contain key technical details, and be easy to understand. 
+The user will give you an article, and you will write a one paragraph summary.
+The summary should be one paragraph long, have at least four sentences, contain key technical details, and be easy to understand. 
 The summary should highlight key words and concepts from the article without abstracting them away. 
 The reader should clearly understand the key points from the article after reading your summary.
 '''.strip()
@@ -202,7 +213,7 @@ def get_article_summary(title, news_article):
         {'role': 'user', 'content': user_prompt}
     ]
 
-    return query_openai(messages, max_tokens=2000, model='gpt-3.5-turbo-1106')
+    return query_openai(messages, max_tokens=4000, model='gpt-4')
 
 
 def rank_articles(articles):
@@ -226,15 +237,6 @@ def rank_articles(articles):
 
     return json.loads(query_openai(messages, max_tokens=200))
 
-
-def arxiv_to_huggingface(url: str) -> str:
-    match = re.search(r"https://arxiv.org/abs/(\d+\.\d+)(?:v\d+)?", url)
-    if match:
-        return f"https://huggingface.co/papers/{match.group(1)}"
-    else:
-        return url
-
-
 def arxiv_to_html(url: str) -> str:
     paper_id = url[url.find('abs/') + 4:].strip('/').strip()
     if paper_id:
@@ -303,12 +305,16 @@ def get_newsletter_excerpt(top_news):
     csv = pd.read_csv(input_csv, encoding='utf-8')
     rows = []
     for row_num, row in csv.iterrows():
-        if 'arxiv' in row['URL']:
+        if 'arxiv' in row['URL'] and row['Name'].startswith('Title:'):
             # remove "Title:" from arxiv titles
             row['Name'] = row['Name'][6:]
 
         if 'youtube' in row['URL']:
             continue
+
+        if '?' in row['URL']:
+            row['URL'] = row['URL'].split("?")[0]
+
         rows.append(row)
 
     print('Getting news articles...')
@@ -376,7 +382,7 @@ def get_newsletter_excerpt(top_news):
                     article = articles[r]
                     summary = summaries[r]
                     title, url, news_article = article['title'], article['url'], article['news_article']
-                    
+
                     top_news += f'#### [{title}]({url})'
                     top_news += '\n'
 

diff --git a/scripts/parse_arxiv_html.py b/scripts/parse_arxiv_html.py