Skip to content

Commit

Permalink
Adding new post
Browse files Browse the repository at this point in the history
  • Loading branch information
andreykurenkov committed Jan 8, 2024
1 parent 484b4b1 commit bdf7822
Show file tree
Hide file tree
Showing 7 changed files with 287 additions and 74 deletions.
184 changes: 184 additions & 0 deletions _posts/digests/2024-01-08-252.md

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
67 changes: 67 additions & 0 deletions scripts/content_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict

def get_arxiv_paper_contents(url):
def remove_unwanted_tags(content, tags_to_remove):
""" Removes specified tags from the content but keeps their text. """
for tag in tags_to_remove:
for sub_tag in content.find_all(tag):
sub_tag.replace_with(sub_tag.get_text())
return content

def extract_text_by_section_ordered(soup):
extracted_text = OrderedDict()
current_section = None

for element in soup.descendants:
if element.name == 'h2':
current_section = element.get_text(strip=True)
if current_section!="Abstract":
current_section = current_section[1:]
extracted_text[current_section] = ''
elif element.name == 'p' and current_section:
cleaned_element = remove_unwanted_tags(element, ['cite', 'a', 'span'])
extracted_text[current_section] += cleaned_element.get_text(separator='', strip=False) + '\n'
return extracted_text

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

title = soup.title.string if soup.title else "Title not found"
section_texts = extract_text_by_section_ordered(soup)
abstract = section_texts.get('Abstract', 'Abstract not found')
introduction = section_texts.get('Introduction', 'Introduction not found')

return f"Title: {title}\n\nAbstract:\n{abstract}\n\nIntroduction:\n{introduction}"

def get_reuters_article_content(url):
"""
Extracts the content of a Reuters article from its URL.
:param url: URL of the Reuters article.
:return: Plain text string of the article contents, without HTML tags.
"""
# Fetch the HTML content from the URL
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the div with a class that starts with "article-body__container"
article_body = soup.find('div', class_=lambda value: value and value.startswith('article-body__container'))

# Extract and return the text content, if the div is found
if article_body:
return article_body.get_text(strip=True)
else:
return "Article content not found."


if __name__ == '__main__':
url = "https://browse.arxiv.org/html/2401.02117v1"
result = extract_title_abstract_introduction(url)

print(result)

66 changes: 36 additions & 30 deletions scripts/csv2md.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
import re

from tenacity import retry, stop_after_attempt, wait_random_exponential
from content_retrieval import get_arxiv_paper_contents, get_reuters_article_content


# a05825917ad14bc38d6d4152b5fae19a
CATEGORIES = [
'Top News',
'Tools',
Expand Down Expand Up @@ -65,6 +67,8 @@ def get_article_category(row, excerpt):
return row['Type']

title, url = row['Name'], row['URL']
if 'arxiv' in url:
return 'Research'

prompt = f'''
Title: {title}
Expand All @@ -86,29 +90,36 @@ def get_article_category(row, excerpt):
The user will provide the article title, link, and description.
After careful consideration, you will respond with ONLY the predicted article type, with no explanations, punctuation, formatting, or anything else.
Please only respond with one of the above types (Business, Resesarch, Tools, Concerns, Policy, Analysis, Expert Opinions, Explainers, Fun).
Only respond with one of the above types (Business, Research, Tools, Concerns, Policy, Analysis, Expert Opinions, Explainers, Fun).
'''.strip()
return query_openai([
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': prompt}
])
{'role': 'user', 'content': prompt},
], model='gpt-4')


def get_news_article(url):
if 'arxiv' in url:
url = arxiv_to_huggingface(url)

try:
article = Article(url)
article.download()
article.parse()
assert article.text
return {
'text': article.text,
'top_image': article.top_image,
'has_top_image': article.has_top_image()
}
except:
if 'arxiv' in url:
url = arxiv_to_html(url)
text = get_arxiv_paper_contents(url)
return {
'text': text,
'top_image': None,
'has_top_image': False
}
else:
article = Article(url)
article.download()
article.parse()
assert article.text
return {
'text': article.text,
'top_image': article.top_image,
'has_top_image': article.has_top_image()
}
except Exception as e:
print('ERROR: not able to get text for URL '+url)
return None


Expand Down Expand Up @@ -186,8 +197,8 @@ def get_output_file_name(n):
def get_article_summary(title, news_article):
system_prompt = '''
You are an expert writer and commentator.
The user will give you an article, and you will write a short summary.
The summary should be one paragraph long, contain key technical details, and be easy to understand.
The user will give you an article, and you will write a one paragraph summary.
The summary should be one paragraph long, have at least four sentences, contain key technical details, and be easy to understand.
The summary should highlight key words and concepts from the article without abstracting them away.
The reader should clearly understand the key points from the article after reading your summary.
'''.strip()
Expand All @@ -202,7 +213,7 @@ def get_article_summary(title, news_article):
{'role': 'user', 'content': user_prompt}
]

return query_openai(messages, max_tokens=2000, model='gpt-3.5-turbo-1106')
return query_openai(messages, max_tokens=4000, model='gpt-4')


def rank_articles(articles):
Expand All @@ -226,15 +237,6 @@ def rank_articles(articles):

return json.loads(query_openai(messages, max_tokens=200))


def arxiv_to_huggingface(url: str) -> str:
match = re.search(r"https://arxiv.org/abs/(\d+\.\d+)(?:v\d+)?", url)
if match:
return f"https://huggingface.co/papers/{match.group(1)}"
else:
return url


def arxiv_to_html(url: str) -> str:
paper_id = url[url.find('abs/') + 4:].strip('/').strip()
if paper_id:
Expand Down Expand Up @@ -303,12 +305,16 @@ def get_newsletter_excerpt(top_news):
csv = pd.read_csv(input_csv, encoding='utf-8')
rows = []
for row_num, row in csv.iterrows():
if 'arxiv' in row['URL']:
if 'arxiv' in row['URL'] and row['Name'].startswith('Title:'):
# remove "Title:" from arxiv titles
row['Name'] = row['Name'][6:]

if 'youtube' in row['URL']:
continue

if '?' in row['URL']:
row['URL'] = row['URL'].split("?")[0]

rows.append(row)

print('Getting news articles...')
Expand Down Expand Up @@ -376,7 +382,7 @@ def get_newsletter_excerpt(top_news):
article = articles[r]
summary = summaries[r]
title, url, news_article = article['title'], article['url'], article['news_article']

top_news += f'#### [{title}]({url})'
top_news += '\n'

Expand Down
44 changes: 0 additions & 44 deletions scripts/parse_arxiv_html.py

This file was deleted.

0 comments on commit bdf7822

Please sign in to comment.