-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.py
69 lines (61 loc) · 2.05 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Wikipedia summarizer
import bs4 as bs
import urllib.request
import re
from gensim.summarization import summarize as su_gs
from gensim.summarization import keywords
from gensim.summarization import mz_keywords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sys import argv
import os
def print_usage():
# Display the parameters and what they mean.
print('''
Usage:
summarize.py <wiki-url> <summary length>
Explanation:
Parameter 1: Wikipedia URL to pull
Parameter 2: the number of words for the summary to contain
''')
def summarize(url_topull, num_of_words):
# Obtain text
scraped_data = urllib.request.urlopen(url_topull)
article = scraped_data.read()
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += p.text
# Extract keywords
stop_words = set(stopwords.words('english'))
keywords = mz_keywords(article_text,scores=True,threshold=0.003)
keywords_names = []
for tuples in keywords:
if tuples[0] not in stop_words:
if len(tuples[0]) > 2:
keywords_names.append(tuples[0])
# Create summary
pre_summary = su_gs(article_text,word_count=num_of_words)
summary = re.sub("[\(\[].*?[\)\]]", "", pre_summary)
# Print
print_pretty (summary,keywords_names)
def print_pretty (summary, keywords_names):
columns = os.get_terminal_size().columns
print ("=" * columns)
print ("wiki-summarizer-----written-by-@brucewlee(github)".center(columns))
print ("-" * columns)
printable = summary
print (printable.center(columns))
print ("-" * columns)
str_keywords_names = str(keywords_names).strip('[]')
printable2 = str_keywords_names
print (printable2.center(columns))
print ("=" * columns)
if __name__ == '__main__':
if len(argv) != 3:
print_usage()
elif not str(argv[2]).isdigit():
print_usage()
else:
summarize(argv[1], int(argv[2]))