-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
98 lines (77 loc) · 2.8 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import os,re,sys
import requests
import urllib.parse
#TODO: Add command link feature for providing book name
book_name=" ".join(sys.argv[1:])
if not book_name:
# remove it
book_name = "as a man thinketh"
print("bookname: '"+book_name+"'")
# ************************************************************************************************
# To grab book from project gutenberg
# ************************************************************************************************
query_url = "https://www.gutenberg.org/ebooks/search/?"+urllib.parse.urlencode({'query':book_name})
response = requests.get(query_url)
# print(response.content)
match = re.search(r'<li class="booklink">\\n<a class="link" href="/ebooks/(\d+)',str(response.content))
if match:
book_number = match.group(1)
print("FOUND!!! Ebook number "+book_number)
query_url = "https://www.gutenberg.org/files/"+book_number
response = requests.get(query_url)
# print(response.content)
match = re.search(r'alt="\[TXT\]"></td><td><a href="(.*\.txt)">',str(response.content))
if match:
a_link_text = match.group(1)
txt_link="https://www.gutenberg.org/files/"+book_number+"/"+a_link_text
print("Link: "+txt_link)
content = requests.get(txt_link).content
try:
os.remove('book.txt')
except:
pass
open('book.txt', 'wb').write(content)
print("Downloaded")
# ************************************************************************************************
# Book is saved as book.txt at this point
# Process the text and save wordcloud image
# ************************************************************************************************
comment_words = ""
stopwords = set(STOPWORDS)
with open('book.txt', 'r') as file:
content = file.read()
copyright_start = re.search(r"\*\*\* START OF (?:THE|THIS) PROJECT GUTENBERG EBOOK", content)
if copyright_start:
a=content.count('\n', 0, copyright_start.start())+1
content=content.split("\n",a)[a]
copyright_end = re.search(r"\*\*\* END OF (?:THE|THIS) PROJECT GUTENBERG EBOOK", content)
if copyright_end:
a=content.count('\n', 0, copyright_end.start())
content="\n".join(content.split("\n",a)[:a])
# try:
# os.remove('book.txt')
# except:
# pass
# finally:
# open('book.txt', 'w').write(content)
tokens = content.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens) + " "
wordcloud = WordCloud(
width=800,
height=800,
background_color="white",
stopwords=stopwords,
min_font_size=10,
).generate(comment_words)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
# plt.show()
plt.savefig(os.path.join(os.getcwd(),"image.png"))