-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathreddit.py
77 lines (73 loc) · 2 KB
/
reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#reddit
import requests
import json
import time
class Reddit:
"""Class to handle the crawling of subreddits for lexengine processing"""
def __init__(self, listSubreddits):
self.subreddits = listSubreddits
self.rawText = ""
self.usedReddits = ""
def crawl(self):
for i in self.subreddits:
self.fetchsubreddit(i)
def fetchsubreddit(self, subreddit):
try:
headers = {'User-Agent' : 'WordHound'}
print ("http://www.reddit.com/r/{0}/.json".format(subreddit))
r = requests.get("http://www.reddit.com/r/{0}/.json".format(subreddit), headers=headers)
jsonData = json.loads(r.content)
#print jsonData
jsonData = jsonData['data']['children']
except:
return
comments = []
articleLinks = []
for thread in jsonData:
thread = thread['data']
comments.append(thread['permalink'])
articleLinks.append(thread['url'])
for com in comments:
time.sleep(1.75)
try:
print "[-] Fetching thread {1} from: {0}".format(unicode(com).split("/")[2],unicode(com).split("/")[5])
#raw_input()
r = requests.get("http://www.reddit.com{0}/.json".format(com), headers=headers)
except UnicodeError:
continue
try:
jsonData = json.loads(r.content)
except:
continue
flag = True
for i in jsonData:
if flag:
flag = False
continue
comment = i['data']['children']
for x in comment:
try:
sentence = x['data']['body']
except:
continue
#print sentence
#raw_input()
if len(sentence) > 0:
self.rawText+= self.clean(sentence) + " "
#print comment
self.usedReddits += subreddit +", "
print "[+] Fetched {0} words from {1}\n".format(len(self.rawText.split(' ')), self.usedReddits)
def clean(self, sentence):
w = ""
for word in sentence.split(' '):
if "http" in word or "https" in word:
continue
w += " "
word = word.lower()
for c in word:
v = ord(c)
#print c
if c == ' ' or (v > 95 and v < 127):
w+=c
return w
#Now we need to iterate through each of the threads and pull comments