forked from umbrae/Serendipity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_subreddits.py
65 lines (57 loc) · 2.29 KB
/
build_subreddits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
# Requires lxml 2.0.3 and httplib2. Public domain.
# Builds a subreddits.pickle file for reading from subreddits.py
# To be run every month or so to get new stats on subreddits.
# Derived from subreddit scraper here: http://pastie.org/pastes/804537
# This automatically filters nsfw subreddits because anonymous scraping
# of /reddits/ does not include nsfw subreddits.
first_uri = 'http://www.reddit.com/reddits/'
import httplib2
import urlparse
import lxml.html.soupparser
import cPickle as pickle
import os
def get_page(uri):
print 'Processing %s' % uri
http = httplib2.Http()
response, content = http.request(uri)
return lxml.html.soupparser.fromstring(content)
def fetch_reddits():
reddit_list = []
current_uri = first_uri
while True:
page = get_page(current_uri)
reddits = page.xpath('//div[contains(@class, \'subreddit\')]')
for reddit in reddits:
info = reddit.xpath('descendant::a[@class=\'title\']')[0]
name = info.text or info.attrib['href']
uri = urlparse.urljoin(current_uri, info.attrib['href'])
try:
description = reddit.xpath('descendant::p[@class=\'description\']/text()')[0]
except:
description = None
try:
subscribers = reddit.xpath('.//span[@class=\'score unvoted\']/span[@class=\'number\']/text()')[0].split()[0].replace(',','')
except IndexError:
subscribers = -1
reddit_list.append(
{
"name": name,
"uri": uri,
"description": description,
"subscribers": int(subscribers)
}
)
try:
print "Processed %s reddits." % len(reddit_list)
next_link = page.xpath('//p[@class=\'nextprev\']/a[contains(text(),\'next\')]')[0]
current_uri = urlparse.urljoin(current_uri, next_link.attrib['href'])
except:
break
return reddit_list
if __name__ == '__main__':
reddits = fetch_reddits()
reddits.sort(key=lambda reddit: reddit["subscribers"])
reddits.reverse()
cwd = os.path.realpath(os.path.dirname(__file__))
pickle.dump(reddits, open(os.path.join(cwd, 'subreddits.pickle'), 'w'))