-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
37 lines (30 loc) · 1.28 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import scrapy
from datetime import datetime
class BrickSetScraper(scrapy.Spider):
name = 'brickset_spider'
start_urls = []
CURRENT_YEAR = datetime.date(datetime.now()).year
START_YEAR = CURRENT_YEAR - 5
END_YEAR = CURRENT_YEAR
for year in range(START_YEAR, END_YEAR+1):
start_urls.append("https://brickset.com/sets/year-"+str(year))
def parse(self, response):
SET_SELECTOR = '.set'
NAME_SELECTOR = 'h1 ::text'
IMG_SELECTOR = 'img ::attr(src)'
PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()'
MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()'
for brickset in response.css(SET_SELECTOR):
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'image': brickset.css(IMG_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
}
NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)