-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeit.js
46 lines (35 loc) · 1011 Bytes
/
scrapeit.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// Using https://github.com/sylvinus/node-crawler
var Crawler = require('Crawler'),
url = require('url'),
$ = require('cheerio');
var pagesHit = 0,
DATA = []; // can store data in local buffer, or Crawler cash, or handoff, etc
var c = new Crawler({
maxConnections : 1,
callback : function (error, result, $) {
if(!result.body) return; // disregard odd results
var html; // the body of the page
// Try to load
try{
html = $.load(result.body);
} catch (e) {
console.error(e);
return; // exit this page without spreading
}
pagesHit++;
console.log('Crawled ' + pagesHit + ': ' + result.request.uri.href);
// Keep crawling
html('a').each(function(index, a){
try {
var queueUrl = $(a).attr('href');
links.push(queueUrl);
c.queue(queueUrl);
} catch (e) {
console.error(e);
}
})
}
});
// Fire it up
var SITE_TO_VISIT = '';
c.queue(SITE_TO_VISIT);