-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathCrawler.py
105 lines (87 loc) · 2.92 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import logging
import urllib
import Queue
import urlparse
import re
import operator
from time import sleep
from Scraper import ScrapeThread
from Worker import WorkThread
import sys
class Crawler:
def __init__(self,base,proxy=False,proxy_port=False,robots=False):
self.url_queue = Queue.Queue()
self.html_queue = Queue.Queue()
self.sqli_queue = Queue.Queue()
self.visited_queue = Queue.Queue()
self.forms_queue = Queue.Queue()
self.base = base
self.convert_base()
self.robots = robots
self.proxy = proxy
self.proxy_port = proxy_port
def check_proxy(self):
proxy = {}
proxy['http'] = 'http://' + str(self.proxy) + ":" + str(self.proxy_port)
html = urllib.urlopen('http://icanhazip.com',proxies=proxy).read()
html_no_proxy = urllib.urlopen('http://icanhazip.com').read()
print "####Checking Proxy####"
print "Using IP: " + html.strip()
print "Original IP: " + html_no_proxy
sleep(3)
def convert_base(self):
if not self.base.startswith('http'):
self.base = "http://" + self.base
self.base = urlparse.urlparse(self.base)
def update_status(self):
sys.stdout.write("\rVisited: %d | Pending: %d" % (self.visited_queue.qsize(), self.url_queue.qsize()))
sys.stdout.flush()
def spawn_threads(self):
worker = WorkThread(self.html_queue, self.url_queue,self.base, self.sqli_queue, self.forms_queue)
worker.setDaemon(True)
worker.start()
scrapers = []
for i in range(5):
t = ScrapeThread(self.url_queue, self.html_queue,self.visited_queue,self.proxy,self.proxy_port,worker)
t.setDaemon(True)
t.start()
scrapers.append(t)
while worker.isAlive():
self.update_status()
sleep(0.1)
sys.stdout.write("\rKillin Scrapers..........")
sys.stdout.flush()
for thread in scrapers:
thread.join()
def start(self):
self.url_queue.put(self.base)
if self.proxy:
self.check_proxy()
if self.robots:
self.url_queue.put(urlparse.urlparse(self.base.geturl() + "/robots"))
self.spawn_threads()
sys.stdout.write("\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r\r")
sys.stdout.flush()
self.status()
def status(self):
print "##################################"
print "###########Drone Status###########"
print "##################################"
print "urls visited: " + str(self.visited_queue.qsize())
print "possible sqli found: " + str(self.sqli_queue.qsize())
print
print "##URLs with GET Params##"
while not self.sqli_queue.empty():
i = self.sqli_queue.get()
print "get: " + i.geturl()
self.sqli_queue.task_done()
print
print "###Forms Collected###"
while not self.forms_queue.empty():
print "form:", self.forms_queue.get()
self.forms_queue.task_done()
print
print "###Pages Visited###"
while not self.visited_queue.empty():
print self.visited_queue.get().geturl()
self.visited_queue.task_done()