forked from Cl0udG0d/SZhe_Scan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpiderGetUrl.py
110 lines (95 loc) · 3.59 KB
/
SpiderGetUrl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from lxml import etree
import threading
import time
import core
import urllib3
import ImportToRedis
import redis
from Init import redispool
'''
因为每深入一层,链接数增大很多,所以截止层数暂定为2,添加多线程之后将层数提高
爬取截止条件为:层数为2,或者队列中无新的链接
返回链接列表
参考链接:
https://www.hss5.com/2018/11/28/python%E7%88%AC%E5%8F%96%E7%BD%91%E7%AB%99%E5%85%A8%E9%83%A8url%E9%93%BE%E6%8E%A5/
https://ask.hellobi.com/blog/bixtcexs/11983
https://lskreno.vip/2019/09/15/%E7%88%AC%E8%99%AB%E4%B9%8B%E6%88%98/
https://github.com/sml2h3/python_collect_domain/blob/master/collect.py
'''
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# domain为传入网址网址
def SortOut(domain, redispool):
redispool.delete("lists")
for url in redispool.smembers("new_lists"):
url = str(url).strip()
if type(url) == list:
continue
url = url.strip()
if not url.startswith("http://") and not url.startswith("https://") and ("javascript:" not in url) and ("#" not in url) \
and not url.endswith("css") and not url.endswith("js"):
if not domain.endswith("/") and not url.startswith("/"):
url = domain + "/" + url
else:
url = domain + url
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url
if domain not in url:
continue
redispool.sadd("lists", url)
redispool.delete("new_lists")
for url in redispool.smembers("lists"):
redispool.sadd(domain, url)
redispool.delete("lists")
def Spider(domain, redispool):
url = redispool.spop(domain)
try:
rep = core.gethtml(url, timeout=1)
rep = etree.HTML(rep)
url_list = rep.xpath('//*[@href]/@href')
for new_url in url_list:
redispool.sadd("new_lists", new_url)
except Exception as e:
pass
'''
利用三个列表进行有层次地广度遍历url:all_lists储存所有获取到的url,new_lists储存这一层遍历时获取到的所有新的url,old_lists储存上一层的所有url
用于下层的遍历
'''
class Spyder(threading.Thread):
def __init__(self, func, domain, args):
threading.Thread.__init__(self)
self.args = args
self.func = func
self.domain = domain
self.result = self.func(self.domain, self.args)
def run(self):
self.func(self.domain, self.args)
def get_result(self):
return self.result
def depth_get(domain):
redispool.delete(domain)
redispool.sadd(domain, domain)
redispool.delete("new_lists")
threads = []
count = 0
while count < 2:
redispool.delete("new_url_list")
count += 1
print("第%d层" % count + 20 * "=")
try:
if count == 1:
Spider(domain, redispool)
else:
while redispool.scard(domain) != 0:
for i in range(1, 31):
t = Spyder(Spider, domain, redispool)
threads.append(t)
t.start()
for t in threads:
t.join()
time.sleep(0.2)
except Exception:
pass
SortOut(domain, redispool)
if __name__ == '__main__':
redispool = redis.Redis(connection_pool=ImportToRedis.redisPool)
depth_get("testphp.vulnweb.com", redispool)