-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcrawler.py
131 lines (114 loc) · 4.14 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
"""
Sets up a WebDriver session using a local copy of TorBrowser, and Selenium via geckodriver.
"""
import time
import pprint
import re
import csv
import json
import urllib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import torDriver
timeoutInSeconds = 5
finalTimeoutInSeconds = 5
listOfSearches = []
termsToSearch = []
# For each term a suffix can be added to focus the search more.
suffix = ""
# Setup the TorBrowser and geckodriver
torDriverInstance = torDriver.TorDriver()
torDriverInstance.downloadGeckodriver()
torDriverInstance.setupTor()
# @description: Crawls from a search term
# @param: searchTerm - The term to search for
# @return: A collection of objects with the data and links
def crawlFromSearch(searchTerm):
# Setup the webdriver
driver = torDriverInstance.setupWebdriver()
try:
driver.get("https://searx.thegpm.org/")
torDriver.isVisible(driver, "#q")
elem = driver.find_element(By.CSS_SELECTOR, "#q")
elem.clear()
elem.send_keys(searchTerm)
elem.send_keys(Keys.RETURN)
except Exception as e:
print(e)
driver.quit()
raise SystemError("Literally cannot even search")
# Wait for some seconds
time.sleep(timeoutInSeconds)
try:
torDriver.isVisible(driver, "#main_results > div.result.result-default > h4 > a")
firstPageResults = driver.find_elements(By.CSS_SELECTOR, "#main_results > div.result.result-default > h4 > a")
except Exception as e:
print(e)
driver.quit()
raise SystemError("Issue browsing from search")
# Use just the first result for now.
# Could start separate threads for each result here, but that can get aggressive.
pprint.pprint(firstPageResults)
thisUrl = firstPageResults[0].get_attribute('href')
firstPageResults[0].click()
# Wait for the page to load
time.sleep(timeoutInSeconds)
torDriver.isVisible(driver, "a")
_url = urllib.parse.urlparse(thisUrl)
print("Parsed url")
pprint.pprint(_url)
allUrls = driver.find_elements(By.XPATH, "//a")
parsedCollection = []
onPageUrls = []
offPageUrls = []
for url in allUrls:
pprint.pprint(url.text)
urlHref = url.get_attribute('href')
pprint.pprint(urlHref)
parsedObject = {
"data": url.text, # Any data as plain text or base64.
"link": urlHref, # Link to the data
"parent": thisUrl # Link to the parent page
}
parsedCollection.append(parsedObject)
driver.quit()
return parsedCollection
# Read in the search terms
with open("searches.txt", "r") as fileHandler:
listOfSearches = fileHandler.readlines()
for searchTerm in listOfSearches:
if searchTerm:
# This Regex is inspired from a discussion about
# substituting characters that are not letters or numbers
# https://stackoverflow.com/a/5843547/682915
strippedWithSpaces = re.sub(r'([^\s\w]|_)+', '', searchTerm).strip()
strippedWithUnderbar = re.sub(r' ', '_', strippedWithSpaces)
strippedLower = strippedWithUnderbar.lower().strip()
termsToSearch.append({
"name": strippedWithSpaces,
"file": "./results/" + strippedLower + ".json",
"search": ' '.join([strippedWithSpaces, suffix]),
"orig": searchTerm,
"num": "",
"artifacts": None
})
# For each term, search and then crawl
for term in termsToSearch:
try:
print("Starting search...")
print(term["name"])
# Crawl
artifactsCollection = crawlFromSearch(term["search"])
term["artifacts"] = artifactsCollection
except Exception as e:
print(f"Something bad happened looking for {term}")
print(e)
continue # Skip to the next term
print("Writing file...", "\n", term["file"])
# Write to file
with open(term["file"], "w") as txtFile:
txtFile.write(json.dumps(term))
print("Done writing file...", "\n", term["file"])