-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_scraper_template.py
92 lines (68 loc) · 2.76 KB
/
multi_scraper_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#IMPORT
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import os
#Define starting url
url = 'Your_URL'
#Gets HTML data from web page and turns it into soup
def get_soup(url):
#put reqired website url in quotes
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url);
html_text = driver.page_source
soup = BeautifulSoup(html_text, 'html.parser')
return soup
#Gets url for next page
def get_next_page(soup):
page = soup.find('tag', {'class': 'class name'})
if not page.find('tag', {'something to find it': 'value'}):
url = 'website starting url' + str(page.find('tag', {'identifier': 'value'})['href'])
return url
else:
return
#Takes in soup and returns all info wanted from it
def find_things(soup):
#inspect reqired web page and put html tags and classes wanted inside quotes
things = soup.find_all('div', class_ = 'dkr2t82')
#loop for each thing in the list of thing
for index, thing in enumerate(things):
#list of requirements can be made here and "a" is a varibale can be changed to go deeper into specific tags if needed
requirement1 = thing.find('tag', class_ = 'value').a.a.text
if 'condition1' in requirement1:
#this is where you put things that you want to print
address = thing.find('tag', class_ = 'value').text.replace(' ','')
#now to find links to those websites
link = thing.find('tag', class_ = 'value').a['href']
path = 'c:/Users/godfreyantomarlin/Downloads/Python_projects/Web_Scraper'
file_name = "things.txt"
#this is where you print outputs using f.write instead if the usual print
f = open(file_name, 'a')
f.write(f"{address}: website.co.uk{link} \n \n")
f.close()
print(f"file saved: {index}")
#Brings together other functions to check all pages of website
def find_all_things(url):
while url:
soup = get_soup(url)
find_things(soup)
url = get_next_page(soup)
#MAIN PROGRAM
if __name__ == '__main__':
counter = 1
#Loop Indefinitely
while True:
#Edit File of info
f = open("file name.txt", 'a')
f.write(f"\n \n Search Number {counter} \n")
f.close()
#Run Scraping Program
find_all_things(url)
#Extra Stuff to loop forever and sleep in between
time_wait_min = 10
print(f"waiting {time_wait_min} minutes...")
time.sleep(time_wait_min * 60)
counter += 1