-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindeed_job_scrape.py
113 lines (101 loc) · 3.95 KB
/
indeed_job_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import requests
import bs4
from bs4 import BeautifulSoup
from pandas import Series, DataFrame
import pandas as pd
import time
URL = "https://www.indeed.com.sg/jobs?q=data+scientist&l=Singapore"
#conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, "html.parser")
#printing soup in a more structured tree format that makes for easier reading
# print(soup.prettify())
def extract_job_title_from_result(soup):
jobs = []
for div in soup.find_all(name="div", attrs={"class":"row"}):
for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
jobs.append(a["title"])
return(jobs)
print(extract_job_title_from_result(soup))
def extract_company_from_result(soup):
companies = []
for div in soup.find_all(name="div", attrs={"class":"row"}):
company = div.find_all(name="span", attrs={"class":"company"})
if len(company) > 0:
for b in company:
companies.append(b.text.strip())
else:
sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
for span in sec_try:
companies.append(span.text.strip())
return(companies)
print(extract_company_from_result(soup))
def extract_location_from_result(soup):
locations = []
spans = soup.findAll("span", attrs={"class": "location"})
for span in spans:
locations.append(span.text)
return(locations)
print(extract_location_from_result(soup))
max_results_per_city = 100
city = "Singapore"
columns = ["city", "job_title", "company_name", "location", "summary", "salary"]
sample_df = pd.DataFrame(columns = columns)
job_posts = []
page_id = 0
for page_id in range(100):
if page_id < 100:
# get the url
if page_id==0:
page = requests.get('https://www.indeed.com.sg/jobs?q=data+scientist&l=' + str(city))
else:
page = requests.get('https://www.indeed.com.sg/jobs?q=data+scientist&l=' + str(city) + "&start=" + str(page_id))
page_id = page_id + 10
soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
for div in soup.find_all(name="div", attrs={"class":"row"}):
#specifying row num for index of job posting in dataframe
num = len(sample_df) + 1
#creating an empty list to hold the data for each posting
job_post = []
#append city name
job_post.append(city)
#grabbing job title
for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
job_post.append(a["title"])
#grabbing company name
company = div.find_all(name="span", attrs={"class":"company"})
if len(company) > 0:
for b in company:
job_post.append(b.text.strip())
else:
sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
for span in sec_try:
job_post.append(span.text)
#grabbing location name
c = div.findAll("span", attrs={"class": "location"})
for span in c:
job_post.append(span.text)
#grabbing summary text
d = div.findAll("span", attrs={"class": "summary"})
for span in d:
job_post.append(span.text.strip())
#grabbing salary
try:
job_post.append(div.find("nobr").text)
except:
try:
div_two = div.find(name="div", attrs={"class":"sjcl"})
div_three = div_two.find("div")
job_post.append(div_three.text.strip())
except:
job_post.append("Nothing_found")
# appending list of job post info to dataframe at index num
# print (job_post)
series_1=Series(job_post)
job_posts.append(series_1)
print(series_1)
# sample_df.loc[num] = series_1
job_posts = pd.DataFrame(job_posts)
#saving sample_df as a local csv file — define your own local path to save contents
job_posts.to_csv("/Users/shirley/Desktop/indeed_job.csv", encoding="utf-8")