-
Notifications
You must be signed in to change notification settings - Fork 0
/
backend.py
132 lines (111 loc) · 5.26 KB
/
backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
from bs4 import BeautifulSoup as BS
class Backend:
def __init__(self, title, location, pages,country, distance, date):
self.title = title
self.location = location
self.pages = int(pages)
self.country = country
self.distance = distance
self.date = date
self.joblist = []
def scrape(self):
url = f'https://{self.country}.indeed.com/jobs?q='
combUrl = url + (self.title.replace(" ", "+")) + "&l=" + (self.location.replace(", ", "%2C%20"))
if(self.distance== "Distance in KM" and self.date == 'D'): #neither is selected
for i in range(0, (self.pages*10), 10): #(0 to 30 in step of 10)
c = self.extract(i, combUrl) # i is the page # of the url search
self.transform(c)
elif(self.distance!="Distance in KM" and self.date=='D'): #only distance is selected
#distance: &radius=(#of km), exact=0
combUrl = combUrl + "&radius="+ str(self.distance)
for i in range(0, (self.pages*10), 10): #(0 to 30 in step of 10)
c = self.extract(i,combUrl) # i is the page # of the url search
self.transform(c)
elif(self.distance=="Distance in KM" and self.date!='D'): #only time is selected
#time: &fromage=(time), 24 hr = 1
combUrl = combUrl + "&fromage=" + str(self.date)
for i in range(0, (self.pages*10), 10): #(0 to 30 in step of 10)
c = self.extract(i,combUrl) # i is the page # of the url search
self.transform(c)
else: #if both are selected
combUrl = combUrl + "&radius="+ str(self.distance) + "&fromage=" + str(self.date)
for i in range(0, (self.pages*10), 10): #(0 to 30 in step of 10)
c = self.extract(i,combUrl) # i is the page # of the url search
self.transform(c)
return self.joblist
def extract(self, page, combUrl):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
#{}, .format is for the the page#
combUrl = (combUrl+ "&start={}").format(page)
print(combUrl)
r = requests.get(combUrl, headers)
soup = BS(r.content, "html.parser")
return soup
def transform(self, soup):
ul = soup.find('ul', class_='jobsearch-ResultsList')
li = ul.find_all('li') #basic class name for each job post shell
#print(len(li))
i = 1
for item in li:
if item is None:
continue
try:
#title = item.find('a').text.strip() #class title, is an 'a' tag, and has title as text
partTitle = item.find('h2', class_="jobTitle") #.text.strip("new") #class jobTitle, is an 'h2' tag, and has title as text
title = partTitle.find('a').text
#print(title)
company = item.find("span", class_="companyName").text.strip()
# print(company)
location = item.find("div", class_="companyLocation").text.strip()
try:
jobType = item.find_all("div", class_="attribute_snippet")
if(len(jobType) == 1):
jobType = jobType[0].text.strip()
# print(jobType)
else:
jobType = jobType[1].text.strip()
# print(jobType)
except:
jobType = ""
try:
salary = item.find("div", class_="salary-snippet-container").text.strip()
except:
salary = ""
# try:
# jobType = item.find("div", class_="attribute_snippet").text.strip()
# except:
# jobType = ""
try:
href = item.find('a').attrs["data-jk"]
link = (f'https://{self.country}.indeed.com/viewjob?jk={href}')
except:
link = ""
try:
summary = item.find("div", class_="job-snippet").text.strip()
except:
summary = ""
try:
date = item.find("span", class_="date").text.strip("Posted")
date = date.replace("Hiring ongoing", "")
date = date.replace("ag", "ago")
date = date.replace("EmployerActive ", "")
#print(date)
except:
date = ""
job = {
'key':i,
'title': title,
'company': company,
'location':location,
'type':jobType,
'salary': salary,
'jobLink': link, #use i as a ref for which itteration of for loop is on, and insert the link accordingly
'summary': summary,
"date":date
}
#print(job)
i+=1
self.joblist.append(job)
except:
pass