-
Notifications
You must be signed in to change notification settings - Fork 1
/
HostadviceCrawler.py
81 lines (71 loc) · 2.66 KB
/
HostadviceCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#BeautifulSoup and selenium work wonderful together.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import re
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas.core.frame import DataFrame
#If other browers is preferred, download the specific web driver of the brower and add the web driver to PATH.
#Change .Chrome() into the brower will be used.
driver = webdriver.Chrome()
driver.get("https://hostadvice.com/")
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
#Close the alert(advertisement) window
alertwindow1 = WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//button[@id='onesignal-popover-cancel-button']")))
alertwindow1.click()
#Find the input box of country
elem = driver.find_element_by_xpath("//input[@id='filter-servers-country']")
#Input the country going to be collected
#Tricky to choose from an autocompletion input box.
print("Input the country you want to search: ")
country = input("")
elem.send_keys(country)
time.sleep(1)
elem.send_keys(Keys.ARROW_DOWN)
elem.send_keys(Keys.RETURN)
time.sleep(5)
#Find the total page number
pagenum = driver.find_element_by_xpath("/html[1]/body[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[5]/div[2]/a[2]").text
countryurl = driver.current_url
pagenum = int(pagenum)
#print(pagenum)
#Create two empty lists to store the names and links of the companies
names=[]
links=[]
#Create a clawer that obtain names and domain links of the companies
def hostadvice(driver):
req = requests.get(url = driver,headers = headers)
html = req.text
div_bf = BeautifulSoup(html,'html.parser')
div = div_bf.find_all('div', class_ = 'text-links')
a_bf = BeautifulSoup(str(div))
a_no = a_bf.find_all('a', rel = 'nofollow')
for each in a_no:
names.append(each.get('title'))
#print(names)
#Get text outside a pair of <> in DOM
for each in a_no:
links.append(each.get_text())
#print(links)
Total={"Name" : names, "Domain" : links}
df = DataFrame(Total)
return df
#Collect names and domains from the first page to the last page.
pagemax = pagenum + 1
for i in range(1,pagemax):
driver = countryurl
driver = driver + 'page/%i'%i
time.sleep(5)
df = hostadvice(driver)
df = df.append(df, ignore_index=True)
i = i + 1
#print(driver)
#Save results into .csv with the country input.
s = str(country)
df.to_csv('hostadvice%s.csv'%s)