-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl_by_url_hotel.js
178 lines (162 loc) · 6.22 KB
/
crawl_by_url_hotel.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
//version 2.0 release date 200509
//(hotel特別版本) 根據關鍵字爬谷歌地圖的商家資料輸出成csv表
let rows = []
let store_num = 0//注意!從0開始
let Interval = setInterval(() => {
// 如果商店名稱dom存在於頁面的話
var dummy = 0
var last_row_num = null
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[5]/div[2]/div/div[1]/span/span[2]')) {
last_row_num = getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[5]/div[2]/div/div[1]/span/span[2]').innerHTML - 0
}
//當爬到清單最後一項目
if (store_num == last_row_num) {
(last_row_num % 20 == 0) ? change_page() : the_end()
}
//當頁面不在店家頁面內
else if (!getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/h1')) {
choose_which_store_to_crawl((store_num) % 20)
}
//當頁面在店家頁面內
else {
if(rows!=0){
//若此業商家名稱 和 列表最後一行 名稱一樣 就不加入
if (rows[rows.length-1][0] == getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/h1').innerHTML){
return
}
}
row = get_store_info_row()
if (row!==-1)//非重複
rows.push(row)
store_num++
}
}, 2000);
function getElementByXpath(path) {
return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
}
function get_store_info_row() {
var row = []
//名稱0
var name = getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/h1').innerHTML
var rowsName = []
rows.forEach(ele=>rowsName.push(ele[0]))
if (name in rowsName)return -1
row.push(name)
//星級1
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[1]/span/span')) {
row.push(getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[1]/span/span').innerHTML)
}
else {
row.push('(空)')
}
//類型2
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[2]/span[1]/span[1]/button')) {
row.push(getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[2]/span[1]/span[1]/button').innerHTML)
}
else {
row.push('(空)')
}
//價錢3
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[1]/div[4]/div/button/div/jsl[2]/div[2]/span')) {
row.push(getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[1]/div[4]/div/button/div/jsl[2]/div[2]/span').innerHTML)
}
else {
row.push('(空)')
}
//星集4
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[2]/span/span[2]/span[2]/span[1]/span')) {
row.push(getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[2]/span/span[2]/span[2]/span[1]/span').innerHTML)
}
else {
row.push('(空)')
}
//手機5
row.push('沒有手機')
//網址6
row.push('沒有網址')
//14-20
var coulmn_number
for (coulmn_number = 14; coulmn_number <= 20; coulmn_number++) {
if (getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[' + coulmn_number + ']/div/div[1]/span[3]/span[3]')) {
var text = getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[' + coulmn_number + ']/div/div[1]/span[3]/span[3]').innerHTML
if(isFB(text))
row[6]='https://www.facebook.com/search/pages/?q='+name
else if(isPhoneNumber(text))
row[5] = text
else if(isUrl(text))
row[6]=text
else
row.push(getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[' + coulmn_number + ']/div/div[1]/span[3]/span[3]').innerHTML)
}
else {
row.push('(空)')
}
}
getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/button/span').click()
return row
}
function choose_which_store_to_crawl(row_num) {
getElementByXpath('//*[@id="pane"]/div/div[1]/div/div/div[5]/div[1]/div['+(2 * row_num + 1)+']/div[1]').click()
}
function change_page() {
getElementByXpath('//*[@id="n7lv7yjyC35__section-pagination-button-next"]').click()
}
function exportToCsv(filename, rows) {
var processRow = function (row) {
var finalVal = '';
for (var j = 0; j < row.length; j++) {
var innerValue = row[j] === null ? '' : row[j].toString();
if (row[j] instanceof Date) {
innerValue = row[j].toLocaleString();
};
var result = innerValue.replace(/"/g, '""');
if (result.search(/("|,|\n)/g) >= 0)
result = '"' + result + '"';
if (j > 0)
finalVal += ',';
finalVal += result;
}
return finalVal + '\n';
};
var csvFile = '';
for (var i = 0; i < rows.length; i++) {
csvFile += processRow(rows[i]);
}
var blob = new Blob([csvFile], { type: 'text/csv;charset=utf-8;' });
if (navigator.msSaveBlob) { // IE 10+
navigator.msSaveBlob(blob, filename);
} else {
var link = document.createElement("a");
if (link.download !== undefined) { // feature detection
// Browsers that support HTML5 download attribute
var url = URL.createObjectURL(blob);
link.setAttribute("href", url);
link.setAttribute("download", filename);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
}
function the_end() {
alert('結束爬取此業20個')
alert('執行指令:"exportToCsv(\'尚未命名\',rows)"下載爬取資料')
killInterval()
}
function killInterval() {
clearInterval(Interval);
}
function isPhoneNumber(text){
if (text[0]==='0' && text[1]==='9') return true
else return false
}
function isUrl (text) {
if (text.search('.com')!==-1)return true
else if (text.search('.net')!==-1)return true
else return false
}
function isFB(text) {
if(text.search('facebook.com')!==-1) return true
else return false
}