-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_test.js
129 lines (113 loc) · 5.14 KB
/
scraper_test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
const puppeteer = require('puppeteer');
//const url = 'https://www2.gov.bc.ca/gov/content/governments/services-for-government/information-management-technology/records-management/information-schedules/arcs/administrative-records/general';
const url = 'https://www2.gov.bc.ca/gov/content/governments/services-for-government/information-management-technology/records-management/information-schedules/arcs/financial-records/fees';
if (!url) {
throw "Please provide URL as a first argument";
}
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
// set an index to track pages downloaded
let currentPage = 1;
let isNext = true;
let data = [];
while (isNext){
// process each page
currentPage += 1;
let newData = await page.evaluate(() => {
let pageTitle = document.title;
var pageNext, pagePrev, pageText;
// get the page text describing the primary
let results = [];
let bs = document.querySelector('div#body');
let ps = Array.from(bs.children);
let pidx = 1; // for some reason it is +1
ps.forEach((p, idx) => {
console.log(p.tagName)
pidx += 1;
let links = p.querySelectorAll('a');
let clickFound = false;
if (links.length > 0){
let lidx = 0;
links.forEach((l) => {
lidx += 1;
if (l.innerText === 'Next'){
pageNext = 'div#body > p:nth-child(' + ps.length + ') > a' + ((links.length > 1) ? ':nth-child(' + lidx + ')': ''); //
clickFound = true;
} else if (l.innerText === 'Previous'){
clickFound = true;
}
});
}
if (!clickFound && p.tagName != 'TABLE')
{
results.push(p.innerText);
}
});
pageText = results.join('\n');
// extract the secondaries from a table...
let dr = document.querySelectorAll('div#body > table > tbody > tr');
let pageData = []
dr.forEach((r) => {
let cells = r.querySelectorAll('td');
if (cells.length >= 5 ){
pageData.push({
series: cells[0].innerText,
text: cells[1].innerText,
a: cells[2].innerText,
sa: cells[3].innerText,
fd: cells[4].innerText,
})
}
});
// sort the items out and return...
return {
title: pageTitle,
text: pageText,
next: pageNext,
previous: pagePrev,
data: pageData,
};
}); // newData
data.push(newData);
// loops to next page...
//*[@id="body"]/p[5]/a[2]
//document.querySelector("#body > p:nth-child(6) > a")
//document.querySelector("#body > p:nth-child(5) > a:nth-child(2)")
// #body > p:nth-child(6) > a
// #body > p:nth-child(5) > a:nth-child(2)
let selText = 'next';
console.log(newData['title']);
//console.log(newData[selText]);
if (newData[selText]){ //
// click to next page
//console.log('Next...')
await Promise.all([
await page.click(newData[selText]),
await page.waitForSelector('div#body')
])
} else {
console.log('Finished....')
isNext = false;
}
}
// close and return
browser.close();
// save to file
var fs = require('fs');
fs.writeFile('arcsdata.json', JSON.stringify(data, null, 2), 'utf8', function (err) {
if (err) throw err;
console.log('Saved!');
});
// return data
return resolve(data);
} catch (e) {
return reject(e);
}
})
}
// run and print output...
run().then(console.log).catch(console.error);