-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
72 lines (62 loc) · 2.19 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
(function(host) {
function Crawler() {
console.log('contructor Crawler');
this.visitedURLs = {};
};
Crawler.webpage = require('webpage');
Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
console.log(url);
if (0 == depth || this.visitedURLs[url]) {
return;
};
var self = this;
var page = Crawler.webpage.create();
page.open(url, function (status) {
if ('fail' === status) {
onFailure({
url: url,
status: status
});
} else {
var documentHTML = page.evaluate(function () {
return document.body && document.body.innerHTML ? document.body.innerHTML : "";
});
self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
self.visitedURLs[url] = true;
onSuccess({
url: url,
status: status,
content: documentHTML
});
};
});
};
Crawler.prototype.getAllURLs = function(page) {
return page.evaluate(function () {
return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
.map(function (link) {
return link.getAttribute("href");
});
});
};
Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
var self = this;
urls.filter(function (url) {
return Crawler.isTopLevelURL(url);
}).forEach(function (url) {
self.crawl(url, depth, onSuccess, onFailure);
});
};
Crawler.isTopLevelURL = function(url) {
return 0 == url.indexOf("http");
};
host.Crawler = Crawler;
})(phantom);
new phantom.Crawler().crawl("http://dn.se", 1,
function onSuccess(page) {
console.log("Loaded page. URL = " + page.url + " content length = " + page.content.head + " status = " + page.status);
},
function onFailure(page) {
console.log("Could not load page. URL = " + page.url + " status = " + page.status);
}
);