-
Notifications
You must be signed in to change notification settings - Fork 1
/
console-crawler.js
executable file
·64 lines (61 loc) · 1.58 KB
/
console-crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env node
var Tarantula = require('tarantula');
var console = require('better-console');
// 1. Parse the URL from the Arguments
var argv = require('yargs')
.usage([
'Traverse a site.',
'Usage: $0 http://example.com',
'Usage: $0 http://example.com --legs=10',
'Usage: $0 http://example.com --legs=10 --proxy=myproxy.com:7070 --web --phantom'
].join('\n'))
.demand(1)
.default('conn', 2)
.default('proxy', '')
.argv;
var SITES = argv._;
var isImage = /\.jpg|\.png|\.gif|\.pdf/i;
var tarantula = new Tarantula({
leg: (argv.phantom ? 'PhantomJS' : ''),
legs: argv.legs,
stayInRange: !argv.web,
proxy: argv.proxy,
});
tarantula.shouldVisit = function (pageUri) {
if (isImage.test(pageUri)) {
return false;
}
return true;
};
tarantula.on('request', function (task) {
console.log('GET', task.uri);
console.log('REFER', task.parent);
});
tarantula.on('data', function (task) {
console.info('200', task.uri);
});
tarantula.on('uris', function (task, newCount) {
console.log(
'V:' + tarantula.visited,
'T:' + tarantula.uris.length,
'Q:' + (tarantula.uris.length - tarantula.visited),
'A:' + tarantula.legs.active,
'+' + newCount
);
});
tarantula.on('error', function (task, errorCode, errorMessage) {
if (typeof errorMessage === 'string' && errorMessage.match('Not HTML')) {
console.warn('Not HTML');
}
else {
console.error(errorCode, task.uri, 'from', task.parent);
if (errorCode == 'ERR') {
console.error(errorMessage);
}
}
});
tarantula.on('done', function () {
console.info('done');
});
console.log('Crawling… ', SITES);
tarantula.start(SITES);