Skip to content

Commit

Permalink
Add delay between requests (#21)
Browse files Browse the repository at this point in the history
Some websites won't let you request that many urls so fast, so the

* Added delay between crawl and added configurable http headers e2d28ec
* Added the documentation in the README.md f5d5e13
* Fixed config headers 70a4b6a
* Put the setTimout inside the processOne function 719b0be
  • Loading branch information
fhamon authored and nitriques committed Aug 25, 2017
1 parent bf44409 commit 6d19724
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 79 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ Your Algolia App ID.

Your generated Algolia API key.

#### delayBetweenRequest: Integer

Simple delay between each requests made to the website in milliseconds.

#### oldentries: Integer

The maximum number of seconds an entry can live without being updated. After
Expand Down
6 changes: 5 additions & 1 deletion app.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ sitemap(config, function (sitemap, urls) {

var results = _.map(urls, function (url, index) {
console.log('Registered ' + url.url);
var processResults = processOne(config, url, function (error, record) {
var processResults = processOne({
config: config,
url: url,
index: index
}, function (error, record) {
if (!!error || !record) {
console.error('Error! ' + error.message);
if (!!error.pageNotFound && !!record) {
Expand Down
6 changes: 5 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
{"url": "http://example.com/en/sitemap.xml", "lang": "en"}
],
"http": {
"auth": ""
"auth": "",
"headers": {

}
},
"delayBetweenRequest": 100,
"selectors": {
"title": "title",
"image": "meta[property=\"og:image\"]",
Expand Down
159 changes: 83 additions & 76 deletions lib/process.js
Original file line number Diff line number Diff line change
Expand Up @@ -131,92 +131,99 @@ var parse = function (record, data, config) {
});
};

module.exports = function (config, url, cb) {
var parsedUrl = URL.parse(url.url);
var client = parsedUrl.protocol === 'https:' ? https : http;
var httpOptions = {
hostname: parsedUrl.hostname,
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
path: parsedUrl.pathname || '/',
method: 'GET',
auth: config.http && config.http.auth
};

if (!httpOptions.hostname) {
return {
url: url,
ok: false,
error: 'No hostname found'
module.exports = function (data, cb) {
var config = data.config;
var url = data.url;
var index = data.index;

setTimeout(function () {
var parsedUrl = URL.parse(url.url);
var client = parsedUrl.protocol === 'https:' ? https : http;
var httpOptions = {
hostname: parsedUrl.hostname,
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
path: parsedUrl.pathname || '/',
method: 'GET',
auth: config.http && config.http.auth,
headers: config.http && config.http.headers
};
}

var callback = function (err, data) {
isFetching = false;
cb(err, data);
};

var fetch = function () {
var shasum = crypto.createHash('sha1');
shasum.update(url.url, 'utf8');

isFetching = true;

var req = client.request(httpOptions, function (res) {
var data = '';
var now = new Date();
var meta = {
date: now,
timestamp: now.getTime(),
url: url.url,
objectID: shasum.digest('base64'),
lang: url.lang
if (!httpOptions.hostname) {
return {
url: url,
ok: false,
error: 'No hostname found'
};
var record = _.clone(meta);

res.setEncoding('utf8');

if (res.statusCode === 404) {
callback({
message: 'Page not found ' + url.url,
pageNotFound: true
}, record);
return;
} else if (res.statusCode !== 200) {
callback({
message: 'HTTP error ' + res.statusCode + ' ' + url.url
});
return;
}
}

var callback = function (err, data) {
isFetching = false;
cb(err, data);
};

var fetch = function () {
var shasum = crypto.createHash('sha1');
shasum.update(url.url, 'utf8');

res.on('data', function (chunk) {
data += chunk;
});
isFetching = true;

res.on('end', function (chunk, encoding) {
if (!!chunk) {
data += chunk;
}
var req = client.request(httpOptions, function (res) {
var data = '';
var now = new Date();
var meta = {
date: now,
timestamp: now.getTime(),
url: url.url,
objectID: shasum.digest('base64'),
lang: url.lang
};
var record = _.clone(meta);

var error = null;
res.setEncoding('utf8');

try {
parse(record, data, config);
} catch (ex) {
error = ex;
} finally {
callback(error, record);
if (res.statusCode === 404) {
callback({
message: 'Page not found ' + url.url,
pageNotFound: true
}, record);
return;
} else if (res.statusCode !== 200) {
callback({
message: 'HTTP error ' + res.statusCode + ' ' + url.url
});
return;
}

res.on('data', function (chunk) {
data += chunk;
});

res.on('end', function (chunk, encoding) {
if (!!chunk) {
data += chunk;
}

var error = null;

try {
parse(record, data, config);
} catch (ex) {
error = ex;
} finally {
callback(error, record);
}
});
});
});

req.on('error', function (e) {
callback(e);
});

req.on('error', function (e) {
callback(e);
});

req.end();
};

req.end();
};

queue.push(fetch);
queue.push(fetch);
}, config.delayBetweenRequest * index);

return {
url: url,
Expand Down
3 changes: 2 additions & 1 deletion lib/sitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ module.exports = function (config, cb) {
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
path: parsedUrl.path,
method: 'GET',
auth: config.http && config.http.auth
auth: config.http && config.http.auth,
headers: config.http && config.http.headers
};
var client = parsedUrl.protocol === 'https:' ? https : http;

Expand Down

0 comments on commit 6d19724

Please sign in to comment.