From 6d19724c3a94d823664175054bff81a11a9064c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9rick=20Hamon?= Date: Fri, 25 Aug 2017 14:01:03 -0400 Subject: [PATCH] Add delay between requests (#21) Some websites won't let you request that many urls so fast, so the * Added delay between crawl and added configurable http headers e2d28ec5450b59d4a93c8131e1696936941b191d * Added the documentation in the README.md f5d5e1338aa3f6ff86ec1cd0246c5d8f56b6adb1 * Fixed config headers 70a4b6ac597cb6daa61930d5ed112096a167621f * Put the setTimout inside the processOne function 719b0bedf18dedb6be67998509bed4d7bd6df86e --- README.md | 4 ++ app.js | 6 +- config.json | 6 +- lib/process.js | 159 ++++++++++++++++++++++++++----------------------- lib/sitemap.js | 3 +- 5 files changed, 99 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index f9a1918..fa18c60 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,10 @@ Your Algolia App ID. Your generated Algolia API key. +#### delayBetweenRequest: Integer + +Simple delay between each requests made to the website in milliseconds. + #### oldentries: Integer The maximum number of seconds an entry can live without being updated. After diff --git a/app.js b/app.js index b96f002..865747c 100644 --- a/app.js +++ b/app.js @@ -90,7 +90,11 @@ sitemap(config, function (sitemap, urls) { var results = _.map(urls, function (url, index) { console.log('Registered ' + url.url); - var processResults = processOne(config, url, function (error, record) { + var processResults = processOne({ + config: config, + url: url, + index: index + }, function (error, record) { if (!!error || !record) { console.error('Error! ' + error.message); if (!!error.pageNotFound && !!record) { diff --git a/config.json b/config.json index c4dc170..99aa148 100644 --- a/config.json +++ b/config.json @@ -17,8 +17,12 @@ {"url": "http://example.com/en/sitemap.xml", "lang": "en"} ], "http": { - "auth": "" + "auth": "", + "headers": { + + } }, + "delayBetweenRequest": 100, "selectors": { "title": "title", "image": "meta[property=\"og:image\"]", diff --git a/lib/process.js b/lib/process.js index 19501e6..4130306 100644 --- a/lib/process.js +++ b/lib/process.js @@ -131,92 +131,99 @@ var parse = function (record, data, config) { }); }; -module.exports = function (config, url, cb) { - var parsedUrl = URL.parse(url.url); - var client = parsedUrl.protocol === 'https:' ? https : http; - var httpOptions = { - hostname: parsedUrl.hostname, - port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80), - path: parsedUrl.pathname || '/', - method: 'GET', - auth: config.http && config.http.auth - }; - - if (!httpOptions.hostname) { - return { - url: url, - ok: false, - error: 'No hostname found' +module.exports = function (data, cb) { + var config = data.config; + var url = data.url; + var index = data.index; + + setTimeout(function () { + var parsedUrl = URL.parse(url.url); + var client = parsedUrl.protocol === 'https:' ? https : http; + var httpOptions = { + hostname: parsedUrl.hostname, + port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80), + path: parsedUrl.pathname || '/', + method: 'GET', + auth: config.http && config.http.auth, + headers: config.http && config.http.headers }; - } - - var callback = function (err, data) { - isFetching = false; - cb(err, data); - }; - - var fetch = function () { - var shasum = crypto.createHash('sha1'); - shasum.update(url.url, 'utf8'); - isFetching = true; - - var req = client.request(httpOptions, function (res) { - var data = ''; - var now = new Date(); - var meta = { - date: now, - timestamp: now.getTime(), - url: url.url, - objectID: shasum.digest('base64'), - lang: url.lang + if (!httpOptions.hostname) { + return { + url: url, + ok: false, + error: 'No hostname found' }; - var record = _.clone(meta); - - res.setEncoding('utf8'); - - if (res.statusCode === 404) { - callback({ - message: 'Page not found ' + url.url, - pageNotFound: true - }, record); - return; - } else if (res.statusCode !== 200) { - callback({ - message: 'HTTP error ' + res.statusCode + ' ' + url.url - }); - return; - } + } + + var callback = function (err, data) { + isFetching = false; + cb(err, data); + }; + + var fetch = function () { + var shasum = crypto.createHash('sha1'); + shasum.update(url.url, 'utf8'); - res.on('data', function (chunk) { - data += chunk; - }); + isFetching = true; - res.on('end', function (chunk, encoding) { - if (!!chunk) { - data += chunk; - } + var req = client.request(httpOptions, function (res) { + var data = ''; + var now = new Date(); + var meta = { + date: now, + timestamp: now.getTime(), + url: url.url, + objectID: shasum.digest('base64'), + lang: url.lang + }; + var record = _.clone(meta); - var error = null; + res.setEncoding('utf8'); - try { - parse(record, data, config); - } catch (ex) { - error = ex; - } finally { - callback(error, record); + if (res.statusCode === 404) { + callback({ + message: 'Page not found ' + url.url, + pageNotFound: true + }, record); + return; + } else if (res.statusCode !== 200) { + callback({ + message: 'HTTP error ' + res.statusCode + ' ' + url.url + }); + return; } + + res.on('data', function (chunk) { + data += chunk; + }); + + res.on('end', function (chunk, encoding) { + if (!!chunk) { + data += chunk; + } + + var error = null; + + try { + parse(record, data, config); + } catch (ex) { + error = ex; + } finally { + callback(error, record); + } + }); }); - }); - - req.on('error', function (e) { - callback(e); - }); + + req.on('error', function (e) { + callback(e); + }); + + req.end(); + }; - req.end(); - }; - - queue.push(fetch); + queue.push(fetch); + }, config.delayBetweenRequest * index); return { url: url, diff --git a/lib/sitemap.js b/lib/sitemap.js index 4e1e3c1..ab81636 100644 --- a/lib/sitemap.js +++ b/lib/sitemap.js @@ -19,7 +19,8 @@ module.exports = function (config, cb) { port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80), path: parsedUrl.path, method: 'GET', - auth: config.http && config.http.auth + auth: config.http && config.http.auth, + headers: config.http && config.http.headers }; var client = parsedUrl.protocol === 'https:' ? https : http;