Skip to content

Commit

Permalink
feat!: add crawling and auditing multiple domains (#51)
Browse files Browse the repository at this point in the history
* chore: apply WIP stash

* feat: add 'ignoreWWWDomain' as default option

* fix: fix file extention checking logic for queue

* feat: allow multiple domains to be crawled and audited

* fix: fix url parsing for generating report files

* fix: prevent redundant crawling when using multiple starting URLs

* fix: fix initial domain not being crawled when using one starting URL

* refactor: update default URL arguments

* fix: prevent multiples of the same URL being queued on startup

* feat: add graceful shutdown logic when no starting URL

BREAKING CHANGE: Removes the dependency on the crawler config

* chore: update package-lock

* refactor: update tests due to breaking change

* chore: fix typo in test

* test: add tests for differently sized URL arrays
  • Loading branch information
TGiles authored Dec 13, 2019
1 parent fa7c13c commit c8e0a12
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 32 deletions.
5 changes: 4 additions & 1 deletion cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ const package = require('./package.json');
const program = new commander.Command();
program.version(package.version);
program
.option('-u, --url <url>', 'starting valid url for auto-lighthouse', 'https://blank.org')
.option('-u, --url <urls>', 'starting valid url for auto-lighthouse',
[
'https://tgiles.github.io'
])
.option('-e, --express <open>', 'flag for auto opening reports in local express server')
.option('-p, --port <port>', 'port for local express server', 9000);

Expand Down
2 changes: 1 addition & 1 deletion config/simpleCrawler.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
"respectRobotsTxt": true,
"parseHTMLComments": false,
"parseScriptTags": false,
"host": "https://blank.org/"
"ignoreWWWDomain": true
}
65 changes: 51 additions & 14 deletions lighthouse_runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@ const parallelLimit = async (funcList, limit = 4) => {
* @param {*} queueItem a URL that has been picked up by the crawler
*/
const queueAdd = (queueItem, urlList) => {
const regex = /\.(css|jpg|pdf|docx|js|png|ico|gif|svg|psd|ai|zip|gz|zx|src|cassette|mini-profiler|axd|woff|woff2|)/i;
if (!queueItem.uriPath.match(regex)) {
let fileExtension = queueItem.uriPath.split('/');
const regex = /\.(css|jpg|jpeg|pdf|docx|js|png|ico|gif|svg|psd|ai|zip|gz|zx|src|cassette|mini-profiler|axd|woff|woff2|eot|ttf)/i;
if (!fileExtension[fileExtension.length - 1].match(regex)) {
urlList.push(queueItem.url);
console.log("Pushed: ", queueItem.url);
}
Expand Down Expand Up @@ -229,31 +230,67 @@ const openReportsWithoutServer = (tempFilePath) => {
*/
function main(program) {
let domainRoot;
let simpleCrawler;
if (program.express === undefined) {
autoOpen = runnerConfig.autoOpenReports;
} else {
autoOpen = program.express;
}
if (program.url === undefined) {
domainRoot = new URL(simpleCrawlerConfig.host);
throw new Error('No URL given, quitting!');
} else {
domainRoot = new URL(program.url);
if (Array.isArray(program.url)) {
domainRoot = [];
program.url.forEach(_url => {
domainRoot.push(new URL(_url));
});
} else {
domainRoot = new URL(program.url)
}
}
let isDomainRootAnArray = Array.isArray(domainRoot);
port = program.port;
let urlList = [domainRoot.href];
console.log('Pushed: ', domainRoot.href);
let simpleCrawler = new Crawler(domainRoot.href)
.on('queueadd', (queueItem) => {
queueAdd(queueItem, urlList)
})
.on('complete', () => {
complete(urlList, autoOpen);
});
if (isDomainRootAnArray) {
simpleCrawler = Crawler(domainRoot[0].href)
.on('queueadd', (queueItem) => {
queueAdd(queueItem, urlList)
})
.on('complete', () => {
complete(urlList, autoOpen);
});

} else {
simpleCrawler = Crawler(domainRoot.href)
.on('queueadd', (queueItem) => {
queueAdd(queueItem, urlList)
})
.on('complete', () => {
complete(urlList, autoOpen);
});
}

for (let key in simpleCrawlerConfig) {
simpleCrawler[key] = simpleCrawlerConfig[key];
}
simpleCrawler.host = domainRoot.hostname;
simpleCrawler.ignoreWWWDomain = true;
let urlList = [];
if (isDomainRootAnArray) {
if (domainRoot.length > 1) {
domainRoot.forEach(root => {
if (!simpleCrawler.queue.includes(root)) {
simpleCrawler.domainWhitelist.push(root.hostname);
simpleCrawler.queueURL(root.href);
}
});
} else {
urlList.push(domainRoot[0].href);
}
} else {
urlList.push(domainRoot.href);
}

// simpleCrawler.host = domainRoot.hostname;

if (autoOpen) {
console.log('Automatically opening reports when done!');
} else {
Expand Down
20 changes: 10 additions & 10 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 50 additions & 6 deletions spec/main/lighthouse_runner.spec.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const EventEmitter = require('events');
const path = require('path');
const fs = require('fs');
describe("main", () => {
it('was called once', () => {
let lighthouse_runner = require("../../lighthouse_runner");
Expand All @@ -26,7 +27,8 @@ describe("main", () => {
it('should run without errors and not open reports', () => {
let runner = require('../../lighthouse_runner');
let mockProgram = {
port: 8000
port: 8000,
url: 'https://tgiles.github.io'
};
expect(runner.openReports).toBeDefined();
expect(runner.openReportsWithoutServer).toBeDefined();
Expand All @@ -40,7 +42,7 @@ describe("main", () => {
it('should use the passed parameters over defaults', () => {
let runner = require('../../lighthouse_runner');
let mockProgram = {
open: true,
express: true,
url: 'https://tgiles.github.io',
port: 8000
};
Expand All @@ -50,14 +52,36 @@ describe("main", () => {
expect(runner.main).toHaveBeenCalledWith(mockProgram);
expect(result).toBeTruthy();
});
it('should use defaults if optional params are not present', () => {
it('should fail if required params are not present', () => {
let runner = require('../../lighthouse_runner');
let errorMessage = 'No URL given, quitting!'
spyOn(runner, "main").and.callThrough();
let mockProgram = {
open: undefined,
url: undefined,
port: 9000
};
expect(() => { runner.main(mockProgram);}).toThrowError(errorMessage);
});
it('should not throw an error when the URL is an array of one', () => {
let runner = require('../../lighthouse_runner');
let mockProgram = {
url: ['https://tgiles.github.io'],
port: 8001
}
spyOn(runner, "main").and.callThrough();

let result = runner.main(mockProgram);
expect(runner.main).toHaveBeenCalledWith(mockProgram);
expect(result).toBeTruthy();
});
it('should not throw an error when the URL is an array of two or more', () => {
let runner = require('../../lighthouse_runner');
let mockProgram = {
url: ['https://tgiles.github.io', 'https://blankslate.io'],
port: 8001
}
spyOn(runner, "main").and.callThrough();

let result = runner.main(mockProgram);
expect(runner.main).toHaveBeenCalledWith(mockProgram);
expect(result).toBeTruthy();
Expand Down Expand Up @@ -85,18 +109,38 @@ describe("openReportsWithoutServer", () => {
expect(result).toBeFalsy();
});
it('returns true if the file path exists', () => {
spyOn(runner, "openReportsWithoutServer").and.callThrough();
spyOn(runner, "openReportsWithoutServer").and.callFake((filePath) => {
if (fs.existsSync(filePath)) {
return true;
} else {
return false;
}
});
let someActualPath = path.join(__dirname, '../../', 'spec', 'helpers', 'lighthouse');
let result = runner.openReportsWithoutServer(someActualPath);
expect(runner.openReportsWithoutServer).toHaveBeenCalledWith(someActualPath);
expect(result).toBeTruthy();
let someFakePath = path.join(__dirname, 'spec', 'helpers', 'non');
result = runner.openReportsWithoutServer(someFakePath);
expect(runner.openReportsWithoutServer).toHaveBeenCalledWith(someFakePath);
expect(result).toBeFalsy();
});
it('returns false if the file path does not exist', () => {
spyOn(runner, "openReportsWithoutServer").and.callThrough();
spyOn(runner, "openReportsWithoutServer").and.callFake((filePath) => {
if (fs.existsSync(filePath)) {
return true;
} else {
return false;
}
});
let someFakePath = path.join(__dirname, 'spec', 'helpers', 'non');
let result = runner.openReportsWithoutServer(someFakePath);
expect(runner.openReportsWithoutServer).toHaveBeenCalledWith(someFakePath);
expect(result).toBeFalsy();
let someActualPath = path.join(__dirname, '../../', 'spec', 'helpers', 'lighthouse');
result = runner.openReportsWithoutServer(someActualPath);
expect(runner.openReportsWithoutServer).toHaveBeenCalledWith(someActualPath);
expect(result).toBeTruthy();
});
});

Expand Down

0 comments on commit c8e0a12

Please sign in to comment.