Skip to content

Commit

Permalink
Cleanup (#10)
Browse files Browse the repository at this point in the history
* Add driverOptions to be passed per-request

* Update README.md

* Add example

* Run integration tests during test phase

* Fix working directory

* Emit typescript declarations

* Add test for root exports

* Better document example file

* Swap driver to a simple function

* More cleanup

* Switch to minimist to simplify CLI

* Enhance CLI testing and structuring

* Move CLI code into index of cli directory

* Move isCrawlerRequest
  • Loading branch information
rbayliss authored Mar 25, 2020
1 parent 2ed9bb1 commit 6b9ad88
Show file tree
Hide file tree
Showing 37 changed files with 714 additions and 496 deletions.
2 changes: 1 addition & 1 deletion .babelrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"@babel/preset-typescript",
["@babel/preset-env", {
"targets": {
"node": "10"
"node": "current"
}
}]
]
Expand Down
10 changes: 9 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@ jobs:
command: npm run test -- --ci --reporters="default" --reporters="jest-junit"
environment:
JEST_JUNIT_OUTPUT: "/tmp/junit/jest.xml"
- run: npm run build
- run:
name: "Prepare Integration Tests"
command: npm install ../../
working_directory: docs/example
- run:
name: "Run integration tests"
command: node_modules/.bin/nightcrawler --config nightcrawler.js crawl
working_directory: docs/example
- store_test_results:
path: /tmp/junit
- run: npm run build
- persist_to_workspace:
root: /srv
paths:
Expand Down
3 changes: 3 additions & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
module.exports = {
root: true,
env: {
"node": true
},
parser: '@typescript-eslint/parser',
plugins: ['@typescript-eslint'],
extends: [
Expand Down
11 changes: 0 additions & 11 deletions .flowconfig

This file was deleted.

4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
node_modules/
yarn.lock
*.log
nightcrawler.js
dist/
/nightcrawler.js
dist/
183 changes: 78 additions & 105 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,141 +12,114 @@ yarn add lastcall-nightcrawler
Define your crawler by creating a `nightcrawler.js` file, like this:
```js
# nightcrawler.js
const Crawler = require('lastcall-nightcrawler');
const Number = Crawler.metrics.Number;

const myCrawler = new Crawler('My Crawler');

myCrawler.on('setup', function(crawler) {
// On setup, give the crawler a list of URLs to crawl.
crawler.enqueue('http://localhost/');
crawler.enqueue('http://localhost/foo');
const {crawl, test} = require('./dist');
const expect = require('expect');

module.exports = crawl('Response code validation', function() {
test('Should return 2xx', function(unit) {
expect(unit.response.statusCode).toBeGreaterThanOrEqual(200);
expect(unit.response.statusCode).toBeLessThan(300);
});

return [
{url: 'https://example.com'},
{url: 'https://example.com?q=1'},
{url: 'https://example.com?q=2'}
];
});

myCrawler.on('analyze', function(crawlReport, analysis) {
// On analysis, derive the metrics you need from the
// array of collected data.
analysis.addMetric('count', new Number('Total Requests', 0, crawlReport.data.length));
});

module.exports = myCrawler;
```
Run your crawler:
```bash
# Run the crawler.
node_modules/.bin/nightcrawler crawl
```

Queueing Requests
-----------------
Requests can be queued during the `setup` event. You can queue a new request by calling the `enqueue()` method, using either a string (representing the URL) or an object containing a `url` property. If you pass an object, you will have access to that object's properties later on during analysis.
Specifying what URLs to crawl
-----------------------------

The `crawl` function expects a return value of an iterable (or async iterable) containing "requests". The simplest version of this is just an array of objects that have a `url` property. Eg:
```js
myCrawler.on('setup', function(crawler) {
// This works
crawler.enqueue('http://localhost/');
// So does this:
crawler.enqueue({
url: 'http://localhost/foo',
group: 'awesome'
});
});
module.exports = crawl('Crawl a static list of URLs', function() {

myCrawler.on('analyze', function(crawlReport, analysis) {
var awesomeRequests = crawlReport.data.filter(function(point) {
// *group property is only available if you added it during queuing.
return point.group === 'awesome';
});
// Do additional analysis only on pages in the awesome group.
analysis.addMetric('awesome.count', new Number('Awesome Requests', 0, awesomeRequests.length));
})
return [
{url: 'https://example.com'}
]
});
```

Collecting data
---------------
By default, only the following information is collected for each response:
* `url` (string) : The URL that was crawled.
* `error` (bool) : Whether the response was determined to be an error response.
* `status` (int): The HTTP status code received.
* `backendResponseTime` (int): The duration of HTTP server response (see the [request module's documentation](https://github.com/request/request) on `timingPhases.firstByte`).
For more advanced use cases, you may want to use async generators to fetch a list of URLs from somewhere else (eg: a database). Eg:

If there is other data you're interested in knowing, you can collect it like this:
```js
// Collect the `Expires` header for each request.
myCrawler.on('response', function(response, data) {
data.expires = response.headers['expires'];
});
async function* getURLs() {
const result = await queryDB();
for(const url of result) {
yield {url: url};
}
}

module.exports = crawl('Crawl a dynamic list of URLs', function() {

return getURLs();
})
```
The response event is triggered on request success or error, as long as the server sends a response. Anything put into the `data` object will end up in the final JSON report.

Performing assertions on responses
----------------------------------

Dynamic Crawling
----------------
You may wish to be able to crawl a list of URLs that isn't static (it's determined at runtime). For example, you may want to query a remote API or a database and enqueue a list of URLs based on that data. To support this, the `setup` event allows you to return a promise.
One of the primary goals of Nightcrawler is to detect URLs that don't meet your expectations. To achieve this, you can use the `test` function within a `crawl` to make assertions about the response received.

```js
// Fetch a list of URLs from a remote API, then enqueue them all.
myCrawler.on('setup', function(crawler) {
return fetchData().then(function(myData) {
myData.forEach(function(url) {
crawler.enqueue(url);
})
})
})
```
const {crawl, test} = require('./dist');
// Use the expect module from NPM for assertions.
// You can use any assertion library, including the built-in assert module.
const expect = require('expect');

Analysis
--------
Once the crawl has been completed, you will probably want to analyze the data in some way. Data analysis in Nightcrawler is intentionally loose - the crawler fires an `analyze` event with an array of collected data, and you are responsible for analyzing your own data. Here are some examples of things you might do during analysis:

```js
const Crawler = require('lastcall-nightcrawler');
const Number = Crawler.metrics.Number;
const Milliseconds = Crawler.metrics.Milliseconds;
const Percent = Crawler.metrics.Percent;

myCrawler.on('analyze', function(crawlReport, analysis) {
var data = crawlReport.data;
module.exports = crawl('Check that the homepage is cacheable', function() {

// Calculate the number of requests that were made:
analysis.addMetric('count', new Number('Total Requests', 0, data.length));

// Calculate the average response time:
var avgTime = data.reduce(function(sum, dataPoint) {
return sum + dataPoint.backendTime
}, 0) / data.length;
analysis.addMetric('time', new Milliseconds('Avg Response Time', 0, avgTime));

// Calculate the percent of requests that were marked failed:
var failRatio = data.filter(function(dataPoint) {
return dataPoint.fail === true;
}).length / data.length;
var level = failRatio > 0 ? 2 : 0;
analysis.addMetric('fail', new Percent('% Failed', level, failRatio));

// Calculate the percent of requests that resulted in a 500 response.
var serverErrorRatio = data.filter(function(dataPoint) {
return dataPoint.statusCode >= 500;
}).length / data.length;
var level = serverErrorRatio > 0 ? 2 : 0;
analysis.add('500', new Percent('% 500', level, serverErrorRatio));
test('Should have cache-control header', function(unit) {
expect(unit.response.headers).toHaveProperty('cache-control');
expect(unit.response.headers['cache-control']).toBe('public; max-age: 1800');
})

return [{url: 'https://example.com/'}]
});
```
The [`analysis`](./src/analysis.js) object can consist of many metrics, added through the `add` method. See [`src/metrics.js`](./src/metrics.js) for more information about metrics.

Analysis can also be performed on individual requests to mark them passed or failed.
The `test` function will receive a `unit` of crawler work, which includes the following properties:

* `request`: The request, as you passed it into the Crawler. This will include any additional properties you passed in, and you can use those properties to do conditional checking of units of work.
* `response`: The response object, as returned by the Driver. The default `NativeDriver` will produce a response in the shape of a Node [`http.IncomingMessage`](https://nodejs.org/api/http.html#http_class_http_incomingmessage) object. All `response` objects are guaranteed to have both a `statusCode` and a `time` property.

Performing assertions about the overall status of the crawl
-----------------------------------------------------------

For some use cases, you will want make assertions about many requests. For example, checking the average response type of all requests. To do this, you may use the `after` function to perform assertions after all the URLs have been requested. Just use the `test` function to collect the data you need from each request, then perform the final assertion in `after`:

```js
myCrawler.on('analyze', function(crawlReport, analysis) {
var data = crawlReport.data;
const {crawl, test, after} = require('./dist');
const expect = require('expect');

data.forEach(function(request) {
var level = request.statusCode > 499 ? 2 : 0
analysis.addResult(request.url, level)
});
})
module.exports = crawl('Check that pages load quickly', function() {
const times = [];

test('Collect response time', function(unit) {
times.push(unit.response.time);
})

after('Response time should be less than 500ms', function() {
const sum = times.reduce((total, value) => total + value, 0);
expect(sum / times.length).toBeLessThan(500);
})

return [{url: 'https://example.com/'}]
});
```

Drivers
-------

Right now, there is only one "Driver" available for making requests. It uses Node's built-in `http` and `https` modules to issue HTTP requests to the target URL. In the future, we may have additional drivers available.

CI Setup
--------
To add Nightcrawler to CircleCI make sure to the following steps are done:
Expand Down
10 changes: 9 additions & 1 deletion bin/nightcrawler
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
#!/usr/bin/env node

require('../dist/cli');
const cli = require('../dist/cli').default;
const {FailedAnalysisError} = require('../dist/cli/errors')
cli(process.argv, process.stdout, process.cwd()).then(
() => process.exit(0),
(err) => {
console.error(err instanceof FailedAnalysisError ? err.message : err);
process.exit(1);
}
)
2 changes: 2 additions & 0 deletions docs/example/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
package-lock.json
node_modules/
34 changes: 34 additions & 0 deletions docs/example/nightcrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

const {crawl, test, after} = require('lastcall-nightcrawler');
const expect = require('expect');

module.exports = crawl('Homepage', function() {
const times = [];

// Tests run for every request/response cycle.
test('Status code is 2xx', function(unit) {
expect(unit.response.statusCode).toBeGreaterThanOrEqual(200);
expect(unit.response.statusCode).toBeLessThan(300);
});

test('Has a long cache lifetime', function(unit) {
expect(unit.response.headers).toHaveProperty('cache-control', 'max-age=604800');
})

test('Collect response time', function(unit) {
times.push(unit.response.time);
})

// After functions run after all responses have been received.
after('Average response time should be < 200ms', function() {
const sum = times.reduce((total, value) => total + value, 0);
expect(sum / times.length).toBeLessThan(200);
})

// Return any iterable/async iterable filled with request-shaped objects.
return [
// options can be used to pass options to the driver.
// For example, passing {auth: 'foo:bar'} will enable basic auth.
{url: 'https://www.example.com/', options: {}}
]
})
7 changes: 7 additions & 0 deletions docs/example/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "lastcall-nightcrawler-examples",
"dependencies": {
"expect": "^25.1.0",
"lastcall-nightcrawler": "^2.0.0"
}
}
9 changes: 5 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
"chalk": "^2.0.0",
"debug": "^3.0.0",
"indent-string": "^4.0.0",
"minimist": "^1.2.5",
"strip-ansi": "^6.0.0",
"wrap-ansi": "^6.0.0",
"xml": "^1.0.0",
"yargs": "^15.0.0"
"xml": "^1.0.0"
},
"bin": {
"nightcrawler": "bin/nightcrawler"
Expand All @@ -24,6 +24,7 @@
"@babel/preset-typescript": "^7.8.3",
"@types/debug": "^4.1.5",
"@types/jest": "^25.1.2",
"@types/minimist": "^1.2.0",
"@types/node": "^13.7.2",
"@types/tmp": "^0.1.0",
"@types/wrap-ansi": "^3.0.0",
Expand All @@ -43,8 +44,8 @@
"scripts": {
"prettier": "prettier --single-quote --write './src/**/*.ts'",
"test": "jest",
"check-types": "tsc",
"build": "babel src/ --out-dir=dist --extensions '.ts' --ignore 'src/**/__tests__/**'",
"check-types": "tsc --noEmit",
"build": "babel src/ --out-dir=dist --extensions '.ts' --ignore 'src/**/__tests__/**' --ignore 'src/**/__mocks__/**' --ignore 'src/**/__stubs__/**' && tsc --emitDeclarationOnly",
"lint": "eslint ./src/ --ext .js,.jsx,.ts,.tsx"
},
"files": [
Expand Down
Loading

0 comments on commit 6b9ad88

Please sign in to comment.