Skip to content

Commit

Permalink
Add blockNavigation option (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0ph1e authored May 1, 2020
1 parent 686aebb commit 22b2b14
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 28 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ scrape({
```
Puppeteer plugin constructor accepts next params:
* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/GoogleChrome/puppeteer/blob/v1.20.0/docs/api.md#puppeteerlaunchoptions)
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached :
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached:
* `timeout` - in milliseconds
* `viewportN` - viewport height multiplier
* `blockNavigation` - *(optional)* - defines whether navigation away from the page is permitted or not. If it is set to true, then the page is locked to the current url and redirects with `location.replace(anotherPage)` will not pass. Defaults to `false`

## How it works
It starts Chromium in headless mode which just opens page and waits until page is loaded.
Expand Down
33 changes: 30 additions & 3 deletions index.js → lib/index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
const puppeteer = require('puppeteer');
const logger = require('./logger.js');

class PuppeteerPlugin {
constructor({
launchOptions = {},
scrollToBottom = null
scrollToBottom = null,
blockNavigation = false
} = {}) {
this.launchOptions = launchOptions;
this.scrollToBottom = scrollToBottom;
this.blockNavigation = blockNavigation;
this.browser = null;
this.headers = {};

logger.info('init plugin', { launchOptions, scrollToBottom, blockNavigation });
}

apply(registerAction) {
Expand All @@ -28,19 +33,26 @@ class PuppeteerPlugin {
const isHtml = contentType && contentType.split(';')[0] === 'text/html';
if (isHtml) {
const url = response.request.href;

const page = await this.browser.newPage();

if (hasValues(this.headers)) {
logger.info('set headers to puppeteer page', this.headers);
await page.setExtraHTTPHeaders(this.headers);
}

if (this.blockNavigation) {
await blockNavigation(page, url);
}

await page.goto(url);

if(this.scrollToBottom) {
if (this.scrollToBottom) {
await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
}

const content = await page.content();
await page.close();

// convert utf-8 -> binary string because website-scraper needs binary
return Buffer.from(content).toString('binary');
} else {
Expand All @@ -58,6 +70,8 @@ function hasValues(obj) {


async function scrollToBottom(page, timeout, viewportN) {
logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`);

await page.evaluate(async (timeout, viewportN) => {
await new Promise((resolve, reject) => {
let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN;
Expand All @@ -74,4 +88,17 @@ async function scrollToBottom(page, timeout, viewportN) {
}, timeout, viewportN);
}

async function blockNavigation(page, url) {
logger.info(`block navigation for puppeteer page from url ${url}`);

page.on('request', req => {
if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== url) {
req.abort('aborted');
} else {
req.continue();
}
});
await page.setRequestInterception(true);
}

module.exports = PuppeteerPlugin;
11 changes: 11 additions & 0 deletions lib/logger.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
const debug = require('debug');

const appName = 'website-scraper-puppeteer';
const logLevels = ['error', 'warn', 'info', 'debug', 'log'];

const logger = {};
logLevels.forEach(logLevel => {
logger[logLevel] = debug(`${appName}:${logLevel}`);
});

module.exports = logger;
6 changes: 3 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "0.1.4",
"description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer",
"readmeFilename": "README.md",
"main": "index.js",
"main": "lib/index.js",
"keywords": [
"website-scraper",
"puppeteer",
Expand All @@ -13,6 +13,7 @@
"html"
],
"dependencies": {
"debug": "^4.1.1",
"puppeteer": "^2.0.0"
},
"peerDependencies": {
Expand Down Expand Up @@ -40,6 +41,6 @@
},
"homepage": "https://github.com/website-scraper/website-scraper-puppeteer#readme",
"files": [
"index.js"
"lib"
]
}
19 changes: 19 additions & 0 deletions test/mock/navigation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body>

<div id="root"></div>

<script>
window.onload = function() {
window.location.replace('http://example.com');
document.getElementById('root').innerText = 'Navigation blocked!';
};
</script>

</body>
</html>
65 changes: 46 additions & 19 deletions test/puppeteer-plugin.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,65 @@ const finalhandler = require('finalhandler');
const serveStatic = require('serve-static');
const fs = require('fs-extra');
const scrape = require('website-scraper');
const PuppeteerPlugin = require('../index');
const PuppeteerPlugin = require('../lib');

const directory = __dirname + '/tmp';
const SERVE_WEBSITE_PORT = 4567;

describe('Puppeteer plugin test', () => {
let result, content;

before('serve website', () => serveWebsite(4567));
before('scrape website', async () => {
result = await scrape({
urls: ['http://localhost:4567'],
directory: directory,
plugins: [ new PuppeteerPlugin() ]
before('serve website', () => serveWebsite(SERVE_WEBSITE_PORT));

describe('Dynamic content', () => {
before('scrape website', async () => {
result = await scrape({
urls: [`http://localhost:${SERVE_WEBSITE_PORT}`],
directory: directory,
plugins: [ new PuppeteerPlugin() ]
});
});
});
before('get content from file', () => {
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
});
before('get content from file', () => {
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
});
after('delete dir', () => fs.removeSync(directory));

after('delete dir', () => fs.removeSync(directory));
it('should have 1 item in result array', () => {
expect(result.length).eql(1);
});

it('should have 1 item in result array', () => {
expect(result.length).eql(1);
});
it('should render dymanic website', async () => {
expect(content).to.contain('<div id="root">Hello world from JS!</div>');
});

it('should render dymanic website', async () => {
expect(content).to.contain('<div id="root">Hello world from JS!</div>');
it('should render special characters correctly', async () => {
expect(content).to.contain('<div id="special-characters-test">저는 7년 동안 한국에서 살았어요. Слава Україні!</div>');
});
});

it('should render special characters correctly', async () => {
expect(content).to.contain('<div id="special-characters-test">저는 7년 동안 한국에서 살았어요. Слава Україні!</div>');
describe('Block navigation', () => {
before('scrape website', async () => {
result = await scrape({
urls: [`http://localhost:${SERVE_WEBSITE_PORT}/navigation.html`],
directory: directory,
plugins: [
new PuppeteerPlugin({
blockNavigation: true
})
]
});
});
before('get content from file', () => {
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
});
after('delete dir', () => fs.removeSync(directory));

it('should render content (and not be redirected)', async () => {
expect(content).to.contain('<div id="root">Navigation blocked!</div>');
});
});


});

function serveWebsite(port = 3000) {
Expand Down

0 comments on commit 22b2b14

Please sign in to comment.