From e56cc8cd0986087095d9160e79326526963ccb21 Mon Sep 17 00:00:00 2001 From: mei23 Date: Sun, 17 Mar 2024 23:29:36 +0900 Subject: [PATCH] =?UTF-8?q?Use=20range=20=E3=81=AA=E3=81=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- built/general.d.ts | 2 +- built/general.js | 4 ++-- built/index.d.ts | 4 ++-- built/index.js | 3 +-- built/server/index.js | 10 ++-------- built/server/load-config.d.ts | 1 + built/utils/got.d.ts | 1 + built/utils/got.js | 33 +++++++++++++++++++++++++++------ server_config.example.yml | 3 +++ src/general.ts | 4 ++-- src/index.ts | 7 +++---- src/server/index.ts | 8 ++------ src/server/load-config.ts | 1 + src/utils/got.ts | 35 +++++++++++++++++++++++++++++------ 14 files changed, 77 insertions(+), 39 deletions(-) diff --git a/built/general.d.ts b/built/general.d.ts index 978fcf36..19bc4f8d 100644 --- a/built/general.d.ts +++ b/built/general.d.ts @@ -1,3 +1,3 @@ import { SummalyEx } from './summaly'; -declare const _default: (url: URL, lang?: string | null) => Promise; +declare const _default: (url: URL, lang?: string | null, useRange?: boolean) => Promise; export default _default; diff --git a/built/general.js b/built/general.js index 18c9c092..ba1f4220 100644 --- a/built/general.js +++ b/built/general.js @@ -13,11 +13,11 @@ const cleanup_title_1 = require("./utils/cleanup-title"); const decode_entities_1 = require("./utils/decode-entities"); const got_1 = require("./utils/got"); const cleanup_url_1 = require("./utils/cleanup-url"); -exports.default = (url, lang = null) => __awaiter(void 0, void 0, void 0, function* () { +exports.default = (url, lang = null, useRange = false) => __awaiter(void 0, void 0, void 0, function* () { var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p, _q, _r, _s, _t, _u, _v, _w, _x, _y, _z, _0, _1, _2, _3, _4, _5, _6; if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) lang = null; - const res = yield (0, got_1.scpaping)(url.href, { lang: lang || undefined }); + const res = yield (0, got_1.scpaping)(url.href, { lang: lang || undefined, useRange }); const $ = res.$; const landingUrl = new URL(res.response.url); const twitterCard = (_a = $('meta[name="twitter:card"]').attr('content')) !== null && _a !== void 0 ? _a : $('meta[property="twitter:card"]').attr('content'); diff --git a/built/index.d.ts b/built/index.d.ts index 3ab10933..ca50eb83 100644 --- a/built/index.d.ts +++ b/built/index.d.ts @@ -8,9 +8,9 @@ type RequestOptions = { */ lang?: string | null; /** - * Whether follow redirects + * Use range for the request */ - followRedirects?: boolean; + useRange?: boolean; }; export declare class Summary { private plugins; diff --git a/built/index.js b/built/index.js index 3ea7e905..8f835fba 100644 --- a/built/index.js +++ b/built/index.js @@ -31,7 +31,6 @@ class Summary { return __awaiter(this, void 0, void 0, function* () { const opts = Object.assign({ lang: null, - followRedirects: true, }, requestOptions); const _url = new URL(url); // pre @@ -50,7 +49,7 @@ class Summary { return summary; } else { - let summary = yield (0, general_1.default)(_url, opts.lang); + let summary = yield (0, general_1.default)(_url, opts.lang, opts.useRange); if (summary == null) throw 'failed summarize'; const landingUrl = summary.url; diff --git a/built/server/index.js b/built/server/index.js index c09a12d4..99b4ae67 100644 --- a/built/server/index.js +++ b/built/server/index.js @@ -14,7 +14,6 @@ const load_config_1 = require("./load-config"); const h3 = require("h3"); const http_1 = require("http"); const h3_typebox_1 = require("h3-typebox"); -const status_error_1 = require("../utils/status-error"); const config = (0, load_config_1.default)(); const summaryInstance = new __1.Summary({ allowedPlugins: config.allowedPlugins @@ -30,19 +29,14 @@ router.get('/url', h3.eventHandler((event) => __awaiter(void 0, void 0, void 0, try { const summary = yield summaryInstance.summary(query.url, { lang: query.lang, - followRedirects: false, + useRange: config.useRange, }); h3.setResponseHeader(event, 'Cache-Control', 'public, max-age=604800'); return summary; } catch (e) { console.log(`summaly error: ${e} ${query.url}`); - if (e instanceof status_error_1.StatusError && e.isPermanentError) { - h3.setResponseStatus(event, 400); - } - else { - h3.setResponseStatus(event, 500); - } + h3.setResponseStatus(event, 422); h3.setResponseHeader(event, 'Content-Type', 'text/plain'); h3.setResponseHeader(event, 'Cache-Control', 'public, max-age=3600'); return 'error'; diff --git a/built/server/load-config.d.ts b/built/server/load-config.d.ts index 889cd1e9..73f81f2f 100644 --- a/built/server/load-config.d.ts +++ b/built/server/load-config.d.ts @@ -1,5 +1,6 @@ type Config = { allowedPlugins?: string[]; + useRange?: boolean; }; export default function (): Config; export {}; diff --git a/built/utils/got.d.ts b/built/utils/got.d.ts index 623c999b..3ecf5b99 100644 --- a/built/utils/got.d.ts +++ b/built/utils/got.d.ts @@ -2,6 +2,7 @@ import * as Got from 'got'; import * as cheerio from 'cheerio'; export declare function scpaping(url: string, opts?: { lang?: string; + useRange?: boolean; }): Promise<{ body: string; $: cheerio.CheerioAPI; diff --git a/built/utils/got.js b/built/utils/got.js index 327aacd6..4f0b8236 100644 --- a/built/utils/got.js +++ b/built/utils/got.js @@ -39,6 +39,8 @@ function scpaping(url, opts) { }; if (opts === null || opts === void 0 ? void 0 : opts.lang) headers['accept-language'] = opts.lang; + if (opts === null || opts === void 0 ? void 0 : opts.useRange) + headers['range'] = `bytes=0-${MAX_RESPONSE_SIZE - 1}`; const response = yield getResponse({ url, method: 'GET', @@ -100,7 +102,8 @@ function getResponse(args) { }); req.on('redirect', (res, opts) => { if (!(0, check_allowed_url_1.checkAllowedUrl)(opts.url)) { - req.cancel(`Invalid url: ${opts.url}`); + console.warn(`Invalid url: ${opts.url}`); + req.cancel(); } }); return yield receiveResponce({ req, typeFilter: args.typeFilter }); @@ -111,10 +114,24 @@ function receiveResponce(args) { const req = args.req; const maxSize = MAX_RESPONSE_SIZE; req.on('response', (res) => { - var _a; + var _a, _b; + if (res.statusCode === 206) { + const m = ((_a = res.headers['content-range']) !== null && _a !== void 0 ? _a : '').match(new RegExp(/^bytes\s+0-(\d+)\/(\d+)$/, 'i')); // bytes 0-47254/47255 + if (m == null) { + console.warn(`Invalid content-range '${res.headers['content-range']}'`); + req.cancel(); + return; + } + if (Number(m[1]) + 1 !== Number(m[2])) { + console.warn(`maxSize exceeded by content-range (${m[2]} > ${maxSize}) on response`); + req.cancel(); + return; + } + } // Check html - if (args.typeFilter && !((_a = res.headers['content-type']) === null || _a === void 0 ? void 0 : _a.match(args.typeFilter))) { - req.cancel(`Rejected by type filter ${res.headers['content-type']}`); + if (args.typeFilter && !((_b = res.headers['content-type']) === null || _b === void 0 ? void 0 : _b.match(args.typeFilter))) { + console.warn(`Rejected by type filter ${res.headers['content-type']}`); + req.cancel(); return; } // 応答ヘッダでサイズチェック @@ -122,14 +139,18 @@ function receiveResponce(args) { if (contentLength != null) { const size = Number(contentLength); if (size > maxSize) { - req.cancel(`maxSize exceeded (${size} > ${maxSize}) on response`); + console.warn(`maxSize exceeded by content-length (${size} > ${maxSize}) on response`); + req.cancel(); + return; } } }); // 受信中のデータでサイズチェック req.on('downloadProgress', (progress) => { if (progress.transferred > maxSize && progress.percent !== 1) { - req.cancel(`maxSize exceeded (${progress.transferred} > ${maxSize}) on response`); + console.warn(`maxSize exceeded in transfer (${progress.transferred} > ${maxSize}) on response`); + req.cancel(); + return; } }); // 応答取得 with ステータスコードエラーの整形 diff --git a/server_config.example.yml b/server_config.example.yml index 30532b1a..9f899419 100644 --- a/server_config.example.yml +++ b/server_config.example.yml @@ -7,3 +7,6 @@ allowedPlugins: # - iwara # - komiflo # - dlsite + +# Range付きリクエストを出すか +useRange: false diff --git a/src/general.ts b/src/general.ts index c50bdcc1..a2a3f375 100644 --- a/src/general.ts +++ b/src/general.ts @@ -4,10 +4,10 @@ import { SummalyEx } from './summaly'; import { scpaping } from './utils/got'; import { cleanupUrl } from './utils/cleanup-url'; -export default async (url: URL, lang: string | null = null): Promise => { +export default async (url: URL, lang: string | null = null, useRange = false): Promise => { if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) lang = null; - const res = await scpaping(url.href, { lang: lang || undefined }); + const res = await scpaping(url.href, { lang: lang || undefined, useRange }); const $ = res.$; const landingUrl = new URL(res.response.url); diff --git a/src/index.ts b/src/index.ts index f110e184..e616bea0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -15,9 +15,9 @@ type RequestOptions = { lang?: string | null; /** - * Whether follow redirects + * Use range for the request */ - followRedirects?: boolean; + useRange?: boolean; }; export class Summary { @@ -37,7 +37,6 @@ export class Summary { public async summary(url: string, requestOptions?: RequestOptions): Promise { const opts = Object.assign({ lang: null, - followRedirects: true, }, requestOptions); const _url = new URL(url); @@ -56,7 +55,7 @@ export class Summary { return summary; } else { - let summary = await general(_url, opts.lang); + let summary = await general(_url, opts.lang, opts.useRange); if (summary == null) throw 'failed summarize'; const landingUrl = summary.url; diff --git a/src/server/index.ts b/src/server/index.ts index ebfa436e..efe49c8d 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -25,18 +25,14 @@ router.get('/url', h3.eventHandler(async event => { try { const summary = await summaryInstance.summary(query.url, { lang: query.lang, - followRedirects: false, + useRange: config.useRange, }); h3.setResponseHeader(event, 'Cache-Control', 'public, max-age=604800'); return summary; } catch (e) { console.log(`summaly error: ${e} ${query.url}`); - if (e instanceof StatusError && e.isPermanentError) { - h3.setResponseStatus(event, 400); - } else { - h3.setResponseStatus(event, 500); - } + h3.setResponseStatus(event, 422); h3.setResponseHeader(event, 'Content-Type', 'text/plain'); h3.setResponseHeader(event, 'Cache-Control', 'public, max-age=3600'); return 'error'; diff --git a/src/server/load-config.ts b/src/server/load-config.ts index 31c7736f..07bab5e8 100644 --- a/src/server/load-config.ts +++ b/src/server/load-config.ts @@ -3,6 +3,7 @@ import * as yaml from 'js-yaml'; type Config = { allowedPlugins?: string[]; + useRange?: boolean; }; export default function () { diff --git a/src/utils/got.ts b/src/utils/got.ts index a4e12dee..a36a5a83 100644 --- a/src/utils/got.ts +++ b/src/utils/got.ts @@ -22,15 +22,16 @@ const NOT_BOT_UA = [ 'www.sankei.com', ]; -export async function scpaping(url: string, opts?: { lang?: string; }) { +export async function scpaping(url: string, opts?: { lang?: string; useRange?: boolean }) { const u = new URL(url); const headers = { 'accept': 'text/html, application/xhtml+xml', 'user-agent': NOT_BOT_UA.includes(u.hostname) ? browserUA : BOT_UA, - }; + } as Record; if (opts?.lang) headers['accept-language'] = opts.lang; + if (opts?.useRange) headers['range'] = `bytes=0-${MAX_RESPONSE_SIZE - 1}`; const response = await getResponse({ url, @@ -96,7 +97,8 @@ async function getResponse(args: { url: string, method: 'GET' | 'POST', body?: s req.on('redirect', (res, opts) => { if (!checkAllowedUrl(opts.url)) { - req.cancel(`Invalid url: ${opts.url}`); + console.warn(`Invalid url: ${opts.url}`); + req.cancel(); } }); @@ -108,9 +110,26 @@ async function receiveResponce(args: { req: Got.CancelableRequest { + if (res.statusCode === 206) { + const m = (res.headers['content-range'] ?? '').match(new RegExp(/^bytes\s+0-(\d+)\/(\d+)$/, 'i')); // bytes 0-47254/47255 + + if (m == null) { + console.warn(`Invalid content-range '${res.headers['content-range']}'`); + req.cancel(); + return; + } + + if (Number(m[1]) + 1 !== Number(m[2])) { + console.warn(`maxSize exceeded by content-range (${m[2]} > ${maxSize}) on response`); + req.cancel(); + return; + } + } + // Check html if (args.typeFilter && !res.headers['content-type']?.match(args.typeFilter)) { - req.cancel(`Rejected by type filter ${res.headers['content-type']}`); + console.warn(`Rejected by type filter ${res.headers['content-type']}`); + req.cancel(); return; } @@ -119,7 +138,9 @@ async function receiveResponce(args: { req: Got.CancelableRequest maxSize) { - req.cancel(`maxSize exceeded (${size} > ${maxSize}) on response`); + console.warn(`maxSize exceeded by content-length (${size} > ${maxSize}) on response`); + req.cancel(); + return; } } }); @@ -127,7 +148,9 @@ async function receiveResponce(args: { req: Got.CancelableRequest { if (progress.transferred > maxSize && progress.percent !== 1) { - req.cancel(`maxSize exceeded (${progress.transferred} > ${maxSize}) on response`); + console.warn(`maxSize exceeded in transfer (${progress.transferred} > ${maxSize}) on response`); + req.cancel(); + return; } });