Skip to content

Commit

Permalink
Replace jsdom with faster parser
Browse files Browse the repository at this point in the history
  • Loading branch information
TimDaub committed Oct 24, 2024
1 parent 45146eb commit 6d3efbb
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 139 deletions.
188 changes: 55 additions & 133 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,14 @@
"it-all": "2.0.0",
"it-length-prefixed": "8.0.4",
"it-map": "2.0.0",
"jsdom": "24.0.0",
"libp2p": "0.40.0",
"linkify-string": "4.1.3",
"lmdb": "2.7.9",
"make-asynchronous": "1.0.1",
"morgan": "1.10.0",
"node-cache": "5.1.2",
"node-fetch-cache": "3.1.2",
"node-html-parser": "6.1.13",
"nodemon": "2.0.21",
"normalize-url": "8.0.0",
"open-graph-scraper-lite": "2.0.0",
Expand Down
10 changes: 5 additions & 5 deletions src/parser.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import DOMPurify from "isomorphic-dompurify";
import ogs from "open-graph-scraper-lite";
import htm from "htm";
import vhtml from "vhtml";
import { JSDOM } from "jsdom";
import { parse as parser } from "node-html-parser";
import { fetchBuilder, FileSystemCache } from "node-fetch-cache";
import { useAgent } from "request-filtering-agent";

Expand All @@ -31,15 +31,15 @@ const filtered = [
];

async function extractCanonicalLink(html) {
const dom = new JSDOM(html);
const node = dom.window.document.querySelector('link[rel="canonical"]');
const dom = parser(html);
const node = dom.querySelector('link[rel="canonical"]');
if (!node) return;

let response;
try {
const signal = AbortSignal.timeout(5000);
response = await fetch(node.href, {
agent: useAgent(node.href),
response = await fetch(node._attrs.href, {
agent: useAgent(node._attrs.href),
signal,
});
} catch (err) {
Expand Down

0 comments on commit 6d3efbb

Please sign in to comment.