Skip to content

Commit

Permalink
feat: crawl URLs in <meta> tags (#9900)
Browse files Browse the repository at this point in the history
* Crawl social-image urls during prerender

* Formatting & Linting

* Format changeset & added exhaustive list of crawlable urls

* Changed severity to minor as described in #5228

* Added support for `property` attribute & limited valid names to just social tags

* More tests

* Better changeset message - I'm indecisive

* Update .changeset/thirty-garlics-tan.md

Co-authored-by: Ben McCann <322311+benmccann@users.noreply.github.com>

* simplify

* simplify

* Removed redundant data-sanitation

* DRY out

---------

Co-authored-by: Ben McCann <322311+benmccann@users.noreply.github.com>
Co-authored-by: Rich Harris <git@rich-harris.dev>
  • Loading branch information
3 people authored May 17, 2023
1 parent 348029b commit ab9f577
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 43 deletions.
5 changes: 5 additions & 0 deletions .changeset/thirty-garlics-tan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@sveltejs/kit': minor
---

feat: crawl URLs in `<meta>` tags
111 changes: 68 additions & 43 deletions packages/kit/src/core/postbuild/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/;

const WHITESPACE = /[\s\n\r]/;

const CRAWLABLE_META_NAME_ATTRS = new Set([
'og:url',
'og:image',
'og:image:url',
'og:image:secure_url',
'og:video',
'og:video:url',
'og:video:secure_url',
'og:audio',
'og:audio:url',
'og:audio:secure_url',
'twitter:image'
]);

/**
* @param {string} html
* @param {string} base
Expand Down Expand Up @@ -81,6 +95,9 @@ export function crawl(html, base) {

const tag = html.slice(start, i).toUpperCase();

/** @type {Record<string, string>} */
const attributes = {};

if (tag === 'SCRIPT' || tag === 'STYLE') {
while (i < html.length) {
if (
Expand All @@ -95,9 +112,6 @@ export function crawl(html, base) {
}
}

let href = '';
let rel = '';

while (i < html.length) {
const start = i;

Expand Down Expand Up @@ -159,44 +173,7 @@ export function crawl(html, base) {
}

value = decode(value);

if (name === 'href') {
if (tag === 'BASE') {
base = resolve(base, value);
} else {
href = resolve(base, value);
}
} else if (name === 'id') {
ids.push(value);
} else if (name === 'name') {
if (tag === 'A') ids.push(value);
} else if (name === 'rel') {
rel = value;
} else if (name === 'src') {
if (value) hrefs.push(resolve(base, value));
} else if (name === 'srcset') {
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (
value[i] === ',' &&
(!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))
) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}
attributes[name] = value;
} else {
i -= 1;
}
Expand All @@ -205,8 +182,56 @@ export function crawl(html, base) {
i += 1;
}

if (href && !/\bexternal\b/i.test(rel)) {
hrefs.push(resolve(base, href));
const { href, id, name, property, rel, src, srcset, content } = attributes;

if (href) {
if (tag === 'BASE') {
base = resolve(base, href);
} else if (!rel || !/\bexternal\b/i.test(rel)) {
hrefs.push(resolve(base, href));
}
}

if (id) {
ids.push(id);
}

if (name && tag === 'A') {
ids.push(name);
}

if (src) {
hrefs.push(resolve(base, src));
}

if (srcset) {
let value = srcset;
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}

if (tag === 'META' && content) {
const attr = name ?? property;

if (attr && CRAWLABLE_META_NAME_ATTRS.has(attr)) {
hrefs.push(resolve(base, content));
}
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="This is a description" />

<!--Only these should get crawled-->
<meta content="https://external.com" name="twitter:image" />
<meta name="og:image" content="/og-image.jpg" />
<meta property="og:audio" content="https://example.com/audio.mp3" />
<meta content="/video.mp4" property="og:video"/>
</head>
<body></body>
</html>
4 changes: 4 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"hrefs": ["https://external.com", "/og-image.jpg", "https://example.com/audio.mp3", "/video.mp4"],
"ids": []
}

0 comments on commit ab9f577

Please sign in to comment.