-
Notifications
You must be signed in to change notification settings - Fork 508
/
broken-links.js
364 lines (343 loc) · 12.1 KB
/
broken-links.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
const fs = require("fs");
const path = require("path");
const fromMarkdown = require("mdast-util-from-markdown");
const visit = require("unist-util-visit");
const { Document, Redirect, Image } = require("../../content");
const { FLAW_LEVELS } = require("../../libs/constants");
const { findMatchesInText } = require("../matches-in-text");
const { DEFAULT_LOCALE, VALID_LOCALES } = require("../../libs/constants");
const dirname = __dirname;
function findMatchesInMarkdown(rawContent, href) {
const matches = [];
visit(fromMarkdown(rawContent), "link", (node) => {
if (node.url == href) {
const { line, column } = node.position.start;
matches.push({ line, column });
}
});
return matches;
}
const _safeToHttpsDomains = new Map();
function getSafeToHttpDomains() {
if (!_safeToHttpsDomains.size) {
const fileParsed = JSON.parse(
fs.readFileSync(path.join(dirname, "safe-to-https-domains.json"), "utf-8")
);
Object.entries(fileParsed).forEach(([key, value]) =>
_safeToHttpsDomains.set(key, value)
);
}
return _safeToHttpsDomains;
}
function isHomepageURL(url) {
// Return true if the URL is something like `/` or `/en-US` or `/fr/`
if (url === "/") {
return true;
}
if (!url.endsWith("/")) {
url += "/";
}
const split = url.split("/");
return split.length === 3 && VALID_LOCALES.has(split[1].toLowerCase());
}
function mutateLink(
$element,
{ suggestion, enUSFallback, isSelfLink } = {
suggestion: null,
enUSFallback: null,
isSelfLink: false,
}
) {
if (isSelfLink) {
$element.attr("aria-current", "page");
} else if (suggestion) {
$element.attr("href", suggestion);
} else if (enUSFallback) {
$element.attr("href", enUSFallback);
// This functionality here should match what we do inside
// the `web.smartLink()` function in kumascript rendering.
$element.text(`${$element.text()} (${DEFAULT_LOCALE})`);
$element.addClass("only-in-en-us");
$element.attr("title", "Currently only available in English (US)");
} else {
$element.addClass("page-not-created");
$element.attr("title", "This is a link to an unwritten page");
}
}
// The 'broken_links' flaw check looks for internal links that
// link to a document that's going to fail with a 404 Not Found.
function getBrokenLinksFlaws(doc, $, { rawContent }, level) {
const flaws = [];
// This is needed because the same href can occur multiple time.
// For example:
// <a href="/foo/bar">
// <a href="/foo/other">
// <a href="/foo/bar"> (again!)
// In this case, when we call `addBrokenLink()` that third time, we know
// this refers to the second time it appears. That's important for the
// sake of finding which match, in the original source (rawContent),
// it belongs to.
const checked = new Map();
// Our cache for looking things up by `href`. This basically protects
// us from calling `findMatchesInText()` more than once.
const matches = new Map();
// A closure function to help making it easier to append flaws
function addBrokenLink(
$element,
index,
href,
suggestion = null,
explanation = null,
enUSFallback = null,
isSelfLink = false
) {
mutateLink($element, { suggestion, enUSFallback, isSelfLink });
if (level === FLAW_LEVELS.IGNORE) {
// Note, even if not interested in flaws, we still need to apply the
// suggestion. For example, in production builds, we don't care about
// logging flaws, but because not all `broken_links` flaws have been
// manually fixed at the source.
return;
}
explanation = explanation || `Can't resolve ${href}`;
if (!matches.has(href)) {
matches.set(
href,
doc.isMarkdown
? findMatchesInMarkdown(rawContent, href)
: Array.from(
findMatchesInText(href, rawContent, {
attribute: "href",
})
)
);
}
// findMatchesInText() is a generator function so use `Array.from()`
// to turn it into an array so we can use `.forEach()` because that
// gives us an `i` for every loop.
matches.get(href).forEach((match, i) => {
if (i !== index) {
return;
}
const id = `link${flaws.length + 1}`;
const fixable = !!suggestion;
$element.attr("data-flaw", id);
flaws.push(
Object.assign({ explanation, id, href, suggestion, fixable }, match)
);
});
}
$("a[href]").each((i, element) => {
const a = $(element);
let href = a.attr("href");
try {
// When Markdown turns into HTML it will encode the `href` values in the
// links. To be able to treat it as if it was from its raw value,
// we first decode it. That way we can find out it was originally written
// in the `index.md` file, for example.
// But not all URLs can be applied with `decodeURI`. For example:
// https://www.ecma-international.org/ecma-262/6.0/#sec-get-%typedarray%.prototype.buffer
// can't be decoded in Node.
// So that's why we do this decoding very defensively.
href = decodeURI(href);
} catch (error) {
console.warn(`Unable to decodeURI '${href}'. Will proceed without.`);
}
// This gives us insight into how many times this exact `href`
// has been encountered in the doc.
// Then, when we call addBrokenLink() we can include an index so that
// that function knows which match it's referring to.
checked.set(href, checked.has(href) ? checked.get(href) + 1 : 0);
// Note, a lot of links are like this:
// <a href="/docs/Learn/Front-end_web_developer">
// which means the author wanted the link to work in any language.
// When checking it against disk, we'll have to assume a locale.
const hrefSplit = href.split("#");
let hrefNormalized = hrefSplit[0];
if (hrefNormalized.startsWith("/docs/")) {
const thisDocumentLocale = doc.mdn_url.split("/")[1];
hrefNormalized = `/${thisDocumentLocale}${hrefNormalized}`;
}
if (
hrefNormalized.endsWith("/contributors.txt") &&
hrefNormalized.startsWith("/") &&
!href.startsWith("//")
) {
// Do nothing. The /contributors.txt URLs are special Yari URLs.
return;
}
if (href.startsWith("http://")) {
let domain = null;
try {
domain = new URL(href).hostname;
} catch (err) {
return addBrokenLink(
a,
checked.get(href),
href,
null,
"Not a valid link URL"
);
}
// If a URL's domain is in the list that getSafeToHttpDomains() provides,
// that means we've tested that you can turn that into a HTTPS link
// simply by replacing the `http://` for `https://`.
// Using `.get(domain)` is smart because if the domain isn't known you
// get `undefined` otherwise you get `true` or `false`. And we're only
// interested in the `true`.
if (getSafeToHttpDomains().get(domain)) {
addBrokenLink(
a,
checked.get(href),
href,
href.replace("http://", "https://"),
"Is currently http:// but can become https://"
);
}
// Note! If it's not known that the URL's domain can be turned into https://
// we do nothing here. No flaw. It's unfortunate that we still have http://
// links in our content but that's a reality of MDN being 15+ years old.
} else if (href.startsWith("https://developer.mozilla.org/")) {
// It might be a working 200 OK link but the link just shouldn't
// have the full absolute URL part in it.
const absoluteURL = new URL(href);
addBrokenLink(
a,
checked.get(href),
href,
absoluteURL.pathname + absoluteURL.search + absoluteURL.hash
);
} else if (isHomepageURL(hrefNormalized)) {
// But did you spell it perfectly?
const homepageLocale = hrefNormalized.split("/")[1];
if (
hrefNormalized !== "/" &&
(VALID_LOCALES.get(homepageLocale.toLowerCase()) !== homepageLocale ||
!hrefNormalized.endsWith("/"))
) {
addBrokenLink(
a,
checked.get(href),
href,
`/${VALID_LOCALES.get(homepageLocale.toLowerCase())}/`
);
}
} else if (hrefNormalized.toLowerCase() === doc.mdn_url.toLowerCase()) {
if (hrefSplit.length > 1) {
addBrokenLink(
a,
checked.get(href),
href,
`#${hrefSplit[1]}`,
"No need for the pathname in anchor links if it's the same page",
null,
true
);
} else {
addBrokenLink(
a,
checked.get(href),
href,
null,
"Link points to the page it's already on",
null,
true
);
}
} else if (href.startsWith("/") && !href.startsWith("//")) {
// Got to fake the domain to sensible extract the .search and .hash
const absoluteURL = new URL(href, "http://www.example.com");
const found = Document.findByURL(hrefNormalized);
if (!found) {
// Before we give up, check if it's an image.
if (!Image.findByURLWithFallback(hrefNormalized)) {
// Even if it's a redirect, it's still a flaw, but it'll be nice to
// know what it *should* be.
const resolved = Redirect.resolve(hrefNormalized);
if (resolved !== hrefNormalized) {
addBrokenLink(
a,
checked.get(href),
href,
resolved + absoluteURL.search + absoluteURL.hash.toLowerCase()
);
} else {
let enUSFallbackURL = null;
// Test if the document is a translated document and the link isn't
// to an en-US URL. We know the link is broken (in this locale!)
// but it might be "salvageable" if we link the en-US equivalent.
// This is, by the way, the same trick the `web.smartLink()` utility
// function does in kumascript rendering.
if (
doc.locale !== DEFAULT_LOCALE &&
href.startsWith(`/${doc.locale}/`)
) {
// What if you swich to the English link; would th link work
// better then?
const enUSHrefNormalized = hrefNormalized.replace(
`/${doc.locale}/`,
`/${DEFAULT_LOCALE}/`
);
let enUSFound = Document.findByURL(enUSHrefNormalized);
if (enUSFound) {
enUSFallbackURL = enUSFound.url;
} else {
const enUSResolved = Redirect.resolve(enUSHrefNormalized);
if (enUSResolved !== enUSHrefNormalized) {
enUSFallbackURL =
enUSResolved +
absoluteURL.search +
absoluteURL.hash.toLowerCase();
}
}
}
addBrokenLink(
a,
checked.get(href),
href,
null,
enUSFallbackURL
? "Can use the English (en-US) link as a fallback"
: null,
enUSFallbackURL
);
}
}
// But does it have the correct case?!
} else if (found.url !== href.split("#")[0]) {
// Inconsistent case.
addBrokenLink(
a,
checked.get(href),
href,
found.url + absoluteURL.search + absoluteURL.hash.toLowerCase()
);
} else if (
hrefSplit.length > 1 &&
hrefSplit[1] !== hrefSplit[1].toLowerCase()
) {
const hash = hrefSplit[1];
addBrokenLink(
a,
checked.get(href),
href,
href.replace(`#${hash}`, `#${hash.toLowerCase()}`),
"Anchor not lowercase"
);
}
} else if (href.startsWith("#")) {
const hash = href.split("#")[1];
if (hash !== hash.toLowerCase()) {
addBrokenLink(
a,
checked.get(href),
href,
href.replace(`#${hash}`, `#${hash.toLowerCase()}`),
"Anchor not lowercase"
);
}
}
});
return flaws;
}
module.exports = { getBrokenLinksFlaws };