Skip to content

Commit

Permalink
fix(build): check for bad_src flaws in markdown files (#8133)
Browse files Browse the repository at this point in the history
Co-authored-by: Claas Augner <caugner@mozilla.com>
  • Loading branch information
yin1999 and caugner authored Mar 1, 2024
1 parent a940648 commit a76cc0e
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 98 deletions.
89 changes: 47 additions & 42 deletions build/check-images.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import imagesize from "image-size";

import { Document, FileAttachment } from "../content/index.js";
import { FLAW_LEVELS } from "../libs/constants/index.js";
import { findMatchesInText } from "./matches-in-text.js";
import { findMatchesInText, findMatchesInMarkdown } from "./matches.js";
import * as cheerio from "cheerio";
import { Doc } from "../libs/types/document.js";

Expand All @@ -30,50 +30,57 @@ export function checkImageReferences(

const checkImages = options.flawLevels.get("images") !== FLAW_LEVELS.IGNORE;

const checked = new Map();
const checked = new Map<string, number>();

function addImageFlaw(
$img,
src,
{ explanation, externalImage = false, suggestion = null }
$img: cheerio.Cheerio<cheerio.Element>,
src: string,
{
explanation,
externalImage = false,
suggestion = null,
}: {
explanation: string;
externalImage?: boolean;
suggestion?: string;
}
) {
// If the document has *two* `<img src="XXX">` tags, this function
// (addImageFlaw) is called two times. We can then assume the
// findMatchesInText() will find it two times too. For each call,
// we need to match the call based in counting matches from findMatchesInText().
const matches = [
...findMatchesInText(src, rawContent, {
attribute: "src",
}),
];
const checkedBefore = checked.get(src) || 0;
matches.forEach((match, i) => {
if (i !== checkedBefore) {
return;
}
if (!("images" in doc.flaws)) {
doc.flaws.images = [];
}
let fixable = false;
if (suggestion) {
$img.attr("src", suggestion);
fixable = true;
}
const id = `image${doc.flaws.images.length + 1}`;
$img.attr("data-flaw", id);
doc.flaws.images.push({
id,
src,
fixable,
suggestion,
explanation,
externalImage,
...match,
});

// Use this to remember which in the list of matches we've dealt with.
checked.set(src, checkedBefore + 1);
const matches = doc.isMarkdown
? findMatchesInMarkdown(src, rawContent, { type: "image" })
: findMatchesInText(src, rawContent, { attribute: "src" });
const checkedBefore = checked.get(src) ?? 0;
if (matches.length <= checkedBefore) {
console.warn(
`Could not find enough matches for src: ${src}, index ${checkedBefore} out of bounds`
);
return;
}
const match = matches[checkedBefore];
if (!("images" in doc.flaws)) {
doc.flaws.images = [];
}
const fixable = Boolean(suggestion);
if (suggestion) {
$img.attr("src", suggestion);
}
const id = `image${doc.flaws.images.length + 1}`;
$img.attr("data-flaw", id);
doc.flaws.images.push({
id,
src,
fixable,
suggestion,
explanation,
externalImage,
...match,
});

// Use this to remember which in the list of matches we've dealt with.
checked.set(src, checkedBefore + 1);
}

$("img[src]").each((i, element) => {
Expand Down Expand Up @@ -238,11 +245,9 @@ export function checkImageWidths(
}
const id = `image_widths${doc.flaws.image_widths.length + 1}`;
$img.attr("data-flaw", id);
const matches = [
...findMatchesInText(style, rawContent, {
attribute: "style",
}),
];
const matches = findMatchesInText(style, rawContent, {
attribute: "style",
});
const checkedBefore = checked.get(style) || 0;
matches.forEach((match, i) => {
if (i !== checkedBefore) {
Expand Down
30 changes: 6 additions & 24 deletions build/flaws/broken-links.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import fs from "node:fs";

import { fromMarkdown } from "mdast-util-from-markdown";
import { visit } from "unist-util-visit";

import { Document, Redirect, FileAttachment } from "../../content/index.js";
import { findMatchesInText } from "../matches-in-text.js";
import { findMatchesInText, findMatchesInMarkdown } from "../matches.js";
import {
DEFAULT_LOCALE,
FLAW_LEVELS,
Expand All @@ -15,17 +12,6 @@ import * as cheerio from "cheerio";
import { Doc } from "../../libs/types/document.js";
import { Flaw } from "./index.js";

function findMatchesInMarkdown(rawContent: string, href: string) {
const matches = [];
visit(fromMarkdown(rawContent), "link", (node: any) => {
if (node.url == href) {
const { line, column } = node.position.start;
matches.push({ line, column });
}
});
return matches;
}

const _safeToHttpsDomains = new Map();

function getSafeToHttpDomains() {
Expand Down Expand Up @@ -127,17 +113,13 @@ export function getBrokenLinksFlaws(
href,

doc.isMarkdown
? findMatchesInMarkdown(rawContent, href)
: Array.from(
findMatchesInText(href, rawContent, {
attribute: "href",
})
)
? findMatchesInMarkdown(href, rawContent, { type: "link" })
: findMatchesInText(href, rawContent, {
attribute: "href",
})
);
}
// findMatchesInText() is a generator function so use `Array.from()`
// to turn it into an array so we can use `.forEach()` because that
// gives us an `i` for every loop.

matches.get(href).forEach((match, i) => {
if (i !== index) {
return;
Expand Down
2 changes: 1 addition & 1 deletion build/flaws/heading-links.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { findMatchesInText } from "../matches-in-text.js";
import { findMatchesInText } from "../matches.js";
import * as cheerio from "cheerio";
import { Doc } from "../../libs/types/document.js";
import { Flaw } from "./index.js";
Expand Down
2 changes: 1 addition & 1 deletion build/flaws/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import {
import {
replaceMatchesInText,
replaceMatchingLinksInMarkdown,
} from "../matches-in-text.js";
} from "../matches.js";
import { forceExternalURL, downloadAndResizeImage } from "../utils.js";
import { getBadBCDQueriesFlaws } from "./bad-bcd-queries.js";
import { getBrokenLinksFlaws } from "./broken-links.js";
Expand Down
2 changes: 1 addition & 1 deletion build/flaws/pre-tags.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Flaw } from "./index.js";

import { getFirstMatchInText } from "../matches-in-text.js";
import { getFirstMatchInText } from "../matches.js";
import * as cheerio from "cheerio";
import { Doc } from "../../libs/types/document.js";
const escapeHTML = (s: string) =>
Expand Down
2 changes: 1 addition & 1 deletion build/flaws/unsafe-html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import {
INTERACTIVE_EXAMPLES_BASE_URL,
LIVE_SAMPLES_BASE_URL,
} from "../../libs/env/index.js";
import { findMatchesInText } from "../matches-in-text.js";
import { findMatchesInText } from "../matches.js";
import * as cheerio from "cheerio";
import { Doc } from "../../libs/types/document.js";

Expand Down
63 changes: 51 additions & 12 deletions build/matches-in-text.ts → build/matches.ts
Original file line number Diff line number Diff line change
@@ -1,30 +1,69 @@
import "mdast-util-directive";
import { fromMarkdown } from "mdast-util-from-markdown";
import { visit } from "unist-util-visit";

const ESCAPE_CHARS_RE = /[.*+?^${}()|[\]\\]/g;

export function* findMatchesInText(
needle,
haystack,
type Match = { line: number; column: number };

export function findMatchesInText(
needle: string,
haystack: string,
{ attribute = null } = {}
) {
): Match[] {
// Need to remove any characters that can affect a regex if we're going
// use the string in a manually constructed regex.
const escaped = needle.replace(ESCAPE_CHARS_RE, "\\$&");
let rex;
if (attribute) {
rex = new RegExp(`${attribute}=['"](${escaped})['"]`, "g");
} else {
rex = new RegExp(`(${escaped})`, "g");
}
const rex = attribute
? new RegExp(`${attribute}=['"](${escaped})['"]`, "g")
: new RegExp(`(${escaped})`, "g");
const matches: Match[] = [];
for (const match of haystack.matchAll(rex)) {
const left = haystack.slice(0, match.index);
const line = (left.match(/\n/g) || []).length + 1;
const lastIndexOf = left.lastIndexOf("\n") + 1;
const column =
match.index - lastIndexOf + 1 + (attribute ? attribute.length + 2 : 0);
yield { line, column };
matches.push({ line, column });
}
return matches;
}

// find links or images in markdown content that match the given URL
export function findMatchesInMarkdown(
url: string,
rawContent: string,
{ type }: { type: "link" | "image" }
): Match[] {
const matches: Match[] = [];
const attributeType = type === "link" ? "href" : "src";
const tree = fromMarkdown(rawContent);
// Find all the links and images in the markdown
// we should also find any HTML elements that contain links or images
visit(tree, [type, "html"], (node) => {
if (node.type === "html") {
const matchesInHtml = findMatchesInText(url, node.value, {
attribute: attributeType,
});
const correctedMatches = matchesInHtml.map(({ line, column }): Match => {
if (line === 1) {
// if it's the first line, we need to add the column offset
column += node.position.start.column - 1;
}
line += node.position.start.line - 1;
return { line, column };
});
matches.push(...correctedMatches);
} else if (node.type == type && node.url === url) {
// else this would be a markdown link or image
const { line, column } = node.position.start;
matches.push({ line, column });
}
});
return matches;
}

export function getFirstMatchInText(needle, haystack) {
export function getFirstMatchInText(needle, haystack): Match {
const index = haystack.indexOf(needle);
const left = haystack.substring(0, index);
const line = left.split("\n").length;
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@
"jest-watch-typeahead": "^2.2.2",
"jsdom": "^24.0.0",
"lint-staged": "^13.2.3",
"mdast-util-directive": "^3.0.0",
"mdast-util-to-hast": "^13.1.0",
"mini-css-extract-plugin": "^2.8.1",
"node-dev": "^8.0.0",
Expand Down
16 changes: 0 additions & 16 deletions testing/content/files/en-us/web/images/bad_src/index.html

This file was deleted.

28 changes: 28 additions & 0 deletions testing/content/files/en-us/web/images/bad_src/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
title: <img> tag with src that isn't a valid image
slug: Web/Images/Bad_src
---

<!-- resolves, intially, to a directory -->
![](/en-us/web)

<!-- actually isn't a valid PNG -->
![](actuallynota.png)

<!--
is a zero byte image, and as a check for html in markdown file
add a space at the start of the <img> as a check for image finder function
-->
<img src="empty.gif" alt=""/>

<!--
multi-line html with broken image,
exists on disk but isn't actually a real SVG file.
-->
<table>
<tr>
<td>
<img src="actuallynota.svg" alt="broken image" />
</td>
</tr>
</table>
15 changes: 15 additions & 0 deletions testing/tests/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,21 @@ test("image flaws with bad images", () => {
"File not present on disk, an empty file, or not an image"
).length
).toBe(4);
// Check the line and column numbers for html <img> tags in markdown
expect({
line: flaws.images[2].line,
column: flaws.images[2].column,
}).toEqual({
line: 16,
column: 12,
});
expect({
line: flaws.images[3].line,
column: flaws.images[3].column,
}).toEqual({
line: 25,
column: 17,
});
});

test("linked to local files", () => {
Expand Down
Loading

0 comments on commit a76cc0e

Please sign in to comment.