fix(build): check for bad_src flaws in markdown files (#8133)

Co-authored-by: Claas Augner <caugner@mozilla.com>
mdn · Mar 1, 2024 · a76cc0e · a76cc0e
1 parent a940648
commit a76cc0e
Show file tree

Hide file tree

Showing 12 changed files with 208 additions and 98 deletions.
diff --git a/build/check-images.ts b/build/check-images.ts
@@ -8,7 +8,7 @@ import imagesize from "image-size";
 
 import { Document, FileAttachment } from "../content/index.js";
 import { FLAW_LEVELS } from "../libs/constants/index.js";
-import { findMatchesInText } from "./matches-in-text.js";
+import { findMatchesInText, findMatchesInMarkdown } from "./matches.js";
 import * as cheerio from "cheerio";
 import { Doc } from "../libs/types/document.js";
 
@@ -30,50 +30,57 @@ export function checkImageReferences(
 
   const checkImages = options.flawLevels.get("images") !== FLAW_LEVELS.IGNORE;
 
-  const checked = new Map();
+  const checked = new Map<string, number>();
 
   function addImageFlaw(
-    $img,
-    src,
-    { explanation, externalImage = false, suggestion = null }
+    $img: cheerio.Cheerio<cheerio.Element>,
+    src: string,
+    {
+      explanation,
+      externalImage = false,
+      suggestion = null,
+    }: {
+      explanation: string;
+      externalImage?: boolean;
+      suggestion?: string;
+    }
   ) {
     // If the document has *two* `<img src="XXX">` tags, this function
     // (addImageFlaw) is called two times. We can then assume the
     // findMatchesInText() will find it two times too. For each call,
     // we need to match the call based in counting matches from findMatchesInText().
-    const matches = [
-      ...findMatchesInText(src, rawContent, {
-        attribute: "src",
-      }),
-    ];
-    const checkedBefore = checked.get(src) || 0;
-    matches.forEach((match, i) => {
-      if (i !== checkedBefore) {
-        return;
-      }
-      if (!("images" in doc.flaws)) {
-        doc.flaws.images = [];
-      }
-      let fixable = false;
-      if (suggestion) {
-        $img.attr("src", suggestion);
-        fixable = true;
-      }
-      const id = `image${doc.flaws.images.length + 1}`;
-      $img.attr("data-flaw", id);
-      doc.flaws.images.push({
-        id,
-        src,
-        fixable,
-        suggestion,
-        explanation,
-        externalImage,
-        ...match,
-      });
-
-      // Use this to remember which in the list of matches we've dealt with.
-      checked.set(src, checkedBefore + 1);
+    const matches = doc.isMarkdown
+      ? findMatchesInMarkdown(src, rawContent, { type: "image" })
+      : findMatchesInText(src, rawContent, { attribute: "src" });
+    const checkedBefore = checked.get(src) ?? 0;
+    if (matches.length <= checkedBefore) {
+      console.warn(
+        `Could not find enough matches for src: ${src}, index ${checkedBefore} out of bounds`
+      );
+      return;
+    }
+    const match = matches[checkedBefore];
+    if (!("images" in doc.flaws)) {
+      doc.flaws.images = [];
+    }
+    const fixable = Boolean(suggestion);
+    if (suggestion) {
+      $img.attr("src", suggestion);
+    }
+    const id = `image${doc.flaws.images.length + 1}`;
+    $img.attr("data-flaw", id);
+    doc.flaws.images.push({
+      id,
+      src,
+      fixable,
+      suggestion,
+      explanation,
+      externalImage,
+      ...match,
     });
+
+    // Use this to remember which in the list of matches we've dealt with.
+    checked.set(src, checkedBefore + 1);
   }
 
   $("img[src]").each((i, element) => {
@@ -238,11 +245,9 @@ export function checkImageWidths(
     }
     const id = `image_widths${doc.flaws.image_widths.length + 1}`;
     $img.attr("data-flaw", id);
-    const matches = [
-      ...findMatchesInText(style, rawContent, {
-        attribute: "style",
-      }),
-    ];
+    const matches = findMatchesInText(style, rawContent, {
+      attribute: "style",
+    });
     const checkedBefore = checked.get(style) || 0;
     matches.forEach((match, i) => {
       if (i !== checkedBefore) {

diff --git a/build/flaws/broken-links.ts b/build/flaws/broken-links.ts
@@ -1,10 +1,7 @@
 import fs from "node:fs";
 
-import { fromMarkdown } from "mdast-util-from-markdown";
-import { visit } from "unist-util-visit";
-
 import { Document, Redirect, FileAttachment } from "../../content/index.js";
-import { findMatchesInText } from "../matches-in-text.js";
+import { findMatchesInText, findMatchesInMarkdown } from "../matches.js";
 import {
   DEFAULT_LOCALE,
   FLAW_LEVELS,
@@ -15,17 +12,6 @@ import * as cheerio from "cheerio";
 import { Doc } from "../../libs/types/document.js";
 import { Flaw } from "./index.js";
 
-function findMatchesInMarkdown(rawContent: string, href: string) {
-  const matches = [];
-  visit(fromMarkdown(rawContent), "link", (node: any) => {
-    if (node.url == href) {
-      const { line, column } = node.position.start;
-      matches.push({ line, column });
-    }
-  });
-  return matches;
-}
-
 const _safeToHttpsDomains = new Map();
 
 function getSafeToHttpDomains() {
@@ -127,17 +113,13 @@ export function getBrokenLinksFlaws(
         href,
 
         doc.isMarkdown
-          ? findMatchesInMarkdown(rawContent, href)
-          : Array.from(
-              findMatchesInText(href, rawContent, {
-                attribute: "href",
-              })
-            )
+          ? findMatchesInMarkdown(href, rawContent, { type: "link" })
+          : findMatchesInText(href, rawContent, {
+              attribute: "href",
+            })
       );
     }
-    // findMatchesInText() is a generator function so use `Array.from()`
-    // to turn it into an array so we can use `.forEach()` because that
-    // gives us an `i` for every loop.
+
     matches.get(href).forEach((match, i) => {
       if (i !== index) {
         return;

diff --git a/build/flaws/heading-links.ts b/build/flaws/heading-links.ts
@@ -1,4 +1,4 @@
-import { findMatchesInText } from "../matches-in-text.js";
+import { findMatchesInText } from "../matches.js";
 import * as cheerio from "cheerio";
 import { Doc } from "../../libs/types/document.js";
 import { Flaw } from "./index.js";

diff --git a/build/flaws/index.ts b/build/flaws/index.ts
@@ -11,7 +11,7 @@ import {
 import {
   replaceMatchesInText,
   replaceMatchingLinksInMarkdown,
-} from "../matches-in-text.js";
+} from "../matches.js";
 import { forceExternalURL, downloadAndResizeImage } from "../utils.js";
 import { getBadBCDQueriesFlaws } from "./bad-bcd-queries.js";
 import { getBrokenLinksFlaws } from "./broken-links.js";

diff --git a/build/flaws/pre-tags.ts b/build/flaws/pre-tags.ts
@@ -1,6 +1,6 @@
 import { Flaw } from "./index.js";
 
-import { getFirstMatchInText } from "../matches-in-text.js";
+import { getFirstMatchInText } from "../matches.js";
 import * as cheerio from "cheerio";
 import { Doc } from "../../libs/types/document.js";
 const escapeHTML = (s: string) =>

diff --git a/build/flaws/unsafe-html.ts b/build/flaws/unsafe-html.ts
@@ -4,7 +4,7 @@ import {
   INTERACTIVE_EXAMPLES_BASE_URL,
   LIVE_SAMPLES_BASE_URL,
 } from "../../libs/env/index.js";
-import { findMatchesInText } from "../matches-in-text.js";
+import { findMatchesInText } from "../matches.js";
 import * as cheerio from "cheerio";
 import { Doc } from "../../libs/types/document.js";
 

diff --git a/build/matches-in-text.ts → build/matches.ts b/build/matches-in-text.ts → build/matches.ts
@@ -1,30 +1,69 @@
+import "mdast-util-directive";
+import { fromMarkdown } from "mdast-util-from-markdown";
+import { visit } from "unist-util-visit";
+
 const ESCAPE_CHARS_RE = /[.*+?^${}()|[\]\\]/g;
 
-export function* findMatchesInText(
-  needle,
-  haystack,
+type Match = { line: number; column: number };
+
+export function findMatchesInText(
+  needle: string,
+  haystack: string,
   { attribute = null } = {}
-) {
+): Match[] {
   // Need to remove any characters that can affect a regex if we're going
   // use the string in a manually constructed regex.
   const escaped = needle.replace(ESCAPE_CHARS_RE, "\\$&");
-  let rex;
-  if (attribute) {
-    rex = new RegExp(`${attribute}=['"](${escaped})['"]`, "g");
-  } else {
-    rex = new RegExp(`(${escaped})`, "g");
-  }
+  const rex = attribute
+    ? new RegExp(`${attribute}=['"](${escaped})['"]`, "g")
+    : new RegExp(`(${escaped})`, "g");
+  const matches: Match[] = [];
   for (const match of haystack.matchAll(rex)) {
     const left = haystack.slice(0, match.index);
     const line = (left.match(/\n/g) || []).length + 1;
     const lastIndexOf = left.lastIndexOf("\n") + 1;
     const column =
       match.index - lastIndexOf + 1 + (attribute ? attribute.length + 2 : 0);
-    yield { line, column };
+    matches.push({ line, column });
   }
+  return matches;
+}
+
+// find links or images in markdown content that match the given URL
+export function findMatchesInMarkdown(
+  url: string,
+  rawContent: string,
+  { type }: { type: "link" | "image" }
+): Match[] {
+  const matches: Match[] = [];
+  const attributeType = type === "link" ? "href" : "src";
+  const tree = fromMarkdown(rawContent);
+  // Find all the links and images in the markdown
+  // we should also find any HTML elements that contain links or images
+  visit(tree, [type, "html"], (node) => {
+    if (node.type === "html") {
+      const matchesInHtml = findMatchesInText(url, node.value, {
+        attribute: attributeType,
+      });
+      const correctedMatches = matchesInHtml.map(({ line, column }): Match => {
+        if (line === 1) {
+          // if it's the first line, we need to add the column offset
+          column += node.position.start.column - 1;
+        }
+        line += node.position.start.line - 1;
+        return { line, column };
+      });
+      matches.push(...correctedMatches);
+    } else if (node.type == type && node.url === url) {
+      // else this would be a markdown link or image
+      const { line, column } = node.position.start;
+      matches.push({ line, column });
+    }
+  });
+  return matches;
 }
 
-export function getFirstMatchInText(needle, haystack) {
+export function getFirstMatchInText(needle, haystack): Match {
   const index = haystack.indexOf(needle);
   const left = haystack.substring(0, index);
   const line = left.split("\n").length;

diff --git a/package.json b/package.json
@@ -211,6 +211,7 @@
     "jest-watch-typeahead": "^2.2.2",
     "jsdom": "^24.0.0",
     "lint-staged": "^13.2.3",
+    "mdast-util-directive": "^3.0.0",
     "mdast-util-to-hast": "^13.1.0",
     "mini-css-extract-plugin": "^2.8.1",
     "node-dev": "^8.0.0",

diff --git a/testing/content/files/en-us/web/images/bad_src/index.html b/testing/content/files/en-us/web/images/bad_src/index.html
diff --git a/testing/content/files/en-us/web/images/bad_src/index.md b/testing/content/files/en-us/web/images/bad_src/index.md
@@ -0,0 +1,28 @@
+---
+title: <img> tag with src that isn't a valid image
+slug: Web/Images/Bad_src
+---
+
+<!-- resolves, intially, to a directory -->
+![](/en-us/web)
+
+<!-- actually isn't a valid PNG -->
+![](actuallynota.png)
+
+<!--
+is a zero byte image, and as a check for html in markdown file
+add a space at the start of the <img> as a check for image finder function
+-->
+ <img src="empty.gif" alt=""/>
+
+<!--
+multi-line html with broken image,
+exists on disk but isn't actually a real SVG file.
+-->
+<table>
+  <tr>
+    <td>
+      <img src="actuallynota.svg" alt="broken image" />
+    </td>
+  </tr>
+</table>
diff --git a/testing/tests/index.test.ts b/testing/tests/index.test.ts
@@ -1085,6 +1085,21 @@ test("image flaws with bad images", () => {
         "File not present on disk, an empty file, or not an image"
     ).length
   ).toBe(4);
+  // Check the line and column numbers for html <img> tags in markdown
+  expect({
+    line: flaws.images[2].line,
+    column: flaws.images[2].column,
+  }).toEqual({
+    line: 16,
+    column: 12,
+  });
+  expect({
+    line: flaws.images[3].line,
+    column: flaws.images[3].column,
+  }).toEqual({
+    line: 25,
+    column: 17,
+  });
 });
 
 test("linked to local files", () => {