Merge pull request #425 from markw65/merge-character-classes

Add a pass to simplify single-character choices
peggyjs · Sep 7, 2023 · e25036f · e25036f
2 parents 4ce5530 + 8527532
commit e25036f
Show file tree

Hide file tree

Showing 13 changed files with 791 additions and 622 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ Released: TBD
 
 ### Minor Changes
 
+- [#425](https://github.com/peggyjs/peggy/pull/425) Add a pass to simplify single-character choices
 - [#420](https://github.com/peggyjs/peggy/pull/420) Updated dependencies to
   avoid audit warnings.  From @hildjj.
 - [#404](https://github.com/peggyjs/peggy/issues/404) Add support for -w/--watch

diff --git a/docs/js/benchmark-bundle.min.js b/docs/js/benchmark-bundle.min.js
diff --git a/docs/js/examples.js b/docs/js/examples.js
@@ -188,6 +188,7 @@ function peg$parse(input, options) {
   var peg$r0 = /^[a-z]/;
   var peg$r1 = /^[^a-z]/i;
   var peg$r2 = /^[0-9]/;
+  var peg$r3 = /^[a-c]/;
 
   var peg$e0 = peg$literalExpectation("foo", false);
   var peg$e1 = peg$literalExpectation("foo", true);
@@ -202,7 +203,8 @@ function peg$parse(input, options) {
   var peg$e10 = peg$literalExpectation("bar", true);
   var peg$e11 = peg$literalExpectation(" ", false);
   var peg$e12 = peg$literalExpectation("c", false);
-  var peg$e13 = peg$otherExpectation("The rest of the input");
+  var peg$e13 = peg$classExpectation([["a", "c"]], false, false);
+  var peg$e14 = peg$otherExpectation("The rest of the input");
 
   var peg$f0 = function(match, rest) { return {match, rest}; };
   var peg$f1 = function(match, rest) { return {match, rest}; };
@@ -1440,30 +1442,12 @@ function peg$parse(input, options) {
     var s0, s1, s2;
 
     s0 = peg$currPos;
-    if (input.charCodeAt(peg$currPos) === 97) {
-      s1 = peg$c3;
+    if (peg$r3.test(input.charAt(peg$currPos))) {
+      s1 = input.charAt(peg$currPos);
       peg$currPos++;
     } else {
       s1 = peg$FAILED;
-      if (peg$silentFails === 0) { peg$fail(peg$e8); }
-    }
-    if (s1 === peg$FAILED) {
-      if (input.charCodeAt(peg$currPos) === 98) {
-        s1 = peg$c4;
-        peg$currPos++;
-      } else {
-        s1 = peg$FAILED;
-        if (peg$silentFails === 0) { peg$fail(peg$e9); }
-      }
-      if (s1 === peg$FAILED) {
-        if (input.charCodeAt(peg$currPos) === 99) {
-          s1 = peg$c7;
-          peg$currPos++;
-        } else {
-          s1 = peg$FAILED;
-          if (peg$silentFails === 0) { peg$fail(peg$e12); }
-        }
-      }
+      if (peg$silentFails === 0) { peg$fail(peg$e13); }
     }
     if (s1 !== peg$FAILED) {
       s2 = peg$parserest();
@@ -1503,7 +1487,7 @@ function peg$parse(input, options) {
     s0 = input.substring(s0, peg$currPos);
     peg$silentFails--;
     s1 = peg$FAILED;
-    if (peg$silentFails === 0) { peg$fail(peg$e13); }
+    if (peg$silentFails === 0) { peg$fail(peg$e14); }
 
     return s0;
   }

diff --git a/docs/js/test-bundle.min.js b/docs/js/test-bundle.min.js
diff --git a/docs/vendor/peggy/peggy.min.js b/docs/vendor/peggy/peggy.min.js
diff --git a/lib/compiler/index.js b/lib/compiler/index.js
@@ -4,6 +4,7 @@ const generateBytecode = require("./passes/generate-bytecode");
 const generateJS = require("./passes/generate-js");
 const inferenceMatchResult = require("./passes/inference-match-result");
 const removeProxyRules = require("./passes/remove-proxy-rules");
+const mergeCharacterClasses = require("./passes/merge-character-classes");
 const reportDuplicateLabels = require("./passes/report-duplicate-labels");
 const reportDuplicateRules = require("./passes/report-duplicate-rules");
 const reportInfiniteRecursion = require("./passes/report-infinite-recursion");
@@ -58,6 +59,7 @@ const compiler = {
     ],
     transform: [
       removeProxyRules,
+      mergeCharacterClasses,
       inferenceMatchResult,
     ],
     generate: [

diff --git a/lib/compiler/passes/merge-character-classes.js b/lib/compiler/passes/merge-character-classes.js
@@ -0,0 +1,191 @@
+// @ts-check
+"use strict";
+
+/**
+ * @typedef {import("../../peg")} PEG
+ */
+
+/** @type {PEG.compiler.visitor} */
+const visitor = require("../visitor");
+
+/**
+ * @param {unknown} target
+ * @param {unknown} source
+ */
+function cloneOver(target, source) {
+  const t = /** @type {Record<string,unknown>} */ (target);
+  const s = /** @type {Record<string,unknown>} */ (source);
+  Object.keys(t).forEach(key => delete t[key]);
+  Object.keys(s).forEach(key => { t[key] = s[key]; });
+}
+
+/**
+ * Clean up the parts array of a `class` node, by sorting,
+ * then removing "contained" ranges, and merging overlapping
+ * or adjacent ranges.
+ *
+ * @param {PEG.ast.CharacterClass["parts"]} parts
+ */
+function cleanParts(parts) {
+  // Sort parts on increasing start, and then decreasing end.
+  parts.sort((a, b) => {
+    const [aStart, aEnd] = Array.isArray(a) ? a : [a, a];
+    const [bStart, bEnd] = Array.isArray(b) ? b : [b, b];
+    if (aStart !== bStart) {
+      return aStart < bStart ? -1 : 1;
+    }
+    if (aEnd !== bEnd) {
+      return aEnd > bEnd ? -1 : 1;
+    }
+    return 0;
+  });
+
+  let prevStart = "";
+  let prevEnd = "";
+  for (let i = 0; i < parts.length; i++) {
+    const part = parts[i];
+    const [curStart, curEnd] = Array.isArray(part) ? part : [part, part];
+    if (curEnd <= prevEnd) {
+      // Current range is contained in previous range,
+      // so drop it.
+      parts.splice(i--, 1);
+      continue;
+    }
+    if (prevEnd.charCodeAt(0) + 1 >= curStart.charCodeAt(0)) {
+      // Current and previous ranges overlap, or are adjacent.
+      // Drop the current, and extend the previous range.
+      parts.splice(i--, 1);
+      parts[i] = [prevStart, prevEnd = curEnd];
+      continue;
+    }
+    prevStart = curStart;
+    prevEnd = curEnd;
+  }
+  return parts;
+}
+
+/**
+ * Merges a choice character classes into a character class
+ * @param {PEG.ast.Grammar} ast
+ */
+function mergeCharacterClasses(ast) {
+  // Build a map from rule names to rules for quick lookup of
+  // ref_rules.
+  const rules = Object.create(null);
+  ast.rules.forEach(rule => (rules[rule.name] = rule.expression));
+  // Keep a map of which rules have been processed, so that when
+  // we find a ref_rule, we can make sure its processed, before we
+  // try to use it.
+  const processedRules = Object.create(null);
+  const [asClass, merge] = [
+    /**
+     * Determine whether a node can be represented as a simple character class,
+     * and return that class if so.
+     *
+     * @param {PEG.ast.Expression} node - the node to inspect
+     * @param {boolean} [clone] - if true, always return a new node that
+     *                            can be modified by the caller
+     * @returns {PEG.ast.CharacterClass | null}
+     */
+    (node, clone) => {
+      if (node.type === "class" && !node.inverted) {
+        if (clone) {
+          node = { ...node };
+          node.parts = [...node.parts];
+        }
+        return node;
+      }
+      if (node.type === "literal" && node.value.length === 1) {
+        return {
+          type: "class",
+          parts: [node.value],
+          inverted: false,
+          ignoreCase: node.ignoreCase,
+          location: node.location,
+        };
+      }
+      if (node.type === "rule_ref") {
+        const ref = rules[node.name];
+        if (ref) {
+          if (!processedRules[node.name]) {
+            processedRules[node.name] = true;
+            merge(ref);
+          }
+          const cls = asClass(ref, true);
+          if (cls) {
+            cls.location = node.location;
+          }
+          return cls;
+        }
+      }
+      return null;
+    },
+    visitor.build({
+      choice(node) {
+        /** @type {PEG.ast.CharacterClass | null} */
+        let prev = null;
+        let changed = false;
+        node.alternatives.forEach((alt, i) => {
+          merge(alt);
+          const cls = asClass(alt);
+          if (!cls) {
+            prev = null;
+            return;
+          }
+          if (prev && prev.ignoreCase === cls.ignoreCase) {
+            prev.parts.push(...cls.parts);
+            node.alternatives[i - 1] = prev;
+            node.alternatives[i] = prev;
+            prev.location = {
+              source: prev.location.source,
+              start: prev.location.start,
+              end: cls.location.end,
+            };
+            changed = true;
+          } else {
+            prev = cls;
+          }
+        });
+        if (changed) {
+          node.alternatives = node.alternatives.filter(
+            (alt, i, arr) => !i || alt !== arr[i - 1]
+          );
+          node.alternatives.forEach((alt, i) => {
+            if (alt.type === "class") {
+              alt.parts = cleanParts(alt.parts);
+              if (alt.parts.length === 1
+                  && !Array.isArray(alt.parts[0])
+                  && !alt.inverted) {
+                node.alternatives[i] = {
+                  type: "literal",
+                  value: alt.parts[0],
+                  ignoreCase: alt.ignoreCase,
+                  location: alt.location,
+                };
+              }
+            }
+          });
+          if (node.alternatives.length === 1) {
+            cloneOver(node, node.alternatives[0]);
+          }
+        }
+      },
+      text(node) {
+        merge(node.expression);
+        if (node.expression.type === "class"
+            || node.expression.type === "literal") {
+          const location = node.location;
+          cloneOver(node, node.expression);
+          node.location = location;
+        }
+      },
+    }),
+  ];
+
+  ast.rules.forEach(rule => {
+    processedRules[rule.name] = true;
+    merge(rule.expression);
+  });
+}
+
+module.exports = mergeCharacterClasses;