-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #425 from markw65/merge-character-classes
Add a pass to simplify single-character choices
- Loading branch information
Showing
13 changed files
with
791 additions
and
622 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
// @ts-check | ||
"use strict"; | ||
|
||
/** | ||
* @typedef {import("../../peg")} PEG | ||
*/ | ||
|
||
/** @type {PEG.compiler.visitor} */ | ||
const visitor = require("../visitor"); | ||
|
||
/** | ||
* @param {unknown} target | ||
* @param {unknown} source | ||
*/ | ||
function cloneOver(target, source) { | ||
const t = /** @type {Record<string,unknown>} */ (target); | ||
const s = /** @type {Record<string,unknown>} */ (source); | ||
Object.keys(t).forEach(key => delete t[key]); | ||
Object.keys(s).forEach(key => { t[key] = s[key]; }); | ||
} | ||
|
||
/** | ||
* Clean up the parts array of a `class` node, by sorting, | ||
* then removing "contained" ranges, and merging overlapping | ||
* or adjacent ranges. | ||
* | ||
* @param {PEG.ast.CharacterClass["parts"]} parts | ||
*/ | ||
function cleanParts(parts) { | ||
// Sort parts on increasing start, and then decreasing end. | ||
parts.sort((a, b) => { | ||
const [aStart, aEnd] = Array.isArray(a) ? a : [a, a]; | ||
const [bStart, bEnd] = Array.isArray(b) ? b : [b, b]; | ||
if (aStart !== bStart) { | ||
return aStart < bStart ? -1 : 1; | ||
} | ||
if (aEnd !== bEnd) { | ||
return aEnd > bEnd ? -1 : 1; | ||
} | ||
return 0; | ||
}); | ||
|
||
let prevStart = ""; | ||
let prevEnd = ""; | ||
for (let i = 0; i < parts.length; i++) { | ||
const part = parts[i]; | ||
const [curStart, curEnd] = Array.isArray(part) ? part : [part, part]; | ||
if (curEnd <= prevEnd) { | ||
// Current range is contained in previous range, | ||
// so drop it. | ||
parts.splice(i--, 1); | ||
continue; | ||
} | ||
if (prevEnd.charCodeAt(0) + 1 >= curStart.charCodeAt(0)) { | ||
// Current and previous ranges overlap, or are adjacent. | ||
// Drop the current, and extend the previous range. | ||
parts.splice(i--, 1); | ||
parts[i] = [prevStart, prevEnd = curEnd]; | ||
continue; | ||
} | ||
prevStart = curStart; | ||
prevEnd = curEnd; | ||
} | ||
return parts; | ||
} | ||
|
||
/** | ||
* Merges a choice character classes into a character class | ||
* @param {PEG.ast.Grammar} ast | ||
*/ | ||
function mergeCharacterClasses(ast) { | ||
// Build a map from rule names to rules for quick lookup of | ||
// ref_rules. | ||
const rules = Object.create(null); | ||
ast.rules.forEach(rule => (rules[rule.name] = rule.expression)); | ||
// Keep a map of which rules have been processed, so that when | ||
// we find a ref_rule, we can make sure its processed, before we | ||
// try to use it. | ||
const processedRules = Object.create(null); | ||
const [asClass, merge] = [ | ||
/** | ||
* Determine whether a node can be represented as a simple character class, | ||
* and return that class if so. | ||
* | ||
* @param {PEG.ast.Expression} node - the node to inspect | ||
* @param {boolean} [clone] - if true, always return a new node that | ||
* can be modified by the caller | ||
* @returns {PEG.ast.CharacterClass | null} | ||
*/ | ||
(node, clone) => { | ||
if (node.type === "class" && !node.inverted) { | ||
if (clone) { | ||
node = { ...node }; | ||
node.parts = [...node.parts]; | ||
} | ||
return node; | ||
} | ||
if (node.type === "literal" && node.value.length === 1) { | ||
return { | ||
type: "class", | ||
parts: [node.value], | ||
inverted: false, | ||
ignoreCase: node.ignoreCase, | ||
location: node.location, | ||
}; | ||
} | ||
if (node.type === "rule_ref") { | ||
const ref = rules[node.name]; | ||
if (ref) { | ||
if (!processedRules[node.name]) { | ||
processedRules[node.name] = true; | ||
merge(ref); | ||
} | ||
const cls = asClass(ref, true); | ||
if (cls) { | ||
cls.location = node.location; | ||
} | ||
return cls; | ||
} | ||
} | ||
return null; | ||
}, | ||
visitor.build({ | ||
choice(node) { | ||
/** @type {PEG.ast.CharacterClass | null} */ | ||
let prev = null; | ||
let changed = false; | ||
node.alternatives.forEach((alt, i) => { | ||
merge(alt); | ||
const cls = asClass(alt); | ||
if (!cls) { | ||
prev = null; | ||
return; | ||
} | ||
if (prev && prev.ignoreCase === cls.ignoreCase) { | ||
prev.parts.push(...cls.parts); | ||
node.alternatives[i - 1] = prev; | ||
node.alternatives[i] = prev; | ||
prev.location = { | ||
source: prev.location.source, | ||
start: prev.location.start, | ||
end: cls.location.end, | ||
}; | ||
changed = true; | ||
} else { | ||
prev = cls; | ||
} | ||
}); | ||
if (changed) { | ||
node.alternatives = node.alternatives.filter( | ||
(alt, i, arr) => !i || alt !== arr[i - 1] | ||
); | ||
node.alternatives.forEach((alt, i) => { | ||
if (alt.type === "class") { | ||
alt.parts = cleanParts(alt.parts); | ||
if (alt.parts.length === 1 | ||
&& !Array.isArray(alt.parts[0]) | ||
&& !alt.inverted) { | ||
node.alternatives[i] = { | ||
type: "literal", | ||
value: alt.parts[0], | ||
ignoreCase: alt.ignoreCase, | ||
location: alt.location, | ||
}; | ||
} | ||
} | ||
}); | ||
if (node.alternatives.length === 1) { | ||
cloneOver(node, node.alternatives[0]); | ||
} | ||
} | ||
}, | ||
text(node) { | ||
merge(node.expression); | ||
if (node.expression.type === "class" | ||
|| node.expression.type === "literal") { | ||
const location = node.location; | ||
cloneOver(node, node.expression); | ||
node.location = location; | ||
} | ||
}, | ||
}), | ||
]; | ||
|
||
ast.rules.forEach(rule => { | ||
processedRules[rule.name] = true; | ||
merge(rule.expression); | ||
}); | ||
} | ||
|
||
module.exports = mergeCharacterClasses; |
Oops, something went wrong.