Skip to content

Commit

Permalink
WIP: safen the text regex via linear-time scans
Browse files Browse the repository at this point in the history
Sketch implementing text regex as a linear-time RegExp imitator.
- A few nits here and there
- I haven't tested all of the offsetOfX routines, so 'npm run test' hangs on some bug
  • Loading branch information
davisjam committed Apr 28, 2018
1 parent dd26af8 commit 24d4a5e
Showing 1 changed file with 150 additions and 14 deletions.
164 changes: 150 additions & 14 deletions lib/marked.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* https://github.com/markedjs/marked
*/

var NEW_TEXT = false;
var NEW_TEXT = true;

var doLog = false;
function log(msg) {
Expand Down Expand Up @@ -526,15 +526,146 @@ var inline = {
code: /^(`+)\s*([\s\S]*?[^`]?)\s*\1(?!`)/,
br: /^ {2,}\n(?!\s*$)/,
del: noop,
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/ // TODO Vulnerable
};

// Helper for the offsetOf routines.
function offsetOfRegex(str, regex) {
var cap = regex.exec(str);
if (cap) {
log(`offsetOfRegex: str ${str} matches regex ${regex.source}`);
return cap.index;
}
return -1;
}

// Returns earliest offset of "special characters"
function offsetOfSpecialChars(str) {
return offsetOfRegex(str, /[\\<!\[`*]/);
}

// Returns earliest offset of a command to italicize
function offsetOfItalics (str) {
return offsetOfRegex(str, /\b_/);
}

// Returns earliest offset of a run of 2+ spaces then a newline
function offsetOfSpacesThenNewline(str) {
// linear-time implementation of / {2,}\n/
var spaceRunBegins = -1;
var nSpaces = 0;
for (var i = 0; i < str.length; i++) {
if (str.charAt(i) === ' ') {
if (nSpaces === 0) {
spaceRunBegins = i;
}
nSpaces++;
} else if (2 <= nSpaces && str.charAt(i) === '\n') {
return spaceRunBegins;
} else {
nSpaces = 0;
}
}
return -1;
}

// Returns earliest offset of an http protocol
function offsetOfHTTP(str) {
return offsetOfRegex(str, /https?:\/\//);
}

// Returns earliest offset of an ftp protocol
function offsetOfFTP(str) {
return offsetOfRegex(str, /ftp:\/\//);
}

// Returns earliest offset of a www URL
function offsetOfWWW(str) {
return offsetOfRegex(str, /www\./);
}

// Returns earliest offset of an email (username + @)
function offsetOfEmail(str) {
var atSymbolIx = 0;
var emailUsernameChar = /^[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]$/;
// Look for email-like things at every '@'.
while (0 < (atSymbolIx = str.indexOf('@', atSymbolIx))) {
// Found an @, work backwards through valid username chars until we run out of string.
var i = atSymbolIx;
while (0 < i && emailUsernameChar.exec(str.charAt(i - 1))) {
i--;
}
// If we found any, this looks like an email.
if (i < atSymbolIx) {
return i;
}
}

return -1;
}

// Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
// textBreakFinders should be a subset of the offsetOfX functions
// Imitates RegExp.exec
function offsetOfTextBreak(str, textBreakFinders) {
// Clean code means doing several O(n) operations.
// A more complex state machine (like a linear-time regex) might test all options
// in parallel more efficiently, but I don't know how to write one.
log(`Looking for tb in \'${str}\'`);
if (str.length === 0) {
return null;
}
var strToSearch = str.substr(1); // Must be at least one character of text before the break.

// Find the earliest instance of each kind of text break.
var textBreaks = textBreakFinders.map(function (f) {
return f(strToSearch);
});
log(`textBreaks: ${textBreaks}`);

// Pick earliest among them.
var validTextBreaks = textBreaks.filter(function (brk) {
return 0 <= brk;
});

var earliestBreakOffset;
if (validTextBreaks.length) {
// Why doesn't Math.min work here?
var min = validTextBreaks.reduce(function (accum, b) {
if (b < accum) {
return b;
}
return accum;
});
earliestBreakOffset = min + 1; // +1 because strToSearch is missing 1st char of str
} else {
// No text breaks? Then the whole string is text.
earliestBreakOffset = str.length;
}

// Mimic RegExp 'exec' for compatibility.
var result = {};
result[0] = str.substr(0, earliestBreakOffset);
result.index = earliestBreakOffset;
log(`Returning: earliestBreakOffset ${earliestBreakOffset} result ${JSON.stringify(result)}`);
return result;
}

// Find earliest text break according to the rules of the Inline Lexer.
// Imitates RegExp.exec
function offsetOfTextBreakInline(str) {
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline]);
}

// Find earliest text break according to the rules of the Inline GFM Lexer.
// Imitates RegExp.exec
function offsetOfTextBreakInlineGFM(str) {
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline, offsetOfHTTP, offsetOfFTP, offsetOfWWW, offsetOfEmail]);
}

// Override vulnerable but readable regex
if (NEW_TEXT) {
// TODO: If we replace ' {2,}\n' with ' \n' and address trailing whitespace,
// we break the definition of GFM inline.breaks further down (affects the gfm_break test).
// Furthermore, we still have trouble with the email pattern substituted in: /|[...]+@/, which
// is vulnerable to REDOS just like /| {2,}\n/ was
inline.text = /[\s\S](?:[\\<!\[`*]|\b_| {2}\n|$)/;
inline.text = { exec: offsetOfTextBreakInline };
}

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
Expand Down Expand Up @@ -599,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
.getRegex(),
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
del: /^~~(?=\S)([\s\S]*?\S)~~/,
text: edit(inline.text)
.replace(']|', '~]|')
.replace('|', '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|')
.getRegex()
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
});

/**
Expand All @@ -611,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {

inline.breaks = merge({}, inline.gfm, {
br: edit(inline.br).replace('{2,}', '*').getRegex(),
text: edit(inline.gfm.text).replace('{2,}', '*').getRegex()
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
});

/**
Expand Down Expand Up @@ -803,16 +931,22 @@ InlineLexer.prototype.output = function(src) {
}

// text
log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
//log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
if (cap = this.rules.text.exec(src)) {
if (NEW_TEXT) {
log(`lexer: Match: ${cap} ${cap.index}`);
log(`lexer: Match: ${JSON.stringify(cap)} ${cap.index}`);
var textLen = cap.index + 1;
// text is not in cap[0], so extract text before advancing src.
out += this.renderer.text(escape(this.smartypants(src.substr(0, textLen))));
src = src.substring(textLen);
continue;
} else {
var offInline = offsetOfTextBreakInline(src);
var offInlineGFM = offsetOfTextBreakInlineGFM(src);
console.log(`cap ${JSON.stringify(cap)}`);
console.log(`offInline ${JSON.stringify(offInline)}`);
console.log(`offInlineGFM ${JSON.stringify(offInlineGFM)}`);
console.log(`regex ${cap[0].length} offInline ${offInline[0].length} offInlineGFM ${offInlineGFM[0].length}`);
src = src.substring(cap[0].length);
out += this.renderer.text(escape(this.smartypants(cap[0])));
continue;
Expand Down Expand Up @@ -1530,6 +1664,8 @@ marked.defaults = marked.getDefaults();
* Expose
*/

marked(' # # ####A');

marked.Parser = Parser;
marked.parser = Parser.parse;

Expand Down

0 comments on commit 24d4a5e

Please sign in to comment.