From 512cd7ab9e3e33be925d88074678b51afaa6b7f2 Mon Sep 17 00:00:00 2001 From: Titus Date: Sun, 29 Mar 2020 18:30:20 +0200 Subject: [PATCH] parse: fix support for literal URLs Related-to GH-479. Closes GH-478. Closes GH-481. --- packages/remark-parse/lib/locate/url.js | 15 +- packages/remark-parse/lib/tokenize/url.js | 247 ++- packages/remark-parse/package.json | 1 + test/fixtures/input/literal-url.text | 57 + test/fixtures/tree/auto-link-url.json | 56 +- .../tree/entities-advanced.commonmark.json | 78 +- ...entities-advanced.commonmark.pedantic.json | 78 +- test/fixtures/tree/entities-advanced.json | 78 +- .../tree/entities-advanced.pedantic.json | 78 +- .../fixtures/tree/literal-url.commonmark.json | 1912 +++++++++++++++++ test/fixtures/tree/literal-url.json | 1912 +++++++++++++++++ test/fixtures/tree/literal-url.nogfm.json | 1005 +++++++++ 12 files changed, 5097 insertions(+), 420 deletions(-) create mode 100644 test/fixtures/input/literal-url.text create mode 100644 test/fixtures/tree/literal-url.commonmark.json create mode 100644 test/fixtures/tree/literal-url.json create mode 100644 test/fixtures/tree/literal-url.nogfm.json diff --git a/packages/remark-parse/lib/locate/url.js b/packages/remark-parse/lib/locate/url.js index e5bf5bfa4..c2cc1beba 100644 --- a/packages/remark-parse/lib/locate/url.js +++ b/packages/remark-parse/lib/locate/url.js @@ -2,22 +2,25 @@ module.exports = locate -var protocols = ['https://', 'http://', 'mailto:'] +var values = ['www.', 'http://', 'https://'] function locate(value, fromIndex) { - var length = protocols.length - var index = -1 var min = -1 + var index + var length var position if (!this.options.gfm) { - return -1 + return min } + length = values.length + index = -1 + while (++index < length) { - position = value.indexOf(protocols[index], fromIndex) + position = value.indexOf(values[index], fromIndex) - if (position !== -1 && (position < min || min === -1)) { + if (position !== -1 && (min === -1 || position < min)) { min = position } } diff --git a/packages/remark-parse/lib/tokenize/url.js b/packages/remark-parse/lib/tokenize/url.js index 92d1c6229..30077cace 100644 --- a/packages/remark-parse/lib/tokenize/url.js +++ b/packages/remark-parse/lib/tokenize/url.js @@ -1,6 +1,9 @@ 'use strict' +var ccount = require('ccount') var decode = require('parse-entities') +var decimal = require('is-decimal') +var alphabetical = require('is-alphabetical') var whitespace = require('is-whitespace-character') var locate = require('../locate/url') @@ -8,146 +11,200 @@ module.exports = url url.locator = locate url.notInLink = true -var quotationMark = '"' -var apostrophe = "'" -var leftParenthesis = '(' -var rightParenthesis = ')' -var comma = ',' -var dot = '.' -var colon = ':' -var semicolon = ';' -var lessThan = '<' -var atSign = '@' -var leftSquareBracket = '[' -var rightSquareBracket = ']' - -var http = 'http://' -var https = 'https://' -var mailto = 'mailto:' - -var protocols = [http, https, mailto] - -var protocolsLength = protocols.length +var exclamationMark = 33 // '!' +var ampersand = 38 // '&' +var rightParenthesis = 41 // ')' +var asterisk = 42 // '*' +var comma = 44 // ',' +var dash = 45 // '-' +var dot = 46 // '.' +var colon = 58 // ':' +var semicolon = 59 // ';' +var questionMark = 63 // '?' +var lessThan = 60 // '<' +var underscore = 95 // '_' +var tilde = 126 // '~' + +var leftParenthesisCharacter = '(' +var rightParenthesisCharacter = ')' function url(eat, value, silent) { var self = this - var subvalue - var content - var character + var gfm = self.options.gfm + var tokenizers = self.inlineTokenizers + var length = value.length + var previousDot = -1 + var protocolless = false + var dots + var lastTwoPartsStart + var start var index - var position - var protocol - var match - var length - var queue - var parenCount - var nextCharacter - var tokenizers + var pathStart + var path + var code + var end + var leftCount + var rightCount + var content + var children + var url var exit - if (!self.options.gfm) { + if (!gfm) { + return + } + + // `WWW.` doesn’t work. + if (value.slice(0, 4) === 'www.') { + protocolless = true + index = 4 + } else if (value.slice(0, 7).toLowerCase() === 'http://') { + index = 7 + } else if (value.slice(0, 8).toLowerCase() === 'https://') { + index = 8 + } else { return } - subvalue = '' - index = -1 + // Act as if the starting boundary is a dot. + previousDot = index - 1 - while (++index < protocolsLength) { - protocol = protocols[index] - match = value.slice(0, protocol.length) + // Parse a valid domain. + start = index + dots = [] - if (match.toLowerCase() === protocol) { - subvalue = match - break + while (index < length) { + code = value.charCodeAt(index) + + if (code === dot) { + // Dots may not appear after each other. + if (previousDot === index - 1) { + break + } + + dots.push(index) + previousDot = index + index++ + continue } + + if ( + decimal(code) || + alphabetical(code) || + code === dash || + code === underscore + ) { + index++ + continue + } + + break + } + + // Ignore a final dot: + if (code === dot) { + dots.pop() + index-- } - if (!subvalue) { + // If there are not dots, exit. + if (dots[0] === undefined) { return } - index = subvalue.length - length = value.length - queue = '' - parenCount = 0 + // If there is an underscore in the last two domain parts, exit: + // `www.example.c_m` and `www.ex_ample.com` are not OK, but + // `www.sub_domain.example.com` is. + lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1 + if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) { + return + } + + /* istanbul ignore if - never used (yet) */ + if (silent) { + return true + } + + end = index + pathStart = index + + // Parse a path. while (index < length) { - character = value.charAt(index) + code = value.charCodeAt(index) - if (whitespace(character) || character === lessThan) { + if (whitespace(code) || code === lessThan) { break } + index++ + if ( - character === dot || - character === comma || - character === colon || - character === semicolon || - character === quotationMark || - character === apostrophe || - character === rightParenthesis || - character === rightSquareBracket + code === exclamationMark || + code === asterisk || + code === comma || + code === dot || + code === colon || + code === questionMark || + code === underscore || + code === tilde ) { - nextCharacter = value.charAt(index + 1) - - if (!nextCharacter || whitespace(nextCharacter)) { - break - } + // Empty + } else { + end = index } + } - if (character === leftParenthesis || character === leftSquareBracket) { - parenCount++ - } + index = end - if (character === rightParenthesis || character === rightSquareBracket) { - parenCount-- + // If the path ends in a closing paren, and the count of closing parens is + // higher than the opening count, then remove the supefluous closing parens. + if (value.charCodeAt(index - 1) === rightParenthesis) { + path = value.slice(pathStart, index) + leftCount = ccount(path, leftParenthesisCharacter) + rightCount = ccount(path, rightParenthesisCharacter) - if (parenCount < 0) { - break - } + while (rightCount > leftCount) { + index = pathStart + path.lastIndexOf(rightParenthesisCharacter) + path = value.slice(pathStart, index) + rightCount-- } - - queue += character - index++ } - if (!queue) { - return - } + if (value.charCodeAt(index - 1) === semicolon) { + // GitHub doesn’t document this, but final semicolons aren’t paret of the + // URL either. + index-- - subvalue += queue - content = subvalue + // // If the path ends in what looks like an entity, it’s not part of the path. + if (alphabetical(value.charCodeAt(index - 1))) { + end = index - 2 - if (protocol === mailto) { - position = queue.indexOf(atSign) + while (alphabetical(value.charCodeAt(end))) { + end-- + } - if (position === -1 || position === length - 1) { - return + if (value.charCodeAt(end) === ampersand) { + index = end + } } - - content = content.slice(mailto.length) } - /* istanbul ignore if - never used (yet) */ - if (silent) { - return true + content = value.slice(0, index) + url = decode(content, {nonTerminated: false}) + + if (protocolless) { + url = 'http://' + url } exit = self.enterLink() // Temporarily remove all tokenizers except text in url. - tokenizers = self.inlineTokenizers self.inlineTokenizers = {text: tokenizers.text} - - content = self.tokenizeInline(content, eat.now()) - + children = self.tokenizeInline(content, eat.now()) self.inlineTokenizers = tokenizers + exit() - return eat(subvalue)({ - type: 'link', - title: null, - url: decode(subvalue, {nonTerminated: false}), - children: content - }) + return eat(content)({type: 'link', title: null, url: url, children: children}) } diff --git a/packages/remark-parse/package.json b/packages/remark-parse/package.json index 854bb84b0..e3eb9f207 100644 --- a/packages/remark-parse/package.json +++ b/packages/remark-parse/package.json @@ -37,6 +37,7 @@ "types/index.d.ts" ], "dependencies": { + "ccount": "^1.0.0", "collapse-white-space": "^1.0.2", "is-alphabetical": "^1.0.0", "is-decimal": "^1.0.0", diff --git a/test/fixtures/input/literal-url.text b/test/fixtures/input/literal-url.text new file mode 100644 index 000000000..03ccfde62 --- /dev/null +++ b/test/fixtures/input/literal-url.text @@ -0,0 +1,57 @@ +# Literal URLs + +## Extended www “autolinks” + +Here’s a URL: www.alpha.org. + +Visit www.bravo.org/help for more information. + +Dots cannot appear together: www..one.com and www.two..com. + +And www. is not a URL, neither are www.a nor www.b. + +Underscores cannot be used in the last two domain parts, so www.three.c_m, +and www.fo_ur.com are not URLs, but www.fi_ve.six.com is. + +Valid, and the dot is not part of the link: Visit www.charlie.org. + +Valid, and this dot isn’t part of the link either: Visit www.delta.org/a.b. + +Valid, but two dots are both not part of the URL: www.example.com.. + +Here are parens: www.echo.com/search?q=Markup+(business) + +The last two aren’t part of the +link: www.foxtrot.com/search?q=golf+(hotel))) + +These first and last ones aren’t +either: (www.india.com/search?q=juliett+(kilo)) + +This last one is: (www.lima.com/search?q=mike+(november) + +Paren counting is only done if the last character is a closing +paren: www.google.com/search?q=(business))+ok + +If it “looks” like an entity at the end, it isn’t included. + +This is a whole URL: www.entity.com/search?q=alpha&hl=en + +This is a without the semicolon: www.entity.com/search?q=bravo&; + +This one is without the “entity”: www.entity.com/search?q=charlie&hl; + +This one one too: www.entity.com/search?q=delta© + +Only “named” ones work, numericals don’t, so this one is only without the +semicolon: www.entity.com/search?q=delta∊ + +`<` immediately ends an autolink: www.alpha.org/he