Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parse: fix support for literal URLs #481

Merged
merged 1 commit into from
Mar 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions packages/remark-parse/lib/locate/url.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@

module.exports = locate

var protocols = ['https://', 'http://', 'mailto:']
var values = ['www.', 'http://', 'https://']

function locate(value, fromIndex) {
var length = protocols.length
var index = -1
var min = -1
var index
var length
var position

if (!this.options.gfm) {
return -1
return min
}

length = values.length
index = -1

while (++index < length) {
position = value.indexOf(protocols[index], fromIndex)
position = value.indexOf(values[index], fromIndex)

if (position !== -1 && (position < min || min === -1)) {
if (position !== -1 && (min === -1 || position < min)) {
min = position
}
}
Expand Down
247 changes: 152 additions & 95 deletions packages/remark-parse/lib/tokenize/url.js
Original file line number Diff line number Diff line change
@@ -1,153 +1,210 @@
'use strict'

var ccount = require('ccount')
var decode = require('parse-entities')
var decimal = require('is-decimal')
var alphabetical = require('is-alphabetical')
var whitespace = require('is-whitespace-character')
var locate = require('../locate/url')

module.exports = url
url.locator = locate
url.notInLink = true

var quotationMark = '"'
var apostrophe = "'"
var leftParenthesis = '('
var rightParenthesis = ')'
var comma = ','
var dot = '.'
var colon = ':'
var semicolon = ';'
var lessThan = '<'
var atSign = '@'
var leftSquareBracket = '['
var rightSquareBracket = ']'

var http = 'http://'
var https = 'https://'
var mailto = 'mailto:'

var protocols = [http, https, mailto]

var protocolsLength = protocols.length
var exclamationMark = 33 // '!'
var ampersand = 38 // '&'
var rightParenthesis = 41 // ')'
var asterisk = 42 // '*'
var comma = 44 // ','
var dash = 45 // '-'
var dot = 46 // '.'
var colon = 58 // ':'
var semicolon = 59 // ';'
var questionMark = 63 // '?'
var lessThan = 60 // '<'
var underscore = 95 // '_'
var tilde = 126 // '~'

var leftParenthesisCharacter = '('
var rightParenthesisCharacter = ')'

function url(eat, value, silent) {
var self = this
var subvalue
var content
var character
var gfm = self.options.gfm
var tokenizers = self.inlineTokenizers
var length = value.length
var previousDot = -1
var protocolless = false
var dots
var lastTwoPartsStart
var start
var index
var position
var protocol
var match
var length
var queue
var parenCount
var nextCharacter
var tokenizers
var pathStart
var path
var code
var end
var leftCount
var rightCount
var content
var children
var url
var exit

if (!self.options.gfm) {
if (!gfm) {
return
}

// `WWW.` doesn’t work.
if (value.slice(0, 4) === 'www.') {
protocolless = true
index = 4
} else if (value.slice(0, 7).toLowerCase() === 'http://') {
index = 7
} else if (value.slice(0, 8).toLowerCase() === 'https://') {
index = 8
} else {
return
}

subvalue = ''
index = -1
// Act as if the starting boundary is a dot.
previousDot = index - 1

while (++index < protocolsLength) {
protocol = protocols[index]
match = value.slice(0, protocol.length)
// Parse a valid domain.
start = index
dots = []

if (match.toLowerCase() === protocol) {
subvalue = match
break
while (index < length) {
code = value.charCodeAt(index)

if (code === dot) {
// Dots may not appear after each other.
if (previousDot === index - 1) {
break
}

dots.push(index)
previousDot = index
index++
continue
}

if (
decimal(code) ||
alphabetical(code) ||
code === dash ||
code === underscore
) {
index++
continue
}

break
}

// Ignore a final dot:
if (code === dot) {
dots.pop()
index--
}

if (!subvalue) {
// If there are not dots, exit.
if (dots[0] === undefined) {
return
}

index = subvalue.length
length = value.length
queue = ''
parenCount = 0
// If there is an underscore in the last two domain parts, exit:
// `www.example.c_m` and `www.ex_ample.com` are not OK, but
// `www.sub_domain.example.com` is.
lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1

if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) {
return
}

/* istanbul ignore if - never used (yet) */
if (silent) {
return true
}

end = index
pathStart = index

// Parse a path.
while (index < length) {
character = value.charAt(index)
code = value.charCodeAt(index)

if (whitespace(character) || character === lessThan) {
if (whitespace(code) || code === lessThan) {
break
}

index++

if (
character === dot ||
character === comma ||
character === colon ||
character === semicolon ||
character === quotationMark ||
character === apostrophe ||
character === rightParenthesis ||
character === rightSquareBracket
code === exclamationMark ||
code === asterisk ||
code === comma ||
code === dot ||
code === colon ||
code === questionMark ||
code === underscore ||
code === tilde
) {
nextCharacter = value.charAt(index + 1)

if (!nextCharacter || whitespace(nextCharacter)) {
break
}
// Empty
} else {
end = index
}
}

if (character === leftParenthesis || character === leftSquareBracket) {
parenCount++
}
index = end

if (character === rightParenthesis || character === rightSquareBracket) {
parenCount--
// If the path ends in a closing paren, and the count of closing parens is
// higher than the opening count, then remove the supefluous closing parens.
if (value.charCodeAt(index - 1) === rightParenthesis) {
path = value.slice(pathStart, index)
leftCount = ccount(path, leftParenthesisCharacter)
rightCount = ccount(path, rightParenthesisCharacter)

if (parenCount < 0) {
break
}
while (rightCount > leftCount) {
index = pathStart + path.lastIndexOf(rightParenthesisCharacter)
path = value.slice(pathStart, index)
rightCount--
}

queue += character
index++
}

if (!queue) {
return
}
if (value.charCodeAt(index - 1) === semicolon) {
// GitHub doesn’t document this, but final semicolons aren’t paret of the
// URL either.
index--

subvalue += queue
content = subvalue
// // If the path ends in what looks like an entity, it’s not part of the path.
if (alphabetical(value.charCodeAt(index - 1))) {
end = index - 2

if (protocol === mailto) {
position = queue.indexOf(atSign)
while (alphabetical(value.charCodeAt(end))) {
end--
}

if (position === -1 || position === length - 1) {
return
if (value.charCodeAt(end) === ampersand) {
index = end
}
}

content = content.slice(mailto.length)
}

/* istanbul ignore if - never used (yet) */
if (silent) {
return true
content = value.slice(0, index)
url = decode(content, {nonTerminated: false})

if (protocolless) {
url = 'http://' + url
}

exit = self.enterLink()

// Temporarily remove all tokenizers except text in url.
tokenizers = self.inlineTokenizers
self.inlineTokenizers = {text: tokenizers.text}

content = self.tokenizeInline(content, eat.now())

children = self.tokenizeInline(content, eat.now())
self.inlineTokenizers = tokenizers

exit()

return eat(subvalue)({
type: 'link',
title: null,
url: decode(subvalue, {nonTerminated: false}),
children: content
})
return eat(content)({type: 'link', title: null, url: url, children: children})
}
1 change: 1 addition & 0 deletions packages/remark-parse/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"types/index.d.ts"
],
"dependencies": {
"ccount": "^1.0.0",
"collapse-white-space": "^1.0.2",
"is-alphabetical": "^1.0.0",
"is-decimal": "^1.0.0",
Expand Down
Loading