Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[superseded] Adding support for user-defined number suffixes #17020

Closed
wants to merge 11 commits into from
3 changes: 3 additions & 0 deletions compiler/docgen.nim
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ proc nodeToHighlightedHtml(d: PDoc; n: PNode; result: var Rope; renderFlags: TRe
of tkStrLit..tkTripleStrLit:
dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
"\\spanStringLit{$1}", [escLit])
of tkStrNumLit:
dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
"\\spanStringLit{$1}", [escLit])
of tkCharLit:
dispA(d.conf, result, "<span class=\"CharLit\">$1</span>", "\\spanCharLit{$1}",
[escLit])
Expand Down
225 changes: 150 additions & 75 deletions compiler/lexer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ type
tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
tkStrNumLit = "tkStrNumLit",

tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
Expand All @@ -74,11 +75,17 @@ type

TokTypes* = set[TokType]

when defined(nimsuggest):
timotheecour marked this conversation as resolved.
Show resolved Hide resolved
# tokens that should not be considered for previousToken
const weakTokens = {tkComma, tkSemiColon, tkColon,
tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
tkCurlyRi}

const
weakTokens = {tkComma, tkSemiColon, tkColon,
tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
tkCurlyRi} # \
# tokens that should not be considered for previousToken
# when a minus (-) is found in front of a numeric literal, if the previous
# token is one of these then it is a negative numeric literal
negationPrefixes = {tkComma, tkColon, tkParLe, tkBracketLe, tkSemiColon,
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
tkBracketDotLe, tkCurlyDotLe, tkParDotLe}
tokKeywordLow* = succ(tkSymbol)
tokKeywordHigh* = pred(tkIntLit)

Expand Down Expand Up @@ -119,6 +126,7 @@ type
cache*: IdentCache
when defined(nimsuggest):
previousToken: TLineInfo
previousTokType*: TokType
config*: ConfigRef

proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
Expand Down Expand Up @@ -296,8 +304,8 @@ proc getNumber(L: var Lexer, result: var Token) =
if L.buf[pos] == '_':
if L.buf[pos+1] notin chars:
lexMessage(L, errGenerated,
"only single underscores may occur in a token and token may not " &
"end with an underscore: e.g. '1__1' and '1_' are invalid")
"only single underscores may occur in a number and a number may " &
"not end with an underscore: e.g. '1__1' and '1_' are invalid")
break
tok.literal.add('_')
inc(pos)
Expand Down Expand Up @@ -332,22 +340,45 @@ proc getNumber(L: var Lexer, result: var Token) =
L.bufpos = msgPos
lexMessage(L, msgKind, msg % t.literal)

proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[1, char]): bool =
if (endOfSuffix - prev) != 2:
return false
return L.buf[prev + 1] == chs[0]

proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[2, char]): bool =
if (endOfSuffix - prev) != 3:
return false
return L.buf[prev + 1] == chs[0] and L.buf[prev + 2] == chs[1]

proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[3, char]): bool =
if (endOfSuffix - prev) != 4:
return false
return L.buf[prev + 1] == chs[0] and L.buf[prev + 2] == chs[1] and L.buf[prev + 3] == chs[2]
JohnAD marked this conversation as resolved.
Show resolved Hide resolved

var
startpos, endpos: int
xi: BiggestInt
isBase10 = true
numDigits = 0
isNegative = false
const
# 'c', 'C' is deprecated
baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
signedIntTypes = {tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit}

result.tokType = tkIntLit # int literal until we know better
result.literal = ""
result.base = base10
startpos = L.bufpos
tokenBegin(result, startpos)

# check for leading minus sign
if L.buf[L.bufpos] == '-':
eatChar(L, result, '-')
isNegative = true

# First stage: find out base, make verifications, build token literal string
# {'c', 'C'} is added for deprecation reasons to provide a clear error message
if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
Expand Down Expand Up @@ -377,7 +408,7 @@ proc getNumber(L: var Lexer, result: var Token) =
else:
internalError(L.config, getLineInfo(L), "getNumber")
if numDigits == 0:
lexMessageLitNum(L, "invalid number: '$1'", startpos)
lexMessageLitNum(L, "invalid number (empty): '$1'", startpos)
else:
discard matchUnderscoreChars(L, result, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
Expand All @@ -386,82 +417,96 @@ proc getNumber(L: var Lexer, result: var Token) =
discard matchUnderscoreChars(L, result, {'0'..'9'})
if L.buf[L.bufpos] in {'e', 'E'}:
result.tokType = tkFloatLit
eatChar(L, result, 'e')
eatChar(L, result, L.buf[L.bufpos])
if L.buf[L.bufpos] in {'+', '-'}:
eatChar(L, result)
discard matchUnderscoreChars(L, result, {'0'..'9'})
endpos = L.bufpos

# Second stage, find out if there's a datatype suffix and handle it
var postPos = endpos
if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
if L.buf[postPos] == '\'':
inc(postPos)

case L.buf[postPos]
of 'f', 'F':
inc(postPos)
if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkFloat32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkFloat64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and
(L.buf[postPos + 1] == '2') and
(L.buf[postPos + 2] == '8'):
result.tokType = tkFloat128Lit
inc(postPos, 3)
else: # "f" alone defaults to float32
result.tokType = tkFloat32Lit
of 'd', 'D': # ad hoc convenience shortcut for f64
let suffixMarker = (L.buf[postPos] == '\'')
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
if suffixMarker:
inc(postPos)
if (L.buf[postPos] notin SymStartChars):
timotheecour marked this conversation as resolved.
Show resolved Hide resolved
lexMessageLitNum(L, "invalid number suffix: '$1'", startpos)

# 2A: handle the builtin literal versions
var internalSuffix = false
var endOfSuffix = postPos
while L.buf[endOfSuffix] in SymStartChars + {'0'..'9'}:
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
inc(endOfSuffix)
if L.buf[postPos] in {'f', 'F'}:
internalSuffix = true # tentatively found
if postPos == endOfSuffix - 1: # stand-alone 'f'
result.tokType = tkFloat32Lit
inc(postPos)
elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
result.tokType = tkFloat32Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
result.tokType = tkFloat64Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['1', '2', '8']):
result.tokType = tkFloat128Lit
inc(postPos, 4)
else:
internalSuffix = false # not found after all
elif L.buf[postPos] in {'d', 'D'}:
if postPos == endOfSuffix - 1: # stand-alone 'd'
result.tokType = tkFloat64Lit
of 'i', 'I':
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkInt64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkInt32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
result.tokType = tkInt16Lit
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
result.tokType = tkInt8Lit
inc(postPos)
else:
lexMessageLitNum(L, "invalid number: '$1'", startpos)
of 'u', 'U':
internalSuffix = true
elif L.buf[postPos] in {'i', 'I'}:
internalSuffix = true # tentatively found
if cmpSuffix(L, postPos, endOfSuffix, ['8']):
result.tokType = tkInt8Lit
inc(postPos, 2)
elif cmpSuffix(L, postPos, endOfSuffix, ['1', '6']):
result.tokType = tkInt16Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
result.tokType = tkInt32Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
result.tokType = tkInt64Lit
inc(postPos, 3)
else:
internalSuffix = false # not found after all
elif L.buf[postPos] in {'u', 'U'}:
internalSuffix = true # tentatively found
if postPos == endOfSuffix - 1: # stand-alone 'u'
result.tokType = tkUIntLit
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkUInt64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkUInt32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
result.tokType = tkUInt16Lit
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
result.tokType = tkUInt8Lit
inc(postPos)
else:
result.tokType = tkUIntLit
elif cmpSuffix(L, postPos, endOfSuffix, ['8']):
result.tokType = tkUInt8Lit
inc(postPos, 2)
elif cmpSuffix(L, postPos, endOfSuffix, ['1', '6']):
result.tokType = tkUInt16Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
result.tokType = tkUInt32Lit
inc(postPos, 3)
elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
result.tokType = tkUInt64Lit
inc(postPos, 3)
else:
lexMessageLitNum(L, "invalid number: '$1'", startpos)
internalSuffix = false # not found after all

# Is there still a literalish char awaiting? Then it's an error!
if L.buf[postPos] in literalishChars or
(L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
lexMessageLitNum(L, "invalid number: '$1'", startpos)
# 2B: else look for user-defined types that are adjacent (no spaces)
if suffixMarker and not internalSuffix:
result.tokType = tkStrNumLit
L.bufpos = endpos # do NOT trim off the suffix
return

# Third stage, extract actual number
# Third stage, extract actual number as a fitting literal
L.bufpos = startpos # restore position
var pos: int = startpos
if L.buf[pos] == '-':
inc(pos)
try:
if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
# place the non-base10 number into result.iNumber or result.fNumber
inc(pos, 2)
xi = 0 # it is a base prefix

Expand Down Expand Up @@ -527,12 +572,22 @@ proc getNumber(L: var Lexer, result: var Token) =
of tkInt16Lit: (xi > BiggestInt(uint16.high))
of tkInt32Lit: (xi > BiggestInt(uint32.high))
else: false

if outOfRange:
#echo "out of range num: ", result.iNumber, " vs ", xi
lexMessageLitNum(L, "number out of range: '$1'", startpos)

# make negative when a sign starts the literal

if isNegative:
case result.tokType:
of floatTypes:
result.fNumber = -result.fNumber
of signedIntTypes:
result.iNumber = -result.iNumber
else:
lexMessageLitNum(L, "cannot assign a negative value to an unsigned type: '$1'", startpos)
else:
# place the base10 number into result.iNumber or result.fNumber
# the parsing routines already handle the isNegative case
case result.tokType
of floatTypes:
result.fNumber = parseFloat(result.literal)
Expand Down Expand Up @@ -571,6 +626,7 @@ proc getNumber(L: var Lexer, result: var Token) =

if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)


# Promote int literal to int64? Not always necessary, but more consistent
if result.tokType == tkIntLit:
if result.iNumber > high(int32):
Expand Down Expand Up @@ -733,9 +789,8 @@ proc getEscapedChar(L: var Lexer, tok: var Token) =

proc handleCRLF(L: var Lexer, pos: int): int =
template registerLine =
let col = L.getColNumber(pos)

when not defined(nimpretty):
let col = L.getColNumber(pos)
if col > MaxLineLength:
lexMessagePos(L, hintLineTooLong, pos)

Expand Down Expand Up @@ -859,6 +914,12 @@ proc getSymbol(L: var Lexer, tok: var Token) =
break
inc(pos)
suspicious = true
of '\'':
if pos==L.bufpos: # leading single quote only allowed at start
h = h !& ord(c)
inc(pos)
else:
break
else: break
tokenEnd(tok, pos-1)
h = !$h
Expand Down Expand Up @@ -1072,6 +1133,7 @@ proc scanComment(L: var Lexer, tok: var Token) =
tok.commentOffsetB = L.offsetBase + pos - 1

proc skip(L: var Lexer, tok: var Token) =
# advance the lexer past whitespaces and comments while accounting for indents
var pos = L.bufpos
tokenBegin(tok, pos)
tok.strongSpaceA = 0
Expand Down Expand Up @@ -1150,6 +1212,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
if tok.tokType notin weakTokens:
L.previousToken.line = tok.line.uint16
L.previousToken.col = tok.col.int16
L.previousTokType = tok.tokType

fillToken(tok)
if L.indentAhead >= 0:
Expand Down Expand Up @@ -1270,14 +1333,26 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
# tkTripleStrLit -> tkGTripleStrLit
inc(tok.tokType, 2)
of '\'':
tok.tokType = tkCharLit
getCharacter(L, tok)
tok.tokType = tkCharLit
if (L.previousTokType == tkStrNumLit) and (tok.strongSpaceA == 0):
# if the previous token (with no prior whitespace) is a tkStrNumLit, then this is a numeric suffix
getSymbol(L, tok) # example: -12'big
elif L.previousTokType == tkAccent:
getSymbol(L, tok) # example: `'big`
else:
tok.tokType = tkCharLit
getCharacter(L, tok)
tok.tokType = tkCharLit
of '0'..'9':
getNumber(L, tok)
let c = L.buf[L.bufpos]
if c in SymChars+{'_'}:
if L.buf[L.bufpos] in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
of '-':
if ((tok.strongSpaceA > 0) or (L.previousTokType in negationPrefixes)) and (L.buf[L.bufpos + 1] in '0'..'9'):
getNumber(L, tok)
if L.buf[L.bufpos] in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
else:
getOperator(L, tok)
else:
if c in OpChars:
getOperator(L, tok)
Expand Down
Loading