Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[superseded] Adding support for user-defined number suffixes #17020

Closed
wants to merge 11 commits into from
3 changes: 3 additions & 0 deletions compiler/docgen.nim
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ proc nodeToHighlightedHtml(d: PDoc; n: PNode; result: var Rope; renderFlags: TRe
of tkStrLit..tkTripleStrLit:
dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
"\\spanStringLit{$1}", [escLit])
of tkStrNumLit:
dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
"\\spanStringLit{$1}", [escLit])
of tkCharLit:
dispA(d.conf, result, "<span class=\"CharLit\">$1</span>", "\\spanCharLit{$1}",
[escLit])
Expand Down
184 changes: 124 additions & 60 deletions compiler/lexer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ type
tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
tkStrNumLit = "tkStrNumLit",

tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
Expand All @@ -74,11 +75,14 @@ type

TokTypes* = set[TokType]

when defined(nimsuggest):
timotheecour marked this conversation as resolved.
Show resolved Hide resolved
# tokens that should not be considered for previousToken
const weakTokens = {tkComma, tkSemiColon, tkColon,
tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
tkCurlyRi}

const
weakTokens = {tkComma, tkSemiColon, tkColon,
tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
tkCurlyRi} # \
# tokens that should not be considered for previousToken
negationPrefixes = {tkComma, tkColon, tkParLe, tkBracketLe, tkCurlyLe}
tokKeywordLow* = succ(tkSymbol)
tokKeywordHigh* = pred(tkIntLit)

Expand Down Expand Up @@ -119,6 +123,7 @@ type
cache*: IdentCache
when defined(nimsuggest):
previousToken: TLineInfo
previousTokType*: TokType
config*: ConfigRef

proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
Expand Down Expand Up @@ -296,7 +301,7 @@ proc getNumber(L: var Lexer, result: var Token) =
if L.buf[pos] == '_':
if L.buf[pos+1] notin chars:
lexMessage(L, errGenerated,
"only single underscores may occur in a token and token may not " &
"only single underscores may occur in a number and number may not " &
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
"end with an underscore: e.g. '1__1' and '1_' are invalid")
break
tok.literal.add('_')
Expand Down Expand Up @@ -337,17 +342,25 @@ proc getNumber(L: var Lexer, result: var Token) =
xi: BiggestInt
isBase10 = true
numDigits = 0
isNegative = false
const
# 'c', 'C' is deprecated
baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
signedIntTypes = {tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit}

result.tokType = tkIntLit # int literal until we know better
result.literal = ""
result.base = base10
startpos = L.bufpos
tokenBegin(result, startpos)

# check for leading minus sign
if L.buf[L.bufpos] == '-':
eatChar(L, result, '-')
isNegative = true

# First stage: find out base, make verifications, build token literal string
# {'c', 'C'} is added for deprecation reasons to provide a clear error message
if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
Expand Down Expand Up @@ -377,7 +390,7 @@ proc getNumber(L: var Lexer, result: var Token) =
else:
internalError(L.config, getLineInfo(L), "getNumber")
if numDigits == 0:
lexMessageLitNum(L, "invalid number: '$1'", startpos)
lexMessageLitNum(L, "invalid number (empty): '$1'", startpos)
else:
discard matchUnderscoreChars(L, result, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
Expand All @@ -394,74 +407,94 @@ proc getNumber(L: var Lexer, result: var Token) =

# Second stage, find out if there's a datatype suffix and handle it
var postPos = endpos
if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
if L.buf[postPos] == '\'':
inc(postPos)

var hasSuffix = false
var internalSuffix = false
if L.buf[postPos] == '\'':
hasSuffix = true
inc(postPos)
# 2A: handle the internal literal versions
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
if L.buf[postPos] in {'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
case L.buf[postPos]
of 'f', 'F':
inc(postPos)
if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
if (L.buf[postPos + 1] == '3') and (L.buf[postPos + 2] == '2'):
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
result.tokType = tkFloat32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '6') and (L.buf[postPos + 2] == '4'):
result.tokType = tkFloat64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and
(L.buf[postPos + 1] == '2') and
(L.buf[postPos + 2] == '8'):
result.tokType = tkFloat128Lit
inc(postPos, 3)
else: # "f" alone defaults to float32
internalSuffix = true
elif (L.buf[postPos + 1] == '1') and
(L.buf[postPos + 2] == '2') and
(L.buf[postPos + 3] == '8'):
result.tokType = tkFloat128Lit
inc(postPos, 4)
internalSuffix = true
elif not (L.buf[postPos + 1] in SymChars): # standalone 'f'
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
result.tokType = tkFloat32Lit
inc(postPos)
internalSuffix = true
of 'd', 'D': # ad hoc convenience shortcut for f64
inc(postPos)
result.tokType = tkFloat64Lit
if not (L.buf[postPos + 1] in SymChars): # standalone 'd'
result.tokType = tkFloat64Lit
inc(postPos)
internalSuffix = true
of 'i', 'I':
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
if (L.buf[postPos + 1] == '6') and (L.buf[postPos + 2] == '4'):
result.tokType = tkInt64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '3') and (L.buf[postPos + 2] == '2'):
result.tokType = tkInt32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '1') and (L.buf[postPos + 2] == '6'):
result.tokType = tkInt16Lit
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '8'):
result.tokType = tkInt8Lit
inc(postPos)
else:
lexMessageLitNum(L, "invalid number: '$1'", startpos)
inc(postPos, 2)
internalSuffix = true
of 'u', 'U':
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
if (L.buf[postPos + 1] == '6') and (L.buf[postPos + 2] == '4'):
result.tokType = tkUInt64Lit
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '3') and (L.buf[postPos + 2] == '2'):
result.tokType = tkUInt32Lit
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '1') and (L.buf[postPos + 2] == '6'):
result.tokType = tkUInt16Lit
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
inc(postPos, 3)
internalSuffix = true
elif (L.buf[postPos + 1] == '8'):
result.tokType = tkUInt8Lit
inc(postPos)
else:
inc(postPos, 2)
internalSuffix = true
elif not (L.buf[postPos + 1] in SymChars): # standalone 'u'
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
result.tokType = tkUIntLit
inc(postPos)
internalSuffix = true
else:
lexMessageLitNum(L, "invalid number: '$1'", startpos)

# Is there still a literalish char awaiting? Then it's an error!
if L.buf[postPos] in literalishChars or
(L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
lexMessageLitNum(L, "invalid number: '$1'", startpos)
discard
# 2B: else look for user-definited types that are adjacent (no spaces)
timotheecour marked this conversation as resolved.
Show resolved Hide resolved
if hasSuffix and not internalSuffix:
if L.buf[postPos] in SymStartChars:
result.tokType = tkStrNumLit
L.bufpos = endpos # do NOT trim off the suffix
return
lexMessageLitNum(L, "invalid number suffix: '$1'", startpos)

# Third stage, extract actual number
# Third stage, extract actual number as a fitting literal
L.bufpos = startpos # restore position
var pos: int = startpos
if L.buf[pos] == '-':
inc(pos)
try:
if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
# place the non-base10 number into result.iNumber or result.fNumber
inc(pos, 2)
xi = 0 # it is a base prefix

Expand Down Expand Up @@ -527,12 +560,22 @@ proc getNumber(L: var Lexer, result: var Token) =
of tkInt16Lit: (xi > BiggestInt(uint16.high))
of tkInt32Lit: (xi > BiggestInt(uint32.high))
else: false

if outOfRange:
#echo "out of range num: ", result.iNumber, " vs ", xi
lexMessageLitNum(L, "number out of range: '$1'", startpos)

# make negative when a sign starts the literal

if isNegative:
case result.tokType:
of floatTypes:
result.fNumber = -result.fNumber
of signedIntTypes:
result.iNumber = -result.iNumber
else:
lexMessageLitNum(L, "cannot assign a negative value to an unsigned type: '$1'", startpos)
else:
# place the base10 number into result.iNumber or result.fNumber
# the parsing routines already handle the isNegative case
case result.tokType
of floatTypes:
result.fNumber = parseFloat(result.literal)
Expand Down Expand Up @@ -571,6 +614,7 @@ proc getNumber(L: var Lexer, result: var Token) =

if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)


# Promote int literal to int64? Not always necessary, but more consistent
if result.tokType == tkIntLit:
if result.iNumber > high(int32):
Expand Down Expand Up @@ -733,11 +777,11 @@ proc getEscapedChar(L: var Lexer, tok: var Token) =

proc handleCRLF(L: var Lexer, pos: int): int =
template registerLine =
let col = L.getColNumber(pos)

when not defined(nimpretty):
let col = L.getColNumber(pos)
if col > MaxLineLength:
lexMessagePos(L, hintLineTooLong, pos)
discard
JohnAD marked this conversation as resolved.
Show resolved Hide resolved

case L.buf[pos]
of CR:
Expand Down Expand Up @@ -859,6 +903,12 @@ proc getSymbol(L: var Lexer, tok: var Token) =
break
inc(pos)
suspicious = true
of '\'':
if pos==L.bufpos: # leading single quote only allowed at start
h = h !& ord(c)
inc(pos)
else:
break
else: break
tokenEnd(tok, pos-1)
h = !$h
Expand Down Expand Up @@ -1072,6 +1122,7 @@ proc scanComment(L: var Lexer, tok: var Token) =
tok.commentOffsetB = L.offsetBase + pos - 1

proc skip(L: var Lexer, tok: var Token) =
# advance the lexer past whitespaces and comments while accounting for indents
var pos = L.bufpos
tokenBegin(tok, pos)
tok.strongSpaceA = 0
Expand Down Expand Up @@ -1150,6 +1201,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
if tok.tokType notin weakTokens:
L.previousToken.line = tok.line.uint16
L.previousToken.col = tok.col.int16
L.previousTokType = tok.tokType

fillToken(tok)
if L.indentAhead >= 0:
Expand Down Expand Up @@ -1270,14 +1322,26 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
# tkTripleStrLit -> tkGTripleStrLit
inc(tok.tokType, 2)
of '\'':
tok.tokType = tkCharLit
getCharacter(L, tok)
tok.tokType = tkCharLit
if (L.previousTokType == tkStrNumLit) and (tok.strongSpaceA == 0):
# if the previous token (with no prior whitespace) is a tkStrNumLit, then this is a numeric suffix
getSymbol(L, tok)
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
elif L.previousTokType == tkAccent:
getSymbol(L, tok)
JohnAD marked this conversation as resolved.
Show resolved Hide resolved
else:
tok.tokType = tkCharLit
getCharacter(L, tok)
tok.tokType = tkCharLit
of '0'..'9':
getNumber(L, tok)
let c = L.buf[L.bufpos]
if c in SymChars+{'_'}:
if L.buf[L.bufpos] in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
of '-':
if ((tok.strongSpaceA > 0) or (L.previousTokType in negationPrefixes)) and (L.buf[L.bufpos + 1] in '0'..'9'):
getNumber(L, tok)
if L.buf[L.bufpos] in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
else:
getOperator(L, tok)
else:
if c in OpChars:
getOperator(L, tok)
Expand Down
Loading