nim-lang · JohnAD · Feb 12, 2021 · Feb 28, 2021 · Mar 5, 2021 · Mar 12, 2021
diff --git a/compiler/docgen.nim b/compiler/docgen.nim
@@ -407,6 +407,9 @@ proc nodeToHighlightedHtml(d: PDoc; n: PNode; result: var Rope; renderFlags: TRe
     of tkStrLit..tkTripleStrLit:
       dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
             "\\spanStringLit{$1}", [escLit])
+    of tkStrNumLit:
+      dispA(d.conf, result, "<span class=\"StringLit\">$1</span>",
+            "\\spanStringLit{$1}", [escLit])
     of tkCharLit:
       dispA(d.conf, result, "<span class=\"CharLit\">$1</span>", "\\spanCharLit{$1}",
             [escLit])

diff --git a/compiler/lexer.nim b/compiler/lexer.nim
@@ -58,7 +58,8 @@ type
     tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
     tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
     tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
-    tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", 
+    tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
+    tkStrNumLit = "tkStrNumLit",
 
     tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
     tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
@@ -74,11 +75,17 @@ type
 
   TokTypes* = set[TokType]
 
+when defined(nimsuggest):
+  # tokens that should not be considered for previousToken
+  const weakTokens = {tkComma, tkSemiColon, tkColon,
+                      tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
+                      tkCurlyRi}
+
 const
-  weakTokens = {tkComma, tkSemiColon, tkColon,
-                tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
-                tkCurlyRi} # \
-    # tokens that should not be considered for previousToken
+  # when a minus (-) is found in front of a numeric literal, if the previous
+  # token is one of these then it is a negative numeric literal
+  negationPrefixes = {tkComma, tkColon, tkParLe, tkBracketLe, tkSemiColon,
+                      tkBracketDotLe, tkCurlyDotLe, tkParDotLe}
   tokKeywordLow* = succ(tkSymbol)
   tokKeywordHigh* = pred(tkIntLit)
 
@@ -119,6 +126,7 @@ type
     cache*: IdentCache
     when defined(nimsuggest):
       previousToken: TLineInfo
+    previousTokType*: TokType
     config*: ConfigRef
 
 proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
@@ -296,8 +304,8 @@ proc getNumber(L: var Lexer, result: var Token) =
       if L.buf[pos] == '_':
         if L.buf[pos+1] notin chars:
           lexMessage(L, errGenerated,
-            "only single underscores may occur in a token and token may not " &
-            "end with an underscore: e.g. '1__1' and '1_' are invalid")
+            "only single underscores may occur in a number and a number may " &
+            "not end with an underscore: e.g. '1__1' and '1_' are invalid")
           break
         tok.literal.add('_')
         inc(pos)
@@ -332,22 +340,45 @@ proc getNumber(L: var Lexer, result: var Token) =
     L.bufpos = msgPos
     lexMessage(L, msgKind, msg % t.literal)
 
+  proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[1, char]): bool =
+    if (endOfSuffix - prev) != 2:
+      return false
+    return L.buf[prev + 1] == chs[0]
+
+  proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[2, char]): bool =
+    if (endOfSuffix - prev) != 3:
+      return false
+    return L.buf[prev + 1] == chs[0] and L.buf[prev + 2] == chs[1]
+
+  proc cmpSuffix(L: Lexer, prev: int, endOfSuffix: int, chs: array[3, char]): bool =
+    if (endOfSuffix - prev) != 4:
+      return false
+    return L.buf[prev + 1] == chs[0] and L.buf[prev + 2] == chs[1] and L.buf[prev + 3] == chs[2]
+
   var
     startpos, endpos: int
     xi: BiggestInt
     isBase10 = true
     numDigits = 0
+    isNegative = false
   const
     # 'c', 'C' is deprecated
     baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
     literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
     floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
+    signedIntTypes = {tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit}
+
   result.tokType = tkIntLit   # int literal until we know better
   result.literal = ""
   result.base = base10
   startpos = L.bufpos
   tokenBegin(result, startpos)
 
+  # check for leading minus sign
+  if L.buf[L.bufpos] == '-':
+    eatChar(L, result, '-')
+    isNegative = true
+
   # First stage: find out base, make verifications, build token literal string
   # {'c', 'C'} is added for deprecation reasons to provide a clear error message
   if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
@@ -377,7 +408,7 @@ proc getNumber(L: var Lexer, result: var Token) =
     else:
       internalError(L.config, getLineInfo(L), "getNumber")
     if numDigits == 0:
-      lexMessageLitNum(L, "invalid number: '$1'", startpos)
+      lexMessageLitNum(L, "invalid number (empty): '$1'", startpos)
   else:
     discard matchUnderscoreChars(L, result, {'0'..'9'})
     if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
@@ -386,82 +417,96 @@ proc getNumber(L: var Lexer, result: var Token) =
       discard matchUnderscoreChars(L, result, {'0'..'9'})
     if L.buf[L.bufpos] in {'e', 'E'}:
       result.tokType = tkFloatLit
-      eatChar(L, result, 'e')
+      eatChar(L, result, L.buf[L.bufpos])
       if L.buf[L.bufpos] in {'+', '-'}:
         eatChar(L, result)
       discard matchUnderscoreChars(L, result, {'0'..'9'})
   endpos = L.bufpos
 
   # Second stage, find out if there's a datatype suffix and handle it
   var postPos = endpos
-  if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
-    if L.buf[postPos] == '\'':
-      inc(postPos)
-
-    case L.buf[postPos]
-    of 'f', 'F':
-      inc(postPos)
-      if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkFloat32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkFloat64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and
-           (L.buf[postPos + 1] == '2') and
-           (L.buf[postPos + 2] == '8'):
-        result.tokType = tkFloat128Lit
-        inc(postPos, 3)
-      else:   # "f" alone defaults to float32
-        result.tokType = tkFloat32Lit
-    of 'd', 'D':  # ad hoc convenience shortcut for f64
+  let suffixMarker = (L.buf[postPos] == '\'')
+  if suffixMarker:
+    inc(postPos)
+    if (L.buf[postPos] notin SymStartChars):
+      lexMessageLitNum(L, "invalid number suffix: '$1'", startpos)
+
+  # 2A: handle the builtin literal versions
+  var internalSuffix = false
+  var endOfSuffix = postPos
+  while L.buf[endOfSuffix] in SymStartChars + {'0'..'9'}:
+    inc(endOfSuffix)
+  if L.buf[postPos] in {'f', 'F'}:
+    internalSuffix = true  # tentatively found
+    if postPos == endOfSuffix - 1:  # stand-alone 'f'
+      result.tokType = tkFloat32Lit
       inc(postPos)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
+      result.tokType = tkFloat32Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
+      result.tokType = tkFloat64Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['1', '2', '8']):
+      result.tokType = tkFloat128Lit
+      inc(postPos, 4)
+    else:
+      internalSuffix = false  # not found after all
+  elif L.buf[postPos] in {'d', 'D'}:
+    if postPos == endOfSuffix - 1:  # stand-alone 'd'
       result.tokType = tkFloat64Lit
-    of 'i', 'I':
       inc(postPos)
-      if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkInt64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkInt32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
-        result.tokType = tkInt16Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '8'):
-        result.tokType = tkInt8Lit
-        inc(postPos)
-      else:
-        lexMessageLitNum(L, "invalid number: '$1'", startpos)
-    of 'u', 'U':
+      internalSuffix = true
+  elif L.buf[postPos] in {'i', 'I'}:
+    internalSuffix = true  # tentatively found
+    if cmpSuffix(L, postPos, endOfSuffix, ['8']):
+      result.tokType = tkInt8Lit
+      inc(postPos, 2)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['1', '6']):
+      result.tokType = tkInt16Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
+      result.tokType = tkInt32Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
+      result.tokType = tkInt64Lit
+      inc(postPos, 3)
+    else:
+      internalSuffix = false # not found after all
+  elif L.buf[postPos] in {'u', 'U'}:
+    internalSuffix = true  # tentatively found
+    if postPos == endOfSuffix - 1:  # stand-alone 'u'
+      result.tokType = tkUIntLit
       inc(postPos)
-      if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
-        result.tokType = tkUInt64Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
-        result.tokType = tkUInt32Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
-        result.tokType = tkUInt16Lit
-        inc(postPos, 2)
-      elif (L.buf[postPos] == '8'):
-        result.tokType = tkUInt8Lit
-        inc(postPos)
-      else:
-        result.tokType = tkUIntLit
+    elif cmpSuffix(L, postPos, endOfSuffix, ['8']):
+      result.tokType = tkUInt8Lit
+      inc(postPos, 2)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['1', '6']):
+      result.tokType = tkUInt16Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['3', '2']):
+      result.tokType = tkUInt32Lit
+      inc(postPos, 3)
+    elif cmpSuffix(L, postPos, endOfSuffix, ['6', '4']):
+      result.tokType = tkUInt64Lit
+      inc(postPos, 3)
     else:
-      lexMessageLitNum(L, "invalid number: '$1'", startpos)
+      internalSuffix = false  # not found after all
 
-  # Is there still a literalish char awaiting? Then it's an error!
-  if  L.buf[postPos] in literalishChars or
-     (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
-    lexMessageLitNum(L, "invalid number: '$1'", startpos)
+  # 2B: else look for user-defined types that are adjacent (no spaces)
+  if suffixMarker and not internalSuffix:
+    result.tokType = tkStrNumLit
+    L.bufpos = endpos # do NOT trim off the suffix
+    return
 
-  # Third stage, extract actual number
+  # Third stage, extract actual number as a fitting literal
   L.bufpos = startpos            # restore position
   var pos: int = startpos
+  if L.buf[pos] == '-':
+    inc(pos)
   try:
     if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
+      # place the non-base10 number into result.iNumber or result.fNumber
       inc(pos, 2)
       xi = 0                  # it is a base prefix
 
@@ -527,12 +572,22 @@ proc getNumber(L: var Lexer, result: var Token) =
         of tkInt16Lit: (xi > BiggestInt(uint16.high))
         of tkInt32Lit: (xi > BiggestInt(uint32.high))
         else: false
-
         if outOfRange:
           #echo "out of range num: ", result.iNumber, " vs ", xi
           lexMessageLitNum(L, "number out of range: '$1'", startpos)
-
+          # make negative when a sign starts the literal
+
+      if isNegative:
+        case result.tokType:
+        of floatTypes:
+          result.fNumber = -result.fNumber
+        of signedIntTypes: 
+          result.iNumber = -result.iNumber
+        else:
+          lexMessageLitNum(L, "cannot assign a negative value to an unsigned type: '$1'", startpos)
     else:
+      # place the base10 number into result.iNumber or result.fNumber
+      # the parsing routines already handle the isNegative case
       case result.tokType
       of floatTypes:
         result.fNumber = parseFloat(result.literal)
@@ -571,6 +626,7 @@ proc getNumber(L: var Lexer, result: var Token) =
 
       if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)
 
+
     # Promote int literal to int64? Not always necessary, but more consistent
     if result.tokType == tkIntLit:
       if result.iNumber > high(int32):
@@ -733,9 +789,8 @@ proc getEscapedChar(L: var Lexer, tok: var Token) =
 
 proc handleCRLF(L: var Lexer, pos: int): int =
   template registerLine =
-    let col = L.getColNumber(pos)
-
     when not defined(nimpretty):
+      let col = L.getColNumber(pos)
       if col > MaxLineLength:
         lexMessagePos(L, hintLineTooLong, pos)
 
@@ -859,6 +914,12 @@ proc getSymbol(L: var Lexer, tok: var Token) =
         break
       inc(pos)
       suspicious = true
+    of '\'':
+      if pos==L.bufpos:  # leading single quote only allowed at start
+        h = h !& ord(c)
+        inc(pos)
+      else:
+        break
     else: break
   tokenEnd(tok, pos-1)
   h = !$h
@@ -1072,6 +1133,7 @@ proc scanComment(L: var Lexer, tok: var Token) =
     tok.commentOffsetB = L.offsetBase + pos - 1
 
 proc skip(L: var Lexer, tok: var Token) =
+  # advance the lexer past whitespaces and comments while accounting for indents
   var pos = L.bufpos
   tokenBegin(tok, pos)
   tok.strongSpaceA = 0
@@ -1150,6 +1212,7 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
       if tok.tokType notin weakTokens:
         L.previousToken.line = tok.line.uint16
         L.previousToken.col = tok.col.int16
+    L.previousTokType = tok.tokType
 
   fillToken(tok)
   if L.indentAhead >= 0:
@@ -1270,14 +1333,26 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
         # tkTripleStrLit -> tkGTripleStrLit
         inc(tok.tokType, 2)
     of '\'':
-      tok.tokType = tkCharLit
-      getCharacter(L, tok)
-      tok.tokType = tkCharLit
+      if (L.previousTokType == tkStrNumLit) and (tok.strongSpaceA == 0):
+        # if the previous token (with no prior whitespace) is a tkStrNumLit, then this is a numeric suffix
+        getSymbol(L, tok) # example: -12'big
+      elif L.previousTokType == tkAccent:
+        getSymbol(L, tok) # example: `'big`
+      else:
+        tok.tokType = tkCharLit
+        getCharacter(L, tok)
+        tok.tokType = tkCharLit
     of '0'..'9':
       getNumber(L, tok)
-      let c = L.buf[L.bufpos]
-      if c in SymChars+{'_'}:
+      if L.buf[L.bufpos] in SymChars+{'_'}:
         lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+    of '-':
+      if ((tok.strongSpaceA > 0) or (L.previousTokType in negationPrefixes)) and (L.buf[L.bufpos + 1] in '0'..'9'):
+        getNumber(L, tok)
+        if L.buf[L.bufpos] in SymChars+{'_'}:
+          lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+      else:
+        getOperator(L, tok)
     else:
       if c in OpChars:
         getOperator(L, tok)