make unary minus part of number literals, refs #17020 (#17488)

* make unary minus part of number literals, refs #17020 * fixes regression
2026-02-12 06:18:51 +00:00 · 2021-03-24 09:47:11 +01:00
parent 465a41c308
commit 86af2f7b50
5 changed files with 173 additions and 39 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -212,7 +212,7 @@

 - `std/options` changed `$some(3)` to `"some(3)"` instead of `"Some(3)"`
  and `$none(int)` to `"none(int)"` instead of `"None[int]"`.
-  
+
 - Added `algorithm.merge`.


@@ -263,6 +263,8 @@

 - `typedesc[Foo]` now renders as such instead of `type Foo` in compiler messages.

+- The unary minus in `-1` is now part of the integer literal, it is now parsed as a single token.
+  This implies that edge cases like `-128'i8` finally work correctly.


 ## Compiler changes
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -26,6 +26,7 @@ const
  SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
  OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
    '|', '=', '%', '&', '$', '@', '~', ':'}
+  UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}

 # don't forget to update the 'highlite' module if these charsets should change

@@ -51,22 +52,22 @@ type
    tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor",
    tkYield = "yield", # end of keywords

-    tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", 
+    tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
    tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit",
-    tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", 
+    tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
    tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
    tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
    tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
    tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
-    tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", 
-    
+    tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
+
    tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
    tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
    tkBracketDotLe = "[.", tkBracketDotRi = ".]",
    tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
    tkParDotLe = "(.", tkParDotRi = ".)",
    tkComma = ",", tkSemiColon = ";",
-    tkColon = ":", tkColonColon = "::", tkEquals = "=", 
+    tkColon = ":", tkColonColon = "::", tkEquals = "=",
    tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:",
    tkOpr, tkComment, tkAccent = "`",
    # these are fake tokens used by renderer.nim
@@ -348,6 +349,14 @@ proc getNumber(L: var Lexer, result: var Token) =
  startpos = L.bufpos
  tokenBegin(result, startpos)

+  var isPositive = true
+  if L.buf[L.bufpos] == '-':
+    eatChar(L, result)
+    isPositive = true
+
+  template setNumber(field, value) =
+    field = (if isPositive: value else: -value)
+
  # First stage: find out base, make verifications, build token literal string
  # {'c', 'C'} is added for deprecation reasons to provide a clear error message
  if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
@@ -459,7 +468,7 @@ proc getNumber(L: var Lexer, result: var Token) =

  # Third stage, extract actual number
  L.bufpos = startpos            # restore position
-  var pos: int = startpos
+  var pos = startpos
  try:
    if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
      inc(pos, 2)
@@ -500,20 +509,20 @@ proc getNumber(L: var Lexer, result: var Token) =
        internalError(L.config, getLineInfo(L), "getNumber")

      case result.tokType
-      of tkIntLit, tkInt64Lit: result.iNumber = xi
-      of tkInt8Lit: result.iNumber = ashr(xi shl 56, 56)
-      of tkInt16Lit: result.iNumber = ashr(xi shl 48, 48)
-      of tkInt32Lit: result.iNumber = ashr(xi shl 32, 32)
-      of tkUIntLit, tkUInt64Lit: result.iNumber = xi
-      of tkUInt8Lit: result.iNumber = xi and 0xff
-      of tkUInt16Lit: result.iNumber = xi and 0xffff
-      of tkUInt32Lit: result.iNumber = xi and 0xffffffff
+      of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi
+      of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56)
+      of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48)
+      of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32)
+      of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi
+      of tkUInt8Lit: setNumber result.iNumber, xi and 0xff
+      of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff
+      of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff
      of tkFloat32Lit:
-        result.fNumber = (cast[PFloat32](addr(xi)))[]
+        setNumber result.fNumber, (cast[PFloat32](addr(xi)))[]
        # note: this code is endian neutral!
        # XXX: Test this on big endian machine!
      of tkFloat64Lit, tkFloatLit:
-        result.fNumber = (cast[PFloat64](addr(xi)))[]
+        setNumber result.fNumber, (cast[PFloat64](addr(xi)))[]
      else: internalError(L.config, getLineInfo(L), "getNumber")

      # Bounds checks. Non decimal literals are allowed to overflow the range of
@@ -521,12 +530,13 @@ proc getNumber(L: var Lexer, result: var Token) =
      # below checks of signed sizes against uint*.high is deliberate:
      # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
      if result.tokType notin floatTypes:
-        let outOfRange = case result.tokType:
-        of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
-        of tkInt8Lit: (xi > BiggestInt(uint8.high))
-        of tkInt16Lit: (xi > BiggestInt(uint16.high))
-        of tkInt32Lit: (xi > BiggestInt(uint32.high))
-        else: false
+        let outOfRange =
+          case result.tokType
+          of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
+          of tkInt8Lit:  (xi > BiggestInt(uint8.high))
+          of tkInt16Lit: (xi > BiggestInt(uint16.high))
+          of tkInt32Lit: (xi > BiggestInt(uint32.high))
+          else: false

        if outOfRange:
          #echo "out of range num: ", result.iNumber, " vs ", xi
@@ -557,23 +567,23 @@ proc getNumber(L: var Lexer, result: var Token) =
          raise newException(ValueError, "invalid integer: " & $result.literal)
        result.iNumber = iNumber

-      # Explicit bounds checks. Only T.high needs to be considered
-      # since result.iNumber can't be negative.
+      # Explicit bounds checks.
      let outOfRange =
        case result.tokType
-        of tkInt8Lit: result.iNumber > int8.high
-        of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high)
-        of tkInt16Lit: result.iNumber > int16.high
-        of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high)
-        of tkInt32Lit: result.iNumber > int32.high
-        of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high)
+        of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low
+        of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0
+        of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low
+        of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0
+        of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low
+        of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0
        else: false

-      if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos)
+      if outOfRange:
+        lexMessageLitNum(L, "number out of range: '$1'", startpos)

    # Promote int literal to int64? Not always necessary, but more consistent
    if result.tokType == tkIntLit:
-      if result.iNumber > high(int32):
+      if result.iNumber > high(int32) or result.iNumber < low(int32):
        result.tokType = tkInt64Lit

  except ValueError:
@@ -1278,6 +1288,19 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
      let c = L.buf[L.bufpos]
      if c in SymChars+{'_'}:
        lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+    of '-':
+      if L.buf[L.bufpos+1] in {'0'..'9'} and
+          (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist):
+        # x)-23 # binary minus
+        # ,-23  # unary minus
+        # \n-78 # unary minus? Yes.
+        # =-3   # parsed as `=-` anyway
+        getNumber(L, tok)
+        let c = L.buf[L.bufpos]
+        if c in SymChars+{'_'}:
+          lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
+      else:
+        getOperator(L, tok)
    else:
      if c in OpChars:
        getOperator(L, tok)
--- a/compiler/semtypes.nim
+++ b/compiler/semtypes.nim
@@ -299,7 +299,7 @@ proc semArrayIndex(c: PContext, n: PNode): PType =
      result = makeRangeWithStaticExpr(c, e.typ.n)
    elif e.kind in {nkIntLit..nkUInt64Lit}:
      if e.intVal < 0:
-        localError(c.config, n[1].info,
+        localError(c.config, n.info,
          "Array length can't be negative, but was " & $e.intVal)
      result = makeRangeType(c, 0, e.intVal-1, n.info, e.typ)
    elif e.kind == nkSym and e.typ.kind == tyStatic:
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -499,10 +499,11 @@ Numerical constants are of a single type and have the form::
  hexdigit = digit | 'A'..'F' | 'a'..'f'
  octdigit = '0'..'7'
  bindigit = '0'..'1'
-  HEX_LIT = '0' ('x' | 'X' ) hexdigit ( ['_'] hexdigit )*
-  DEC_LIT = digit ( ['_'] digit )*
-  OCT_LIT = '0' 'o' octdigit ( ['_'] octdigit )*
-  BIN_LIT = '0' ('b' | 'B' ) bindigit ( ['_'] bindigit )*
+  unary_minus = '-' # See the section about unary minus
+  HEX_LIT = unary_minus? '0' ('x' | 'X' ) hexdigit ( ['_'] hexdigit )*
+  DEC_LIT = unary_minus? digit ( ['_'] digit )*
+  OCT_LIT = unary_minus? '0' 'o' octdigit ( ['_'] octdigit )*
+  BIN_LIT = unary_minus? '0' ('b' | 'B' ) bindigit ( ['_'] bindigit )*

  INT_LIT = HEX_LIT
          | DEC_LIT
@@ -521,7 +522,7 @@ Numerical constants are of a single type and have the form::
  UINT64_LIT = INT_LIT ['\''] ('u' | 'U') '64'

  exponent = ('e' | 'E' ) ['+' | '-'] digit ( ['_'] digit )*
-  FLOAT_LIT = digit (['_'] digit)* (('.' digit (['_'] digit)* [exponent]) |exponent)
+  FLOAT_LIT = unary_minus? digit (['_'] digit)* (('.' digit (['_'] digit)* [exponent]) |exponent)
  FLOAT32_SUFFIX = ('f' | 'F') ['32']
  FLOAT32_LIT = HEX_LIT '\'' FLOAT32_SUFFIX
              | (FLOAT_LIT | DEC_LIT | OCT_LIT | BIN_LIT) ['\''] FLOAT32_SUFFIX
@@ -535,6 +536,38 @@ for readability. Integer and floating-point literals may be given in decimal (no
 prefix), binary (prefix `0b`), octal (prefix `0o`), and hexadecimal
 (prefix `0x`) notation.

+The fact that the unary minus `-` in a number literal like `-1` is considered
+to be part of the literal is a late addition to the language. The rationale is that
+an expression `-128'i8` should be valid and without this special case, this would
+be impossible -- `128` is not a valid `int8` value, only `-128` is.
+
+For the `unary_minus` rule there are further restrictions that are not covered
+in the formal grammar. For `-` to be part of the number literal its immediately
+preceeding character has to be in the
+set `{' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}`. This set was designed to
+cover most cases in a natural manner.
+
+In the following examples, `-1` is a single token:
+
+.. code-block:: nim
+
+  echo -1
+  echo(-1)
+  echo [-1]
+  echo 3,-1
+
+  "abc";-1
+
+In the following examples, `-1` is parsed as two separate tokens (as `- 1`):
+
+.. code-block:: nim
+
+  echo x-1
+  echo (int)-1
+  echo [a]-1
+  "abc"-1
+
+
 There exists a literal for each numerical type that is
 defined. The suffix starting with an apostrophe ('\'') is called a
 `type suffix`:idx:. Literals without a type suffix are of an integer type
--- a/tests/lexer/tunary_minus.nim
+++ b/tests/lexer/tunary_minus.nim
@@ -0,0 +1,76 @@
+discard """
+  targets: "c cpp js"
+"""
+
+# Test numeric literals and handling of minus symbol
+
+import std/[macros, strutils]
+
+macro lispReprStr*(a: untyped): untyped = newLit(a.lispRepr)
+
+macro assertAST*(expected: string, struct: untyped): untyped =
+  var ast = newLit(struct.treeRepr)
+  result = quote do:
+    if `ast` != `expected`:
+      doAssert false, "\nGot:\n" & `ast`.indent(2) & "\nExpected:\n" & `expected`.indent(2)
+
+const one = 1
+const minusOne = `-`(one)
+
+# border cases that *should* generate compiler errors:
+assertAST dedent """
+  StmtList
+    Asgn
+      Ident "x"
+      Command
+        IntLit 4
+        IntLit -1""":
+  x = 4 -1
+assertAST dedent """
+  StmtList
+    VarSection
+      IdentDefs
+        Ident "x"
+        Ident "uint"
+        IntLit -1""":
+  var x: uint = -1
+template bad() =
+  x = 4 -1
+doAssert not compiles(bad())
+
+template main =
+  block: # check when a minus (-) is a negative sign for a literal
+    doAssert -1 == minusOne:
+      "unable to parse a spaced-prefixed negative int"
+    doAssert lispReprStr(-1) == """(IntLit -1)"""
+    doAssert -1.0'f64 == minusOne.float64
+    doAssert lispReprStr(-1.000'f64) == """(Float64Lit -1.0)"""
+    doAssert lispReprStr( -1.000'f64) == """(Float64Lit -1.0)"""
+    doAssert [-1].contains(minusOne):
+      "unable to handle negatives after square bracket"
+    doAssert lispReprStr([-1]) == """(Bracket (IntLit -1))"""
+    doAssert (-1, 2)[0] == minusOne:
+      "unable to handle negatives after parenthesis"
+    doAssert lispReprStr((-1, 2)) == """(Par (IntLit -1) (IntLit 2))"""
+    proc x(): int =
+      var a = 1;-1  # the -1 should act as the return value
+    doAssert x() == minusOne:
+      "unable to handle negatives after semi-colon"
+
+  block: # check when a minus (-) is an unary op
+    doAssert -one == minusOne:
+      "unable to a negative prior to identifier"
+
+  block: # check when a minus (-) is a a subtraction op
+    doAssert 4-1 == 3:
+      "unable to handle subtraction sans surrounding spaces with a numeric literal"
+    doAssert 4-one == 3:
+      "unable to handle subtraction sans surrounding spaces with an identifier"
+    doAssert 4 - 1 == 3:
+      "unable to handle subtraction with surrounding spaces with a numeric literal"
+    doAssert 4 - one == 3:
+      "unable to handle subtraction with surrounding spaces with an identifier"
+
+
+static: main()
+main()