diff --git a/changelog.md b/changelog.md index 78817cf85f..c0cd970247 100644 --- a/changelog.md +++ b/changelog.md @@ -212,7 +212,7 @@ - `std/options` changed `$some(3)` to `"some(3)"` instead of `"Some(3)"` and `$none(int)` to `"none(int)"` instead of `"None[int]"`. - + - Added `algorithm.merge`. @@ -263,6 +263,8 @@ - `typedesc[Foo]` now renders as such instead of `type Foo` in compiler messages. +- The unary minus in `-1` is now part of the integer literal, it is now parsed as a single token. + This implies that edge cases like `-128'i8` finally work correctly. ## Compiler changes diff --git a/compiler/lexer.nim b/compiler/lexer.nim index 729ba34352..bcd3f00767 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -26,6 +26,7 @@ const SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'} OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.', '|', '=', '%', '&', '$', '@', '~', ':'} + UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'} # don't forget to update the 'highlite' module if these charsets should change @@ -51,22 +52,22 @@ type tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor", tkYield = "yield", # end of keywords - tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", + tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit", tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit", - tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", + tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit", tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit", tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit", tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", - tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", - + tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", + tkParLe = "(", tkParRi = ")", tkBracketLe = "[", tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", tkBracketDotLe = "[.", tkBracketDotRi = ".]", tkCurlyDotLe = "{.", tkCurlyDotRi = ".}", tkParDotLe = "(.", tkParDotRi = ".)", tkComma = ",", tkSemiColon = ";", - tkColon = ":", tkColonColon = "::", tkEquals = "=", + tkColon = ":", tkColonColon = "::", tkEquals = "=", tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:", tkOpr, tkComment, tkAccent = "`", # these are fake tokens used by renderer.nim @@ -348,6 +349,14 @@ proc getNumber(L: var Lexer, result: var Token) = startpos = L.bufpos tokenBegin(result, startpos) + var isPositive = true + if L.buf[L.bufpos] == '-': + eatChar(L, result) + isPositive = true + + template setNumber(field, value) = + field = (if isPositive: value else: -value) + # First stage: find out base, make verifications, build token literal string # {'c', 'C'} is added for deprecation reasons to provide a clear error message if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}: @@ -459,7 +468,7 @@ proc getNumber(L: var Lexer, result: var Token) = # Third stage, extract actual number L.bufpos = startpos # restore position - var pos: int = startpos + var pos = startpos try: if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): inc(pos, 2) @@ -500,20 +509,20 @@ proc getNumber(L: var Lexer, result: var Token) = internalError(L.config, getLineInfo(L), "getNumber") case result.tokType - of tkIntLit, tkInt64Lit: result.iNumber = xi - of tkInt8Lit: result.iNumber = ashr(xi shl 56, 56) - of tkInt16Lit: result.iNumber = ashr(xi shl 48, 48) - of tkInt32Lit: result.iNumber = ashr(xi shl 32, 32) - of tkUIntLit, tkUInt64Lit: result.iNumber = xi - of tkUInt8Lit: result.iNumber = xi and 0xff - of tkUInt16Lit: result.iNumber = xi and 0xffff - of tkUInt32Lit: result.iNumber = xi and 0xffffffff + of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi + of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) + of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) + of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) + of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi + of tkUInt8Lit: setNumber result.iNumber, xi and 0xff + of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff + of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff of tkFloat32Lit: - result.fNumber = (cast[PFloat32](addr(xi)))[] + setNumber result.fNumber, (cast[PFloat32](addr(xi)))[] # note: this code is endian neutral! # XXX: Test this on big endian machine! of tkFloat64Lit, tkFloatLit: - result.fNumber = (cast[PFloat64](addr(xi)))[] + setNumber result.fNumber, (cast[PFloat64](addr(xi)))[] else: internalError(L.config, getLineInfo(L), "getNumber") # Bounds checks. Non decimal literals are allowed to overflow the range of @@ -521,12 +530,13 @@ proc getNumber(L: var Lexer, result: var Token) = # below checks of signed sizes against uint*.high is deliberate: # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) if result.tokType notin floatTypes: - let outOfRange = case result.tokType: - of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi - of tkInt8Lit: (xi > BiggestInt(uint8.high)) - of tkInt16Lit: (xi > BiggestInt(uint16.high)) - of tkInt32Lit: (xi > BiggestInt(uint32.high)) - else: false + let outOfRange = + case result.tokType + of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi + of tkInt8Lit: (xi > BiggestInt(uint8.high)) + of tkInt16Lit: (xi > BiggestInt(uint16.high)) + of tkInt32Lit: (xi > BiggestInt(uint32.high)) + else: false if outOfRange: #echo "out of range num: ", result.iNumber, " vs ", xi @@ -557,23 +567,23 @@ proc getNumber(L: var Lexer, result: var Token) = raise newException(ValueError, "invalid integer: " & $result.literal) result.iNumber = iNumber - # Explicit bounds checks. Only T.high needs to be considered - # since result.iNumber can't be negative. + # Explicit bounds checks. let outOfRange = case result.tokType - of tkInt8Lit: result.iNumber > int8.high - of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) - of tkInt16Lit: result.iNumber > int16.high - of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) - of tkInt32Lit: result.iNumber > int32.high - of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) + of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low + of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 + of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low + of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 + of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low + of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 else: false - if outOfRange: lexMessageLitNum(L, "number out of range: '$1'", startpos) + if outOfRange: + lexMessageLitNum(L, "number out of range: '$1'", startpos) # Promote int literal to int64? Not always necessary, but more consistent if result.tokType == tkIntLit: - if result.iNumber > high(int32): + if result.iNumber > high(int32) or result.iNumber < low(int32): result.tokType = tkInt64Lit except ValueError: @@ -1278,6 +1288,19 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = let c = L.buf[L.bufpos] if c in SymChars+{'_'}: lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + of '-': + if L.buf[L.bufpos+1] in {'0'..'9'} and + (L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist): + # x)-23 # binary minus + # ,-23 # unary minus + # \n-78 # unary minus? Yes. + # =-3 # parsed as `=-` anyway + getNumber(L, tok) + let c = L.buf[L.bufpos] + if c in SymChars+{'_'}: + lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier") + else: + getOperator(L, tok) else: if c in OpChars: getOperator(L, tok) diff --git a/compiler/semtypes.nim b/compiler/semtypes.nim index dcab9a8842..0fce7b4171 100644 --- a/compiler/semtypes.nim +++ b/compiler/semtypes.nim @@ -299,7 +299,7 @@ proc semArrayIndex(c: PContext, n: PNode): PType = result = makeRangeWithStaticExpr(c, e.typ.n) elif e.kind in {nkIntLit..nkUInt64Lit}: if e.intVal < 0: - localError(c.config, n[1].info, + localError(c.config, n.info, "Array length can't be negative, but was " & $e.intVal) result = makeRangeType(c, 0, e.intVal-1, n.info, e.typ) elif e.kind == nkSym and e.typ.kind == tyStatic: diff --git a/doc/manual.rst b/doc/manual.rst index 1bb47f28b7..db12516304 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -499,10 +499,11 @@ Numerical constants are of a single type and have the form:: hexdigit = digit | 'A'..'F' | 'a'..'f' octdigit = '0'..'7' bindigit = '0'..'1' - HEX_LIT = '0' ('x' | 'X' ) hexdigit ( ['_'] hexdigit )* - DEC_LIT = digit ( ['_'] digit )* - OCT_LIT = '0' 'o' octdigit ( ['_'] octdigit )* - BIN_LIT = '0' ('b' | 'B' ) bindigit ( ['_'] bindigit )* + unary_minus = '-' # See the section about unary minus + HEX_LIT = unary_minus? '0' ('x' | 'X' ) hexdigit ( ['_'] hexdigit )* + DEC_LIT = unary_minus? digit ( ['_'] digit )* + OCT_LIT = unary_minus? '0' 'o' octdigit ( ['_'] octdigit )* + BIN_LIT = unary_minus? '0' ('b' | 'B' ) bindigit ( ['_'] bindigit )* INT_LIT = HEX_LIT | DEC_LIT @@ -521,7 +522,7 @@ Numerical constants are of a single type and have the form:: UINT64_LIT = INT_LIT ['\''] ('u' | 'U') '64' exponent = ('e' | 'E' ) ['+' | '-'] digit ( ['_'] digit )* - FLOAT_LIT = digit (['_'] digit)* (('.' digit (['_'] digit)* [exponent]) |exponent) + FLOAT_LIT = unary_minus? digit (['_'] digit)* (('.' digit (['_'] digit)* [exponent]) |exponent) FLOAT32_SUFFIX = ('f' | 'F') ['32'] FLOAT32_LIT = HEX_LIT '\'' FLOAT32_SUFFIX | (FLOAT_LIT | DEC_LIT | OCT_LIT | BIN_LIT) ['\''] FLOAT32_SUFFIX @@ -535,6 +536,38 @@ for readability. Integer and floating-point literals may be given in decimal (no prefix), binary (prefix `0b`), octal (prefix `0o`), and hexadecimal (prefix `0x`) notation. +The fact that the unary minus `-` in a number literal like `-1` is considered +to be part of the literal is a late addition to the language. The rationale is that +an expression `-128'i8` should be valid and without this special case, this would +be impossible -- `128` is not a valid `int8` value, only `-128` is. + +For the `unary_minus` rule there are further restrictions that are not covered +in the formal grammar. For `-` to be part of the number literal its immediately +preceeding character has to be in the +set `{' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}`. This set was designed to +cover most cases in a natural manner. + +In the following examples, `-1` is a single token: + +.. code-block:: nim + + echo -1 + echo(-1) + echo [-1] + echo 3,-1 + + "abc";-1 + +In the following examples, `-1` is parsed as two separate tokens (as `- 1`): + +.. code-block:: nim + + echo x-1 + echo (int)-1 + echo [a]-1 + "abc"-1 + + There exists a literal for each numerical type that is defined. The suffix starting with an apostrophe ('\'') is called a `type suffix`:idx:. Literals without a type suffix are of an integer type diff --git a/tests/lexer/tunary_minus.nim b/tests/lexer/tunary_minus.nim new file mode 100644 index 0000000000..89f1b79ef7 --- /dev/null +++ b/tests/lexer/tunary_minus.nim @@ -0,0 +1,76 @@ +discard """ + targets: "c cpp js" +""" + +# Test numeric literals and handling of minus symbol + +import std/[macros, strutils] + +macro lispReprStr*(a: untyped): untyped = newLit(a.lispRepr) + +macro assertAST*(expected: string, struct: untyped): untyped = + var ast = newLit(struct.treeRepr) + result = quote do: + if `ast` != `expected`: + doAssert false, "\nGot:\n" & `ast`.indent(2) & "\nExpected:\n" & `expected`.indent(2) + +const one = 1 +const minusOne = `-`(one) + +# border cases that *should* generate compiler errors: +assertAST dedent """ + StmtList + Asgn + Ident "x" + Command + IntLit 4 + IntLit -1""": + x = 4 -1 +assertAST dedent """ + StmtList + VarSection + IdentDefs + Ident "x" + Ident "uint" + IntLit -1""": + var x: uint = -1 +template bad() = + x = 4 -1 +doAssert not compiles(bad()) + +template main = + block: # check when a minus (-) is a negative sign for a literal + doAssert -1 == minusOne: + "unable to parse a spaced-prefixed negative int" + doAssert lispReprStr(-1) == """(IntLit -1)""" + doAssert -1.0'f64 == minusOne.float64 + doAssert lispReprStr(-1.000'f64) == """(Float64Lit -1.0)""" + doAssert lispReprStr( -1.000'f64) == """(Float64Lit -1.0)""" + doAssert [-1].contains(minusOne): + "unable to handle negatives after square bracket" + doAssert lispReprStr([-1]) == """(Bracket (IntLit -1))""" + doAssert (-1, 2)[0] == minusOne: + "unable to handle negatives after parenthesis" + doAssert lispReprStr((-1, 2)) == """(Par (IntLit -1) (IntLit 2))""" + proc x(): int = + var a = 1;-1 # the -1 should act as the return value + doAssert x() == minusOne: + "unable to handle negatives after semi-colon" + + block: # check when a minus (-) is an unary op + doAssert -one == minusOne: + "unable to a negative prior to identifier" + + block: # check when a minus (-) is a a subtraction op + doAssert 4-1 == 3: + "unable to handle subtraction sans surrounding spaces with a numeric literal" + doAssert 4-one == 3: + "unable to handle subtraction sans surrounding spaces with an identifier" + doAssert 4 - 1 == 3: + "unable to handle subtraction with surrounding spaces with a numeric literal" + doAssert 4 - one == 3: + "unable to handle subtraction with surrounding spaces with an identifier" + + +static: main() +main()