Merge pull request #2848 from ozra/bugfix-2523-number-literal-lexing

Bugfix #2523 number literal lexing
This commit is contained in:
Andreas Rumpf
2015-06-04 11:40:00 +02:00
2 changed files with 164 additions and 120 deletions

View File

@@ -231,23 +231,6 @@ proc lexMessagePos(L: var TLexer, msg: TMsgKind, pos: int, arg = "") =
var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart)
L.dispMessage(info, msg, arg)
proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) =
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.literal, buf[pos])
inc(pos)
else:
break
if buf[pos] == '_':
if buf[pos+1] notin chars:
lexMessage(L, errInvalidToken, "_")
break
add(tok.literal, '_')
inc(pos)
L.bufpos = pos
proc matchTwoChars(L: TLexer, first: char, second: set[char]): bool =
result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second)
@@ -270,136 +253,195 @@ proc unsafeParseUInt(s: string, b: var BiggestInt, start = 0): int =
result = i - start
{.pop.} # overflowChecks
template eatChar(L: var TLexer, t: var TToken, replacementChar: char) =
add(t.literal, replacementChar)
inc(L.bufpos)
template eatChar(L: var TLexer, t: var TToken) =
add(t.literal, L.buf[L.bufpos])
inc(L.bufpos)
proc getNumber(L: var TLexer): TToken =
var
pos, endpos: int
startpos, endpos: int
xi: BiggestInt
# get the base:
const literalishChars = { 'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'c',
'C', 'b', 'B', '_', '.', '\''}
const literalishCharsNoDot = literalishChars - {'.'}
proc matchUnderscoreChars(L: var TLexer, tok: var TToken, chars: set[char]) =
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while true:
if buf[pos] in chars:
add(tok.literal, buf[pos])
inc(pos)
else:
break
if buf[pos] == '_':
if buf[pos+1] notin chars:
lexMessage(L, errInvalidToken, "_")
break
add(tok.literal, '_')
inc(pos)
L.bufpos = pos
proc matchChars(L: var TLexer, tok: var TToken, chars: set[char]) =
var pos = L.bufpos # use registers for pos, buf
var buf = L.buf
while buf[pos] in chars:
add(tok.literal, buf[pos])
inc(pos)
L.bufpos = pos
proc lexMessageLitNum(L: var TLexer, msg: TMsgKind, startpos: int) =
# Used to get slightly human friendlier err messages.
# Note: the erroneous 'O' char in the character set is intentional
const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O',
'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'}
var msgPos = L.bufpos
var t: TToken
t.literal = ""
L.bufpos = startpos # Use L.bufpos as pos because of matchChars
matchChars(L, t, literalishChars)
# We must verify +/- specifically so that we're not past the literal
if L.buf[L.bufpos] in {'+', '-'} and
L.buf[L.bufpos - 1] in {'e', 'E'}:
add(t.literal, L.buf[L.bufpos])
inc(L.bufpos)
matchChars(L, t, literalishChars)
if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
inc(L.bufpos)
add(t.literal, L.buf[L.bufpos])
matchChars(L, t, {'0'..'9'})
L.bufpos = msgPos
lexMessage(L, msg, t.literal)
result.tokType = tkIntLit # int literal until we know better
result.literal = ""
result.base = base10 # BUGFIX
pos = L.bufpos # make sure the literal is correct for error messages:
var eallowed = false
if L.buf[pos] == '0' and L.buf[pos+1] in {'X', 'x'}:
matchUnderscoreChars(L, result, {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x'})
result.base = base10
startpos = L.bufpos
var isAFloatLiteral = false
# First stage: find out base, make verifications, build token literal string
if L.buf[L.bufpos] == '0' and
L.buf[L.bufpos + 1] in {'X', 'x', 'o', 'O', 'c', 'C', 'b', 'B'}:
eatChar(L, result, '0')
case L.buf[L.bufpos]
of 'O':
lexMessageLitNum(L, errInvalidNumberOctalCode, startpos)
of 'x', 'X':
eatChar(L, result, 'x')
matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
of 'o', 'c', 'C':
eatChar(L, result, 'c')
matchUnderscoreChars(L, result, {'0'..'7'})
of 'b', 'B':
eatChar(L, result, 'b')
matchUnderscoreChars(L, result, {'0'..'1'})
else:
internalError(getLineInfo(L), "getNumber")
else:
matchUnderscoreChars(L, result, {'0'..'9', 'b', 'B', 'o', 'c', 'C'})
eallowed = true
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
add(result.literal, '.')
inc(L.bufpos)
matchUnderscoreChars(L, result, {'0'..'9'})
eallowed = true
if eallowed and L.buf[L.bufpos] in {'e', 'E'}:
add(result.literal, 'e')
inc(L.bufpos)
if L.buf[L.bufpos] in {'+', '-'}:
add(result.literal, L.buf[L.bufpos])
inc(L.bufpos)
matchUnderscoreChars(L, result, {'0'..'9'})
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
isAFloatLiteral = true
eatChar(L, result, '.')
matchUnderscoreChars(L, result, {'0'..'9'})
if L.buf[L.bufpos] in {'e', 'E'}:
isAFloatLiteral = true
eatChar(L, result, 'e')
if L.buf[L.bufpos] in {'+', '-'}:
eatChar(L, result)
matchUnderscoreChars(L, result, {'0'..'9'})
endpos = L.bufpos
if L.buf[endpos] in {'\'', 'f', 'F', 'i', 'I', 'u', 'U'}:
if L.buf[endpos] == '\'': inc(endpos)
L.bufpos = pos # restore position
case L.buf[endpos]
# Second stage, find out if there's a datatype postfix and handle it
var postPos = endpos
if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
if L.buf[postPos] == '\'':
inc(postPos)
case L.buf[postPos]
of 'f', 'F':
inc(endpos)
if (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'):
inc(postPos)
if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkFloat32Lit
inc(endpos, 2)
elif (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'):
inc(postPos, 2)
elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkFloat64Lit
inc(endpos, 2)
elif (L.buf[endpos] == '1') and
(L.buf[endpos + 1] == '2') and
(L.buf[endpos + 2] == '8'):
inc(postPos, 2)
elif (L.buf[postPos] == '1') and
(L.buf[postPos + 1] == '2') and
(L.buf[postPos + 2] == '8'):
result.tokType = tkFloat128Lit
inc(endpos, 3)
else:
lexMessage(L, errInvalidNumber, result.literal & "'f" & L.buf[endpos])
inc(postPos, 3)
else: # "f" alone defaults to float32
result.tokType = tkFloat32Lit
of 'd', 'D': # ad hoc convenience shortcut for f64
inc(postPos)
result.tokType = tkFloat64Lit
of 'i', 'I':
inc(endpos)
if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'):
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkInt64Lit
inc(endpos, 2)
elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'):
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkInt32Lit
inc(endpos, 2)
elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'):
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
result.tokType = tkInt16Lit
inc(endpos, 2)
elif (L.buf[endpos] == '8'):
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
result.tokType = tkInt8Lit
inc(endpos)
inc(postPos)
else:
lexMessage(L, errInvalidNumber, result.literal & "'i" & L.buf[endpos])
lexMessageLitNum(L, errInvalidNumber, startpos)
of 'u', 'U':
inc(endpos)
if (L.buf[endpos] == '6') and (L.buf[endpos + 1] == '4'):
inc(postPos)
if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'):
result.tokType = tkUInt64Lit
inc(endpos, 2)
elif (L.buf[endpos] == '3') and (L.buf[endpos + 1] == '2'):
inc(postPos, 2)
elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'):
result.tokType = tkUInt32Lit
inc(endpos, 2)
elif (L.buf[endpos] == '1') and (L.buf[endpos + 1] == '6'):
inc(postPos, 2)
elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'):
result.tokType = tkUInt16Lit
inc(endpos, 2)
elif (L.buf[endpos] == '8'):
inc(postPos, 2)
elif (L.buf[postPos] == '8'):
result.tokType = tkUInt8Lit
inc(endpos)
inc(postPos)
else:
result.tokType = tkUIntLit
else: lexMessage(L, errInvalidNumber, result.literal & "'" & L.buf[endpos])
else:
L.bufpos = pos # restore position
else:
lexMessageLitNum(L, errInvalidNumber, startpos)
# Is there still a literalish char awaiting? Then it's an error!
if L.buf[postPos] in literalishCharsNoDot or
(L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
lexMessageLitNum(L, errInvalidNumber, startpos)
# Third stage, extract actual number
L.bufpos = startpos # restore position
var pos: int = startpos
try:
if (L.buf[pos] == '0') and
(L.buf[pos + 1] in {'x', 'X', 'b', 'B', 'o', 'O', 'c', 'C'}):
inc(pos, 2)
xi = 0 # it may be a base prefix
xi = 0 # it is a base prefix
case L.buf[pos - 1] # now look at the optional type suffix:
of 'b', 'B':
result.base = base2
while true:
case L.buf[pos]
of '2'..'9', '.':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of '_':
if L.buf[pos+1] notin {'0'..'1'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0', '1':
while pos < endpos:
if L.buf[pos] != '_':
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
else: break
inc(pos)
of 'o', 'c', 'C':
result.base = base8
while true:
case L.buf[pos]
of '8'..'9', '.':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of '_':
if L.buf[pos+1] notin {'0'..'7'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0'..'7':
while pos < endpos:
if L.buf[pos] != '_':
xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
inc(pos)
else: break
of 'O':
lexMessage(L, errInvalidNumber, result.literal)
inc(pos)
of 'x', 'X':
result.base = base16
while true:
while pos < endpos:
case L.buf[pos]
of '_':
if L.buf[pos+1] notin {'0'..'9', 'a'..'f', 'A'..'F'}:
lexMessage(L, errInvalidToken, "_")
break
inc(pos)
of '0'..'9':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
@@ -410,8 +452,10 @@ proc getNumber(L: var TLexer): TToken =
of 'A'..'F':
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
inc(pos)
else: break
else: internalError(getLineInfo(L), "getNumber")
else:
break
else:
internalError(getLineInfo(L), "getNumber")
case result.tokType
of tkIntLit, tkInt64Lit: result.iNumber = xi
of tkInt8Lit: result.iNumber = BiggestInt(int8(toU8(int(xi))))
@@ -427,7 +471,7 @@ proc getNumber(L: var TLexer): TToken =
# XXX: Test this on big endian machine!
of tkFloat64Lit: result.fNumber = (cast[PFloat64](addr(xi)))[]
else: internalError(getLineInfo(L), "getNumber")
elif isFloatLiteral(result.literal) or (result.tokType == tkFloat32Lit) or
elif isAFloatLiteral or (result.tokType == tkFloat32Lit) or
(result.tokType == tkFloat64Lit):
result.fNumber = parseFloat(result.literal)
if result.tokType == tkIntLit: result.tokType = tkFloatLit
@@ -443,18 +487,18 @@ proc getNumber(L: var TLexer): TToken =
if result.tokType == tkIntLit:
result.tokType = tkInt64Lit
elif result.tokType in {tkInt8Lit, tkInt16Lit, tkInt32Lit}:
lexMessage(L, errNumberOutOfRange, result.literal)
lexMessageLitNum(L, errNumberOutOfRange, startpos)
elif result.tokType == tkInt8Lit and
(result.iNumber < int8.low or result.iNumber > int8.high):
lexMessage(L, errNumberOutOfRange, result.literal)
lexMessageLitNum(L, errNumberOutOfRange, startpos)
elif result.tokType == tkInt16Lit and
(result.iNumber < int16.low or result.iNumber > int16.high):
lexMessage(L, errNumberOutOfRange, result.literal)
lexMessageLitNum(L, errNumberOutOfRange, startpos)
except ValueError:
lexMessage(L, errInvalidNumber, result.literal)
lexMessageLitNum(L, errInvalidNumber, startpos)
except OverflowError, RangeError:
lexMessage(L, errNumberOutOfRange, result.literal)
L.bufpos = endpos
lexMessageLitNum(L, errNumberOutOfRange, startpos)
L.bufpos = postPos
proc handleHexChar(L: var TLexer, xi: var int) =
case L.buf[L.bufpos]

View File

@@ -17,10 +17,9 @@ type
errIntLiteralExpected, errInvalidCharacterConstant,
errClosingTripleQuoteExpected, errClosingQuoteExpected,
errTabulatorsAreNotAllowed, errInvalidToken, errLineTooLong,
errInvalidNumber, errNumberOutOfRange, errNnotAllowedInCharacter,
errClosingBracketExpected, errMissingFinalQuote, errIdentifierExpected,
errNewlineExpected,
errInvalidModuleName,
errInvalidNumber, errInvalidNumberOctalCode, errNumberOutOfRange,
errNnotAllowedInCharacter, errClosingBracketExpected, errMissingFinalQuote,
errIdentifierExpected, errNewlineExpected, errInvalidModuleName,
errOperatorExpected, errTokenExpected, errStringAfterIncludeExpected,
errRecursiveDependencyX, errOnOrOffExpected, errNoneSpeedOrSizeExpected,
errInvalidPragma, errUnknownPragma, errInvalidDirectiveX,
@@ -143,6 +142,7 @@ const
errInvalidToken: "invalid token: $1",
errLineTooLong: "line too long",
errInvalidNumber: "$1 is not a valid number",
errInvalidNumberOctalCode: "$1 is not a valid number; did you mean octal? Then use one of '0o', '0c' or '0C'.",
errNumberOutOfRange: "number $1 out of valid range",
errNnotAllowedInCharacter: "\\n not allowed in character literal",
errClosingBracketExpected: "closing ']' expected, but end of file reached",