Files
Nim/lib/pure/parseutils.nim
Reimer Behrends 8b8a21cb59 Fixed parsing of float literals.
Float literals were not parsed properly when their fractional part
exceeded 53 significant bits. This affected in particular math.PI
and math.E. Rather than reinventing the wheel, this patch reuses
C's strtod() implementation, which already does the heavy lifting
with respect to correctness, though some caution is necessary to
keep float parsing locale-independent.
2014-05-23 11:27:39 +02:00

419 lines
13 KiB
Nim

#
#
# Nimrod's Runtime Library
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module contains helpers for parsing tokens, numbers, identifiers, etc.
{.deadCodeElim: on.}
{.push debugger:off .} # the user does not want to trace a part
# of the standard library!
include "system/inclrtl"
const
Whitespace = {' ', '\t', '\v', '\r', '\l', '\f'}
IdentChars = {'a'..'z', 'A'..'Z', '0'..'9', '_'}
IdentStartChars = {'a'..'z', 'A'..'Z', '_'}
## copied from strutils
proc toLower(c: char): char {.inline.} =
result = if c in {'A'..'Z'}: chr(ord(c)-ord('A')+ord('a')) else: c
proc parseHex*(s: string, number: var int, start = 0): int {.
rtl, extern: "npuParseHex", noSideEffect.} =
## Parses a hexadecimal number and stores its value in ``number``.
##
## Returns the number of the parsed characters or 0 in case of an error. This
## proc is sensitive to the already existing value of ``number`` and will
## likely not do what you want unless you make sure ``number`` is zero. You
## can use this feature to *chain* calls, though the result int will quickly
## overflow. Example:
##
## .. code-block:: nimrod
## var value = 0
## discard parseHex("0x38", value)
## assert value == 56
## discard parseHex("0x34", value)
## assert value == 56 * 256 + 52
## value = -1
## discard parseHex("0x38", value)
## assert value == -200
##
var i = start
var foundDigit = false
if s[i] == '0' and (s[i+1] == 'x' or s[i+1] == 'X'): inc(i, 2)
elif s[i] == '#': inc(i)
while true:
case s[i]
of '_': discard
of '0'..'9':
number = number shl 4 or (ord(s[i]) - ord('0'))
foundDigit = true
of 'a'..'f':
number = number shl 4 or (ord(s[i]) - ord('a') + 10)
foundDigit = true
of 'A'..'F':
number = number shl 4 or (ord(s[i]) - ord('A') + 10)
foundDigit = true
else: break
inc(i)
if foundDigit: result = i-start
proc parseOct*(s: string, number: var int, start = 0): int {.
rtl, extern: "npuParseOct", noSideEffect.} =
## parses an octal number and stores its value in ``number``. Returns
## the number of the parsed characters or 0 in case of an error.
var i = start
var foundDigit = false
if s[i] == '0' and (s[i+1] == 'o' or s[i+1] == 'O'): inc(i, 2)
while true:
case s[i]
of '_': discard
of '0'..'7':
number = number shl 3 or (ord(s[i]) - ord('0'))
foundDigit = true
else: break
inc(i)
if foundDigit: result = i-start
proc parseIdent*(s: string, ident: var string, start = 0): int =
## parses an identifier and stores it in ``ident``. Returns
## the number of the parsed characters or 0 in case of an error.
var i = start
if s[i] in IdentStartChars:
inc(i)
while s[i] in IdentChars: inc(i)
ident = substr(s, start, i-1)
result = i-start
proc parseIdent*(s: string, start = 0): string =
## parses an identifier and stores it in ``ident``.
## Returns the parsed identifier or an empty string in case of an error.
result = ""
var i = start
if s[i] in IdentStartChars:
inc(i)
while s[i] in IdentChars: inc(i)
result = substr(s, start, i-1)
proc parseToken*(s: string, token: var string, validChars: set[char],
start = 0): int {.inline, deprecated.} =
## parses a token and stores it in ``token``. Returns
## the number of the parsed characters or 0 in case of an error. A token
## consists of the characters in `validChars`.
##
## **Deprecated since version 0.8.12**: Use ``parseWhile`` instead.
var i = start
while s[i] in validChars: inc(i)
result = i-start
token = substr(s, start, i-1)
proc skipWhitespace*(s: string, start = 0): int {.inline.} =
## skips the whitespace starting at ``s[start]``. Returns the number of
## skipped characters.
while s[start+result] in Whitespace: inc(result)
proc skip*(s, token: string, start = 0): int {.inline.} =
## skips the `token` starting at ``s[start]``. Returns the length of `token`
## or 0 if there was no `token` at ``s[start]``.
while result < token.len and s[result+start] == token[result]: inc(result)
if result != token.len: result = 0
proc skipIgnoreCase*(s, token: string, start = 0): int =
## same as `skip` but case is ignored for token matching.
while result < token.len and
toLower(s[result+start]) == toLower(token[result]): inc(result)
if result != token.len: result = 0
proc skipUntil*(s: string, until: set[char], start = 0): int {.inline.} =
## Skips all characters until one char from the set `until` is found
## or the end is reached.
## Returns number of characters skipped.
while s[result+start] notin until and s[result+start] != '\0': inc(result)
proc skipUntil*(s: string, until: char, start = 0): int {.inline.} =
## Skips all characters until the char `until` is found
## or the end is reached.
## Returns number of characters skipped.
while s[result+start] != until and s[result+start] != '\0': inc(result)
proc skipWhile*(s: string, toSkip: set[char], start = 0): int {.inline.} =
## Skips all characters while one char from the set `token` is found.
## Returns number of characters skipped.
while s[result+start] in toSkip and s[result+start] != '\0': inc(result)
proc parseUntil*(s: string, token: var string, until: set[char],
start = 0): int {.inline.} =
## parses a token and stores it in ``token``. Returns
## the number of the parsed characters or 0 in case of an error. A token
## consists of the characters notin `until`.
var i = start
while i < s.len and s[i] notin until: inc(i)
result = i-start
token = substr(s, start, i-1)
proc parseUntil*(s: string, token: var string, until: char,
start = 0): int {.inline.} =
## parses a token and stores it in ``token``. Returns
## the number of the parsed characters or 0 in case of an error. A token
## consists of any character that is not the `until` character.
var i = start
while i < s.len and s[i] != until: inc(i)
result = i-start
token = substr(s, start, i-1)
proc parseWhile*(s: string, token: var string, validChars: set[char],
start = 0): int {.inline.} =
## parses a token and stores it in ``token``. Returns
## the number of the parsed characters or 0 in case of an error. A token
## consists of the characters in `validChars`.
var i = start
while s[i] in validChars: inc(i)
result = i-start
token = substr(s, start, i-1)
proc captureBetween*(s: string, first: char, second = '\0', start = 0): string =
## Finds the first occurence of ``first``, then returns everything from there
## up to ``second``(if ``second`` is '\0', then ``first`` is used).
var i = skipUntil(s, first, start)+1+start
result = ""
discard s.parseUntil(result, if second == '\0': first else: second, i)
{.push overflowChecks: on.}
# this must be compiled with overflow checking turned on:
proc rawParseInt(s: string, b: var BiggestInt, start = 0): int =
var
sign: BiggestInt = -1
i = start
if s[i] == '+': inc(i)
elif s[i] == '-':
inc(i)
sign = 1
if s[i] in {'0'..'9'}:
b = 0
while s[i] in {'0'..'9'}:
b = b * 10 - (ord(s[i]) - ord('0'))
inc(i)
while s[i] == '_': inc(i) # underscores are allowed and ignored
b = b * sign
result = i - start
{.pop.} # overflowChecks
proc parseBiggestInt*(s: string, number: var BiggestInt, start = 0): int {.
rtl, extern: "npuParseBiggestInt", noSideEffect.} =
## parses an integer starting at `start` and stores the value into `number`.
## Result is the number of processed chars or 0 if there is no integer.
## `EOverflow` is raised if an overflow occurs.
var res: BiggestInt
# use 'res' for exception safety (don't write to 'number' in case of an
# overflow exception:
result = rawParseInt(s, res, start)
number = res
proc parseInt*(s: string, number: var int, start = 0): int {.
rtl, extern: "npuParseInt", noSideEffect.} =
## parses an integer starting at `start` and stores the value into `number`.
## Result is the number of processed chars or 0 if there is no integer.
## `EOverflow` is raised if an overflow occurs.
var res: BiggestInt
result = parseBiggestInt(s, res, start)
if (sizeof(int) <= 4) and
((res < low(int)) or (res > high(int))):
raise newException(EOverflow, "overflow")
else:
number = int(res)
proc parseBiggestFloat*(s: string, number: var BiggestFloat, start = 0): int {.
rtl, extern: "npuParseBiggestFloat", noSideEffect.} =
## parses a float starting at `start` and stores the value into `number`.
## Result is the number of processed chars or 0 if a parsing error
## occurred.
type struct_lconv {.importc: "struct lconv",header:"<locale.h>".} =
object
# Unneeded fields have been omitted.
decimal_point: cstring
proc localeconv(): ptr struct_lconv {.importc, header: "<locale.h>",
noSideEffect.}
proc strtod(buf: cstring, endptr: ptr cstring): float64 {.importc,
header: "<stdlib.h>", noSideEffect.}
# This routine leverages `strtod()` for the non-trivial task of
# parsing floating point numbers correctly. Because `strtod()` is
# locale-dependent with respect to the radix character, we create
# a copy where the decimal point is replaced with the locale's
# radix character.
var
i = start
sign = 1.0
t = ""
hasdigits = false
# Sign?
if s[i] == '+' or s[i] == '-':
if s[i] == '-':
sign = -1.0
add(t, s[i])
inc(i)
# NaN?
if s[i] == 'N' or s[i] == 'n':
if s[i+1] == 'A' or s[i+1] == 'a':
if s[i+2] == 'N' or s[i+2] == 'n':
if s[i+3] notin IdentChars:
number = NaN
return i+3 - start
return 0
# Inf?
if s[i] == 'I' or s[i] == 'i':
if s[i+1] == 'N' or s[i+1] == 'n':
if s[i+2] == 'F' or s[i+2] == 'f':
if s[i+3] notin IdentChars:
number = Inf*sign
return i+3 - start
return 0
# Integer part?
while s[i] in {'0'..'9'}:
hasdigits = true
add(t, s[i])
inc(i)
while s[i] == '_': inc(i)
# Fractional part?
if s[i] == '.':
add(t, localeconv().decimal_point)
inc(i)
while s[i] in {'0'..'9'}:
hasdigits = true
add(t, s[i])
inc(i)
while s[i] == '_': inc(i)
if not hasdigits:
return 0
# Exponent?
if s[i] in {'e', 'E'}:
add(t, s[i])
inc(i)
if s[i] in {'+', '-'}:
add(t, s[i])
inc(i)
if s[i] notin {'0'..'9'}:
return 0
while s[i] in {'0'..'9'}:
add(t, s[i])
inc(i)
while s[i] == '_': inc(i)
number = strtod(t, nil)
result = i - start
proc parseFloat*(s: string, number: var float, start = 0): int {.
rtl, extern: "npuParseFloat", noSideEffect.} =
## parses a float starting at `start` and stores the value into `number`.
## Result is the number of processed chars or 0 if there occured a parsing
## error.
var bf: BiggestFloat
result = parseBiggestFloat(s, bf, start)
number = bf
type
TInterpolatedKind* = enum ## describes for `interpolatedFragments`
## which part of the interpolated string is
## yielded; for example in "str$$$var${expr}"
ikStr, ## ``str`` part of the interpolated string
ikDollar, ## escaped ``$`` part of the interpolated string
ikVar, ## ``var`` part of the interpolated string
ikExpr ## ``expr`` part of the interpolated string
iterator interpolatedFragments*(s: string): tuple[kind: TInterpolatedKind,
value: string] =
## Tokenizes the string `s` into substrings for interpolation purposes.
##
## Example:
##
## .. code-block:: nimrod
## for k, v in interpolatedFragments(" $this is ${an example} $$"):
## echo "(", k, ", \"", v, "\")"
##
## Results in:
##
## .. code-block:: nimrod
## (ikString, " ")
## (ikExpr, "this")
## (ikString, " is ")
## (ikExpr, "an example")
## (ikString, " ")
## (ikDollar, "$")
var i = 0
var kind: TInterpolatedKind
while true:
var j = i
if s[j] == '$':
if s[j+1] == '{':
inc j, 2
var nesting = 0
while true:
case s[j]
of '{': inc nesting
of '}':
if nesting == 0:
inc j
break
dec nesting
of '\0':
raise newException(EInvalidValue,
"Expected closing '}': " & s[i..s.len])
else: discard
inc j
inc i, 2 # skip ${
kind = ikExpr
elif s[j+1] in IdentStartChars:
inc j, 2
while s[j] in IdentChars: inc(j)
inc i # skip $
kind = ikVar
elif s[j+1] == '$':
inc j, 2
inc i # skip $
kind = ikDollar
else:
raise newException(EInvalidValue,
"Unable to parse a varible name at " & s[i..s.len])
else:
while j < s.len and s[j] != '$': inc j
kind = ikStr
if j > i:
# do not copy the trailing } for ikExpr:
yield (kind, substr(s, i, j-1-ord(kind == ikExpr)))
else:
break
i = j
when isMainModule:
for k, v in interpolatedFragments("$test{} $this is ${an{ example}} "):
echo "(", k, ", \"", v, "\")"
var value = 0
discard parseHex("0x38", value)
assert value == 56
discard parseHex("0x34", value)
assert value == 56 * 256 + 52
value = -1
discard parseHex("0x38", value)
assert value == -200
{.pop.}