mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-10 15:04:59 +00:00
added strscans stdlib module
This commit is contained in:
@@ -87,7 +87,7 @@ Collections and algorithms
|
||||
* `sequtils <sequtils.html>`_
|
||||
This module implements operations for the built-in seq type
|
||||
which were inspired by functional programming languages.
|
||||
|
||||
|
||||
|
||||
String handling
|
||||
---------------
|
||||
@@ -100,6 +100,9 @@ String handling
|
||||
* `parseutils <parseutils.html>`_
|
||||
This module contains helpers for parsing tokens, numbers, identifiers, etc.
|
||||
|
||||
* `strscans <strscans.html>`_
|
||||
This module contains a ``scanf`` macro for convenient parsing of mini languages.
|
||||
|
||||
* `strtabs <strtabs.html>`_
|
||||
The ``strtabs`` module implements an efficient hash table that is a mapping
|
||||
from strings to strings. Supports a case-sensitive, case-insensitive and
|
||||
|
||||
@@ -173,6 +173,22 @@ proc parseUntil*(s: string, token: var string, until: char,
|
||||
result = i-start
|
||||
token = substr(s, start, i-1)
|
||||
|
||||
proc parseUntil*(s: string, token: var string, until: string,
|
||||
start = 0): int {.inline.} =
|
||||
## parses a token and stores it in ``token``. Returns
|
||||
## the number of the parsed characters or 0 in case of an error. A token
|
||||
## consists of any character that comes before the `until` token.
|
||||
var i = start
|
||||
while i < s.len:
|
||||
if s[i] == until[0]:
|
||||
var u = 1
|
||||
while i+u < s.len and u < until.len and s[i+u] == until[u]:
|
||||
inc u
|
||||
if u >= until.len: break
|
||||
inc(i)
|
||||
result = i-start
|
||||
token = substr(s, start, i-1)
|
||||
|
||||
proc parseWhile*(s: string, token: var string, validChars: set[char],
|
||||
start = 0): int {.inline.} =
|
||||
## parses a token and stores it in ``token``. Returns
|
||||
|
||||
296
lib/pure/strscans.nim
Normal file
296
lib/pure/strscans.nim
Normal file
@@ -0,0 +1,296 @@
|
||||
#
|
||||
#
|
||||
# Nim's Runtime Library
|
||||
# (c) Copyright 2016 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
##[
|
||||
This module contains a `scanf`:idx: macro that can be used for extracting
|
||||
substrings from an input string. This is often easier than regular expressions.
|
||||
Some examples as an apetizer:
|
||||
|
||||
.. code-block:: nim
|
||||
# check if input string matches a triple of integers:
|
||||
const input = "(1,2,4)"
|
||||
var x, y, z: int
|
||||
if scanf("($i,$i,$i)", input, x, y, z):
|
||||
echo "matches and x is ", x, " y is ", y, " z is ", z
|
||||
|
||||
# check if input string matches an ISO date followed by an identifier followed
|
||||
# by whitespace and a floating point number:
|
||||
var year, month, day: int
|
||||
var identifier: string
|
||||
var myfloat: float
|
||||
if scanf("$i-$i-$i $w$s$f", input, year, month, day, identifier, myfloat):
|
||||
echo "yes, we have a match!"
|
||||
|
||||
As can be seen from the examples, strings are matched verbatim except for
|
||||
substrings starting with ``$``. These constructions are available:
|
||||
|
||||
================= ========================================================
|
||||
``$i`` Matches an integer. This uses ``parseutils.parseInt``.
|
||||
``$f`` Matches a floating pointer number. Uses ``parseFloat``.
|
||||
``$w`` Matches an ASCII identifier: ``[A-Z-a-z_][A-Za-z_0-9]*``.
|
||||
``$s`` Skips optional whitespace.
|
||||
``$$`` Matches a single dollar sign.
|
||||
``$.`` Matches if the end of the input string has been reached.
|
||||
``$*`` Matches until the token following the ``$*`` was found.
|
||||
The match is allowed to be of 0 length.
|
||||
``$+`` Matches until the token following the ``$+`` was found.
|
||||
The match must consist of at least one char.
|
||||
``${foo}`` User defined matcher. Uses the proc ``foo`` to perform
|
||||
the match. See below for more details.
|
||||
``$[foo]`` Call user defined proc ``foo`` to **skip** some optional
|
||||
parts in the input string. See below for more details.
|
||||
================= ========================================================
|
||||
|
||||
Even though ``$*`` and ``$+`` look similar to the regular expressions ``.*``
|
||||
and ``.+`` they work quite differently, there is no non-deterministic
|
||||
state machine involved and the matches are non-greedy. ``[$*]``
|
||||
matches ``[xyz]`` via ``parseutils.parseUntil``.
|
||||
|
||||
Furthermore no backtracking is performed, if parsing fails after a value
|
||||
has already been bound to a matched subexpression this value is not restored
|
||||
to its original value. This rarely causes problems in practice and if it does
|
||||
for you, it's easy enough to bind to a temporary variable first.
|
||||
|
||||
|
||||
Startswith vs full match
|
||||
========================
|
||||
|
||||
``scanf`` returns true if the input string **starts with** the specified
|
||||
pattern. If instead it should only return true if theres is also nothing
|
||||
left in the input, append ``$.`` to your pattern.
|
||||
|
||||
|
||||
User definable matchers
|
||||
=======================
|
||||
|
||||
One very nice advantage over regular expressions is that ``scanf`` is
|
||||
extensible with ordinary Nim procs. The proc is either enclosed in ``${}``
|
||||
or in ``$[]``. ``${}`` matches and binds the result
|
||||
to a variable (that was passed to the ``scanf`` macro) while ``$[]`` merely
|
||||
optional tokens.
|
||||
|
||||
|
||||
In this example, we define a helper proc ``skipSep`` that skips some separators
|
||||
which we then use in our scanf pattern to help us in the matching process:
|
||||
|
||||
.. code-block:: nim
|
||||
|
||||
proc someSep(input: string; start: int; seps: set[char] = {':','-','.'}): int =
|
||||
# Note: The parameters and return value must match to what ``scanf`` requires
|
||||
result = 0
|
||||
while input[start+result] in seps: inc result
|
||||
|
||||
if scanf("$w${someSep}$w", input, key, value):
|
||||
...
|
||||
|
||||
It also possible to pass arguments to a user definable matcher:
|
||||
|
||||
.. code-block:: nim
|
||||
|
||||
proc ndigits(input: string; start: int; intVal: var int; n: int): int =
|
||||
# matches exactly ``n`` digits. Matchers need to return 0 if nothing
|
||||
# matched or otherwise the number of processed chars.
|
||||
var x = 0
|
||||
var i = 0
|
||||
while i < n and i+start < input.len and input[i+start] in {'0'..'9'}:
|
||||
x = x * 10 + input[i+start].ord - '0'.ord
|
||||
inc i
|
||||
# only overwrite if we had a match
|
||||
if i == n:
|
||||
result = n
|
||||
intVal = x
|
||||
|
||||
# match an ISO date extracting year, month, day at the same time.
|
||||
# Also ensure the input ends after the ISO date:
|
||||
var year, month, day: int
|
||||
if scanf("${ndigits(4)}-${ndigits(2)}-${ndigits(2)}$.", "2013-01-03", year, month, day):
|
||||
...
|
||||
|
||||
]##
|
||||
|
||||
|
||||
import macros, parseutils
|
||||
|
||||
proc conditionsToIfChain(n, idx, res: NimNode; start: int): NimNode =
|
||||
assert n.kind == nnkStmtList
|
||||
if start >= n.len: return newAssignment(res, newLit true)
|
||||
var ifs: NimNode = nil
|
||||
if n[start+1].kind == nnkEmpty:
|
||||
ifs = conditionsToIfChain(n, idx, res, start+3)
|
||||
else:
|
||||
ifs = newIfStmt((n[start+1],
|
||||
newTree(nnkStmtList, newCall(bindSym"inc", idx, n[start+2]),
|
||||
conditionsToIfChain(n, idx, res, start+3))))
|
||||
result = newTree(nnkStmtList, n[start], ifs)
|
||||
|
||||
proc notZero(x: NimNode): NimNode = newCall(bindSym"!=", x, newLit 0)
|
||||
|
||||
proc buildUserCall(x: string; args: varargs[NimNode]): NimNode =
|
||||
let y = parseExpr(x)
|
||||
result = newTree(nnkCall)
|
||||
if y.kind in nnkCallKinds: result.add y[0]
|
||||
else: result.add y
|
||||
for a in args: result.add a
|
||||
if y.kind in nnkCallKinds:
|
||||
for i in 1..<y.len: result.add y[i]
|
||||
|
||||
macro scanf*(pattern: static[string]; input: string; results: varargs[typed]): bool =
|
||||
## See top level documentation of his module of how ``scanf`` works.
|
||||
template matchBind(parser) {.dirty.} =
|
||||
var resLen = genSym(nskLet, "resLen")
|
||||
conds.add newLetStmt(resLen, newCall(bindSym(parser), input, results[i], idx))
|
||||
conds.add resLen.notZero
|
||||
conds.add resLen
|
||||
|
||||
var i = 0
|
||||
var p = 0
|
||||
var idx = genSym(nskVar, "idx")
|
||||
var res = genSym(nskVar, "res")
|
||||
result = newTree(nnkStmtListExpr, newVarStmt(idx, newLit 0), newVarStmt(res, newLit false))
|
||||
var conds = newTree(nnkStmtList)
|
||||
var fullMatch = false
|
||||
while p < pattern.len:
|
||||
if pattern[p] == '$':
|
||||
inc p
|
||||
case pattern[p]
|
||||
of '$':
|
||||
var resLen = genSym(nskLet, "resLen")
|
||||
conds.add newLetStmt(resLen, newCall(bindSym"skip", input, newLit($pattern[p]), idx))
|
||||
conds.add resLen.notZero
|
||||
conds.add resLen
|
||||
of 'w':
|
||||
if i < results.len or getType(results[i]).typeKind != ntyString:
|
||||
matchBind "parseIdent"
|
||||
else:
|
||||
error("no string var given for $w")
|
||||
inc i
|
||||
of 'i':
|
||||
if i < results.len or getType(results[i]).typeKind != ntyInt:
|
||||
matchBind "parseInt"
|
||||
else:
|
||||
error("no int var given for $d")
|
||||
inc i
|
||||
of 'f':
|
||||
if i < results.len or getType(results[i]).typeKind != ntyFloat:
|
||||
matchBind "parseFloat"
|
||||
else:
|
||||
error("no float var given for $f")
|
||||
inc i
|
||||
of 's':
|
||||
conds.add newCall(bindSym"inc", idx, newCall(bindSym"skipWhitespace", input, idx))
|
||||
conds.add newEmptyNode()
|
||||
conds.add newEmptyNode()
|
||||
of '.':
|
||||
if p == pattern.len-1:
|
||||
fullMatch = true
|
||||
else:
|
||||
error("invalid format string")
|
||||
of '*', '+':
|
||||
if i < results.len or getType(results[i]).typeKind != ntyString:
|
||||
var min = ord(pattern[p] == '+')
|
||||
var q=p+1
|
||||
var token = ""
|
||||
while q < pattern.len and pattern[q] != '$':
|
||||
token.add pattern[q]
|
||||
inc q
|
||||
var resLen = genSym(nskLet, "resLen")
|
||||
conds.add newLetStmt(resLen, newCall(bindSym"parseUntil", input, results[i], newLit(token), idx))
|
||||
conds.add newCall(bindSym"!=", resLen, newLit min)
|
||||
conds.add resLen
|
||||
else:
|
||||
error("no string var given for $" & pattern[p])
|
||||
inc i
|
||||
of '{':
|
||||
inc p
|
||||
var nesting = 0
|
||||
let start = p
|
||||
while true:
|
||||
case pattern[p]
|
||||
of '{': inc nesting
|
||||
of '}':
|
||||
if nesting == 0: break
|
||||
dec nesting
|
||||
of '\0': error("expected closing '}'")
|
||||
else: discard
|
||||
inc p
|
||||
let expr = pattern.substr(start, p-1)
|
||||
if i < results.len:
|
||||
var resLen = genSym(nskLet, "resLen")
|
||||
conds.add newLetStmt(resLen, buildUserCall(expr, input, results[i], idx))
|
||||
conds.add newCall(bindSym"!=", resLen, newLit 0)
|
||||
conds.add resLen
|
||||
else:
|
||||
error("no var given for $" & expr)
|
||||
inc i
|
||||
of '[':
|
||||
inc p
|
||||
var nesting = 0
|
||||
let start = p
|
||||
while true:
|
||||
case pattern[p]
|
||||
of '[': inc nesting
|
||||
of ']':
|
||||
if nesting == 0: break
|
||||
dec nesting
|
||||
of '\0': error("expected closing ']'")
|
||||
else: discard
|
||||
inc p
|
||||
let expr = pattern.substr(start, p-1)
|
||||
conds.add newCall(bindSym"inc", idx, buildUserCall(expr, input, idx))
|
||||
conds.add newEmptyNode()
|
||||
conds.add newEmptyNode()
|
||||
else: error("invalid format string")
|
||||
inc p
|
||||
else:
|
||||
var token = ""
|
||||
while p < pattern.len and pattern[p] != '$':
|
||||
token.add pattern[p]
|
||||
inc p
|
||||
var resLen = genSym(nskLet, "resLen")
|
||||
conds.add newLetStmt(resLen, newCall(bindSym"skip", input, newLit(token), idx))
|
||||
conds.add resLen.notZero
|
||||
conds.add resLen
|
||||
result.add conditionsToIfChain(conds, idx, res, 0)
|
||||
if fullMatch:
|
||||
result.add newCall(bindSym">=", idx, newCall(bindSym"len", input))
|
||||
else:
|
||||
result.add res
|
||||
|
||||
when isMainModule:
|
||||
proc twoDigits(input: string; x: var int; start: int): int =
|
||||
if input[start] == '0' and input[start+1] == '0':
|
||||
result = 2
|
||||
x = 13
|
||||
else:
|
||||
result = 0
|
||||
|
||||
proc someSep(input: string; start: int; seps: set[char] = {';',',','-','.'}): int =
|
||||
result = 0
|
||||
while input[start+result] in seps: inc result
|
||||
|
||||
var key, val: string
|
||||
var intval: int
|
||||
var floatval: float
|
||||
doAssert scanf("$w$s::$s$w$s$i $f", "abc:: xyz 89 33.25", key, val, intval, floatVal)
|
||||
doAssert key == "abc"
|
||||
doAssert val == "xyz"
|
||||
doAssert intval == 89
|
||||
doAssert floatVal == 33.25
|
||||
|
||||
let xx = scanf("$$$i", "$abc", intval)
|
||||
doAssert xx == false
|
||||
|
||||
|
||||
let xx2 = scanf("$$$i", "$1234", intval)
|
||||
doAssert xx2
|
||||
|
||||
let yy = scanf("$[someSep]Breakpoint${twoDigits}$[someSep({';','.','-'})] [$+]$.", ";.--Breakpoint00 [output]", intVal, key)
|
||||
doAssert yy
|
||||
doAssert key == "output"
|
||||
doAssert intVal == 13
|
||||
@@ -45,6 +45,9 @@ Library Additions
|
||||
- The rlocks module has been added providing reentrant lock synchronization
|
||||
primitive.
|
||||
- A generic "sink operator" written as ``&=`` has been added to the ``system`` and the ``net`` modules.
|
||||
- Added ``strscans`` module that implements a ``scanf`` for easy input extraction.
|
||||
- Added a version of ``parseutils.parseUntil`` that can deal with a string ``until`` token. The other
|
||||
versions are for ``char`` and ``set[char]``.
|
||||
|
||||
|
||||
Compiler Additions
|
||||
@@ -62,6 +65,7 @@ Language Additions
|
||||
- Nim now supports ``partial`` object declarations to mitigate the problems
|
||||
that arise when types are mutually dependent and yet should be kept in
|
||||
different modules.
|
||||
- ``include`` statements are not restricted to top level statements anymore.
|
||||
|
||||
|
||||
2016-01-27 Nim in Action is now available!
|
||||
|
||||
@@ -40,7 +40,7 @@ srcdoc2: "pure/concurrency/threadpool.nim;pure/concurrency/cpuinfo.nim"
|
||||
srcdoc: "system/threads.nim;system/channels.nim;js/dom"
|
||||
srcdoc2: "pure/os;pure/strutils;pure/math;pure/matchers;pure/algorithm"
|
||||
srcdoc2: "pure/stats;impure/nre;windows/winlean"
|
||||
srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib"
|
||||
srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib;pure/strscans"
|
||||
srcdoc2: "pure/parseopt;pure/parseopt2;pure/hashes;pure/strtabs;pure/lexbase"
|
||||
srcdoc2: "pure/parsecfg;pure/parsexml;pure/parsecsv;pure/parsesql"
|
||||
srcdoc2: "pure/streams;pure/terminal;pure/cgi;pure/unicode"
|
||||
|
||||
Reference in New Issue
Block a user