added strscans stdlib module

2026-07-18 06:51:18 +00:00 · 2016-05-04 01:34:08 +02:00
parent 724cd631d8
commit b357e80833
5 changed files with 321 additions and 2 deletions
--- a/doc/lib.txt
+++ b/doc/lib.txt
@@ -87,7 +87,7 @@ Collections and algorithms
 * `sequtils <sequtils.html>`_
  This module implements operations for the built-in seq type
  which were inspired by functional programming languages.
-  
+

 String handling
 ---------------
@@ -100,6 +100,9 @@ String handling
 * `parseutils <parseutils.html>`_
  This module contains helpers for parsing tokens, numbers, identifiers, etc.

+* `strscans <strscans.html>`_
+  This module contains a ``scanf`` macro for convenient parsing of mini languages.
+
 * `strtabs <strtabs.html>`_
  The ``strtabs`` module implements an efficient hash table that is a mapping
  from strings to strings. Supports a case-sensitive, case-insensitive and
--- a/lib/pure/parseutils.nim
+++ b/lib/pure/parseutils.nim
@@ -173,6 +173,22 @@ proc parseUntil*(s: string, token: var string, until: char,
  result = i-start
  token = substr(s, start, i-1)

+proc parseUntil*(s: string, token: var string, until: string,
+                 start = 0): int {.inline.} =
+  ## parses a token and stores it in ``token``. Returns
+  ## the number of the parsed characters or 0 in case of an error. A token
+  ## consists of any character that comes before the `until`  token.
+  var i = start
+  while i < s.len:
+    if s[i] == until[0]:
+      var u = 1
+      while i+u < s.len and u < until.len and s[i+u] == until[u]:
+        inc u
+      if u >= until.len: break
+    inc(i)
+  result = i-start
+  token = substr(s, start, i-1)
+
 proc parseWhile*(s: string, token: var string, validChars: set[char],
                 start = 0): int {.inline.} =
  ## parses a token and stores it in ``token``. Returns
--- a/lib/pure/strscans.nim
+++ b/lib/pure/strscans.nim
@@ -0,0 +1,296 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2016 Andreas Rumpf
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+##[
+This module contains a `scanf`:idx: macro that can be used for extracting
+substrings from an input string. This is often easier than regular expressions.
+Some examples as an apetizer:
+
+.. code-block:: nim
+  # check if input string matches a triple of integers:
+  const input = "(1,2,4)"
+  var x, y, z: int
+  if scanf("($i,$i,$i)", input, x, y, z):
+    echo "matches and x is ", x, " y is ", y, " z is ", z
+
+  # check if input string matches an ISO date followed by an identifier followed
+  # by whitespace and a floating point number:
+  var year, month, day: int
+  var identifier: string
+  var myfloat: float
+  if scanf("$i-$i-$i $w$s$f", input, year, month, day, identifier, myfloat):
+    echo "yes, we have a match!"
+
+As can be seen from the examples, strings are matched verbatim except for
+substrings starting with ``$``. These constructions are available:
+
+=================   ========================================================
+``$i``              Matches an integer. This uses ``parseutils.parseInt``.
+``$f``              Matches a floating pointer number. Uses ``parseFloat``.
+``$w``              Matches an ASCII identifier: ``[A-Z-a-z_][A-Za-z_0-9]*``.
+``$s``              Skips optional whitespace.
+``$$``              Matches a single dollar sign.
+``$.``              Matches if the end of the input string has been reached.
+``$*``              Matches until the token following the ``$*`` was found.
+                    The match is allowed to be of 0 length.
+``$+``              Matches until the token following the ``$+`` was found.
+                    The match must consist of at least one char.
+``${foo}``          User defined matcher. Uses the proc ``foo`` to perform
+                    the match. See below for more details.
+``$[foo]``          Call user defined proc ``foo`` to **skip** some optional
+                    parts in the input string. See below for more details.
+=================   ========================================================
+
+Even though ``$*`` and ``$+`` look similar to the regular expressions ``.*``
+and ``.+`` they work quite differently, there is no non-deterministic
+state machine involved and the matches are non-greedy. ``[$*]``
+matches ``[xyz]`` via ``parseutils.parseUntil``.
+
+Furthermore no backtracking is performed, if parsing fails after a value
+has already been bound to a matched subexpression this value is not restored
+to its original value. This rarely causes problems in practice and if it does
+for you, it's easy enough to bind to a temporary variable first.
+
+
+Startswith vs full match
+========================
+
+``scanf`` returns true if the input string **starts with** the specified
+pattern. If instead it should only return true if theres is also nothing
+left in the input, append ``$.`` to your pattern.
+
+
+User definable matchers
+=======================
+
+One very nice advantage over regular expressions is that ``scanf`` is
+extensible with ordinary Nim procs. The proc is either enclosed in ``${}``
+or in ``$[]``. ``${}`` matches and binds the result
+to a variable (that was passed to the ``scanf`` macro) while ``$[]`` merely
+optional tokens.
+
+
+In this example, we define a helper proc ``skipSep`` that skips some separators
+which we then use in our scanf pattern to help us in the matching process:
+
+.. code-block:: nim
+
+  proc someSep(input: string; start: int; seps: set[char] = {':','-','.'}): int =
+    # Note: The parameters and return value must match to what ``scanf`` requires
+    result = 0
+    while input[start+result] in seps: inc result
+
+  if scanf("$w${someSep}$w", input, key, value):
+    ...
+
+It also possible to pass arguments to a user definable matcher:
+
+.. code-block:: nim
+
+  proc ndigits(input: string; start: int; intVal: var int; n: int): int =
+    # matches exactly ``n`` digits. Matchers need to return 0 if nothing
+    # matched or otherwise the number of processed chars.
+    var x = 0
+    var i = 0
+    while i < n and i+start < input.len and input[i+start] in {'0'..'9'}:
+      x = x * 10 + input[i+start].ord - '0'.ord
+      inc i
+    # only overwrite if we had a match
+    if i == n:
+      result = n
+      intVal = x
+
+  # match an ISO date extracting year, month, day at the same time.
+  # Also ensure the input ends after the ISO date:
+  var year, month, day: int
+  if scanf("${ndigits(4)}-${ndigits(2)}-${ndigits(2)}$.", "2013-01-03", year, month, day):
+    ...
+
+]##
+
+
+import macros, parseutils
+
+proc conditionsToIfChain(n, idx, res: NimNode; start: int): NimNode =
+  assert n.kind == nnkStmtList
+  if start >= n.len: return newAssignment(res, newLit true)
+  var ifs: NimNode = nil
+  if n[start+1].kind == nnkEmpty:
+    ifs = conditionsToIfChain(n, idx, res, start+3)
+  else:
+    ifs = newIfStmt((n[start+1],
+                    newTree(nnkStmtList, newCall(bindSym"inc", idx, n[start+2]),
+                                     conditionsToIfChain(n, idx, res, start+3))))
+  result = newTree(nnkStmtList, n[start], ifs)
+
+proc notZero(x: NimNode): NimNode = newCall(bindSym"!=", x, newLit 0)
+
+proc buildUserCall(x: string; args: varargs[NimNode]): NimNode =
+  let y = parseExpr(x)
+  result = newTree(nnkCall)
+  if y.kind in nnkCallKinds: result.add y[0]
+  else: result.add y
+  for a in args: result.add a
+  if y.kind in nnkCallKinds:
+    for i in 1..<y.len: result.add y[i]
+
+macro scanf*(pattern: static[string]; input: string; results: varargs[typed]): bool =
+  ## See top level documentation of his module of how ``scanf`` works.
+  template matchBind(parser) {.dirty.} =
+    var resLen = genSym(nskLet, "resLen")
+    conds.add newLetStmt(resLen, newCall(bindSym(parser), input, results[i], idx))
+    conds.add resLen.notZero
+    conds.add resLen
+
+  var i = 0
+  var p = 0
+  var idx = genSym(nskVar, "idx")
+  var res = genSym(nskVar, "res")
+  result = newTree(nnkStmtListExpr, newVarStmt(idx, newLit 0), newVarStmt(res, newLit false))
+  var conds = newTree(nnkStmtList)
+  var fullMatch = false
+  while p < pattern.len:
+    if pattern[p] == '$':
+      inc p
+      case pattern[p]
+      of '$':
+        var resLen = genSym(nskLet, "resLen")
+        conds.add newLetStmt(resLen, newCall(bindSym"skip", input, newLit($pattern[p]), idx))
+        conds.add resLen.notZero
+        conds.add resLen
+      of 'w':
+        if i < results.len or getType(results[i]).typeKind != ntyString:
+          matchBind "parseIdent"
+        else:
+          error("no string var given for $w")
+        inc i
+      of 'i':
+        if i < results.len or getType(results[i]).typeKind != ntyInt:
+          matchBind "parseInt"
+        else:
+          error("no int var given for $d")
+        inc i
+      of 'f':
+        if i < results.len or getType(results[i]).typeKind != ntyFloat:
+          matchBind "parseFloat"
+        else:
+          error("no float var given for $f")
+        inc i
+      of 's':
+        conds.add newCall(bindSym"inc", idx, newCall(bindSym"skipWhitespace", input, idx))
+        conds.add newEmptyNode()
+        conds.add newEmptyNode()
+      of '.':
+        if p == pattern.len-1:
+          fullMatch = true
+        else:
+          error("invalid format string")
+      of '*', '+':
+        if i < results.len or getType(results[i]).typeKind != ntyString:
+          var min = ord(pattern[p] == '+')
+          var q=p+1
+          var token = ""
+          while q < pattern.len and pattern[q] != '$':
+            token.add pattern[q]
+            inc q
+          var resLen = genSym(nskLet, "resLen")
+          conds.add newLetStmt(resLen, newCall(bindSym"parseUntil", input, results[i], newLit(token), idx))
+          conds.add newCall(bindSym"!=", resLen, newLit min)
+          conds.add resLen
+        else:
+          error("no string var given for $" & pattern[p])
+        inc i
+      of '{':
+        inc p
+        var nesting = 0
+        let start = p
+        while true:
+          case pattern[p]
+          of '{': inc nesting
+          of '}':
+            if nesting == 0: break
+            dec nesting
+          of '\0': error("expected closing '}'")
+          else: discard
+          inc p
+        let expr = pattern.substr(start, p-1)
+        if i < results.len:
+          var resLen = genSym(nskLet, "resLen")
+          conds.add newLetStmt(resLen, buildUserCall(expr, input, results[i], idx))
+          conds.add newCall(bindSym"!=", resLen, newLit 0)
+          conds.add resLen
+        else:
+          error("no var given for $" & expr)
+        inc i
+      of '[':
+        inc p
+        var nesting = 0
+        let start = p
+        while true:
+          case pattern[p]
+          of '[': inc nesting
+          of ']':
+            if nesting == 0: break
+            dec nesting
+          of '\0': error("expected closing ']'")
+          else: discard
+          inc p
+        let expr = pattern.substr(start, p-1)
+        conds.add newCall(bindSym"inc", idx, buildUserCall(expr, input, idx))
+        conds.add newEmptyNode()
+        conds.add newEmptyNode()
+      else: error("invalid format string")
+      inc p
+    else:
+      var token = ""
+      while p < pattern.len and pattern[p] != '$':
+        token.add pattern[p]
+        inc p
+      var resLen = genSym(nskLet, "resLen")
+      conds.add newLetStmt(resLen, newCall(bindSym"skip", input, newLit(token), idx))
+      conds.add resLen.notZero
+      conds.add resLen
+  result.add conditionsToIfChain(conds, idx, res, 0)
+  if fullMatch:
+    result.add newCall(bindSym">=", idx, newCall(bindSym"len", input))
+  else:
+    result.add res
+
+when isMainModule:
+  proc twoDigits(input: string; x: var int; start: int): int =
+    if input[start] == '0' and input[start+1] == '0':
+      result = 2
+      x = 13
+    else:
+      result = 0
+
+  proc someSep(input: string; start: int; seps: set[char] = {';',',','-','.'}): int =
+    result = 0
+    while input[start+result] in seps: inc result
+
+  var key, val: string
+  var intval: int
+  var floatval: float
+  doAssert scanf("$w$s::$s$w$s$i  $f", "abc:: xyz 89  33.25", key, val, intval, floatVal)
+  doAssert key == "abc"
+  doAssert val == "xyz"
+  doAssert intval == 89
+  doAssert floatVal == 33.25
+
+  let xx = scanf("$$$i", "$abc", intval)
+  doAssert xx == false
+
+
+  let xx2 = scanf("$$$i", "$1234", intval)
+  doAssert xx2
+
+  let yy = scanf("$[someSep]Breakpoint${twoDigits}$[someSep({';','.','-'})] [$+]$.", ";.--Breakpoint00 [output]", intVal, key)
+  doAssert yy
+  doAssert key == "output"
+  doAssert intVal == 13
--- a/web/news.txt
+++ b/web/news.txt
@@ -45,6 +45,9 @@ Library Additions
 - The rlocks module has been added providing reentrant lock synchronization
  primitive.
 - A generic "sink operator" written as ``&=`` has been added to the ``system`` and the ``net`` modules.
+- Added ``strscans`` module that implements a ``scanf`` for easy input extraction.
+- Added a version of ``parseutils.parseUntil`` that can deal with a string ``until`` token. The other
+  versions are for ``char`` and ``set[char]``.


 Compiler Additions
@@ -62,6 +65,7 @@ Language Additions
 - Nim now supports ``partial`` object declarations to mitigate the problems
  that arise when types are mutually dependent and yet should be kept in
  different modules.
+- ``include`` statements are not restricted to top level statements anymore.


 2016-01-27 Nim in Action is now available!
--- a/web/website.ini
+++ b/web/website.ini
@@ -40,7 +40,7 @@ srcdoc2: "pure/concurrency/threadpool.nim;pure/concurrency/cpuinfo.nim"
 srcdoc: "system/threads.nim;system/channels.nim;js/dom"
 srcdoc2: "pure/os;pure/strutils;pure/math;pure/matchers;pure/algorithm"
 srcdoc2: "pure/stats;impure/nre;windows/winlean"
-srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib"
+srcdoc2: "pure/complex;pure/times;pure/osproc;pure/pegs;pure/dynlib;pure/strscans"
 srcdoc2: "pure/parseopt;pure/parseopt2;pure/hashes;pure/strtabs;pure/lexbase"
 srcdoc2: "pure/parsecfg;pure/parsexml;pure/parsecsv;pure/parsesql"
 srcdoc2: "pure/streams;pure/terminal;pure/cgi;pure/unicode"