unicode: add split procs; refs #6301

2026-06-04 02:44:44 +00:00 · 2018-10-14 10:47:54 +02:00
parent 2818661ffe
commit 48d3b26c21
3 changed files with 474 additions and 0 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -112,6 +112,9 @@
 - Added ``macros.copyLineInfo`` to copy lineInfo from other node.
 - Added ``system.ashr`` an arithmetic right shift for integers.

+- Added `split`, `splitWhitespace`, `size` procs and iterators
+  to `unicode.nim`.
+
 ### Library changes

 - ``macros.astGenRepr``, ``macros.lispRepr`` and ``macros.treeRepr``
--- a/lib/pure/editdistance.nim
+++ b/lib/pure/editdistance.nim
@@ -0,0 +1,295 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2018 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## This module implements an algorithm to compute the
+## `edit distance`:idx: between two Unicode strings.
+
+import unicode
+
+proc editDistance*(a, b: string): int {.noSideEffect.} =
+  ## Returns the unicode-rune edit distance between ``a`` and ``b``.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  if len(a) > len(b):
+    # make ``b`` the longer string
+    return editDistance(b, a)
+  # strip common prefix
+  var
+    i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
+    i_next_a = 0
+    i_next_b = 0
+    rune_a, rune_b: Rune
+    len_runes_a = 0 ## The number of relevant runes in string ``a``.
+    len_runes_b = 0 ## The number of relevant runes in string ``b``.
+  block commonPrefix:
+    # ``a`` is the shorter string
+    while i_start < len(a):
+      i_next_a = i_start
+      a.fastRuneAt(i_next_a, rune_a, doInc = true)
+      i_next_b = i_start
+      b.fastRuneAt(i_next_b, rune_b, doInc = true)
+      if rune_a != rune_b:
+        inc(len_runes_a)
+        inc(len_runes_b)
+        break
+      i_start = i_next_a
+  var
+    # we know that we are either at the start of the strings
+    # or that the current value of rune_a is not equal to rune_b
+    # => start search for common suffix after the current rune (``i_next_*``)
+    i_end_a = i_next_a ## The exclusive upper index bound of string ``a``.
+    i_end_b = i_next_b ## The exclusive upper index bound of string ``b``.
+    i_current_a = i_next_a
+    i_current_b = i_next_b
+  block commonSuffix:
+    var
+      add_runes_a = 0
+      add_runes_b = 0
+    while i_current_a < len(a) and i_current_b < len(b):
+      i_next_a = i_current_a
+      a.fastRuneAt(i_next_a, rune_a)
+      i_next_b = i_current_b
+      b.fastRuneAt(i_next_b, rune_b)
+      inc(add_runes_a)
+      inc(add_runes_b)
+      if rune_a != rune_b:
+        i_end_a = i_next_a
+        i_end_b = i_next_b
+        inc(len_runes_a, add_runes_a)
+        inc(len_runes_b, add_runes_b)
+        add_runes_a = 0
+        add_runes_b = 0
+      i_current_a = i_next_a
+      i_current_b = i_next_b
+    if i_current_a >= len(a): # ``a`` exhausted
+      if i_current_b < len(b): # ``b`` not exhausted
+        i_end_a = i_current_a
+        i_end_b = i_current_b
+        inc(len_runes_a, add_runes_a)
+        inc(len_runes_b, add_runes_b)
+        while true:
+          b.fastRuneAt(i_end_b, rune_b)
+          inc(len_runes_b)
+          if i_end_b >= len(b): break
+    elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted
+      i_end_a = i_current_a
+      i_end_b = i_current_b
+      inc(len_runes_a, add_runes_a)
+      inc(len_runes_b, add_runes_b)
+      while true:
+        a.fastRuneAt(i_end_a, rune_a)
+        inc(len_runes_a)
+        if i_end_a >= len(a): break
+  block specialCases:
+    # trivial cases:
+    if len_runes_a == 0: return len_runes_b
+    if len_runes_b == 0: return len_runes_a
+    # another special case:
+    if len_runes_a == 1:
+      a.fastRuneAt(i_start, rune_a, doInc = false)
+      var i_current_b = i_start
+      while i_current_b < i_end_b:
+        b.fastRuneAt(i_current_b, rune_b, doInc = true)
+        if rune_a == rune_b: return len_runes_b - 1
+      return len_runes_b
+  # common case:
+  var
+    len1 = len_runes_a + 1
+    len2 = len_runes_b + 1
+    row: seq[int]
+  let half = len_runes_a div 2
+  newSeq(row, len2)
+  var e = i_start + len2 - 1 # end marker
+  # initialize first row:
+  for i in 1 .. (len2 - half - 1): row[i] = i
+  row[0] = len1 - half - 1
+  i_current_a = i_start
+  var
+    char2p_i = -1
+    char2p_prev: int
+  for i in 1 .. (len1 - 1):
+    i_next_a = i_current_a
+    a.fastRuneAt(i_next_a, rune_a)
+    var
+      char2p: int
+      D, x: int
+      p: int
+    if i >= (len1 - half):
+      # skip the upper triangle:
+      let offset = i + half - len1
+      if char2p_i == i:
+        b.fastRuneAt(char2p_prev, rune_b)
+        char2p = char2p_prev
+        char2p_i = i + 1
+      else:
+        char2p = i_start
+        for j in 0 ..< offset:
+          rune_b = b.runeAt(char2p)
+          inc(char2p, rune_b.size)
+        char2p_i = i + 1
+        char2p_prev = char2p
+      p = offset
+      rune_b = b.runeAt(char2p)
+      var c3 = row[p] + (if rune_a != rune_b: 1 else: 0)
+      inc(char2p, rune_b.size)
+      inc(p)
+      x = row[p] + 1
+      D = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = i_start
+      D = i
+      x = i
+    if i <= (half + 1):
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(D)
+      rune_b = b.runeAt(char2p)
+      var c3 = D + (if rune_a != rune_b: 1 else: 0)
+      inc(char2p, rune_b.size)
+      inc(x)
+      if x > c3: x = c3
+      D = row[p] + 1
+      if x > D: x = D
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(D)
+      rune_b = b.runeAt(char2p)
+      var c3 = D + (if rune_a != rune_b: 1 else: 0)
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+    i_current_a = i_next_a
+  result = row[e]
+
+proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
+  ## Returns the edit distance between `a` and `b`.
+  ##
+  ## This uses the `Levenshtein`:idx: distance algorithm with only a linear
+  ## memory overhead.
+  var len1 = a.len
+  var len2 = b.len
+  if len1 > len2:
+    # make `b` the longer string
+    return editDistanceAscii(b, a)
+
+  # strip common prefix:
+  var s = 0
+  while s < len1 and a[s] == b[s]:
+    inc(s)
+    dec(len1)
+    dec(len2)
+  # strip common suffix:
+  while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
+    dec(len1)
+    dec(len2)
+  # trivial cases:
+  if len1 == 0: return len2
+  if len2 == 0: return len1
+
+  # another special case:
+  if len1 == 1:
+    for j in s..s+len2-1:
+      if a[s] == b[j]: return len2 - 1
+    return len2
+
+  inc(len1)
+  inc(len2)
+  var half = len1 shr 1
+  # initalize first row:
+  #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
+  var row: seq[int]
+  newSeq(row, len2)
+  var e = s + len2 - 1 # end marker
+  for i in 1..len2 - half - 1: row[i] = i
+  row[0] = len1 - half - 1
+  for i in 1 .. len1 - 1:
+    var char1 = a[i + s - 1]
+    var char2p: int
+    var D, x: int
+    var p: int
+    if i >= len1 - half:
+      # skip the upper triangle:
+      var offset = i - len1 + half
+      char2p = offset
+      p = offset
+      var c3 = row[p] + ord(char1 != b[s + char2p])
+      inc(p)
+      inc(char2p)
+      x = row[p] + 1
+      D = x
+      if x > c3: x = c3
+      row[p] = x
+      inc(p)
+    else:
+      p = 1
+      char2p = 0
+      D = i
+      x = i
+    if i <= half + 1:
+      # skip the lower triangle:
+      e = len2 + i - half - 2
+    # main:
+    while p <= e:
+      dec(D)
+      var c3 = D + ord(char1 != b[char2p + s])
+      inc(char2p)
+      inc(x)
+      if x > c3: x = c3
+      D = row[p] + 1
+      if x > D: x = D
+      row[p] = x
+      inc(p)
+    # lower triangle sentinel:
+    if i <= half:
+      dec(D)
+      var c3 = D + ord(char1 != b[char2p + s])
+      inc(x)
+      if x > c3: x = c3
+      row[p] = x
+  result = row[e]
+
+
+when isMainModule:
+  doAssert editDistance("", "") == 0
+  doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
+  doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
+
+  doAssert editDistance("привет", "превет") == 1
+  doAssert editDistance("Åge", "Age") == 1
+  # editDistance, one string is longer in bytes, but shorter in rune length
+  # first string: 4 bytes, second: 6 bytes, but only 3 runes
+  doAssert editDistance("aaaa", "×××") == 4
+
+  block veryLongStringEditDistanceTest:
+    const cap = 256
+    var
+      s1 = newStringOfCap(cap)
+      s2 = newStringOfCap(cap)
+    while len(s1) < cap:
+      s1.add 'a'
+    while len(s2) < cap:
+      s2.add 'b'
+    doAssert editDistance(s1, s2) == cap
+
+  block combiningCodePointsEditDistanceTest:
+    const s = "A\xCC\x8Age"
+    doAssert editDistance(s, "Age") == 1
+
+  doAssert editDistanceAscii("", "") == 0
+  doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
+  doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -524,6 +524,22 @@ const
    0x3000,  0x3000,  # ideographic space
    0xfeff,  0xfeff]  #

+  unicodeSpaces = [
+    Rune 0x0009, # tab
+    Rune 0x000a, # LF
+    Rune 0x000d, # CR
+    Rune 0x0020, # space
+    Rune 0x0085, # next line
+    Rune 0x00a0, # unknown
+    Rune 0x1680, # Ogham space mark
+    Rune 0x2000, # en dash .. zero-width space
+    Rune 0x200e, Rune 0x200f,  # LTR mark .. RTL mark (pattern whitespace)
+    Rune 0x2028, Rune 0x2029,  #  -     0x3000,  0x3000,  #
+    Rune 0x202f, # narrow no-break space
+    Rune 0x205f, # medium mathematical space
+    Rune 0x3000, # ideographic space
+    Rune 0xfeff] # unknown
+
  toupperRanges = [
    0x0061,  0x007a, 468,  # a-z A-Z
    0x00e0,  0x00f6, 468,  # - -
@@ -1733,7 +1749,157 @@ proc lastRune*(s: string; last: int): (Rune, int) =
    fastRuneAt(s, last-L, r, false)
    result = (r, L+1)

+proc size*(r: Rune): int {.noSideEffect.} =
+  ## Returns the number of bytes the rune ``r`` takes.
+  let v = r.uint32
+  if v <= 0x007F: result = 1
+  elif v <= 0x07FF: result = 2
+  elif v <= 0xFFFF: result = 3
+  elif v <= 0x1FFFFF: result = 4
+  elif v <= 0x3FFFFFF: result = 5
+  elif v <= 0x7FFFFFFF: result = 6
+  else: result = 1
+
+# --------- Private templates for different split separators -----------
+proc stringHasSep(s: string, index: int, seps: openarray[Rune]): bool =
+  var rune: Rune
+  fastRuneAt(s, index, rune, false)
+  return seps.contains(rune)
+
+proc stringHasSep(s: string, index: int, sep: Rune): bool =
+  var rune: Rune
+  fastRuneAt(s, index, rune, false)
+  return sep == rune
+
+template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) =
+  ## Common code for split procedures
+  var
+    last = 0
+    splits = maxsplit
+  if len(s) > 0:
+    while last <= len(s):
+      var first = last
+      while last < len(s) and not stringHasSep(s, last, sep):
+        when sep is Rune:
+          inc(last, sepLen)
+        else:
+          inc(last, runeLenAt(s, last))
+      if splits == 0: last = len(s)
+      yield s[first .. (last - 1)]
+      if splits == 0: break
+      dec(splits)
+      when sep is Rune:
+        inc(last, sepLen)
+      else:
+        inc(last, if last < len(s): runeLenAt(s, last) else: 1)
+
+iterator split*(s: string, seps: openarray[Rune] = unicodeSpaces,
+  maxsplit: int = -1): string =
+  ## Splits the unicode string `s` into substrings using a group of separators.
+  ##
+  ## Substrings are separated by a substring containing only `seps`.
+  ##
+  ## .. code-block:: nim
+  ##   for word in split("this\lis an\texample"):
+  ##     writeLine(stdout, word)
+  ##
+  ## ...generates this output:
+  ##
+  ## .. code-block::
+  ##   "this"
+  ##   "is"
+  ##   "an"
+  ##   "example"
+  ##
+  ## And the following code:
+  ##
+  ## .. code-block:: nim
+  ##   for word in split("this:is;an$example", {';', ':', '$'}):
+  ##     writeLine(stdout, word)
+  ##
+  ## ...produces the same output as the first example. The code:
+  ##
+  ## .. code-block:: nim
+  ##   let date = "2012-11-20T22:08:08.398990"
+  ##   let separators = {' ', '-', ':', 'T'}
+  ##   for number in split(date, separators):
+  ##     writeLine(stdout, number)
+  ##
+  ## ...results in:
+  ##
+  ## .. code-block::
+  ##   "2012"
+  ##   "11"
+  ##   "20"
+  ##   "22"
+  ##   "08"
+  ##   "08.398990"
+  ##
+  splitCommon(s, seps, maxsplit)
+
+iterator splitWhitespace*(s: string): string =
+  ## Splits a unicode string at whitespace runes
+  splitCommon(s, unicodeSpaces, -1)
+
+template accResult(iter: untyped) =
+  result = @[]
+  for x in iter: add(result, x)
+
+proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
+  rtl, extern: "ncuSplitWhitespace".} =
+  ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
+  ## iterator, but is a proc that returns a sequence of substrings.
+  accResult(splitWhitespace(s))
+
+iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
+  ## Splits the unicode string `s` into substrings using a single separator.
+  ##
+  ## Substrings are separated by the rune `sep`.
+  ## The code:
+  ##
+  ## .. code-block:: nim
+  ##   for word in split(";;this;is;an;;example;;;", ';'):
+  ##     writeLine(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block::
+  ##   ""
+  ##   ""
+  ##   "this"
+  ##   "is"
+  ##   "an"
+  ##   ""
+  ##   "example"
+  ##   ""
+  ##   ""
+  ##   ""
+  ##
+  splitCommon(s, sep, maxsplit, sep.size)
+
+proc split*(s: string, seps: openarray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {.
+  noSideEffect, rtl, extern: "nucSplitRunes".} =
+  ## The same as the `split iterator <#split.i,string,openarray[Rune]>`_, but is a
+  ## proc that returns a sequence of substrings.
+  accResult(split(s, seps, maxsplit))
+
+proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
+  rtl, extern: "nucSplitRune".} =
+  ## The same as the `split iterator <#split.i,string,Rune>`_, but is a proc
+  ## that returns a sequence of substrings.
+  accResult(split(s, sep, maxsplit))
+
 when isMainModule:
+
+  proc asRune(s: static[string]): Rune =
+    ## Compile-time conversion proc for converting string literals to a Rune
+    ## value. Returns the first Rune of the specified string.
+    ##
+    ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a
+    ## compile-time constant.
+    if s.len == 0: Rune(0)
+    else: s.runeAt(0)
+
  let
    someString = "öÑ"
    someRunes = @[runeAt(someString, 0), runeAt(someString, 2)]
@@ -1898,3 +2064,13 @@ when isMainModule:
  doAssert(runeSubStr(s, -100, 100) == "Hänsel  ««: 10,00€")
  doAssert(runeSubStr(s, 0, -100) == "")
  doAssert(runeSubStr(s, 100, -100) == "")
+
+  block splitTests:
+    let s = " this is an example  "
+    let s2 = ":this;is;an:example;;"
+    let s3 = ":this×is×an:example××"
+    doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
+    doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an", "example", "", ""]
+    doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is", "an", "example", "", ""]
+    doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example  "]
+    doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example  "]