From 48d3b26c218c3a6b041e347782d993c8fb0fc27d Mon Sep 17 00:00:00 2001 From: Andreas Rumpf Date: Sun, 14 Oct 2018 10:47:54 +0200 Subject: [PATCH] unicode: add split procs; refs #6301 --- changelog.md | 3 + lib/pure/editdistance.nim | 295 ++++++++++++++++++++++++++++++++++++++ lib/pure/unicode.nim | 176 +++++++++++++++++++++++ 3 files changed, 474 insertions(+) create mode 100644 lib/pure/editdistance.nim diff --git a/changelog.md b/changelog.md index 7464f50b6b..3204244ab7 100644 --- a/changelog.md +++ b/changelog.md @@ -112,6 +112,9 @@ - Added ``macros.copyLineInfo`` to copy lineInfo from other node. - Added ``system.ashr`` an arithmetic right shift for integers. +- Added `split`, `splitWhitespace`, `size` procs and iterators + to `unicode.nim`. + ### Library changes - ``macros.astGenRepr``, ``macros.lispRepr`` and ``macros.treeRepr`` diff --git a/lib/pure/editdistance.nim b/lib/pure/editdistance.nim new file mode 100644 index 0000000000..40beb7d93b --- /dev/null +++ b/lib/pure/editdistance.nim @@ -0,0 +1,295 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2018 Nim contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module implements an algorithm to compute the +## `edit distance`:idx: between two Unicode strings. + +import unicode + +proc editDistance*(a, b: string): int {.noSideEffect.} = + ## Returns the unicode-rune edit distance between ``a`` and ``b``. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + if len(a) > len(b): + # make ``b`` the longer string + return editDistance(b, a) + # strip common prefix + var + i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b`` + i_next_a = 0 + i_next_b = 0 + rune_a, rune_b: Rune + len_runes_a = 0 ## The number of relevant runes in string ``a``. + len_runes_b = 0 ## The number of relevant runes in string ``b``. + block commonPrefix: + # ``a`` is the shorter string + while i_start < len(a): + i_next_a = i_start + a.fastRuneAt(i_next_a, rune_a, doInc = true) + i_next_b = i_start + b.fastRuneAt(i_next_b, rune_b, doInc = true) + if rune_a != rune_b: + inc(len_runes_a) + inc(len_runes_b) + break + i_start = i_next_a + var + # we know that we are either at the start of the strings + # or that the current value of rune_a is not equal to rune_b + # => start search for common suffix after the current rune (``i_next_*``) + i_end_a = i_next_a ## The exclusive upper index bound of string ``a``. + i_end_b = i_next_b ## The exclusive upper index bound of string ``b``. + i_current_a = i_next_a + i_current_b = i_next_b + block commonSuffix: + var + add_runes_a = 0 + add_runes_b = 0 + while i_current_a < len(a) and i_current_b < len(b): + i_next_a = i_current_a + a.fastRuneAt(i_next_a, rune_a) + i_next_b = i_current_b + b.fastRuneAt(i_next_b, rune_b) + inc(add_runes_a) + inc(add_runes_b) + if rune_a != rune_b: + i_end_a = i_next_a + i_end_b = i_next_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + add_runes_a = 0 + add_runes_b = 0 + i_current_a = i_next_a + i_current_b = i_next_b + if i_current_a >= len(a): # ``a`` exhausted + if i_current_b < len(b): # ``b`` not exhausted + i_end_a = i_current_a + i_end_b = i_current_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + while true: + b.fastRuneAt(i_end_b, rune_b) + inc(len_runes_b) + if i_end_b >= len(b): break + elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted + i_end_a = i_current_a + i_end_b = i_current_b + inc(len_runes_a, add_runes_a) + inc(len_runes_b, add_runes_b) + while true: + a.fastRuneAt(i_end_a, rune_a) + inc(len_runes_a) + if i_end_a >= len(a): break + block specialCases: + # trivial cases: + if len_runes_a == 0: return len_runes_b + if len_runes_b == 0: return len_runes_a + # another special case: + if len_runes_a == 1: + a.fastRuneAt(i_start, rune_a, doInc = false) + var i_current_b = i_start + while i_current_b < i_end_b: + b.fastRuneAt(i_current_b, rune_b, doInc = true) + if rune_a == rune_b: return len_runes_b - 1 + return len_runes_b + # common case: + var + len1 = len_runes_a + 1 + len2 = len_runes_b + 1 + row: seq[int] + let half = len_runes_a div 2 + newSeq(row, len2) + var e = i_start + len2 - 1 # end marker + # initialize first row: + for i in 1 .. (len2 - half - 1): row[i] = i + row[0] = len1 - half - 1 + i_current_a = i_start + var + char2p_i = -1 + char2p_prev: int + for i in 1 .. (len1 - 1): + i_next_a = i_current_a + a.fastRuneAt(i_next_a, rune_a) + var + char2p: int + D, x: int + p: int + if i >= (len1 - half): + # skip the upper triangle: + let offset = i + half - len1 + if char2p_i == i: + b.fastRuneAt(char2p_prev, rune_b) + char2p = char2p_prev + char2p_i = i + 1 + else: + char2p = i_start + for j in 0 ..< offset: + rune_b = b.runeAt(char2p) + inc(char2p, rune_b.size) + char2p_i = i + 1 + char2p_prev = char2p + p = offset + rune_b = b.runeAt(char2p) + var c3 = row[p] + (if rune_a != rune_b: 1 else: 0) + inc(char2p, rune_b.size) + inc(p) + x = row[p] + 1 + D = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = i_start + D = i + x = i + if i <= (half + 1): + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(D) + rune_b = b.runeAt(char2p) + var c3 = D + (if rune_a != rune_b: 1 else: 0) + inc(char2p, rune_b.size) + inc(x) + if x > c3: x = c3 + D = row[p] + 1 + if x > D: x = D + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(D) + rune_b = b.runeAt(char2p) + var c3 = D + (if rune_a != rune_b: 1 else: 0) + inc(x) + if x > c3: x = c3 + row[p] = x + i_current_a = i_next_a + result = row[e] + +proc editDistanceAscii*(a, b: string): int {.noSideEffect.} = + ## Returns the edit distance between `a` and `b`. + ## + ## This uses the `Levenshtein`:idx: distance algorithm with only a linear + ## memory overhead. + var len1 = a.len + var len2 = b.len + if len1 > len2: + # make `b` the longer string + return editDistanceAscii(b, a) + + # strip common prefix: + var s = 0 + while s < len1 and a[s] == b[s]: + inc(s) + dec(len1) + dec(len2) + # strip common suffix: + while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]: + dec(len1) + dec(len2) + # trivial cases: + if len1 == 0: return len2 + if len2 == 0: return len1 + + # another special case: + if len1 == 1: + for j in s..s+len2-1: + if a[s] == b[j]: return len2 - 1 + return len2 + + inc(len1) + inc(len2) + var half = len1 shr 1 + # initalize first row: + #var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int))) + var row: seq[int] + newSeq(row, len2) + var e = s + len2 - 1 # end marker + for i in 1..len2 - half - 1: row[i] = i + row[0] = len1 - half - 1 + for i in 1 .. len1 - 1: + var char1 = a[i + s - 1] + var char2p: int + var D, x: int + var p: int + if i >= len1 - half: + # skip the upper triangle: + var offset = i - len1 + half + char2p = offset + p = offset + var c3 = row[p] + ord(char1 != b[s + char2p]) + inc(p) + inc(char2p) + x = row[p] + 1 + D = x + if x > c3: x = c3 + row[p] = x + inc(p) + else: + p = 1 + char2p = 0 + D = i + x = i + if i <= half + 1: + # skip the lower triangle: + e = len2 + i - half - 2 + # main: + while p <= e: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(char2p) + inc(x) + if x > c3: x = c3 + D = row[p] + 1 + if x > D: x = D + row[p] = x + inc(p) + # lower triangle sentinel: + if i <= half: + dec(D) + var c3 = D + ord(char1 != b[char2p + s]) + inc(x) + if x > c3: x = c3 + row[p] = x + result = row[e] + + +when isMainModule: + doAssert editDistance("", "") == 0 + doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia + doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia + + doAssert editDistance("привет", "превет") == 1 + doAssert editDistance("Åge", "Age") == 1 + # editDistance, one string is longer in bytes, but shorter in rune length + # first string: 4 bytes, second: 6 bytes, but only 3 runes + doAssert editDistance("aaaa", "×××") == 4 + + block veryLongStringEditDistanceTest: + const cap = 256 + var + s1 = newStringOfCap(cap) + s2 = newStringOfCap(cap) + while len(s1) < cap: + s1.add 'a' + while len(s2) < cap: + s2.add 'b' + doAssert editDistance(s1, s2) == cap + + block combiningCodePointsEditDistanceTest: + const s = "A\xCC\x8Age" + doAssert editDistance(s, "Age") == 1 + + doAssert editDistanceAscii("", "") == 0 + doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia + doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 978f569ac6..58855c165d 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -524,6 +524,22 @@ const 0x3000, 0x3000, # ideographic space 0xfeff, 0xfeff] # + unicodeSpaces = [ + Rune 0x0009, # tab + Rune 0x000a, # LF + Rune 0x000d, # CR + Rune 0x0020, # space + Rune 0x0085, # next line + Rune 0x00a0, # unknown + Rune 0x1680, # Ogham space mark + Rune 0x2000, # en dash .. zero-width space + Rune 0x200e, Rune 0x200f, # LTR mark .. RTL mark (pattern whitespace) + Rune 0x2028, Rune 0x2029, # - 0x3000, 0x3000, # + Rune 0x202f, # narrow no-break space + Rune 0x205f, # medium mathematical space + Rune 0x3000, # ideographic space + Rune 0xfeff] # unknown + toupperRanges = [ 0x0061, 0x007a, 468, # a-z A-Z 0x00e0, 0x00f6, 468, # - - @@ -1733,7 +1749,157 @@ proc lastRune*(s: string; last: int): (Rune, int) = fastRuneAt(s, last-L, r, false) result = (r, L+1) +proc size*(r: Rune): int {.noSideEffect.} = + ## Returns the number of bytes the rune ``r`` takes. + let v = r.uint32 + if v <= 0x007F: result = 1 + elif v <= 0x07FF: result = 2 + elif v <= 0xFFFF: result = 3 + elif v <= 0x1FFFFF: result = 4 + elif v <= 0x3FFFFFF: result = 5 + elif v <= 0x7FFFFFFF: result = 6 + else: result = 1 + +# --------- Private templates for different split separators ----------- +proc stringHasSep(s: string, index: int, seps: openarray[Rune]): bool = + var rune: Rune + fastRuneAt(s, index, rune, false) + return seps.contains(rune) + +proc stringHasSep(s: string, index: int, sep: Rune): bool = + var rune: Rune + fastRuneAt(s, index, rune, false) + return sep == rune + +template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) = + ## Common code for split procedures + var + last = 0 + splits = maxsplit + if len(s) > 0: + while last <= len(s): + var first = last + while last < len(s) and not stringHasSep(s, last, sep): + when sep is Rune: + inc(last, sepLen) + else: + inc(last, runeLenAt(s, last)) + if splits == 0: last = len(s) + yield s[first .. (last - 1)] + if splits == 0: break + dec(splits) + when sep is Rune: + inc(last, sepLen) + else: + inc(last, if last < len(s): runeLenAt(s, last) else: 1) + +iterator split*(s: string, seps: openarray[Rune] = unicodeSpaces, + maxsplit: int = -1): string = + ## Splits the unicode string `s` into substrings using a group of separators. + ## + ## Substrings are separated by a substring containing only `seps`. + ## + ## .. code-block:: nim + ## for word in split("this\lis an\texample"): + ## writeLine(stdout, word) + ## + ## ...generates this output: + ## + ## .. code-block:: + ## "this" + ## "is" + ## "an" + ## "example" + ## + ## And the following code: + ## + ## .. code-block:: nim + ## for word in split("this:is;an$example", {';', ':', '$'}): + ## writeLine(stdout, word) + ## + ## ...produces the same output as the first example. The code: + ## + ## .. code-block:: nim + ## let date = "2012-11-20T22:08:08.398990" + ## let separators = {' ', '-', ':', 'T'} + ## for number in split(date, separators): + ## writeLine(stdout, number) + ## + ## ...results in: + ## + ## .. code-block:: + ## "2012" + ## "11" + ## "20" + ## "22" + ## "08" + ## "08.398990" + ## + splitCommon(s, seps, maxsplit) + +iterator splitWhitespace*(s: string): string = + ## Splits a unicode string at whitespace runes + splitCommon(s, unicodeSpaces, -1) + +template accResult(iter: untyped) = + result = @[] + for x in iter: add(result, x) + +proc splitWhitespace*(s: string): seq[string] {.noSideEffect, + rtl, extern: "ncuSplitWhitespace".} = + ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ + ## iterator, but is a proc that returns a sequence of substrings. + accResult(splitWhitespace(s)) + +iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = + ## Splits the unicode string `s` into substrings using a single separator. + ## + ## Substrings are separated by the rune `sep`. + ## The code: + ## + ## .. code-block:: nim + ## for word in split(";;this;is;an;;example;;;", ';'): + ## writeLine(stdout, word) + ## + ## Results in: + ## + ## .. code-block:: + ## "" + ## "" + ## "this" + ## "is" + ## "an" + ## "" + ## "example" + ## "" + ## "" + ## "" + ## + splitCommon(s, sep, maxsplit, sep.size) + +proc split*(s: string, seps: openarray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {. + noSideEffect, rtl, extern: "nucSplitRunes".} = + ## The same as the `split iterator <#split.i,string,openarray[Rune]>`_, but is a + ## proc that returns a sequence of substrings. + accResult(split(s, seps, maxsplit)) + +proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, + rtl, extern: "nucSplitRune".} = + ## The same as the `split iterator <#split.i,string,Rune>`_, but is a proc + ## that returns a sequence of substrings. + accResult(split(s, sep, maxsplit)) + when isMainModule: + + proc asRune(s: static[string]): Rune = + ## Compile-time conversion proc for converting string literals to a Rune + ## value. Returns the first Rune of the specified string. + ## + ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a + ## compile-time constant. + if s.len == 0: Rune(0) + else: s.runeAt(0) + let someString = "öÑ" someRunes = @[runeAt(someString, 0), runeAt(someString, 2)] @@ -1898,3 +2064,13 @@ when isMainModule: doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€") doAssert(runeSubStr(s, 0, -100) == "") doAssert(runeSubStr(s, 100, -100) == "") + + block splitTests: + let s = " this is an example " + let s2 = ":this;is;an:example;;" + let s3 = ":this×is×an:example××" + doAssert s.split() == @["", "this", "is", "an", "example", "", ""] + doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an", "example", "", ""] + doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is", "an", "example", "", ""] + doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "] + doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "]