unicode: add split procs; refs #6301

This commit is contained in:
Andreas Rumpf
2018-10-14 10:47:54 +02:00
committed by narimiran
parent 2818661ffe
commit 48d3b26c21
3 changed files with 474 additions and 0 deletions

View File

@@ -112,6 +112,9 @@
- Added ``macros.copyLineInfo`` to copy lineInfo from other node.
- Added ``system.ashr`` an arithmetic right shift for integers.
- Added `split`, `splitWhitespace`, `size` procs and iterators
to `unicode.nim`.
### Library changes
- ``macros.astGenRepr``, ``macros.lispRepr`` and ``macros.treeRepr``

295
lib/pure/editdistance.nim Normal file
View File

@@ -0,0 +1,295 @@
#
#
# Nim's Runtime Library
# (c) Copyright 2018 Nim contributors
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## This module implements an algorithm to compute the
## `edit distance`:idx: between two Unicode strings.
import unicode
proc editDistance*(a, b: string): int {.noSideEffect.} =
## Returns the unicode-rune edit distance between ``a`` and ``b``.
##
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
## memory overhead.
if len(a) > len(b):
# make ``b`` the longer string
return editDistance(b, a)
# strip common prefix
var
i_start = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
i_next_a = 0
i_next_b = 0
rune_a, rune_b: Rune
len_runes_a = 0 ## The number of relevant runes in string ``a``.
len_runes_b = 0 ## The number of relevant runes in string ``b``.
block commonPrefix:
# ``a`` is the shorter string
while i_start < len(a):
i_next_a = i_start
a.fastRuneAt(i_next_a, rune_a, doInc = true)
i_next_b = i_start
b.fastRuneAt(i_next_b, rune_b, doInc = true)
if rune_a != rune_b:
inc(len_runes_a)
inc(len_runes_b)
break
i_start = i_next_a
var
# we know that we are either at the start of the strings
# or that the current value of rune_a is not equal to rune_b
# => start search for common suffix after the current rune (``i_next_*``)
i_end_a = i_next_a ## The exclusive upper index bound of string ``a``.
i_end_b = i_next_b ## The exclusive upper index bound of string ``b``.
i_current_a = i_next_a
i_current_b = i_next_b
block commonSuffix:
var
add_runes_a = 0
add_runes_b = 0
while i_current_a < len(a) and i_current_b < len(b):
i_next_a = i_current_a
a.fastRuneAt(i_next_a, rune_a)
i_next_b = i_current_b
b.fastRuneAt(i_next_b, rune_b)
inc(add_runes_a)
inc(add_runes_b)
if rune_a != rune_b:
i_end_a = i_next_a
i_end_b = i_next_b
inc(len_runes_a, add_runes_a)
inc(len_runes_b, add_runes_b)
add_runes_a = 0
add_runes_b = 0
i_current_a = i_next_a
i_current_b = i_next_b
if i_current_a >= len(a): # ``a`` exhausted
if i_current_b < len(b): # ``b`` not exhausted
i_end_a = i_current_a
i_end_b = i_current_b
inc(len_runes_a, add_runes_a)
inc(len_runes_b, add_runes_b)
while true:
b.fastRuneAt(i_end_b, rune_b)
inc(len_runes_b)
if i_end_b >= len(b): break
elif i_current_b >= len(b): # ``b`` exhausted and ``a`` not exhausted
i_end_a = i_current_a
i_end_b = i_current_b
inc(len_runes_a, add_runes_a)
inc(len_runes_b, add_runes_b)
while true:
a.fastRuneAt(i_end_a, rune_a)
inc(len_runes_a)
if i_end_a >= len(a): break
block specialCases:
# trivial cases:
if len_runes_a == 0: return len_runes_b
if len_runes_b == 0: return len_runes_a
# another special case:
if len_runes_a == 1:
a.fastRuneAt(i_start, rune_a, doInc = false)
var i_current_b = i_start
while i_current_b < i_end_b:
b.fastRuneAt(i_current_b, rune_b, doInc = true)
if rune_a == rune_b: return len_runes_b - 1
return len_runes_b
# common case:
var
len1 = len_runes_a + 1
len2 = len_runes_b + 1
row: seq[int]
let half = len_runes_a div 2
newSeq(row, len2)
var e = i_start + len2 - 1 # end marker
# initialize first row:
for i in 1 .. (len2 - half - 1): row[i] = i
row[0] = len1 - half - 1
i_current_a = i_start
var
char2p_i = -1
char2p_prev: int
for i in 1 .. (len1 - 1):
i_next_a = i_current_a
a.fastRuneAt(i_next_a, rune_a)
var
char2p: int
D, x: int
p: int
if i >= (len1 - half):
# skip the upper triangle:
let offset = i + half - len1
if char2p_i == i:
b.fastRuneAt(char2p_prev, rune_b)
char2p = char2p_prev
char2p_i = i + 1
else:
char2p = i_start
for j in 0 ..< offset:
rune_b = b.runeAt(char2p)
inc(char2p, rune_b.size)
char2p_i = i + 1
char2p_prev = char2p
p = offset
rune_b = b.runeAt(char2p)
var c3 = row[p] + (if rune_a != rune_b: 1 else: 0)
inc(char2p, rune_b.size)
inc(p)
x = row[p] + 1
D = x
if x > c3: x = c3
row[p] = x
inc(p)
else:
p = 1
char2p = i_start
D = i
x = i
if i <= (half + 1):
# skip the lower triangle:
e = len2 + i - half - 2
# main:
while p <= e:
dec(D)
rune_b = b.runeAt(char2p)
var c3 = D + (if rune_a != rune_b: 1 else: 0)
inc(char2p, rune_b.size)
inc(x)
if x > c3: x = c3
D = row[p] + 1
if x > D: x = D
row[p] = x
inc(p)
# lower triangle sentinel:
if i <= half:
dec(D)
rune_b = b.runeAt(char2p)
var c3 = D + (if rune_a != rune_b: 1 else: 0)
inc(x)
if x > c3: x = c3
row[p] = x
i_current_a = i_next_a
result = row[e]
proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
## Returns the edit distance between `a` and `b`.
##
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
## memory overhead.
var len1 = a.len
var len2 = b.len
if len1 > len2:
# make `b` the longer string
return editDistanceAscii(b, a)
# strip common prefix:
var s = 0
while s < len1 and a[s] == b[s]:
inc(s)
dec(len1)
dec(len2)
# strip common suffix:
while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
dec(len1)
dec(len2)
# trivial cases:
if len1 == 0: return len2
if len2 == 0: return len1
# another special case:
if len1 == 1:
for j in s..s+len2-1:
if a[s] == b[j]: return len2 - 1
return len2
inc(len1)
inc(len2)
var half = len1 shr 1
# initalize first row:
#var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
var row: seq[int]
newSeq(row, len2)
var e = s + len2 - 1 # end marker
for i in 1..len2 - half - 1: row[i] = i
row[0] = len1 - half - 1
for i in 1 .. len1 - 1:
var char1 = a[i + s - 1]
var char2p: int
var D, x: int
var p: int
if i >= len1 - half:
# skip the upper triangle:
var offset = i - len1 + half
char2p = offset
p = offset
var c3 = row[p] + ord(char1 != b[s + char2p])
inc(p)
inc(char2p)
x = row[p] + 1
D = x
if x > c3: x = c3
row[p] = x
inc(p)
else:
p = 1
char2p = 0
D = i
x = i
if i <= half + 1:
# skip the lower triangle:
e = len2 + i - half - 2
# main:
while p <= e:
dec(D)
var c3 = D + ord(char1 != b[char2p + s])
inc(char2p)
inc(x)
if x > c3: x = c3
D = row[p] + 1
if x > D: x = D
row[p] = x
inc(p)
# lower triangle sentinel:
if i <= half:
dec(D)
var c3 = D + ord(char1 != b[char2p + s])
inc(x)
if x > c3: x = c3
row[p] = x
result = row[e]
when isMainModule:
doAssert editDistance("", "") == 0
doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
doAssert editDistance("привет", "превет") == 1
doAssert editDistance("Åge", "Age") == 1
# editDistance, one string is longer in bytes, but shorter in rune length
# first string: 4 bytes, second: 6 bytes, but only 3 runes
doAssert editDistance("aaaa", "×××") == 4
block veryLongStringEditDistanceTest:
const cap = 256
var
s1 = newStringOfCap(cap)
s2 = newStringOfCap(cap)
while len(s1) < cap:
s1.add 'a'
while len(s2) < cap:
s2.add 'b'
doAssert editDistance(s1, s2) == cap
block combiningCodePointsEditDistanceTest:
const s = "A\xCC\x8Age"
doAssert editDistance(s, "Age") == 1
doAssert editDistanceAscii("", "") == 0
doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia

View File

@@ -524,6 +524,22 @@ const
0x3000, 0x3000, # ideographic space
0xfeff, 0xfeff] #
unicodeSpaces = [
Rune 0x0009, # tab
Rune 0x000a, # LF
Rune 0x000d, # CR
Rune 0x0020, # space
Rune 0x0085, # next line
Rune 0x00a0, # unknown
Rune 0x1680, # Ogham space mark
Rune 0x2000, # en dash .. zero-width space
Rune 0x200e, Rune 0x200f, # LTR mark .. RTL mark (pattern whitespace)
Rune 0x2028, Rune 0x2029, # - 0x3000, 0x3000, #
Rune 0x202f, # narrow no-break space
Rune 0x205f, # medium mathematical space
Rune 0x3000, # ideographic space
Rune 0xfeff] # unknown
toupperRanges = [
0x0061, 0x007a, 468, # a-z A-Z
0x00e0, 0x00f6, 468, # - -
@@ -1733,7 +1749,157 @@ proc lastRune*(s: string; last: int): (Rune, int) =
fastRuneAt(s, last-L, r, false)
result = (r, L+1)
proc size*(r: Rune): int {.noSideEffect.} =
## Returns the number of bytes the rune ``r`` takes.
let v = r.uint32
if v <= 0x007F: result = 1
elif v <= 0x07FF: result = 2
elif v <= 0xFFFF: result = 3
elif v <= 0x1FFFFF: result = 4
elif v <= 0x3FFFFFF: result = 5
elif v <= 0x7FFFFFFF: result = 6
else: result = 1
# --------- Private templates for different split separators -----------
proc stringHasSep(s: string, index: int, seps: openarray[Rune]): bool =
var rune: Rune
fastRuneAt(s, index, rune, false)
return seps.contains(rune)
proc stringHasSep(s: string, index: int, sep: Rune): bool =
var rune: Rune
fastRuneAt(s, index, rune, false)
return sep == rune
template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) =
## Common code for split procedures
var
last = 0
splits = maxsplit
if len(s) > 0:
while last <= len(s):
var first = last
while last < len(s) and not stringHasSep(s, last, sep):
when sep is Rune:
inc(last, sepLen)
else:
inc(last, runeLenAt(s, last))
if splits == 0: last = len(s)
yield s[first .. (last - 1)]
if splits == 0: break
dec(splits)
when sep is Rune:
inc(last, sepLen)
else:
inc(last, if last < len(s): runeLenAt(s, last) else: 1)
iterator split*(s: string, seps: openarray[Rune] = unicodeSpaces,
maxsplit: int = -1): string =
## Splits the unicode string `s` into substrings using a group of separators.
##
## Substrings are separated by a substring containing only `seps`.
##
## .. code-block:: nim
## for word in split("this\lis an\texample"):
## writeLine(stdout, word)
##
## ...generates this output:
##
## .. code-block::
## "this"
## "is"
## "an"
## "example"
##
## And the following code:
##
## .. code-block:: nim
## for word in split("this:is;an$example", {';', ':', '$'}):
## writeLine(stdout, word)
##
## ...produces the same output as the first example. The code:
##
## .. code-block:: nim
## let date = "2012-11-20T22:08:08.398990"
## let separators = {' ', '-', ':', 'T'}
## for number in split(date, separators):
## writeLine(stdout, number)
##
## ...results in:
##
## .. code-block::
## "2012"
## "11"
## "20"
## "22"
## "08"
## "08.398990"
##
splitCommon(s, seps, maxsplit)
iterator splitWhitespace*(s: string): string =
## Splits a unicode string at whitespace runes
splitCommon(s, unicodeSpaces, -1)
template accResult(iter: untyped) =
result = @[]
for x in iter: add(result, x)
proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
rtl, extern: "ncuSplitWhitespace".} =
## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
## iterator, but is a proc that returns a sequence of substrings.
accResult(splitWhitespace(s))
iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
## Splits the unicode string `s` into substrings using a single separator.
##
## Substrings are separated by the rune `sep`.
## The code:
##
## .. code-block:: nim
## for word in split(";;this;is;an;;example;;;", ';'):
## writeLine(stdout, word)
##
## Results in:
##
## .. code-block::
## ""
## ""
## "this"
## "is"
## "an"
## ""
## "example"
## ""
## ""
## ""
##
splitCommon(s, sep, maxsplit, sep.size)
proc split*(s: string, seps: openarray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {.
noSideEffect, rtl, extern: "nucSplitRunes".} =
## The same as the `split iterator <#split.i,string,openarray[Rune]>`_, but is a
## proc that returns a sequence of substrings.
accResult(split(s, seps, maxsplit))
proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
rtl, extern: "nucSplitRune".} =
## The same as the `split iterator <#split.i,string,Rune>`_, but is a proc
## that returns a sequence of substrings.
accResult(split(s, sep, maxsplit))
when isMainModule:
proc asRune(s: static[string]): Rune =
## Compile-time conversion proc for converting string literals to a Rune
## value. Returns the first Rune of the specified string.
##
## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a
## compile-time constant.
if s.len == 0: Rune(0)
else: s.runeAt(0)
let
someString = "öÑ"
someRunes = @[runeAt(someString, 0), runeAt(someString, 2)]
@@ -1898,3 +2064,13 @@ when isMainModule:
doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
doAssert(runeSubStr(s, 0, -100) == "")
doAssert(runeSubStr(s, 100, -100) == "")
block splitTests:
let s = " this is an example "
let s2 = ":this;is;an:example;;"
let s3 = ":this×is×an:example××"
doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an", "example", "", ""]
doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is", "an", "example", "", ""]
doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "]
doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "]