mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-29 01:14:41 +00:00
296 lines
7.9 KiB
Nim
296 lines
7.9 KiB
Nim
#
|
||
#
|
||
# Nim's Runtime Library
|
||
# (c) Copyright 2018 Nim contributors
|
||
#
|
||
# See the file "copying.txt", included in this
|
||
# distribution, for details about the copyright.
|
||
#
|
||
|
||
## This module implements an algorithm to compute the
|
||
## `edit distance`:idx: between two Unicode strings.
|
||
|
||
import unicode
|
||
|
||
proc editDistance*(a, b: string): int {.noSideEffect.} =
|
||
## Returns the unicode-rune edit distance between ``a`` and ``b``.
|
||
##
|
||
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
|
||
## memory overhead.
|
||
if len(a) > len(b):
|
||
# make ``b`` the longer string
|
||
return editDistance(b, a)
|
||
# strip common prefix
|
||
var
|
||
iStart = 0 ## The character starting index of the first rune in both strings ``a`` and ``b``
|
||
iNextA = 0
|
||
iNextB = 0
|
||
runeA, runeB: Rune
|
||
lenRunesA = 0 ## The number of relevant runes in string ``a``.
|
||
lenRunesB = 0 ## The number of relevant runes in string ``b``.
|
||
block commonPrefix:
|
||
# ``a`` is the shorter string
|
||
while iStart < len(a):
|
||
iNextA = iStart
|
||
a.fastRuneAt(iNextA, runeA, doInc = true)
|
||
iNextB = iStart
|
||
b.fastRuneAt(iNextB, runeB, doInc = true)
|
||
if runeA != runeB:
|
||
inc(lenRunesA)
|
||
inc(lenRunesB)
|
||
break
|
||
iStart = iNextA
|
||
var
|
||
# we know that we are either at the start of the strings
|
||
# or that the current value of runeA is not equal to runeB
|
||
# => start search for common suffix after the current rune (``i_next_*``)
|
||
iEndA = iNextA ## The exclusive upper index bound of string ``a``.
|
||
iEndB = iNextB ## The exclusive upper index bound of string ``b``.
|
||
iCurrentA = iNextA
|
||
iCurrentB = iNextB
|
||
block commonSuffix:
|
||
var
|
||
addRunesA = 0
|
||
addRunesB = 0
|
||
while iCurrentA < len(a) and iCurrentB < len(b):
|
||
iNextA = iCurrentA
|
||
a.fastRuneAt(iNextA, runeA)
|
||
iNextB = iCurrentB
|
||
b.fastRuneAt(iNextB, runeB)
|
||
inc(addRunesA)
|
||
inc(addRunesB)
|
||
if runeA != runeB:
|
||
iEndA = iNextA
|
||
iEndB = iNextB
|
||
inc(lenRunesA, addRunesA)
|
||
inc(lenRunesB, addRunesB)
|
||
addRunesA = 0
|
||
addRunesB = 0
|
||
iCurrentA = iNextA
|
||
iCurrentB = iNextB
|
||
if iCurrentA >= len(a): # ``a`` exhausted
|
||
if iCurrentB < len(b): # ``b`` not exhausted
|
||
iEndA = iCurrentA
|
||
iEndB = iCurrentB
|
||
inc(lenRunesA, addRunesA)
|
||
inc(lenRunesB, addRunesB)
|
||
while true:
|
||
b.fastRuneAt(iEndB, runeB)
|
||
inc(lenRunesB)
|
||
if iEndB >= len(b): break
|
||
elif iCurrentB >= len(b): # ``b`` exhausted and ``a`` not exhausted
|
||
iEndA = iCurrentA
|
||
iEndB = iCurrentB
|
||
inc(lenRunesA, addRunesA)
|
||
inc(lenRunesB, addRunesB)
|
||
while true:
|
||
a.fastRuneAt(iEndA, runeA)
|
||
inc(lenRunesA)
|
||
if iEndA >= len(a): break
|
||
block specialCases:
|
||
# trivial cases:
|
||
if lenRunesA == 0: return lenRunesB
|
||
if lenRunesB == 0: return lenRunesA
|
||
# another special case:
|
||
if lenRunesA == 1:
|
||
a.fastRuneAt(iStart, runeA, doInc = false)
|
||
var iCurrentB = iStart
|
||
while iCurrentB < iEndB:
|
||
b.fastRuneAt(iCurrentB, runeB, doInc = true)
|
||
if runeA == runeB: return lenRunesB - 1
|
||
return lenRunesB
|
||
# common case:
|
||
var
|
||
len1 = lenRunesA + 1
|
||
len2 = lenRunesB + 1
|
||
row: seq[int]
|
||
let half = lenRunesA div 2
|
||
newSeq(row, len2)
|
||
var e = iStart + len2 - 1 # end marker
|
||
# initialize first row:
|
||
for i in 1 .. (len2 - half - 1): row[i] = i
|
||
row[0] = len1 - half - 1
|
||
iCurrentA = iStart
|
||
var
|
||
char2pI = -1
|
||
char2pPrev: int
|
||
for i in 1 .. (len1 - 1):
|
||
iNextA = iCurrentA
|
||
a.fastRuneAt(iNextA, runeA)
|
||
var
|
||
char2p: int
|
||
diff, x: int
|
||
p: int
|
||
if i >= (len1 - half):
|
||
# skip the upper triangle:
|
||
let offset = i + half - len1
|
||
if char2pI == i:
|
||
b.fastRuneAt(char2pPrev, runeB)
|
||
char2p = char2pPrev
|
||
char2pI = i + 1
|
||
else:
|
||
char2p = iStart
|
||
for j in 0 ..< offset:
|
||
runeB = b.runeAt(char2p)
|
||
inc(char2p, runeB.size)
|
||
char2pI = i + 1
|
||
char2pPrev = char2p
|
||
p = offset
|
||
runeB = b.runeAt(char2p)
|
||
var c3 = row[p] + (if runeA != runeB: 1 else: 0)
|
||
inc(char2p, runeB.size)
|
||
inc(p)
|
||
x = row[p] + 1
|
||
diff = x
|
||
if x > c3: x = c3
|
||
row[p] = x
|
||
inc(p)
|
||
else:
|
||
p = 1
|
||
char2p = iStart
|
||
diff = i
|
||
x = i
|
||
if i <= (half + 1):
|
||
# skip the lower triangle:
|
||
e = len2 + i - half - 2
|
||
# main:
|
||
while p <= e:
|
||
dec(diff)
|
||
runeB = b.runeAt(char2p)
|
||
var c3 = diff + (if runeA != runeB: 1 else: 0)
|
||
inc(char2p, runeB.size)
|
||
inc(x)
|
||
if x > c3: x = c3
|
||
diff = row[p] + 1
|
||
if x > diff: x = diff
|
||
row[p] = x
|
||
inc(p)
|
||
# lower triangle sentinel:
|
||
if i <= half:
|
||
dec(diff)
|
||
runeB = b.runeAt(char2p)
|
||
var c3 = diff + (if runeA != runeB: 1 else: 0)
|
||
inc(x)
|
||
if x > c3: x = c3
|
||
row[p] = x
|
||
iCurrentA = iNextA
|
||
result = row[e]
|
||
|
||
proc editDistanceAscii*(a, b: string): int {.noSideEffect.} =
|
||
## Returns the edit distance between `a` and `b`.
|
||
##
|
||
## This uses the `Levenshtein`:idx: distance algorithm with only a linear
|
||
## memory overhead.
|
||
var len1 = a.len
|
||
var len2 = b.len
|
||
if len1 > len2:
|
||
# make `b` the longer string
|
||
return editDistanceAscii(b, a)
|
||
|
||
# strip common prefix:
|
||
var s = 0
|
||
while s < len1 and a[s] == b[s]:
|
||
inc(s)
|
||
dec(len1)
|
||
dec(len2)
|
||
# strip common suffix:
|
||
while len1 > 0 and len2 > 0 and a[s+len1-1] == b[s+len2-1]:
|
||
dec(len1)
|
||
dec(len2)
|
||
# trivial cases:
|
||
if len1 == 0: return len2
|
||
if len2 == 0: return len1
|
||
|
||
# another special case:
|
||
if len1 == 1:
|
||
for j in s..s+len2-1:
|
||
if a[s] == b[j]: return len2 - 1
|
||
return len2
|
||
|
||
inc(len1)
|
||
inc(len2)
|
||
var half = len1 shr 1
|
||
# initalize first row:
|
||
#var row = cast[ptr array[0..high(int) div 8, int]](alloc(len2*sizeof(int)))
|
||
var row: seq[int]
|
||
newSeq(row, len2)
|
||
var e = s + len2 - 1 # end marker
|
||
for i in 1..len2 - half - 1: row[i] = i
|
||
row[0] = len1 - half - 1
|
||
for i in 1 .. len1 - 1:
|
||
var char1 = a[i + s - 1]
|
||
var char2p: int
|
||
var diff, x: int
|
||
var p: int
|
||
if i >= len1 - half:
|
||
# skip the upper triangle:
|
||
var offset = i - len1 + half
|
||
char2p = offset
|
||
p = offset
|
||
var c3 = row[p] + ord(char1 != b[s + char2p])
|
||
inc(p)
|
||
inc(char2p)
|
||
x = row[p] + 1
|
||
diff = x
|
||
if x > c3: x = c3
|
||
row[p] = x
|
||
inc(p)
|
||
else:
|
||
p = 1
|
||
char2p = 0
|
||
diff = i
|
||
x = i
|
||
if i <= half + 1:
|
||
# skip the lower triangle:
|
||
e = len2 + i - half - 2
|
||
# main:
|
||
while p <= e:
|
||
dec(diff)
|
||
var c3 = diff + ord(char1 != b[char2p + s])
|
||
inc(char2p)
|
||
inc(x)
|
||
if x > c3: x = c3
|
||
diff = row[p] + 1
|
||
if x > diff: x = diff
|
||
row[p] = x
|
||
inc(p)
|
||
# lower triangle sentinel:
|
||
if i <= half:
|
||
dec(diff)
|
||
var c3 = diff + ord(char1 != b[char2p + s])
|
||
inc(x)
|
||
if x > c3: x = c3
|
||
row[p] = x
|
||
result = row[e]
|
||
|
||
|
||
when isMainModule:
|
||
doAssert editDistance("", "") == 0
|
||
doAssert editDistance("kitten", "sitting") == 3 # from Wikipedia
|
||
doAssert editDistance("flaw", "lawn") == 2 # from Wikipedia
|
||
|
||
doAssert editDistance("привет", "превет") == 1
|
||
doAssert editDistance("Åge", "Age") == 1
|
||
# editDistance, one string is longer in bytes, but shorter in rune length
|
||
# first string: 4 bytes, second: 6 bytes, but only 3 runes
|
||
doAssert editDistance("aaaa", "×××") == 4
|
||
|
||
block veryLongStringEditDistanceTest:
|
||
const cap = 256
|
||
var
|
||
s1 = newStringOfCap(cap)
|
||
s2 = newStringOfCap(cap)
|
||
while len(s1) < cap:
|
||
s1.add 'a'
|
||
while len(s2) < cap:
|
||
s2.add 'b'
|
||
doAssert editDistance(s1, s2) == cap
|
||
|
||
block combiningCodePointsEditDistanceTest:
|
||
const s = "A\xCC\x8Age"
|
||
doAssert editDistance(s, "Age") == 1
|
||
|
||
doAssert editDistanceAscii("", "") == 0
|
||
doAssert editDistanceAscii("kitten", "sitting") == 3 # from Wikipedia
|
||
doAssert editDistanceAscii("flaw", "lawn") == 2 # from Wikipedia
|