mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-28 17:04:41 +00:00
414 lines
14 KiB
Nim
414 lines
14 KiB
Nim
#
|
|
#
|
|
# Nim's Runtime Library
|
|
# (c) Copyright 2018 Nim contributors
|
|
#
|
|
# See the file "copying.txt", included in this
|
|
# distribution, for details about the copyright.
|
|
#
|
|
|
|
## This module implements an algorithm to compute the
|
|
## `diff`:idx: between two sequences of lines.
|
|
##
|
|
## A basic example of ``diffInt`` on 2 arrays of integers:
|
|
##
|
|
## .. code::nim
|
|
##
|
|
## import experimental/diff
|
|
## echo diffInt([0, 1, 2, 3, 4, 5, 6, 7, 8], [-1, 1, 2, 3, 4, 5, 666, 7, 42])
|
|
##
|
|
## Another short example of ``diffText`` to diff strings:
|
|
##
|
|
## .. code::nim
|
|
##
|
|
## import experimental/diff
|
|
## # 2 samples of text for testing (from "The Call of Cthulhu" by Lovecraft)
|
|
## let txt0 = """I have looked upon all the universe has to hold of horror,
|
|
## even skies of spring and flowers of summer must ever be poison to me."""
|
|
## let txt1 = """I have looked upon all your code has to hold of bugs,
|
|
## even skies of spring and flowers of summer must ever be poison to me."""
|
|
##
|
|
## echo diffText(txt0, txt1)
|
|
##
|
|
## - To learn more see `Diff on Wikipedia. <http://wikipedia.org/wiki/Diff>`_
|
|
|
|
# code owner: Arne Döring
|
|
#
|
|
# This is based on C# code written by Matthias Hertel, http://www.mathertel.de
|
|
#
|
|
# This Class implements the Difference Algorithm published in
|
|
# "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
|
|
# Algorithmica Vol. 1 No. 2, 1986, p 251.
|
|
|
|
import tables, strutils
|
|
|
|
type
|
|
Item* = object ## An Item in the list of differences.
|
|
startA*: int ## Start Line number in Data A.
|
|
startB*: int ## Start Line number in Data B.
|
|
deletedA*: int ## Number of changes in Data A.
|
|
insertedB*: int ## Number of changes in Data B.
|
|
|
|
DiffData = object ## Data on one input file being compared.
|
|
data: seq[int] ## Buffer of numbers that will be compared.
|
|
modified: seq[bool] ## Array of booleans that flag for modified
|
|
## data. This is the result of the diff.
|
|
## This means deletedA in the first Data or
|
|
## inserted in the second Data.
|
|
|
|
Smsrd = object
|
|
x, y: int
|
|
|
|
# template to avoid a seq copy. Required until ``sink`` parameters are ready.
|
|
template newDiffData(initData: seq[int]; L: int): DiffData =
|
|
DiffData(
|
|
data: initData,
|
|
modified: newSeq[bool](L + 2)
|
|
)
|
|
|
|
proc len(d: DiffData): int {.inline.} = d.data.len
|
|
|
|
proc diffCodes(aText: string; h: var Table[string, int]): DiffData =
|
|
## This function converts all textlines of the text into unique numbers for every unique textline
|
|
## so further work can work only with simple numbers.
|
|
## ``aText`` the input text
|
|
## ``h`` This extern initialized hashtable is used for storing all ever used textlines.
|
|
## ``trimSpace`` ignore leading and trailing space characters
|
|
## Returns a array of integers.
|
|
var lastUsedCode = h.len
|
|
result.data = newSeq[int]()
|
|
for s in aText.splitLines:
|
|
if h.contains s:
|
|
result.data.add h[s]
|
|
else:
|
|
inc lastUsedCode
|
|
h[s] = lastUsedCode
|
|
result.data.add lastUsedCode
|
|
result.modified = newSeq[bool](result.data.len + 2)
|
|
|
|
proc optimize(data: var DiffData) =
|
|
## If a sequence of modified lines starts with a line that contains the same content
|
|
## as the line that appends the changes, the difference sequence is modified so that the
|
|
## appended line and not the starting line is marked as modified.
|
|
## This leads to more readable diff sequences when comparing text files.
|
|
var startPos = 0
|
|
while startPos < data.len:
|
|
while startPos < data.len and not data.modified[startPos]:
|
|
inc startPos
|
|
var endPos = startPos
|
|
while endPos < data.len and data.modified[endPos]:
|
|
inc endPos
|
|
|
|
if endPos < data.len and data.data[startPos] == data.data[endPos]:
|
|
data.modified[startPos] = false
|
|
data.modified[endPos] = true
|
|
else:
|
|
startPos = endPos
|
|
|
|
proc sms(dataA: var DiffData; lowerA, upperA: int; dataB: DiffData; lowerB, upperB: int;
|
|
downVector, upVector: var openArray[int]): Smsrd =
|
|
## This is the algorithm to find the Shortest Middle Snake (sms).
|
|
## ``dataA`` sequence A
|
|
## ``lowerA`` lower bound of the actual range in dataA
|
|
## ``upperA`` upper bound of the actual range in dataA (exclusive)
|
|
## ``dataB`` sequence B
|
|
## ``lowerB`` lower bound of the actual range in dataB
|
|
## ``upperB`` upper bound of the actual range in dataB (exclusive)
|
|
## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
|
|
## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
|
|
## Returns a MiddleSnakeData record containing x,y and u,v.
|
|
|
|
let max = dataA.len + dataB.len + 1
|
|
|
|
let downK = lowerA - lowerB # the k-line to start the forward search
|
|
let upK = upperA - upperB # the k-line to start the reverse search
|
|
|
|
let delta = (upperA - lowerA) - (upperB - lowerB)
|
|
let oddDelta = (delta and 1) != 0
|
|
|
|
# The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
|
|
# and are access using a specific offset: upOffset upVector and downOffset for downVector
|
|
let downOffset = max - downK
|
|
let upOffset = max - upK
|
|
|
|
let maxD = ((upperA - lowerA + upperB - lowerB) div 2) + 1
|
|
|
|
downVector[downOffset + downK + 1] = lowerA
|
|
upVector[upOffset + upK - 1] = upperA
|
|
|
|
for D in 0 .. maxD:
|
|
# Extend the forward path.
|
|
for k in countUp(downK - D, downK + D, 2):
|
|
# find the only or better starting point
|
|
var x: int
|
|
if k == downK - D:
|
|
x = downVector[downOffset + k + 1] # down
|
|
else:
|
|
x = downVector[downOffset + k - 1] + 1 # a step to the right
|
|
if k < downK + D and downVector[downOffset + k + 1] >= x:
|
|
x = downVector[downOffset + k + 1] # down
|
|
|
|
var y = x - k
|
|
|
|
# find the end of the furthest reaching forward D-path in diagonal k.
|
|
while x < upperA and y < upperB and dataA.data[x] == dataB.data[y]:
|
|
inc x
|
|
inc y
|
|
|
|
downVector[downOffset + k] = x
|
|
|
|
# overlap ?
|
|
if oddDelta and upK - D < k and k < upK + D:
|
|
if upVector[upOffset + k] <= downVector[downOffset + k]:
|
|
return Smsrd(x: downVector[downOffset + k],
|
|
y: downVector[downOffset + k] - k)
|
|
|
|
# Extend the reverse path.
|
|
for k in countUp(upK - D, upK + D, 2):
|
|
# find the only or better starting point
|
|
var x: int
|
|
if k == upK + D:
|
|
x = upVector[upOffset + k - 1] # up
|
|
else:
|
|
x = upVector[upOffset + k + 1] - 1 # left
|
|
if k > upK - D and upVector[upOffset + k - 1] < x:
|
|
x = upVector[upOffset + k - 1] # up
|
|
|
|
var y = x - k
|
|
while x > lowerA and y > lowerB and dataA.data[x - 1] == dataB.data[y - 1]:
|
|
dec x
|
|
dec y
|
|
|
|
upVector[upOffset + k] = x
|
|
|
|
# overlap ?
|
|
if not oddDelta and downK-D <= k and k <= downK+D:
|
|
if upVector[upOffset + k] <= downVector[downOffset + k]:
|
|
return Smsrd(x: downVector[downOffset + k],
|
|
y: downVector[downOffset + k] - k)
|
|
|
|
assert false, "the algorithm should never come here."
|
|
|
|
proc lcs(dataA: var DiffData; lowerA, upperA: int; dataB: var DiffData; lowerB, upperB: int;
|
|
downVector, upVector: var openArray[int]) =
|
|
## This is the divide-and-conquer implementation of the longes common-subsequence (lcs)
|
|
## algorithm.
|
|
## The published algorithm passes recursively parts of the A and B sequences.
|
|
## To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
|
|
## ``dataA`` sequence A
|
|
## ``lowerA`` lower bound of the actual range in dataA
|
|
## ``upperA`` upper bound of the actual range in dataA (exclusive)
|
|
## ``dataB`` sequence B
|
|
## ``lowerB`` lower bound of the actual range in dataB
|
|
## ``upperB`` upper bound of the actual range in dataB (exclusive)
|
|
## ``downVector`` a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.
|
|
## ``upVector`` a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.
|
|
|
|
# make mutable copy:
|
|
var lowerA = lowerA
|
|
var lowerB = lowerB
|
|
var upperA = upperA
|
|
var upperB = upperB
|
|
|
|
# Fast walkthrough equal lines at the start
|
|
while lowerA < upperA and lowerB < upperB and dataA.data[lowerA] == dataB.data[lowerB]:
|
|
inc lowerA
|
|
inc lowerB
|
|
|
|
# Fast walkthrough equal lines at the end
|
|
while lowerA < upperA and lowerB < upperB and dataA.data[upperA - 1] == dataB.data[upperB - 1]:
|
|
dec upperA
|
|
dec upperB
|
|
|
|
if lowerA == upperA:
|
|
# mark as inserted lines.
|
|
while lowerB < upperB:
|
|
dataB.modified[lowerB] = true
|
|
inc lowerB
|
|
|
|
elif lowerB == upperB:
|
|
# mark as deleted lines.
|
|
while lowerA < upperA:
|
|
dataA.modified[lowerA] = true
|
|
inc lowerA
|
|
|
|
else:
|
|
# Find the middle snakea and length of an optimal path for A and B
|
|
let smsrd = sms(dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector)
|
|
# Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y))
|
|
|
|
# The path is from LowerX to (x,y) and (x,y) to UpperX
|
|
lcs(dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector)
|
|
lcs(dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector) # 2002.09.20: no need for 2 points
|
|
|
|
proc createDiffs(dataA, dataB: DiffData): seq[Item] =
|
|
## Scan the tables of which lines are inserted and deleted,
|
|
## producing an edit script in forward order.
|
|
var startA = 0
|
|
var startB = 0
|
|
var lineA = 0
|
|
var lineB = 0
|
|
while lineA < dataA.len or lineB < dataB.len:
|
|
if lineA < dataA.len and not dataA.modified[lineA] and
|
|
lineB < dataB.len and not dataB.modified[lineB]:
|
|
# equal lines
|
|
inc lineA
|
|
inc lineB
|
|
else:
|
|
# maybe deleted and/or inserted lines
|
|
startA = lineA
|
|
startB = lineB
|
|
|
|
while lineA < dataA.len and (lineB >= dataB.len or dataA.modified[lineA]):
|
|
inc lineA
|
|
|
|
while lineB < dataB.len and (lineA >= dataA.len or dataB.modified[lineB]):
|
|
inc lineB
|
|
|
|
if (startA < lineA) or (startB < lineB):
|
|
result.add Item(startA: startA,
|
|
startB: startB,
|
|
deletedA: lineA - startA,
|
|
insertedB: lineB - startB)
|
|
|
|
|
|
proc diffInt*(arrayA, arrayB: openArray[int]): seq[Item] =
|
|
## Find the difference in 2 arrays of integers.
|
|
##
|
|
## ``arrayA`` A-version of the numbers (usualy the old one)
|
|
##
|
|
## ``arrayB`` B-version of the numbers (usualy the new one)
|
|
##
|
|
## Returns a sequence of Items that describe the differences.
|
|
|
|
# The A-Version of the data (original data) to be compared.
|
|
var dataA = newDiffData(@arrayA, arrayA.len)
|
|
|
|
# The B-Version of the data (modified data) to be compared.
|
|
var dataB = newDiffData(@arrayB, arrayB.len)
|
|
|
|
let max = dataA.len + dataB.len + 1
|
|
## vector for the (0,0) to (x,y) search
|
|
var downVector = newSeq[int](2 * max + 2)
|
|
## vector for the (u,v) to (N,M) search
|
|
var upVector = newSeq[int](2 * max + 2)
|
|
|
|
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
|
|
result = createDiffs(dataA, dataB)
|
|
|
|
proc diffText*(textA, textB: string): seq[Item] =
|
|
## Find the difference in 2 text documents, comparing by textlines.
|
|
##
|
|
## The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
|
|
## each line is converted into a (hash) number. This hash-value is computed by storing all
|
|
## textlines into a common hashtable so i can find dublicates in there, and generating a
|
|
## new number each time a new textline is inserted.
|
|
##
|
|
## ``textA`` A-version of the text (usually the old one)
|
|
##
|
|
## ``textB`` B-version of the text (usually the new one)
|
|
##
|
|
## Returns a seq of Items that describe the differences.
|
|
|
|
# prepare the input-text and convert to comparable numbers.
|
|
var h = initTable[string, int]() # TextA.len + TextB.len <- probably wrong initial size
|
|
# The A-Version of the data (original data) to be compared.
|
|
var dataA = diffCodes(textA, h)
|
|
|
|
# The B-Version of the data (modified data) to be compared.
|
|
var dataB = diffCodes(textB, h)
|
|
|
|
h.clear # free up hashtable memory (maybe)
|
|
|
|
let max = dataA.len + dataB.len + 1
|
|
## vector for the (0,0) to (x,y) search
|
|
var downVector = newSeq[int](2 * max + 2)
|
|
## vector for the (u,v) to (N,M) search
|
|
var upVector = newSeq[int](2 * max + 2)
|
|
|
|
lcs(dataA, 0, dataA.len, dataB, 0, dataB.len, downVector, upVector)
|
|
|
|
optimize(dataA)
|
|
optimize(dataB)
|
|
result = createDiffs(dataA, dataB)
|
|
|
|
when isMainModule:
|
|
|
|
proc testHelper(f: seq[Item]): string =
|
|
for it in f:
|
|
result.add(
|
|
$it.deletedA & "." & $it.insertedB & "." & $it.startA & "." & $it.startB & "*"
|
|
)
|
|
|
|
proc main() =
|
|
var a, b: string
|
|
|
|
stdout.writeLine("Diff Self Test...")
|
|
|
|
# test all changes
|
|
a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
|
|
b = "0,1,2,3,4,5,6,7,8,9".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"12.10.0.0*",
|
|
"all-changes test failed.")
|
|
stdout.writeLine("all-changes test passed.")
|
|
# test all same
|
|
a = "a,b,c,d,e,f,g,h,i,j,k,l".replace(',', '\n')
|
|
b = a
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"",
|
|
"all-same test failed.")
|
|
stdout.writeLine("all-same test passed.")
|
|
|
|
# test snake
|
|
a = "a,b,c,d,e,f".replace(',', '\n')
|
|
b = "b,c,d,e,f,x".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"1.0.0.0*0.1.6.5*",
|
|
"snake test failed.")
|
|
stdout.writeLine("snake test passed.")
|
|
|
|
# 2002.09.20 - repro
|
|
a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".replace(',', '\n')
|
|
b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*",
|
|
"repro20020920 test failed.")
|
|
stdout.writeLine("repro20020920 test passed.")
|
|
|
|
# 2003.02.07 - repro
|
|
a = "F".replace(',', '\n')
|
|
b = "0,F,1,2,3,4,5,6,7".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"0.1.0.0*0.7.1.2*",
|
|
"repro20030207 test failed.")
|
|
stdout.writeLine("repro20030207 test passed.")
|
|
|
|
# Muegel - repro
|
|
a = "HELLO\nWORLD"
|
|
b = "\n\nhello\n\n\n\nworld\n"
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"2.8.0.0*",
|
|
"repro20030409 test failed.")
|
|
stdout.writeLine("repro20030409 test passed.")
|
|
|
|
# test some differences
|
|
a = "a,b,-,c,d,e,f,f".replace(',', '\n')
|
|
b = "a,b,x,c,e,f".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"1.1.2.2*1.0.4.4*1.0.7.6*",
|
|
"some-changes test failed.")
|
|
stdout.writeLine("some-changes test passed.")
|
|
|
|
# test one change within long chain of repeats
|
|
a = "a,a,a,a,a,a,a,a,a,a".replace(',', '\n')
|
|
b = "a,a,a,a,-,a,a,a,a,a".replace(',', '\n')
|
|
assert(testHelper(diffText(a, b)) ==
|
|
"0.1.4.4*1.0.9.10*",
|
|
"long chain of repeats test failed.")
|
|
|
|
stdout.writeLine("End.")
|
|
stdout.flushFile
|
|
|
|
main()
|