mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-04 12:07:51 +00:00
[backport] pseudorandom probing for hash collision (#13418)
This commit is contained in:
@@ -18,34 +18,87 @@ when not defined(nimHasDefault):
|
||||
var v: T
|
||||
v
|
||||
|
||||
const freeMarker = 0
|
||||
const deletedMarker = -1
|
||||
|
||||
type UHash = uint
|
||||
|
||||
# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These
|
||||
# two procs retain clarity of that encoding without the space cost of an enum.
|
||||
proc isEmpty(hcode: Hash): bool {.inline.} =
|
||||
result = hcode == 0
|
||||
proc isFilledAndValid(hcode: Hash): bool {.inline.} =
|
||||
result = hcode != 0 and hcode != deletedMarker
|
||||
# performance: we could use bit magic if needed
|
||||
|
||||
proc isFilled(hcode: Hash): bool {.inline.} =
|
||||
result = hcode != 0
|
||||
|
||||
proc nextTry(h, maxHash: Hash): Hash {.inline.} =
|
||||
result = (h + 1) and maxHash
|
||||
|
||||
proc mustRehash(length, counter: int): bool {.inline.} =
|
||||
assert(length > counter)
|
||||
result = (length * 2 < counter * 3) or (length - counter < 4)
|
||||
proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} =
|
||||
result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask))
|
||||
|
||||
proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} =
|
||||
# FACTOR between hashcommon.nextTry, intsets.nextTry
|
||||
# an optimization would be to use `(h + 1) and maxHash` for a few iterations
|
||||
# and then switch to the formula below, to get "best of both worlds": good
|
||||
# cache locality, except when a collision cluster is detected (ie, large number
|
||||
# of iterations).
|
||||
const PERTURB_SHIFT = 5 # consider tying this to `numBitsMask = fastLog2(t.dataLen)`
|
||||
result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash))
|
||||
perturb = perturb shr PERTURB_SHIFT
|
||||
|
||||
proc mustRehash[T](t: T): bool {.inline.} =
|
||||
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
|
||||
let counter2 = t.counter + t.countDeleted
|
||||
let length = t.dataLen
|
||||
assert(length > counter2)
|
||||
result = (length * 2 < counter2 * 3) or (length - counter2 < 4) # synchronize with `rightSize`
|
||||
|
||||
proc rightSize*(count: Natural): int {.inline.} =
|
||||
## Return the value of `initialSize` to support `count` items.
|
||||
##
|
||||
## If more items are expected to be added, simply add that
|
||||
## expected extra amount to the parameter before calling this.
|
||||
##
|
||||
## Internally, we want `mustRehash(t) == false` for t that was just resized.
|
||||
# Make sure to synchronize with `mustRehash`
|
||||
result = nextPowerOfTwo(count * 3 div 2 + 4)
|
||||
|
||||
template getPerturb(t: typed, hc: Hash): UHash =
|
||||
# we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors
|
||||
# so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines)
|
||||
let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2
|
||||
# this makes a major difference for cases like #13393; it causes the bits
|
||||
# that were masked out in 1st position so they'll be masked in instead, and
|
||||
# influence the recursion in nextTry earlier rather than later.
|
||||
translateBits(cast[uint](hc), numBitsMask)
|
||||
|
||||
template rawGetKnownHCImpl() {.dirty.} =
|
||||
if t.dataLen == 0:
|
||||
return -1
|
||||
var h: Hash = hc and maxHash(t) # start with real hash value
|
||||
while isFilled(t.data[h].hcode):
|
||||
# Compare hc THEN key with boolean short circuit. This makes the common case
|
||||
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
|
||||
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
|
||||
# just a few clock cycles, generally worth it for any non-integer-like A.
|
||||
if t.data[h].hcode == hc and t.data[h].key == key:
|
||||
return h
|
||||
h = nextTry(h, maxHash(t))
|
||||
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
|
||||
var perturb = t.getPerturb(hc)
|
||||
var deletedIndex = -1
|
||||
while true:
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
# Compare hc THEN key with boolean short circuit. This makes the common case
|
||||
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
|
||||
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
|
||||
# just a few clock cycles, generally worth it for any non-integer-like A.
|
||||
# performance: we optimize this: depending on type(key), skip hc comparison
|
||||
if t.data[h].hcode == hc and t.data[h].key == key:
|
||||
return h
|
||||
h = nextTry(h, maxHash(t), perturb)
|
||||
elif t.data[h].hcode == deletedMarker:
|
||||
if deletedIndex == -1:
|
||||
deletedIndex = h
|
||||
h = nextTry(h, maxHash(t), perturb)
|
||||
else:
|
||||
break
|
||||
if deletedIndex == -1:
|
||||
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
|
||||
else:
|
||||
# we prefer returning a (in fact the 1st found) deleted index
|
||||
result = -1 - deletedIndex
|
||||
|
||||
proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} =
|
||||
rawGetKnownHCImpl()
|
||||
@@ -54,6 +107,8 @@ template genHashImpl(key, hc: typed) =
|
||||
hc = hash(key)
|
||||
if hc == 0: # This almost never taken branch should be very predictable.
|
||||
hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine.
|
||||
elif hc == deletedMarker:
|
||||
hc = 214159261
|
||||
|
||||
template genHash(key: typed): Hash =
|
||||
var res: Hash
|
||||
|
||||
@@ -46,30 +46,40 @@ type
|
||||
IntSet* = object ## An efficient set of `int` implemented as a sparse bit set.
|
||||
elems: int # only valid for small numbers
|
||||
counter, max: int
|
||||
countDeleted: int
|
||||
head: PTrunk
|
||||
data: TrunkSeq
|
||||
a: array[0..33, int] # profiling shows that 34 elements are enough
|
||||
|
||||
proc mustRehash(length, counter: int): bool {.inline.} =
|
||||
assert(length > counter)
|
||||
result = (length * 2 < counter * 3) or (length - counter < 4)
|
||||
proc mustRehash[T](t: T): bool {.inline.} =
|
||||
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
|
||||
let counter2 = t.counter + t.countDeleted
|
||||
let length = t.max + 1
|
||||
assert length > counter2
|
||||
result = (length * 2 < counter2 * 3) or (length - counter2 < 4)
|
||||
|
||||
proc nextTry(h, maxHash: Hash): Hash {.inline.} =
|
||||
result = ((5 * h) + 1) and maxHash
|
||||
proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} =
|
||||
# FACTOR between hashcommon.nextTry, intsets.nextTry
|
||||
const PERTURB_SHIFT = 5
|
||||
var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT
|
||||
perturb = cast[Hash](perturb2)
|
||||
result = ((5*h) + 1 + perturb) and maxHash
|
||||
|
||||
proc intSetGet(t: IntSet, key: int): PTrunk =
|
||||
var h = key and t.max
|
||||
var perturb = key
|
||||
while t.data[h] != nil:
|
||||
if t.data[h].key == key:
|
||||
return t.data[h]
|
||||
h = nextTry(h, t.max)
|
||||
h = nextTry(h, t.max, perturb)
|
||||
result = nil
|
||||
|
||||
proc intSetRawInsert(t: IntSet, data: var TrunkSeq, desc: PTrunk) =
|
||||
var h = desc.key and t.max
|
||||
var perturb = desc.key
|
||||
while data[h] != nil:
|
||||
assert(data[h] != desc)
|
||||
h = nextTry(h, t.max)
|
||||
h = nextTry(h, t.max, perturb)
|
||||
assert(data[h] == nil)
|
||||
data[h] = desc
|
||||
|
||||
@@ -84,14 +94,16 @@ proc intSetEnlarge(t: var IntSet) =
|
||||
|
||||
proc intSetPut(t: var IntSet, key: int): PTrunk =
|
||||
var h = key and t.max
|
||||
var perturb = key
|
||||
while t.data[h] != nil:
|
||||
if t.data[h].key == key:
|
||||
return t.data[h]
|
||||
h = nextTry(h, t.max)
|
||||
if mustRehash(t.max + 1, t.counter): intSetEnlarge(t)
|
||||
h = nextTry(h, t.max, perturb)
|
||||
if mustRehash(t): intSetEnlarge(t)
|
||||
inc(t.counter)
|
||||
h = key and t.max
|
||||
while t.data[h] != nil: h = nextTry(h, t.max)
|
||||
perturb = key
|
||||
while t.data[h] != nil: h = nextTry(h, t.max, perturb)
|
||||
assert(t.data[h] == nil)
|
||||
new(result)
|
||||
result.next = t.head
|
||||
@@ -100,6 +112,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk =
|
||||
t.data[h] = result
|
||||
|
||||
proc bitincl(s: var IntSet, key: int) {.inline.} =
|
||||
var ret: PTrunk
|
||||
var t = intSetPut(s, `shr`(key, TrunkShift))
|
||||
var u = key and TrunkMask
|
||||
t.bits[u shr IntShift] = t.bits[u shr IntShift] or
|
||||
@@ -393,7 +406,8 @@ proc assign*(dest: var IntSet, src: IntSet) =
|
||||
var it = src.head
|
||||
while it != nil:
|
||||
var h = it.key and dest.max
|
||||
while dest.data[h] != nil: h = nextTry(h, dest.max)
|
||||
var perturb = it.key
|
||||
while dest.data[h] != nil: h = nextTry(h, dest.max, perturb)
|
||||
assert(dest.data[h] == nil)
|
||||
var n: PTrunk
|
||||
new(n)
|
||||
|
||||
@@ -48,7 +48,7 @@ template inclImpl() {.dirty.} =
|
||||
var hc: Hash
|
||||
var index = rawGet(s, key, hc)
|
||||
if index < 0:
|
||||
if mustRehash(len(s.data), s.counter):
|
||||
if mustRehash(s):
|
||||
enlarge(s)
|
||||
index = rawGetKnownHC(s, key, hc)
|
||||
rawInsert(s, s.data, key, hc, -1 - index)
|
||||
@@ -62,17 +62,12 @@ template containsOrInclImpl() {.dirty.} =
|
||||
if index >= 0:
|
||||
result = true
|
||||
else:
|
||||
if mustRehash(len(s.data), s.counter):
|
||||
if mustRehash(s):
|
||||
enlarge(s)
|
||||
index = rawGetKnownHC(s, key, hc)
|
||||
rawInsert(s, s.data, key, hc, -1 - index)
|
||||
inc(s.counter)
|
||||
|
||||
template doWhile(a, b) =
|
||||
while true:
|
||||
b
|
||||
if not a: break
|
||||
|
||||
proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
|
||||
var hc: Hash
|
||||
var i = rawGet(s, key, hc)
|
||||
@@ -82,17 +77,9 @@ proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
|
||||
if i >= 0:
|
||||
result = false
|
||||
dec(s.counter)
|
||||
while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1
|
||||
var j = i # The correctness of this depends on (h+1) in nextTry,
|
||||
var r = j # though may be adaptable to other simple sequences.
|
||||
s.data[i].hcode = 0 # mark current EMPTY
|
||||
s.data[i].key = default(type(s.data[i].key))
|
||||
doWhile((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)):
|
||||
i = (i + 1) and msk # increment mod table size
|
||||
if isEmpty(s.data[i].hcode): # end of collision cluster; So all done
|
||||
return
|
||||
r = s.data[i].hcode and msk # "home" location of key@i
|
||||
s.data[j] = move(s.data[i]) # data[i] will be marked EMPTY next loop
|
||||
inc(s.countDeleted)
|
||||
s.data[i].hcode = deletedMarker
|
||||
s.data[i].key = default(type(s.data[i].key))
|
||||
|
||||
template dollarImpl() {.dirty.} =
|
||||
result = "{"
|
||||
|
||||
@@ -68,6 +68,7 @@ type
|
||||
## before calling other procs on it.
|
||||
data: KeyValuePairSeq[A]
|
||||
counter: int
|
||||
countDeleted: int
|
||||
|
||||
type
|
||||
OrderedKeyValuePair[A] = tuple[
|
||||
@@ -80,15 +81,13 @@ type
|
||||
## <#initOrderedSet,int>`_ before calling other procs on it.
|
||||
data: OrderedKeyValuePairSeq[A]
|
||||
counter, first, last: int
|
||||
countDeleted: int
|
||||
|
||||
const
|
||||
defaultInitialSize* = 64
|
||||
|
||||
include setimpl
|
||||
|
||||
proc rightSize*(count: Natural): int {.inline.}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# ------------------------------ HashSet ------------------------------
|
||||
# ---------------------------------------------------------------------
|
||||
@@ -250,7 +249,7 @@ iterator items*[A](s: HashSet[A]): A =
|
||||
## echo b
|
||||
## # --> {(a: 1, b: 3), (a: 0, b: 4)}
|
||||
for h in 0 .. high(s.data):
|
||||
if isFilled(s.data[h].hcode): yield s.data[h].key
|
||||
if isFilledAndValid(s.data[h].hcode): yield s.data[h].key
|
||||
|
||||
proc containsOrIncl*[A](s: var HashSet[A], key: A): bool =
|
||||
## Includes `key` in the set `s` and tells if `key` was already in `s`.
|
||||
@@ -342,7 +341,7 @@ proc pop*[A](s: var HashSet[A]): A =
|
||||
doAssertRaises(KeyError, echo s.pop)
|
||||
|
||||
for h in 0 .. high(s.data):
|
||||
if isFilled(s.data[h].hcode):
|
||||
if isFilledAndValid(s.data[h].hcode):
|
||||
result = s.data[h].key
|
||||
excl(s, result)
|
||||
return result
|
||||
@@ -593,16 +592,6 @@ proc `$`*[A](s: HashSet[A]): string =
|
||||
## # --> {no, esc'aping, is " provided}
|
||||
dollarImpl()
|
||||
|
||||
proc rightSize*(count: Natural): int {.inline.} =
|
||||
## Return the value of `initialSize` to support `count` items.
|
||||
##
|
||||
## If more items are expected to be added, simply add that
|
||||
## expected extra amount to the parameter before calling this.
|
||||
##
|
||||
## Internally, we want `mustRehash(rightSize(x), x) == false`.
|
||||
result = nextPowerOfTwo(count * 3 div 2 + 4)
|
||||
|
||||
|
||||
proc initSet*[A](initialSize = defaultInitialSize): HashSet[A] {.deprecated:
|
||||
"Deprecated since v0.20, use 'initHashSet'".} = initHashSet[A](initialSize)
|
||||
|
||||
@@ -634,7 +623,7 @@ template forAllOrderedPairs(yieldStmt: untyped) {.dirty.} =
|
||||
var idx = 0
|
||||
while h >= 0:
|
||||
var nxt = s.data[h].next
|
||||
if isFilled(s.data[h].hcode):
|
||||
if isFilledAndValid(s.data[h].hcode):
|
||||
yieldStmt
|
||||
inc(idx)
|
||||
h = nxt
|
||||
@@ -868,7 +857,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool =
|
||||
while h >= 0 and g >= 0:
|
||||
var nxh = s.data[h].next
|
||||
var nxg = t.data[g].next
|
||||
if isFilled(s.data[h].hcode) and isFilled(t.data[g].hcode):
|
||||
if isFilledAndValid(s.data[h].hcode) and isFilledAndValid(t.data[g].hcode):
|
||||
if s.data[h].key == t.data[g].key:
|
||||
inc compared
|
||||
else:
|
||||
@@ -1146,10 +1135,19 @@ when isMainModule and not defined(release):
|
||||
b.incl(2)
|
||||
assert b.len == 1
|
||||
|
||||
for i in 0 .. 32:
|
||||
var s = rightSize(i)
|
||||
if s <= i or mustRehash(s, i):
|
||||
echo "performance issue: rightSize() will not elide enlarge() at ", i
|
||||
block:
|
||||
type FakeTable = object
|
||||
dataLen: int
|
||||
counter: int
|
||||
countDeleted: int
|
||||
|
||||
var t: FakeTable
|
||||
for i in 0 .. 32:
|
||||
var s = rightSize(i)
|
||||
t.dataLen = s
|
||||
t.counter = i
|
||||
doAssert s > i and not mustRehash(t),
|
||||
"performance issue: rightSize() will not elide enlarge() at: " & $i
|
||||
|
||||
block missingOrExcl:
|
||||
var s = toOrderedSet([2, 3, 6, 7])
|
||||
|
||||
@@ -25,6 +25,7 @@ type
|
||||
SharedTable*[A, B] = object ## generic hash SharedTable
|
||||
data: KeyValuePairSeq[A, B]
|
||||
counter, dataLen: int
|
||||
countDeleted: int
|
||||
lock: Lock
|
||||
|
||||
template maxHash(t): untyped = t.dataLen-1
|
||||
@@ -32,7 +33,7 @@ template maxHash(t): untyped = t.dataLen-1
|
||||
include tableimpl
|
||||
|
||||
template st_maybeRehashPutImpl(enlarge) {.dirty.} =
|
||||
if mustRehash(t.dataLen, t.counter):
|
||||
if mustRehash(t):
|
||||
enlarge(t)
|
||||
index = rawGetKnownHC(t, key, hc)
|
||||
index = -1 - index # important to transform for mgetOrPutImpl
|
||||
@@ -49,9 +50,10 @@ proc enlarge[A, B](t: var SharedTable[A, B]) =
|
||||
for i in 0..<oldSize:
|
||||
let eh = n[i].hcode
|
||||
if isFilled(eh):
|
||||
var perturb = t.getPerturb(eh)
|
||||
var j: Hash = eh and maxHash(t)
|
||||
while isFilled(t.data[j].hcode):
|
||||
j = nextTry(j, maxHash(t))
|
||||
j = nextTry(j, maxHash(t), perturb)
|
||||
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
|
||||
deallocShared(n)
|
||||
|
||||
|
||||
@@ -14,13 +14,20 @@ include hashcommon
|
||||
template rawGetDeepImpl() {.dirty.} = # Search algo for unconditional add
|
||||
genHashImpl(key, hc)
|
||||
var h: Hash = hc and maxHash(t)
|
||||
while isFilled(t.data[h].hcode):
|
||||
h = nextTry(h, maxHash(t))
|
||||
var perturb = t.getPerturb(hc)
|
||||
while true:
|
||||
let hcode = t.data[h].hcode
|
||||
if hcode == deletedMarker or hcode == freeMarker:
|
||||
break
|
||||
else:
|
||||
h = nextTry(h, maxHash(t), perturb)
|
||||
result = h
|
||||
|
||||
template rawInsertImpl() {.dirty.} =
|
||||
template rawInsertImpl(t) {.dirty.} =
|
||||
data[h].key = key
|
||||
data[h].val = val
|
||||
if data[h].hcode == deletedMarker:
|
||||
t.countDeleted.dec
|
||||
data[h].hcode = hc
|
||||
|
||||
proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} =
|
||||
@@ -28,7 +35,7 @@ proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} =
|
||||
|
||||
proc rawInsert[X, A, B](t: var X, data: var KeyValuePairSeq[A, B],
|
||||
key: A, val: B, hc: Hash, h: Hash) =
|
||||
rawInsertImpl()
|
||||
rawInsertImpl(t)
|
||||
|
||||
template checkIfInitialized() =
|
||||
when compiles(defaultInitialSize):
|
||||
@@ -37,15 +44,14 @@ template checkIfInitialized() =
|
||||
|
||||
template addImpl(enlarge) {.dirty.} =
|
||||
checkIfInitialized()
|
||||
if mustRehash(t.dataLen, t.counter): enlarge(t)
|
||||
if mustRehash(t): enlarge(t)
|
||||
var hc: Hash
|
||||
var j = rawGetDeep(t, key, hc)
|
||||
rawInsert(t, t.data, key, val, hc, j)
|
||||
inc(t.counter)
|
||||
|
||||
template maybeRehashPutImpl(enlarge) {.dirty.} =
|
||||
checkIfInitialized()
|
||||
if mustRehash(t.dataLen, t.counter):
|
||||
if mustRehash(t):
|
||||
enlarge(t)
|
||||
index = rawGetKnownHC(t, key, hc)
|
||||
index = -1 - index # important to transform for mgetOrPutImpl
|
||||
@@ -82,24 +88,11 @@ template delImplIdx(t, i) =
|
||||
let msk = maxHash(t)
|
||||
if i >= 0:
|
||||
dec(t.counter)
|
||||
block outer:
|
||||
while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1
|
||||
var j = i # The correctness of this depends on (h+1) in nextTry,
|
||||
var r = j # though may be adaptable to other simple sequences.
|
||||
t.data[i].hcode = 0 # mark current EMPTY
|
||||
t.data[i].key = default(type(t.data[i].key))
|
||||
t.data[i].val = default(type(t.data[i].val))
|
||||
while true:
|
||||
i = (i + 1) and msk # increment mod table size
|
||||
if isEmpty(t.data[i].hcode): # end of collision cluster; So all done
|
||||
break outer
|
||||
r = t.data[i].hcode and msk # "home" location of key@i
|
||||
if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)):
|
||||
break
|
||||
when defined(js):
|
||||
t.data[j] = t.data[i]
|
||||
else:
|
||||
t.data[j] = move(t.data[i]) # data[j] will be marked EMPTY next loop
|
||||
inc(t.countDeleted)
|
||||
t.data[i].hcode = deletedMarker
|
||||
t.data[i].key = default(type(t.data[i].key))
|
||||
t.data[i].val = default(type(t.data[i].val))
|
||||
# mustRehash + enlarge not needed because counter+countDeleted doesn't change
|
||||
|
||||
template delImpl() {.dirty.} =
|
||||
var hc: Hash
|
||||
@@ -123,8 +116,8 @@ template initImpl(result: typed, size: int) =
|
||||
result.last = -1
|
||||
|
||||
template insertImpl() = # for CountTable
|
||||
if t.dataLen == 0: initImpl(t, defaultInitialSize)
|
||||
if mustRehash(len(t.data), t.counter): enlarge(t)
|
||||
checkIfInitialized()
|
||||
if mustRehash(t): enlarge(t)
|
||||
ctRawInsert(t, t.data, key, val)
|
||||
inc(t.counter)
|
||||
|
||||
|
||||
@@ -233,6 +233,7 @@ type
|
||||
## For creating an empty Table, use `initTable proc<#initTable,int>`_.
|
||||
data: KeyValuePairSeq[A, B]
|
||||
counter: int
|
||||
countDeleted: int
|
||||
TableRef*[A, B] = ref Table[A, B] ## Ref version of `Table<#Table>`_.
|
||||
##
|
||||
## For creating a new empty TableRef, use `newTable proc
|
||||
@@ -250,8 +251,6 @@ template dataLen(t): untyped = len(t.data)
|
||||
|
||||
include tableimpl
|
||||
|
||||
proc rightSize*(count: Natural): int {.inline.}
|
||||
|
||||
template get(t, key): untyped =
|
||||
## retrieves the value at ``t[key]``. The value can be modified.
|
||||
## If ``key`` is not in ``t``, the ``KeyError`` exception is raised.
|
||||
@@ -267,14 +266,16 @@ template get(t, key): untyped =
|
||||
|
||||
proc enlarge[A, B](t: var Table[A, B]) =
|
||||
var n: KeyValuePairSeq[A, B]
|
||||
newSeq(n, len(t.data) * growthFactor)
|
||||
newSeq(n, t.counter.rightSize)
|
||||
swap(t.data, n)
|
||||
t.countDeleted = 0
|
||||
for i in countup(0, high(n)):
|
||||
let eh = n[i].hcode
|
||||
if isFilled(eh):
|
||||
if isFilledAndValid(eh):
|
||||
var j: Hash = eh and maxHash(t)
|
||||
var perturb = t.getPerturb(eh)
|
||||
while isFilled(t.data[j].hcode):
|
||||
j = nextTry(j, maxHash(t))
|
||||
j = nextTry(j, maxHash(t), perturb)
|
||||
when defined(js):
|
||||
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
|
||||
else:
|
||||
@@ -579,15 +580,6 @@ proc `==`*[A, B](s, t: Table[A, B]): bool =
|
||||
|
||||
equalsImpl(s, t)
|
||||
|
||||
proc rightSize*(count: Natural): int {.inline.} =
|
||||
## Return the value of ``initialSize`` to support ``count`` items.
|
||||
##
|
||||
## If more items are expected to be added, simply add that
|
||||
## expected extra amount to the parameter before calling this.
|
||||
##
|
||||
## Internally, we want mustRehash(rightSize(x), x) == false.
|
||||
result = nextPowerOfTwo(count * 3 div 2 + 4)
|
||||
|
||||
proc indexBy*[A, B, C](collection: A, index: proc(x: B): C): Table[C, B] =
|
||||
## Index the collection with the proc provided.
|
||||
# TODO: As soon as supported, change collection: A to collection: A[B]
|
||||
@@ -670,7 +662,7 @@ iterator pairs*[A, B](t: Table[A, B]): (A, B) =
|
||||
## # value: [1, 5, 7, 9]
|
||||
let L = len(t)
|
||||
for h in 0 .. high(t.data):
|
||||
if isFilled(t.data[h].hcode):
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
yield (t.data[h].key, t.data[h].val)
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
|
||||
@@ -692,7 +684,7 @@ iterator mpairs*[A, B](t: var Table[A, B]): (A, var B) =
|
||||
|
||||
let L = len(t)
|
||||
for h in 0 .. high(t.data):
|
||||
if isFilled(t.data[h].hcode):
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
yield (t.data[h].key, t.data[h].val)
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
|
||||
@@ -713,7 +705,7 @@ iterator keys*[A, B](t: Table[A, B]): A =
|
||||
|
||||
let L = len(t)
|
||||
for h in 0 .. high(t.data):
|
||||
if isFilled(t.data[h].hcode):
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
yield t.data[h].key
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
|
||||
@@ -734,7 +726,7 @@ iterator values*[A, B](t: Table[A, B]): B =
|
||||
|
||||
let L = len(t)
|
||||
for h in 0 .. high(t.data):
|
||||
if isFilled(t.data[h].hcode):
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
yield t.data[h].val
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
|
||||
@@ -756,36 +748,53 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B =
|
||||
|
||||
let L = len(t)
|
||||
for h in 0 .. high(t.data):
|
||||
if isFilled(t.data[h].hcode):
|
||||
if isFilledAndValid(t.data[h].hcode):
|
||||
yield t.data[h].val
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
|
||||
template hasKeyOrPutCache(cache, h): bool =
|
||||
# using `IntSet` would be an option worth considering to avoid quadratic
|
||||
# behavior in case user misuses Table with lots of duplicate keys; but it
|
||||
# has overhead in the common case of small number of duplicates.
|
||||
# However: when lots of duplicates are used, all operations would be slow
|
||||
# anyway because the full `hash(key)` is identical for these, which makes
|
||||
# `nextTry` follow the exact same path for each key, resulting in large
|
||||
# collision clusters. Alternatives could involve modifying the hash/retrieval
|
||||
# based on duplicate key count.
|
||||
var ret = false
|
||||
for hi in cache:
|
||||
if hi == h:
|
||||
ret = true
|
||||
break
|
||||
if not ret: cache.add h
|
||||
ret
|
||||
|
||||
iterator allValues*[A, B](t: Table[A, B]; key: A): B =
|
||||
## Iterates over any value in the table ``t`` that belongs to the given ``key``.
|
||||
##
|
||||
## Used if you have a table with duplicate keys (as a result of using
|
||||
## `add proc<#add,Table[A,B],A,B>`_).
|
||||
##
|
||||
## **Examples:**
|
||||
##
|
||||
## .. code-block::
|
||||
## var a = {'a': 3, 'b': 5}.toTable
|
||||
## for i in 1..3:
|
||||
## a.add('z', 10*i)
|
||||
## echo a # {'a': 3, 'b': 5, 'z': 10, 'z': 20, 'z': 30}
|
||||
##
|
||||
## for v in a.allValues('z'):
|
||||
## echo v
|
||||
## # 10
|
||||
## # 20
|
||||
## # 30
|
||||
var h: Hash = genHash(key) and high(t.data)
|
||||
runnableExamples:
|
||||
import testutils
|
||||
var a = {'a': 3, 'b': 5}.toTable
|
||||
for i in 1..3: a.add('z', 10*i)
|
||||
doAssert a.sortedPairs == @[('a', 3), ('b', 5), ('z', 10), ('z', 20), ('z', 30)]
|
||||
doAssert sortedItems(a.allValues('z')) == @[10, 20, 30]
|
||||
|
||||
let hc = genHash(key)
|
||||
var h: Hash = hc and high(t.data)
|
||||
let L = len(t)
|
||||
while isFilled(t.data[h].hcode):
|
||||
if t.data[h].key == key:
|
||||
yield t.data[h].val
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
h = nextTry(h, high(t.data))
|
||||
var perturb = t.getPerturb(hc)
|
||||
|
||||
var num = 0
|
||||
var cache: seq[Hash]
|
||||
while isFilled(t.data[h].hcode): # `isFilledAndValid` would be incorrect, see test for `allValues`
|
||||
if t.data[h].hcode == hc and t.data[h].key == key:
|
||||
if not hasKeyOrPutCache(cache, h):
|
||||
yield t.data[h].val
|
||||
assert(len(t) == L, "the length of the table changed while iterating over it")
|
||||
h = nextTry(h, high(t.data), perturb)
|
||||
|
||||
|
||||
|
||||
@@ -1219,6 +1228,8 @@ type
|
||||
## <#initOrderedTable,int>`_.
|
||||
data: OrderedKeyValuePairSeq[A, B]
|
||||
counter, first, last: int
|
||||
countDeleted: int
|
||||
|
||||
OrderedTableRef*[A, B] = ref OrderedTable[A, B] ## Ref version of
|
||||
## `OrderedTable<#OrderedTable>`_.
|
||||
##
|
||||
@@ -1240,7 +1251,7 @@ proc rawGet[A, B](t: OrderedTable[A, B], key: A, hc: var Hash): int =
|
||||
proc rawInsert[A, B](t: var OrderedTable[A, B],
|
||||
data: var OrderedKeyValuePairSeq[A, B],
|
||||
key: A, val: B, hc: Hash, h: Hash) =
|
||||
rawInsertImpl()
|
||||
rawInsertImpl(t)
|
||||
data[h].next = -1
|
||||
if t.first < 0: t.first = h
|
||||
if t.last >= 0: data[t.last].next = h
|
||||
@@ -1248,18 +1259,20 @@ proc rawInsert[A, B](t: var OrderedTable[A, B],
|
||||
|
||||
proc enlarge[A, B](t: var OrderedTable[A, B]) =
|
||||
var n: OrderedKeyValuePairSeq[A, B]
|
||||
newSeq(n, len(t.data) * growthFactor)
|
||||
newSeq(n, t.counter.rightSize)
|
||||
var h = t.first
|
||||
t.first = -1
|
||||
t.last = -1
|
||||
swap(t.data, n)
|
||||
t.countDeleted = 0
|
||||
while h >= 0:
|
||||
var nxt = n[h].next
|
||||
let eh = n[h].hcode
|
||||
if isFilled(eh):
|
||||
if isFilledAndValid(eh):
|
||||
var j: Hash = eh and maxHash(t)
|
||||
var perturb = t.getPerturb(eh)
|
||||
while isFilled(t.data[j].hcode):
|
||||
j = nextTry(j, maxHash(t))
|
||||
j = nextTry(j, maxHash(t), perturb)
|
||||
rawInsert(t, t.data, n[h].key, n[h].val, n[h].hcode, j)
|
||||
h = nxt
|
||||
|
||||
@@ -2211,6 +2224,7 @@ type
|
||||
## <#initCountTable,int>`_.
|
||||
data: seq[tuple[key: A, val: int]]
|
||||
counter: int
|
||||
countDeleted: int
|
||||
isSorted: bool
|
||||
CountTableRef*[A] = ref CountTable[A] ## Ref version of
|
||||
## `CountTable<#CountTable>`_.
|
||||
@@ -2223,8 +2237,10 @@ type
|
||||
|
||||
proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]],
|
||||
key: A, val: int) =
|
||||
var h: Hash = hash(key) and high(data)
|
||||
while data[h].val != 0: h = nextTry(h, high(data))
|
||||
let hc = hash(key)
|
||||
var perturb = t.getPerturb(hc)
|
||||
var h: Hash = hc and high(data)
|
||||
while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: handle deletedMarker
|
||||
data[h].key = key
|
||||
data[h].val = val
|
||||
|
||||
@@ -2251,10 +2267,12 @@ proc remove[A](t: var CountTable[A], key: A) =
|
||||
proc rawGet[A](t: CountTable[A], key: A): int =
|
||||
if t.data.len == 0:
|
||||
return -1
|
||||
var h: Hash = hash(key) and high(t.data) # start with real hash value
|
||||
while t.data[h].val != 0:
|
||||
let hc = hash(key)
|
||||
var perturb = t.getPerturb(hc)
|
||||
var h: Hash = hc and high(t.data) # start with real hash value
|
||||
while t.data[h].val != 0: # TODO: may need to handle t.data[h].hcode == deletedMarker?
|
||||
if t.data[h].key == key: return h
|
||||
h = nextTry(h, high(t.data))
|
||||
h = nextTry(h, high(t.data), perturb)
|
||||
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
|
||||
|
||||
template ctget(t, key, default: untyped): untyped =
|
||||
|
||||
@@ -112,38 +112,14 @@ proc hash*[T: proc](x: T): Hash {.inline.} =
|
||||
else:
|
||||
result = hash(pointer(x))
|
||||
|
||||
const
|
||||
prime = uint(11)
|
||||
|
||||
proc hash*(x: int): Hash {.inline.} =
|
||||
proc hash*(x: int|int64|uint|uint64|char|Ordinal): Hash {.inline.} =
|
||||
## Efficient hashing of integers.
|
||||
result = cast[Hash](cast[uint](x) * prime)
|
||||
|
||||
proc hash*(x: int64): Hash {.inline.} =
|
||||
## Efficient hashing of `int64` integers.
|
||||
result = cast[Hash](cast[uint](x) * prime)
|
||||
|
||||
proc hash*(x: uint): Hash {.inline.} =
|
||||
## Efficient hashing of unsigned integers.
|
||||
result = cast[Hash](x * prime)
|
||||
|
||||
proc hash*(x: uint64): Hash {.inline.} =
|
||||
## Efficient hashing of `uint64` integers.
|
||||
result = cast[Hash](cast[uint](x) * prime)
|
||||
|
||||
proc hash*(x: char): Hash {.inline.} =
|
||||
## Efficient hashing of characters.
|
||||
result = cast[Hash](cast[uint](ord(x)) * prime)
|
||||
|
||||
proc hash*[T: Ordinal](x: T): Hash {.inline.} =
|
||||
## Efficient hashing of other ordinal types (e.g. enums).
|
||||
result = cast[Hash](cast[uint](ord(x)) * prime)
|
||||
cast[Hash](ord(x))
|
||||
|
||||
proc hash*(x: float): Hash {.inline.} =
|
||||
## Efficient hashing of floats.
|
||||
var y = x + 1.0
|
||||
result = cast[ptr Hash](addr(y))[]
|
||||
|
||||
var y = x + 0.0 # for denormalization
|
||||
result = hash(cast[ptr Hash](addr(y))[])
|
||||
|
||||
# Forward declarations before methods that hash containers. This allows
|
||||
# containers to contain other containers
|
||||
|
||||
15
lib/pure/testutils.nim
Normal file
15
lib/pure/testutils.nim
Normal file
@@ -0,0 +1,15 @@
|
||||
## Utilities to help writing tests
|
||||
##
|
||||
## Unstable, experimental API.
|
||||
|
||||
import std/[sequtils, algorithm]
|
||||
|
||||
proc sortedPairs*[T](t: T): auto =
|
||||
## helps when writing tests involving tables in a way that's robust to
|
||||
## changes in hashing functions / table implementation.
|
||||
toSeq(t.pairs).sorted
|
||||
|
||||
template sortedItems*(t: untyped): untyped =
|
||||
## helps when writing tests involving tables in a way that's robust to
|
||||
## changes in hashing functions / table implementation.
|
||||
sorted(toSeq(t))
|
||||
@@ -1,7 +1,7 @@
|
||||
discard """
|
||||
cmd: "nim c --gc:arc $file"
|
||||
nimout: '''(a: true, n: doAssert)
|
||||
Table[system.string, trepr.MyType](data: @[], counter: 0)
|
||||
Table[system.string, trepr.MyType](data: @[], counter: 0, countDeleted: 0)
|
||||
nil
|
||||
'''
|
||||
"""
|
||||
|
||||
@@ -8,7 +8,12 @@ And we get here
|
||||
'''
|
||||
joinable: false
|
||||
"""
|
||||
import hashes, sequtils, tables, algorithm
|
||||
import hashes, sequtils, tables, algorithm, testutils
|
||||
|
||||
block tableDollar:
|
||||
# other tests should use `sortedPairs` to be robust to future table/hash
|
||||
# implementation changes
|
||||
doAssert ${1: 'a', 2: 'b'}.toTable in ["{1: 'a', 2: 'b'}", "{2: 'b', 1: 'a'}"]
|
||||
|
||||
# test should not be joined because it takes too long.
|
||||
block tableadds:
|
||||
@@ -188,6 +193,23 @@ block ttables2:
|
||||
run1()
|
||||
echo "2"
|
||||
|
||||
block allValues:
|
||||
var t: Table[int, string]
|
||||
var key = 0
|
||||
let n = 1000
|
||||
for i in 0..<n: t.add(i, $i)
|
||||
const keys = [0, -1, 12]
|
||||
for i in 0..1:
|
||||
for key in keys:
|
||||
t.add(key, $key & ":" & $i)
|
||||
for i in 0..<n:
|
||||
if i notin keys:
|
||||
t.del(i)
|
||||
doAssert t.sortedPairs == @[(-1, "-1:0"), (-1, "-1:1"), (0, "0"), (0, "0:0"), (0, "0:1"), (12, "12"), (12, "12:0"), (12, "12:1")]
|
||||
doAssert sortedItems(t.allValues(0)) == @["0", "0:0", "0:1"]
|
||||
doAssert sortedItems(t.allValues(-1)) == @["-1:0", "-1:1"]
|
||||
doAssert sortedItems(t.allValues(12)) == @["12", "12:0", "12:1"]
|
||||
doAssert sortedItems(t.allValues(1)) == @[]
|
||||
|
||||
block tablesref:
|
||||
const
|
||||
@@ -232,8 +254,8 @@ block tablesref:
|
||||
for x in 0..1:
|
||||
for y in 0..1:
|
||||
assert t[(x,y)] == $x & $y
|
||||
assert($t ==
|
||||
"{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}")
|
||||
assert t.sortedPairs ==
|
||||
@[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")]
|
||||
|
||||
block tableTest2:
|
||||
var t = newTable[string, float]()
|
||||
@@ -340,7 +362,7 @@ block tablesref:
|
||||
block anonZipTest:
|
||||
let keys = @['a','b','c']
|
||||
let values = @[1, 2, 3]
|
||||
doAssert "{'c': 3, 'a': 1, 'b': 2}" == $ toTable zip(keys, values)
|
||||
doAssert zip(keys, values).toTable.sortedPairs == @[('a', 1), ('b', 2), ('c', 3)]
|
||||
|
||||
block clearTableTest:
|
||||
var t = newTable[string, float]()
|
||||
@@ -369,3 +391,4 @@ block tablesref:
|
||||
|
||||
orderedTableSortTest()
|
||||
echo "3"
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ discard """
|
||||
output: '''true'''
|
||||
"""
|
||||
|
||||
import hashes, tables, sharedtables
|
||||
import hashes, tables, sharedtables, testutils
|
||||
|
||||
const
|
||||
data = {
|
||||
@@ -47,8 +47,7 @@ block tableTest1:
|
||||
for x in 0..1:
|
||||
for y in 0..1:
|
||||
assert t[(x,y)] == $x & $y
|
||||
assert($t ==
|
||||
"{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}")
|
||||
assert t.sortedPairs == @[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")]
|
||||
|
||||
block tableTest2:
|
||||
var t = initTable[string, float]()
|
||||
|
||||
Reference in New Issue
Block a user