[backport] pseudorandom probing for hash collision (#13418)

This commit is contained in:
Timothee Cour
2020-02-19 08:19:55 -08:00
committed by GitHub
parent 273a93581f
commit 8c22518d67
12 changed files with 258 additions and 178 deletions

View File

@@ -18,34 +18,87 @@ when not defined(nimHasDefault):
var v: T
v
const freeMarker = 0
const deletedMarker = -1
type UHash = uint
# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These
# two procs retain clarity of that encoding without the space cost of an enum.
proc isEmpty(hcode: Hash): bool {.inline.} =
result = hcode == 0
proc isFilledAndValid(hcode: Hash): bool {.inline.} =
result = hcode != 0 and hcode != deletedMarker
# performance: we could use bit magic if needed
proc isFilled(hcode: Hash): bool {.inline.} =
result = hcode != 0
proc nextTry(h, maxHash: Hash): Hash {.inline.} =
result = (h + 1) and maxHash
proc mustRehash(length, counter: int): bool {.inline.} =
assert(length > counter)
result = (length * 2 < counter * 3) or (length - counter < 4)
proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} =
result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask))
proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} =
# FACTOR between hashcommon.nextTry, intsets.nextTry
# an optimization would be to use `(h + 1) and maxHash` for a few iterations
# and then switch to the formula below, to get "best of both worlds": good
# cache locality, except when a collision cluster is detected (ie, large number
# of iterations).
const PERTURB_SHIFT = 5 # consider tying this to `numBitsMask = fastLog2(t.dataLen)`
result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash))
perturb = perturb shr PERTURB_SHIFT
proc mustRehash[T](t: T): bool {.inline.} =
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
let counter2 = t.counter + t.countDeleted
let length = t.dataLen
assert(length > counter2)
result = (length * 2 < counter2 * 3) or (length - counter2 < 4) # synchronize with `rightSize`
proc rightSize*(count: Natural): int {.inline.} =
## Return the value of `initialSize` to support `count` items.
##
## If more items are expected to be added, simply add that
## expected extra amount to the parameter before calling this.
##
## Internally, we want `mustRehash(t) == false` for t that was just resized.
# Make sure to synchronize with `mustRehash`
result = nextPowerOfTwo(count * 3 div 2 + 4)
template getPerturb(t: typed, hc: Hash): UHash =
# we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors
# so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines)
let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2
# this makes a major difference for cases like #13393; it causes the bits
# that were masked out in 1st position so they'll be masked in instead, and
# influence the recursion in nextTry earlier rather than later.
translateBits(cast[uint](hc), numBitsMask)
template rawGetKnownHCImpl() {.dirty.} =
if t.dataLen == 0:
return -1
var h: Hash = hc and maxHash(t) # start with real hash value
while isFilled(t.data[h].hcode):
# Compare hc THEN key with boolean short circuit. This makes the common case
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
# just a few clock cycles, generally worth it for any non-integer-like A.
if t.data[h].hcode == hc and t.data[h].key == key:
return h
h = nextTry(h, maxHash(t))
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
var perturb = t.getPerturb(hc)
var deletedIndex = -1
while true:
if isFilledAndValid(t.data[h].hcode):
# Compare hc THEN key with boolean short circuit. This makes the common case
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
# just a few clock cycles, generally worth it for any non-integer-like A.
# performance: we optimize this: depending on type(key), skip hc comparison
if t.data[h].hcode == hc and t.data[h].key == key:
return h
h = nextTry(h, maxHash(t), perturb)
elif t.data[h].hcode == deletedMarker:
if deletedIndex == -1:
deletedIndex = h
h = nextTry(h, maxHash(t), perturb)
else:
break
if deletedIndex == -1:
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
else:
# we prefer returning a (in fact the 1st found) deleted index
result = -1 - deletedIndex
proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} =
rawGetKnownHCImpl()
@@ -54,6 +107,8 @@ template genHashImpl(key, hc: typed) =
hc = hash(key)
if hc == 0: # This almost never taken branch should be very predictable.
hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine.
elif hc == deletedMarker:
hc = 214159261
template genHash(key: typed): Hash =
var res: Hash

View File

@@ -46,30 +46,40 @@ type
IntSet* = object ## An efficient set of `int` implemented as a sparse bit set.
elems: int # only valid for small numbers
counter, max: int
countDeleted: int
head: PTrunk
data: TrunkSeq
a: array[0..33, int] # profiling shows that 34 elements are enough
proc mustRehash(length, counter: int): bool {.inline.} =
assert(length > counter)
result = (length * 2 < counter * 3) or (length - counter < 4)
proc mustRehash[T](t: T): bool {.inline.} =
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
let counter2 = t.counter + t.countDeleted
let length = t.max + 1
assert length > counter2
result = (length * 2 < counter2 * 3) or (length - counter2 < 4)
proc nextTry(h, maxHash: Hash): Hash {.inline.} =
result = ((5 * h) + 1) and maxHash
proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} =
# FACTOR between hashcommon.nextTry, intsets.nextTry
const PERTURB_SHIFT = 5
var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT
perturb = cast[Hash](perturb2)
result = ((5*h) + 1 + perturb) and maxHash
proc intSetGet(t: IntSet, key: int): PTrunk =
var h = key and t.max
var perturb = key
while t.data[h] != nil:
if t.data[h].key == key:
return t.data[h]
h = nextTry(h, t.max)
h = nextTry(h, t.max, perturb)
result = nil
proc intSetRawInsert(t: IntSet, data: var TrunkSeq, desc: PTrunk) =
var h = desc.key and t.max
var perturb = desc.key
while data[h] != nil:
assert(data[h] != desc)
h = nextTry(h, t.max)
h = nextTry(h, t.max, perturb)
assert(data[h] == nil)
data[h] = desc
@@ -84,14 +94,16 @@ proc intSetEnlarge(t: var IntSet) =
proc intSetPut(t: var IntSet, key: int): PTrunk =
var h = key and t.max
var perturb = key
while t.data[h] != nil:
if t.data[h].key == key:
return t.data[h]
h = nextTry(h, t.max)
if mustRehash(t.max + 1, t.counter): intSetEnlarge(t)
h = nextTry(h, t.max, perturb)
if mustRehash(t): intSetEnlarge(t)
inc(t.counter)
h = key and t.max
while t.data[h] != nil: h = nextTry(h, t.max)
perturb = key
while t.data[h] != nil: h = nextTry(h, t.max, perturb)
assert(t.data[h] == nil)
new(result)
result.next = t.head
@@ -100,6 +112,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk =
t.data[h] = result
proc bitincl(s: var IntSet, key: int) {.inline.} =
var ret: PTrunk
var t = intSetPut(s, `shr`(key, TrunkShift))
var u = key and TrunkMask
t.bits[u shr IntShift] = t.bits[u shr IntShift] or
@@ -393,7 +406,8 @@ proc assign*(dest: var IntSet, src: IntSet) =
var it = src.head
while it != nil:
var h = it.key and dest.max
while dest.data[h] != nil: h = nextTry(h, dest.max)
var perturb = it.key
while dest.data[h] != nil: h = nextTry(h, dest.max, perturb)
assert(dest.data[h] == nil)
var n: PTrunk
new(n)

View File

@@ -48,7 +48,7 @@ template inclImpl() {.dirty.} =
var hc: Hash
var index = rawGet(s, key, hc)
if index < 0:
if mustRehash(len(s.data), s.counter):
if mustRehash(s):
enlarge(s)
index = rawGetKnownHC(s, key, hc)
rawInsert(s, s.data, key, hc, -1 - index)
@@ -62,17 +62,12 @@ template containsOrInclImpl() {.dirty.} =
if index >= 0:
result = true
else:
if mustRehash(len(s.data), s.counter):
if mustRehash(s):
enlarge(s)
index = rawGetKnownHC(s, key, hc)
rawInsert(s, s.data, key, hc, -1 - index)
inc(s.counter)
template doWhile(a, b) =
while true:
b
if not a: break
proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
var hc: Hash
var i = rawGet(s, key, hc)
@@ -82,17 +77,9 @@ proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
if i >= 0:
result = false
dec(s.counter)
while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1
var j = i # The correctness of this depends on (h+1) in nextTry,
var r = j # though may be adaptable to other simple sequences.
s.data[i].hcode = 0 # mark current EMPTY
s.data[i].key = default(type(s.data[i].key))
doWhile((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)):
i = (i + 1) and msk # increment mod table size
if isEmpty(s.data[i].hcode): # end of collision cluster; So all done
return
r = s.data[i].hcode and msk # "home" location of key@i
s.data[j] = move(s.data[i]) # data[i] will be marked EMPTY next loop
inc(s.countDeleted)
s.data[i].hcode = deletedMarker
s.data[i].key = default(type(s.data[i].key))
template dollarImpl() {.dirty.} =
result = "{"

View File

@@ -68,6 +68,7 @@ type
## before calling other procs on it.
data: KeyValuePairSeq[A]
counter: int
countDeleted: int
type
OrderedKeyValuePair[A] = tuple[
@@ -80,15 +81,13 @@ type
## <#initOrderedSet,int>`_ before calling other procs on it.
data: OrderedKeyValuePairSeq[A]
counter, first, last: int
countDeleted: int
const
defaultInitialSize* = 64
include setimpl
proc rightSize*(count: Natural): int {.inline.}
# ---------------------------------------------------------------------
# ------------------------------ HashSet ------------------------------
# ---------------------------------------------------------------------
@@ -250,7 +249,7 @@ iterator items*[A](s: HashSet[A]): A =
## echo b
## # --> {(a: 1, b: 3), (a: 0, b: 4)}
for h in 0 .. high(s.data):
if isFilled(s.data[h].hcode): yield s.data[h].key
if isFilledAndValid(s.data[h].hcode): yield s.data[h].key
proc containsOrIncl*[A](s: var HashSet[A], key: A): bool =
## Includes `key` in the set `s` and tells if `key` was already in `s`.
@@ -342,7 +341,7 @@ proc pop*[A](s: var HashSet[A]): A =
doAssertRaises(KeyError, echo s.pop)
for h in 0 .. high(s.data):
if isFilled(s.data[h].hcode):
if isFilledAndValid(s.data[h].hcode):
result = s.data[h].key
excl(s, result)
return result
@@ -593,16 +592,6 @@ proc `$`*[A](s: HashSet[A]): string =
## # --> {no, esc'aping, is " provided}
dollarImpl()
proc rightSize*(count: Natural): int {.inline.} =
## Return the value of `initialSize` to support `count` items.
##
## If more items are expected to be added, simply add that
## expected extra amount to the parameter before calling this.
##
## Internally, we want `mustRehash(rightSize(x), x) == false`.
result = nextPowerOfTwo(count * 3 div 2 + 4)
proc initSet*[A](initialSize = defaultInitialSize): HashSet[A] {.deprecated:
"Deprecated since v0.20, use 'initHashSet'".} = initHashSet[A](initialSize)
@@ -634,7 +623,7 @@ template forAllOrderedPairs(yieldStmt: untyped) {.dirty.} =
var idx = 0
while h >= 0:
var nxt = s.data[h].next
if isFilled(s.data[h].hcode):
if isFilledAndValid(s.data[h].hcode):
yieldStmt
inc(idx)
h = nxt
@@ -868,7 +857,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool =
while h >= 0 and g >= 0:
var nxh = s.data[h].next
var nxg = t.data[g].next
if isFilled(s.data[h].hcode) and isFilled(t.data[g].hcode):
if isFilledAndValid(s.data[h].hcode) and isFilledAndValid(t.data[g].hcode):
if s.data[h].key == t.data[g].key:
inc compared
else:
@@ -1146,10 +1135,19 @@ when isMainModule and not defined(release):
b.incl(2)
assert b.len == 1
for i in 0 .. 32:
var s = rightSize(i)
if s <= i or mustRehash(s, i):
echo "performance issue: rightSize() will not elide enlarge() at ", i
block:
type FakeTable = object
dataLen: int
counter: int
countDeleted: int
var t: FakeTable
for i in 0 .. 32:
var s = rightSize(i)
t.dataLen = s
t.counter = i
doAssert s > i and not mustRehash(t),
"performance issue: rightSize() will not elide enlarge() at: " & $i
block missingOrExcl:
var s = toOrderedSet([2, 3, 6, 7])

View File

@@ -25,6 +25,7 @@ type
SharedTable*[A, B] = object ## generic hash SharedTable
data: KeyValuePairSeq[A, B]
counter, dataLen: int
countDeleted: int
lock: Lock
template maxHash(t): untyped = t.dataLen-1
@@ -32,7 +33,7 @@ template maxHash(t): untyped = t.dataLen-1
include tableimpl
template st_maybeRehashPutImpl(enlarge) {.dirty.} =
if mustRehash(t.dataLen, t.counter):
if mustRehash(t):
enlarge(t)
index = rawGetKnownHC(t, key, hc)
index = -1 - index # important to transform for mgetOrPutImpl
@@ -49,9 +50,10 @@ proc enlarge[A, B](t: var SharedTable[A, B]) =
for i in 0..<oldSize:
let eh = n[i].hcode
if isFilled(eh):
var perturb = t.getPerturb(eh)
var j: Hash = eh and maxHash(t)
while isFilled(t.data[j].hcode):
j = nextTry(j, maxHash(t))
j = nextTry(j, maxHash(t), perturb)
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
deallocShared(n)

View File

@@ -14,13 +14,20 @@ include hashcommon
template rawGetDeepImpl() {.dirty.} = # Search algo for unconditional add
genHashImpl(key, hc)
var h: Hash = hc and maxHash(t)
while isFilled(t.data[h].hcode):
h = nextTry(h, maxHash(t))
var perturb = t.getPerturb(hc)
while true:
let hcode = t.data[h].hcode
if hcode == deletedMarker or hcode == freeMarker:
break
else:
h = nextTry(h, maxHash(t), perturb)
result = h
template rawInsertImpl() {.dirty.} =
template rawInsertImpl(t) {.dirty.} =
data[h].key = key
data[h].val = val
if data[h].hcode == deletedMarker:
t.countDeleted.dec
data[h].hcode = hc
proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} =
@@ -28,7 +35,7 @@ proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} =
proc rawInsert[X, A, B](t: var X, data: var KeyValuePairSeq[A, B],
key: A, val: B, hc: Hash, h: Hash) =
rawInsertImpl()
rawInsertImpl(t)
template checkIfInitialized() =
when compiles(defaultInitialSize):
@@ -37,15 +44,14 @@ template checkIfInitialized() =
template addImpl(enlarge) {.dirty.} =
checkIfInitialized()
if mustRehash(t.dataLen, t.counter): enlarge(t)
if mustRehash(t): enlarge(t)
var hc: Hash
var j = rawGetDeep(t, key, hc)
rawInsert(t, t.data, key, val, hc, j)
inc(t.counter)
template maybeRehashPutImpl(enlarge) {.dirty.} =
checkIfInitialized()
if mustRehash(t.dataLen, t.counter):
if mustRehash(t):
enlarge(t)
index = rawGetKnownHC(t, key, hc)
index = -1 - index # important to transform for mgetOrPutImpl
@@ -82,24 +88,11 @@ template delImplIdx(t, i) =
let msk = maxHash(t)
if i >= 0:
dec(t.counter)
block outer:
while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1
var j = i # The correctness of this depends on (h+1) in nextTry,
var r = j # though may be adaptable to other simple sequences.
t.data[i].hcode = 0 # mark current EMPTY
t.data[i].key = default(type(t.data[i].key))
t.data[i].val = default(type(t.data[i].val))
while true:
i = (i + 1) and msk # increment mod table size
if isEmpty(t.data[i].hcode): # end of collision cluster; So all done
break outer
r = t.data[i].hcode and msk # "home" location of key@i
if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)):
break
when defined(js):
t.data[j] = t.data[i]
else:
t.data[j] = move(t.data[i]) # data[j] will be marked EMPTY next loop
inc(t.countDeleted)
t.data[i].hcode = deletedMarker
t.data[i].key = default(type(t.data[i].key))
t.data[i].val = default(type(t.data[i].val))
# mustRehash + enlarge not needed because counter+countDeleted doesn't change
template delImpl() {.dirty.} =
var hc: Hash
@@ -123,8 +116,8 @@ template initImpl(result: typed, size: int) =
result.last = -1
template insertImpl() = # for CountTable
if t.dataLen == 0: initImpl(t, defaultInitialSize)
if mustRehash(len(t.data), t.counter): enlarge(t)
checkIfInitialized()
if mustRehash(t): enlarge(t)
ctRawInsert(t, t.data, key, val)
inc(t.counter)

View File

@@ -233,6 +233,7 @@ type
## For creating an empty Table, use `initTable proc<#initTable,int>`_.
data: KeyValuePairSeq[A, B]
counter: int
countDeleted: int
TableRef*[A, B] = ref Table[A, B] ## Ref version of `Table<#Table>`_.
##
## For creating a new empty TableRef, use `newTable proc
@@ -250,8 +251,6 @@ template dataLen(t): untyped = len(t.data)
include tableimpl
proc rightSize*(count: Natural): int {.inline.}
template get(t, key): untyped =
## retrieves the value at ``t[key]``. The value can be modified.
## If ``key`` is not in ``t``, the ``KeyError`` exception is raised.
@@ -267,14 +266,16 @@ template get(t, key): untyped =
proc enlarge[A, B](t: var Table[A, B]) =
var n: KeyValuePairSeq[A, B]
newSeq(n, len(t.data) * growthFactor)
newSeq(n, t.counter.rightSize)
swap(t.data, n)
t.countDeleted = 0
for i in countup(0, high(n)):
let eh = n[i].hcode
if isFilled(eh):
if isFilledAndValid(eh):
var j: Hash = eh and maxHash(t)
var perturb = t.getPerturb(eh)
while isFilled(t.data[j].hcode):
j = nextTry(j, maxHash(t))
j = nextTry(j, maxHash(t), perturb)
when defined(js):
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
else:
@@ -579,15 +580,6 @@ proc `==`*[A, B](s, t: Table[A, B]): bool =
equalsImpl(s, t)
proc rightSize*(count: Natural): int {.inline.} =
## Return the value of ``initialSize`` to support ``count`` items.
##
## If more items are expected to be added, simply add that
## expected extra amount to the parameter before calling this.
##
## Internally, we want mustRehash(rightSize(x), x) == false.
result = nextPowerOfTwo(count * 3 div 2 + 4)
proc indexBy*[A, B, C](collection: A, index: proc(x: B): C): Table[C, B] =
## Index the collection with the proc provided.
# TODO: As soon as supported, change collection: A to collection: A[B]
@@ -670,7 +662,7 @@ iterator pairs*[A, B](t: Table[A, B]): (A, B) =
## # value: [1, 5, 7, 9]
let L = len(t)
for h in 0 .. high(t.data):
if isFilled(t.data[h].hcode):
if isFilledAndValid(t.data[h].hcode):
yield (t.data[h].key, t.data[h].val)
assert(len(t) == L, "the length of the table changed while iterating over it")
@@ -692,7 +684,7 @@ iterator mpairs*[A, B](t: var Table[A, B]): (A, var B) =
let L = len(t)
for h in 0 .. high(t.data):
if isFilled(t.data[h].hcode):
if isFilledAndValid(t.data[h].hcode):
yield (t.data[h].key, t.data[h].val)
assert(len(t) == L, "the length of the table changed while iterating over it")
@@ -713,7 +705,7 @@ iterator keys*[A, B](t: Table[A, B]): A =
let L = len(t)
for h in 0 .. high(t.data):
if isFilled(t.data[h].hcode):
if isFilledAndValid(t.data[h].hcode):
yield t.data[h].key
assert(len(t) == L, "the length of the table changed while iterating over it")
@@ -734,7 +726,7 @@ iterator values*[A, B](t: Table[A, B]): B =
let L = len(t)
for h in 0 .. high(t.data):
if isFilled(t.data[h].hcode):
if isFilledAndValid(t.data[h].hcode):
yield t.data[h].val
assert(len(t) == L, "the length of the table changed while iterating over it")
@@ -756,36 +748,53 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B =
let L = len(t)
for h in 0 .. high(t.data):
if isFilled(t.data[h].hcode):
if isFilledAndValid(t.data[h].hcode):
yield t.data[h].val
assert(len(t) == L, "the length of the table changed while iterating over it")
template hasKeyOrPutCache(cache, h): bool =
# using `IntSet` would be an option worth considering to avoid quadratic
# behavior in case user misuses Table with lots of duplicate keys; but it
# has overhead in the common case of small number of duplicates.
# However: when lots of duplicates are used, all operations would be slow
# anyway because the full `hash(key)` is identical for these, which makes
# `nextTry` follow the exact same path for each key, resulting in large
# collision clusters. Alternatives could involve modifying the hash/retrieval
# based on duplicate key count.
var ret = false
for hi in cache:
if hi == h:
ret = true
break
if not ret: cache.add h
ret
iterator allValues*[A, B](t: Table[A, B]; key: A): B =
## Iterates over any value in the table ``t`` that belongs to the given ``key``.
##
## Used if you have a table with duplicate keys (as a result of using
## `add proc<#add,Table[A,B],A,B>`_).
##
## **Examples:**
##
## .. code-block::
## var a = {'a': 3, 'b': 5}.toTable
## for i in 1..3:
## a.add('z', 10*i)
## echo a # {'a': 3, 'b': 5, 'z': 10, 'z': 20, 'z': 30}
##
## for v in a.allValues('z'):
## echo v
## # 10
## # 20
## # 30
var h: Hash = genHash(key) and high(t.data)
runnableExamples:
import testutils
var a = {'a': 3, 'b': 5}.toTable
for i in 1..3: a.add('z', 10*i)
doAssert a.sortedPairs == @[('a', 3), ('b', 5), ('z', 10), ('z', 20), ('z', 30)]
doAssert sortedItems(a.allValues('z')) == @[10, 20, 30]
let hc = genHash(key)
var h: Hash = hc and high(t.data)
let L = len(t)
while isFilled(t.data[h].hcode):
if t.data[h].key == key:
yield t.data[h].val
assert(len(t) == L, "the length of the table changed while iterating over it")
h = nextTry(h, high(t.data))
var perturb = t.getPerturb(hc)
var num = 0
var cache: seq[Hash]
while isFilled(t.data[h].hcode): # `isFilledAndValid` would be incorrect, see test for `allValues`
if t.data[h].hcode == hc and t.data[h].key == key:
if not hasKeyOrPutCache(cache, h):
yield t.data[h].val
assert(len(t) == L, "the length of the table changed while iterating over it")
h = nextTry(h, high(t.data), perturb)
@@ -1219,6 +1228,8 @@ type
## <#initOrderedTable,int>`_.
data: OrderedKeyValuePairSeq[A, B]
counter, first, last: int
countDeleted: int
OrderedTableRef*[A, B] = ref OrderedTable[A, B] ## Ref version of
## `OrderedTable<#OrderedTable>`_.
##
@@ -1240,7 +1251,7 @@ proc rawGet[A, B](t: OrderedTable[A, B], key: A, hc: var Hash): int =
proc rawInsert[A, B](t: var OrderedTable[A, B],
data: var OrderedKeyValuePairSeq[A, B],
key: A, val: B, hc: Hash, h: Hash) =
rawInsertImpl()
rawInsertImpl(t)
data[h].next = -1
if t.first < 0: t.first = h
if t.last >= 0: data[t.last].next = h
@@ -1248,18 +1259,20 @@ proc rawInsert[A, B](t: var OrderedTable[A, B],
proc enlarge[A, B](t: var OrderedTable[A, B]) =
var n: OrderedKeyValuePairSeq[A, B]
newSeq(n, len(t.data) * growthFactor)
newSeq(n, t.counter.rightSize)
var h = t.first
t.first = -1
t.last = -1
swap(t.data, n)
t.countDeleted = 0
while h >= 0:
var nxt = n[h].next
let eh = n[h].hcode
if isFilled(eh):
if isFilledAndValid(eh):
var j: Hash = eh and maxHash(t)
var perturb = t.getPerturb(eh)
while isFilled(t.data[j].hcode):
j = nextTry(j, maxHash(t))
j = nextTry(j, maxHash(t), perturb)
rawInsert(t, t.data, n[h].key, n[h].val, n[h].hcode, j)
h = nxt
@@ -2211,6 +2224,7 @@ type
## <#initCountTable,int>`_.
data: seq[tuple[key: A, val: int]]
counter: int
countDeleted: int
isSorted: bool
CountTableRef*[A] = ref CountTable[A] ## Ref version of
## `CountTable<#CountTable>`_.
@@ -2223,8 +2237,10 @@ type
proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]],
key: A, val: int) =
var h: Hash = hash(key) and high(data)
while data[h].val != 0: h = nextTry(h, high(data))
let hc = hash(key)
var perturb = t.getPerturb(hc)
var h: Hash = hc and high(data)
while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: handle deletedMarker
data[h].key = key
data[h].val = val
@@ -2251,10 +2267,12 @@ proc remove[A](t: var CountTable[A], key: A) =
proc rawGet[A](t: CountTable[A], key: A): int =
if t.data.len == 0:
return -1
var h: Hash = hash(key) and high(t.data) # start with real hash value
while t.data[h].val != 0:
let hc = hash(key)
var perturb = t.getPerturb(hc)
var h: Hash = hc and high(t.data) # start with real hash value
while t.data[h].val != 0: # TODO: may need to handle t.data[h].hcode == deletedMarker?
if t.data[h].key == key: return h
h = nextTry(h, high(t.data))
h = nextTry(h, high(t.data), perturb)
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
template ctget(t, key, default: untyped): untyped =

View File

@@ -112,38 +112,14 @@ proc hash*[T: proc](x: T): Hash {.inline.} =
else:
result = hash(pointer(x))
const
prime = uint(11)
proc hash*(x: int): Hash {.inline.} =
proc hash*(x: int|int64|uint|uint64|char|Ordinal): Hash {.inline.} =
## Efficient hashing of integers.
result = cast[Hash](cast[uint](x) * prime)
proc hash*(x: int64): Hash {.inline.} =
## Efficient hashing of `int64` integers.
result = cast[Hash](cast[uint](x) * prime)
proc hash*(x: uint): Hash {.inline.} =
## Efficient hashing of unsigned integers.
result = cast[Hash](x * prime)
proc hash*(x: uint64): Hash {.inline.} =
## Efficient hashing of `uint64` integers.
result = cast[Hash](cast[uint](x) * prime)
proc hash*(x: char): Hash {.inline.} =
## Efficient hashing of characters.
result = cast[Hash](cast[uint](ord(x)) * prime)
proc hash*[T: Ordinal](x: T): Hash {.inline.} =
## Efficient hashing of other ordinal types (e.g. enums).
result = cast[Hash](cast[uint](ord(x)) * prime)
cast[Hash](ord(x))
proc hash*(x: float): Hash {.inline.} =
## Efficient hashing of floats.
var y = x + 1.0
result = cast[ptr Hash](addr(y))[]
var y = x + 0.0 # for denormalization
result = hash(cast[ptr Hash](addr(y))[])
# Forward declarations before methods that hash containers. This allows
# containers to contain other containers

15
lib/pure/testutils.nim Normal file
View File

@@ -0,0 +1,15 @@
## Utilities to help writing tests
##
## Unstable, experimental API.
import std/[sequtils, algorithm]
proc sortedPairs*[T](t: T): auto =
## helps when writing tests involving tables in a way that's robust to
## changes in hashing functions / table implementation.
toSeq(t.pairs).sorted
template sortedItems*(t: untyped): untyped =
## helps when writing tests involving tables in a way that's robust to
## changes in hashing functions / table implementation.
sorted(toSeq(t))

View File

@@ -1,7 +1,7 @@
discard """
cmd: "nim c --gc:arc $file"
nimout: '''(a: true, n: doAssert)
Table[system.string, trepr.MyType](data: @[], counter: 0)
Table[system.string, trepr.MyType](data: @[], counter: 0, countDeleted: 0)
nil
'''
"""

View File

@@ -8,7 +8,12 @@ And we get here
'''
joinable: false
"""
import hashes, sequtils, tables, algorithm
import hashes, sequtils, tables, algorithm, testutils
block tableDollar:
# other tests should use `sortedPairs` to be robust to future table/hash
# implementation changes
doAssert ${1: 'a', 2: 'b'}.toTable in ["{1: 'a', 2: 'b'}", "{2: 'b', 1: 'a'}"]
# test should not be joined because it takes too long.
block tableadds:
@@ -188,6 +193,23 @@ block ttables2:
run1()
echo "2"
block allValues:
var t: Table[int, string]
var key = 0
let n = 1000
for i in 0..<n: t.add(i, $i)
const keys = [0, -1, 12]
for i in 0..1:
for key in keys:
t.add(key, $key & ":" & $i)
for i in 0..<n:
if i notin keys:
t.del(i)
doAssert t.sortedPairs == @[(-1, "-1:0"), (-1, "-1:1"), (0, "0"), (0, "0:0"), (0, "0:1"), (12, "12"), (12, "12:0"), (12, "12:1")]
doAssert sortedItems(t.allValues(0)) == @["0", "0:0", "0:1"]
doAssert sortedItems(t.allValues(-1)) == @["-1:0", "-1:1"]
doAssert sortedItems(t.allValues(12)) == @["12", "12:0", "12:1"]
doAssert sortedItems(t.allValues(1)) == @[]
block tablesref:
const
@@ -232,8 +254,8 @@ block tablesref:
for x in 0..1:
for y in 0..1:
assert t[(x,y)] == $x & $y
assert($t ==
"{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}")
assert t.sortedPairs ==
@[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")]
block tableTest2:
var t = newTable[string, float]()
@@ -340,7 +362,7 @@ block tablesref:
block anonZipTest:
let keys = @['a','b','c']
let values = @[1, 2, 3]
doAssert "{'c': 3, 'a': 1, 'b': 2}" == $ toTable zip(keys, values)
doAssert zip(keys, values).toTable.sortedPairs == @[('a', 1), ('b', 2), ('c', 3)]
block clearTableTest:
var t = newTable[string, float]()
@@ -369,3 +391,4 @@ block tablesref:
orderedTableSortTest()
echo "3"

View File

@@ -3,7 +3,7 @@ discard """
output: '''true'''
"""
import hashes, tables, sharedtables
import hashes, tables, sharedtables, testutils
const
data = {
@@ -47,8 +47,7 @@ block tableTest1:
for x in 0..1:
for y in 0..1:
assert t[(x,y)] == $x & $y
assert($t ==
"{(x: 1, y: 1): \"11\", (x: 0, y: 0): \"00\", (x: 0, y: 1): \"01\", (x: 1, y: 0): \"10\"}")
assert t.sortedPairs == @[((x: 0, y: 0), "00"), ((x: 0, y: 1), "01"), ((x: 1, y: 0), "10"), ((x: 1, y: 1), "11")]
block tableTest2:
var t = initTable[string, float]()