From 65ce08f38c6a8ae05df5529a5b2d51de7aaec2d6 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 6 Feb 2015 09:24:20 -0500 Subject: [PATCH 1/3] Add hcode. Re-factor rawGet. Fix infinite loop. Replace state enum with a cached hash code which has the same memory overhead and locality as the enum, but can really speed things up with non-integer-like keys (keys for which either hash() or == take more than couple cycles, or where the key data is "indirect" and might incur another cache miss). To function as both empty/filled state and a hash code cache, it only needs to be ensured that hash codes are non-zero for any real key. That is done at the one place in the whole file hash() is called. Keep convention clear via isFilled() & isEmpty(). An isDeleted state will no longer be necessary as per below excl/inf loop fix. Since some use sites know hc and some do not, re-factor rawGet into two forms - one with known hash code and one with an unknown HC that returns it. Both forms still return <0 on missing, but returns the much more informative "-1 - index". That return can be quickly inverted by -1 - result to recover the index where insert should happen, provided no modifications are made to the table in the meantime. This protocol retains the prior <0 interface and also makes it easy to avoid unnecessary duplicate search work in procs like containsOrInclImpl (which formerly searched in the initial get and AGAIN in rawInsert). Strip the searching part out of rawInsert to "make it even more raw". swap(s.data, n) a bit earlier so rawGet and rawGetKnownHC can have similar parameter lists and integrate well with rawInsert/code sharing between Set and OrderedSet impls. This PR also fixes infinite looping upon too many deletes. [ The deleted state (aka "tombstone") approach is vulnerable to the table filling up with deleted items which forces giant scans for missing keys which could be anywhere. In the version prior to this PR, table wraparound wasn't even detected yielding infinite loops. ] This PR changes excl() from marking slots as deleted to Knuth algo 6.4R, "local/incremental moveback rehashing" - adapted from Knuth's h->h-1 to the cache-friendlier h->h+1 probe sequence and adapted from "gotos" to a new doWhile template. This method restores the table to a state that would have resulted from pure inserts (in some order). Update nextTry accordingly. Since linear probing can degrade a little faster, 50% rather than 66% may be a better default growth threshold, but users should be able to adjust threshold anyway. Old unit tests all pass. More extensive testing in this module is probably warranted before taking similar enhancements over to collections.tables. --- lib/pure/collections/sets.nim | 135 +++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 41 deletions(-) diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 4cc46149ed..72fd5cd811 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -24,9 +24,12 @@ import when not defined(nimhygiene): {.pragma: dirty.} +# For "integer-like A" that are too big for intsets/bit-vectors to be practical, +# it would be best to shrink hcode to the same size as the integer. Larger +# codes should never be needed, and this can pack more entries per cache-line. +# Losing hcode entirely is also possible - if some element value is forbidden. type - SlotEnum = enum seEmpty, seFilled, seDeleted - KeyValuePair[A] = tuple[slot: SlotEnum, key: A] + KeyValuePair[A] = tuple[hcode: THash, key: A] KeyValuePairSeq[A] = seq[KeyValuePair[A]] HashSet* {.myShallow.}[A] = object ## \ ## A generic hash set. @@ -38,6 +41,14 @@ type {.deprecated: [TSet: HashSet].} +# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These +# two procs retain clarity of that encoding without the space cost of an enum. +proc isEmpty(hcode: THash): bool {.inline.} = + result = hcode == 0 + +proc isFilled(hcode: THash): bool {.inline.} = + result = hcode != 0 + proc isValid*[A](s: HashSet[A]): bool = ## Returns `true` if the set has been initialized with `initSet <#initSet>`_. ## @@ -94,7 +105,7 @@ iterator items*[A](s: HashSet[A]): A = ## # --> {(a: 1, b: 3), (a: 0, b: 4)} assert s.isValid, "The set needs to be initialized." for h in 0..high(s.data): - if s.data[h].slot == seFilled: yield s.data[h].key + if isFilled(s.data[h].hcode): yield s.data[h].key const growthFactor = 2 @@ -104,24 +115,34 @@ proc mustRehash(length, counter: int): bool {.inline.} = result = (length * 2 < counter * 3) or (length - counter < 4) proc nextTry(h, maxHash: THash): THash {.inline.} = - result = ((5 * h) + 1) and maxHash + result = (h + 1) and maxHash -template rawGetImpl() {.dirty.} = - var h: THash = hash(key) and high(s.data) # start with real hash value - while s.data[h].slot != seEmpty: - if s.data[h].key == key and s.data[h].slot == seFilled: +template rawGetKnownHCImpl() {.dirty.} = + var h: THash = hc and high(s.data) # start with real hash value + while isFilled(s.data[h].hcode): + # Compare hc THEN key with boolean short circuit. This makes the common case + # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. + # It does slow down succeeding lookups by one extra THash cmp&and..usually + # just a few clock cycles, generally worth it for any non-integer-like A. + if s.data[h].hcode == hc and s.data[h].key == key: # compare hc THEN key return h h = nextTry(h, high(s.data)) - result = -1 + result = -1 - h # < 0 => MISSING; insert idx = -1 - result + +template rawGetImpl() {.dirty.} = + hc = hash(key) + if hc == 0: # This almost never taken branch should be very predictable. + hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine. + rawGetKnownHCImpl() template rawInsertImpl() {.dirty.} = - var h: THash = hash(key) and high(data) - while data[h].slot == seFilled: - h = nextTry(h, high(data)) data[h].key = key - data[h].slot = seFilled + data[h].hcode = hc -proc rawGet[A](s: HashSet[A], key: A): int = +proc rawGetKnownHC[A](s: HashSet[A], key: A, hc: THash): int {.inline.} = + rawGetKnownHCImpl() + +proc rawGet[A](s: HashSet[A], key: A, hc: var THash): int {.inline.} = rawGetImpl() proc mget*[A](s: var HashSet[A], key: A): var A = @@ -130,7 +151,8 @@ proc mget*[A](s: var HashSet[A], key: A): var A = ## when one overloaded 'hash' and '==' but still needs reference semantics ## for sharing. assert s.isValid, "The set needs to be initialized." - var index = rawGet(s, key) + var hc: THash + var index = rawGet(s, key, hc) if index >= 0: result = s.data[index].key else: raise newException(KeyError, "key not found: " & $key) @@ -147,33 +169,43 @@ proc contains*[A](s: HashSet[A], key: A): bool = ## values.excl(2) ## assert(not values.contains(2)) assert s.isValid, "The set needs to be initialized." - var index = rawGet(s, key) + var hc: THash + var index = rawGet(s, key, hc) result = index >= 0 -proc rawInsert[A](s: var HashSet[A], data: var KeyValuePairSeq[A], key: A) = +proc rawInsert[A](s: var HashSet[A], data: var KeyValuePairSeq[A], key: A, + hc: THash, h: THash) = rawInsertImpl() proc enlarge[A](s: var HashSet[A]) = var n: KeyValuePairSeq[A] newSeq(n, len(s.data) * growthFactor) - for i in countup(0, high(s.data)): - if s.data[i].slot == seFilled: rawInsert(s, n, s.data[i].key) - swap(s.data, n) + swap(s.data, n) # n is now old seq + for i in countup(0, high(n)): + if isFilled(n[i].hcode): + var j = -1 - rawGetKnownHC(s, n[i].key, n[i].hcode) + rawInsert(s, s.data, n[i].key, n[i].hcode, j) template inclImpl() {.dirty.} = - var index = rawGet(s, key) + var hc: THash + var index = rawGet(s, key, hc) if index < 0: - if mustRehash(len(s.data), s.counter): enlarge(s) - rawInsert(s, s.data, key) + if mustRehash(len(s.data), s.counter): + enlarge(s) + index = rawGetKnownHC(s, key, hc) + rawInsert(s, s.data, key, hc, -1 - index) inc(s.counter) template containsOrInclImpl() {.dirty.} = - var index = rawGet(s, key) + var hc: THash + var index = rawGet(s, key, hc) if index >= 0: result = true else: - if mustRehash(len(s.data), s.counter): enlarge(s) - rawInsert(s, s.data, key) + if mustRehash(len(s.data), s.counter): + enlarge(s) + index = rawGetKnownHC(s, key, hc) + rawInsert(s, s.data, key, hc, -1 - index) inc(s.counter) proc incl*[A](s: var HashSet[A], key: A) = @@ -204,6 +236,10 @@ proc incl*[A](s: var HashSet[A], other: HashSet[A]) = assert other.isValid, "The set `other` needs to be initialized." for item in other: incl(s, item) +template doWhile(a: expr, b: stmt): stmt = + b + while a: b + proc excl*[A](s: var HashSet[A], key: A) = ## Excludes `key` from the set `s`. ## @@ -215,10 +251,22 @@ proc excl*[A](s: var HashSet[A], key: A) = ## s.excl(2) ## assert s.len == 3 assert s.isValid, "The set needs to be initialized." - var index = rawGet(s, key) - if index >= 0: - s.data[index].slot = seDeleted + var hc: THash + var i = rawGet(s, key, hc) + var msk = high(s.data) + if i >= 0: + s.data[i].hcode = 0 dec(s.counter) + while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 + var j = i # The correctness of this depends on (h+1) in nextTry, + var r = j # though may be adaptable to other simple sequences. + s.data[i].hcode = 0 # mark current EMPTY + doWhile ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): + i = (i + 1) and msk # increment mod table size + if isEmpty(s.data[i].hcode): # end of collision cluster; So all done + return + r = s.data[i].hcode and msk # "home" location of key@i + s.data[j] = s.data[i] # data[j] will be marked EMPTY next loop proc excl*[A](s: var HashSet[A], other: HashSet[A]) = ## Excludes everything in `other` from `s`. @@ -494,7 +542,7 @@ proc map*[A, B](data: HashSet[A], op: proc (x: A): B {.closure.}): HashSet[B] = type OrderedKeyValuePair[A] = tuple[ - slot: SlotEnum, next: int, key: A] + hcode: THash, next: int, key: A] OrderedKeyValuePairSeq[A] = seq[OrderedKeyValuePair[A]] OrderedSet* {.myShallow.}[A] = object ## \ ## A generic hash set that remembers insertion order. @@ -546,7 +594,7 @@ template forAllOrderedPairs(yieldStmt: stmt) {.dirty, immediate.} = var h = s.first while h >= 0: var nxt = s.data[h].next - if s.data[h].slot == seFilled: yieldStmt + if isFilled(s.data[h].hcode): yieldStmt h = nxt iterator items*[A](s: OrderedSet[A]): A = @@ -571,7 +619,10 @@ iterator items*[A](s: OrderedSet[A]): A = forAllOrderedPairs: yield s.data[h].key -proc rawGet[A](s: OrderedSet[A], key: A): int = +proc rawGetKnownHC[A](s: OrderedSet[A], key: A, hc: THash): int {.inline.} = + rawGetKnownHCImpl() + +proc rawGet[A](s: OrderedSet[A], key: A, hc: var THash): int {.inline.} = rawGetImpl() proc contains*[A](s: OrderedSet[A], key: A): bool = @@ -585,11 +636,12 @@ proc contains*[A](s: OrderedSet[A], key: A): bool = ## values.incl(2) ## assert values.contains(2) assert s.isValid, "The set needs to be initialized." - var index = rawGet(s, key) + var hc: THash + var index = rawGet(s, key, hc) result = index >= 0 -proc rawInsert[A](s: var OrderedSet[A], - data: var OrderedKeyValuePairSeq[A], key: A) = +proc rawInsert[A](s: var OrderedSet[A], data: var OrderedKeyValuePairSeq[A], + key: A, hc: THash, h: THash) = rawInsertImpl() data[h].next = -1 if s.first < 0: s.first = h @@ -602,12 +654,13 @@ proc enlarge[A](s: var OrderedSet[A]) = var h = s.first s.first = -1 s.last = -1 - while h >= 0: - var nxt = s.data[h].next - if s.data[h].slot == seFilled: - rawInsert(s, n, s.data[h].key) - h = nxt swap(s.data, n) + while h >= 0: + var nxt = n[h].next + if isFilled(n[h].hcode): + var j = -1 - rawGetKnownHC(s, n[h].key, n[h].hcode) + rawInsert(s, s.data, n[h].key, n[h].hcode, j) + h = nxt proc incl*[A](s: var OrderedSet[A], key: A) = ## Includes an element `key` in `s`. @@ -726,7 +779,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool = while h >= 0 and g >= 0: var nxh = s.data[h].next var nxg = t.data[g].next - if s.data[h].slot == seFilled and s.data[g].slot == seFilled: + if isFilled(s.data[h].hcode) and isFilled(s.data[g].hcode): if s.data[h].key == s.data[g].key: inc compared else: From 42f8f1cd1fe491c19362a4b03f89952ea6e160bc Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Sat, 7 Feb 2015 09:37:17 -0500 Subject: [PATCH 2/3] Fix unnecessarily slow set building from openArray. The estimation of the initialSize as simply array len + 10 was too small for for all but the smallest sets. It would not elide/skip one final enlarge(). That last one is actually always the most expensive enlarge(). Indeed, in a series where one to start from tiny and build up the table..that last one is about 50% of all the enlarging time in general. So, this simple and reasonable optimization (compared to just starting at 64) was only helping about half as much as it could. Introduce a rightSize() proc to be the inverse to mustRehash(). Export it to clients since pre-sizing is externally useful in set construction and the current mustRehash rules are opaque and beyond the control of clients. Also add test module logic to check that rightSize() and mustRehash() are inverses in the appropriate sense..not really in a block/assertion throwing unit test since this is a peformance nice-to-have issue rather than about basic correctness. (Also, fix a too vs. two typo in doc comment.) --- lib/pure/collections/sets.nim | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 72fd5cd811..3e462f73fa 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -114,6 +114,15 @@ proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) result = (length * 2 < counter * 3) or (length - counter < 4) +proc rightSize*(count: int): int {.inline.} = + ## Return the value of `initialSize` to support `count` items. + ## + ## If more items are expected to be added, simply add that + ## expected extra amount to the parameter before calling this. + ## + ## Internally, we want mustRehash(rightSize(x), x) == false. + result = nextPowerOfTwo(count * 3 div 2 + 4) + proc nextTry(h, maxHash: THash): THash {.inline.} = result = (h + 1) and maxHash @@ -343,7 +352,7 @@ proc toSet*[A](keys: openArray[A]): HashSet[A] = ## var numbers = toSet([1, 2, 3, 4, 5]) ## assert numbers.contains(2) ## assert numbers.contains(4) - result = initSet[A](nextPowerOfTwo(keys.len+10)) + result = initSet[A](rightSize(keys.len)) for key in items(keys): result.incl(key) template dollarImpl(): stmt {.dirty.} = @@ -708,7 +717,7 @@ proc containsOrIncl*[A](s: var OrderedSet[A], key: A): bool = proc init*[A](s: var OrderedSet[A], initialSize=64) = ## Initializes an ordered hash set. ## - ## The `initialSize` parameter needs to be a power of too. You can use + ## The `initialSize` parameter needs to be a power of two. You can use ## `math.nextPowerOfTwo() `_ to guarantee that at ## runtime. All set variables have to be initialized before you can use them ## with other procs from this module with the exception of `isValid() @@ -751,7 +760,7 @@ proc toOrderedSet*[A](keys: openArray[A]): OrderedSet[A] = ## var numbers = toOrderedSet([1, 2, 3, 4, 5]) ## assert numbers.contains(2) ## assert numbers.contains(4) - result = initOrderedSet[A](nextPowerOfTwo(keys.len+10)) + result = initOrderedSet[A](rightSize(keys.len)) for key in items(keys): result.incl(key) proc `$`*[A](s: OrderedSet[A]): string = @@ -954,6 +963,11 @@ proc testModule() = b.incl(2) assert b.len == 1 + for i in 0 .. 32: + var s = rightSize(i) + if s <= i or mustRehash(s, i): + echo "performance issue: rightSize() will not elide enlarge() at ", i + echo "Micro tests run successfully." when isMainModule and not defined(release): testModule() From 1f3ce26421f73916aa978c67ab90cdf68218120d Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Sat, 7 Feb 2015 13:13:03 -0500 Subject: [PATCH 3/3] Address Andreas' complaint about code duplication. --- lib/pure/collections/sets.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 3e462f73fa..33fec1a186 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -246,8 +246,9 @@ proc incl*[A](s: var HashSet[A], other: HashSet[A]) = for item in other: incl(s, item) template doWhile(a: expr, b: stmt): stmt = - b - while a: b + while true: + b + if not a: break proc excl*[A](s: var HashSet[A], key: A) = ## Excludes `key` from the set `s`.