mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-09 06:23:25 +00:00
hashes: implement murmur3 (#12022)
* hashes: implement murmur3 * refactoring; there is only one murmurHash and it works at compile-time via VM hooks * fixes JS tests * makes toOpenArrayByte work with C++ * make it bootstrap in C++ mode for 0.20
This commit is contained in:
@@ -50,7 +50,7 @@ type
|
||||
|
||||
- Added `os.delEnv` and `nimscript.delEnv`. (#11466)
|
||||
|
||||
- Enable Oid usage in hashtables. (#11472)
|
||||
- Enabled Oid usage in hashtables. (#11472)
|
||||
|
||||
- Added `unsafeColumnAt` procs, that return unsafe cstring from InstantRow. (#11647)
|
||||
|
||||
|
||||
@@ -100,21 +100,23 @@ proc openArrayLoc(p: BProc, n: PNode): Rope =
|
||||
if optBoundsCheck in p.options:
|
||||
genBoundsCheck(p, a, b, c)
|
||||
let ty = skipTypes(a.t, abstractVar+{tyPtr})
|
||||
let dest = getTypeDesc(p.module, n.typ.sons[0])
|
||||
case ty.kind
|
||||
of tyArray:
|
||||
let first = toInt64(firstOrd(p.config, ty))
|
||||
if first == 0:
|
||||
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
|
||||
result = "($4*)(($1)+($2)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
|
||||
else:
|
||||
result = "($1)+(($2)-($4)), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first)]
|
||||
of tyOpenArray, tyVarargs, tyUncheckedArray:
|
||||
result = "($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c)]
|
||||
result = "($5*)($1)+(($2)-($4)), ($3)-($2)+1" %
|
||||
[rdLoc(a), rdLoc(b), rdLoc(c), intLiteral(first), dest]
|
||||
of tyOpenArray, tyVarargs, tyUncheckedArray, tyCString:
|
||||
result = "($4*)($1)+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dest]
|
||||
of tyString, tySequence:
|
||||
if skipTypes(n.typ, abstractInst).kind == tyVar and
|
||||
not compileToCpp(p.module):
|
||||
result = "(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
|
||||
result = "($5*)(*$1)$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
|
||||
else:
|
||||
result = "$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p)]
|
||||
result = "($5*)$1$4+($2), ($3)-($2)+1" % [rdLoc(a), rdLoc(b), rdLoc(c), dataField(p), dest]
|
||||
else:
|
||||
internalError(p.config, "openArrayLoc: " & typeToString(a.t))
|
||||
else:
|
||||
|
||||
@@ -97,4 +97,5 @@ proc initDefines*(symbols: StringTableRef) =
|
||||
|
||||
defineSymbol("nimFixedOwned")
|
||||
defineSymbol("nimHasStyleChecks")
|
||||
defineSymbol("nimToOpenArrayCString")
|
||||
defineSymbol("nimHasUsed")
|
||||
|
||||
@@ -17,6 +17,8 @@ from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getApp
|
||||
from md5 import getMD5
|
||||
from sighashes import symBodyDigest
|
||||
|
||||
from hashes import hash
|
||||
|
||||
template mathop(op) {.dirty.} =
|
||||
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
|
||||
|
||||
@@ -157,3 +159,30 @@ proc registerAdditionalOps*(c: PCtx) =
|
||||
stackTrace(c, PStackFrame(prc: c.prc.sym, comesFrom: 0, next: nil), c.exceptionInstr,
|
||||
"isExported() requires a symbol. '" & $n & "' is of kind '" & $n.kind & "'", n.info)
|
||||
setResult(a, sfExported in n.sym.flags)
|
||||
|
||||
proc hashVmImpl(a: VmArgs) =
|
||||
var res = hashes.hash(a.getString(0), a.getInt(1).int, a.getInt(2).int)
|
||||
if c.config.cmd == cmdCompileToJS:
|
||||
# emulate JS's terrible integers:
|
||||
res = cast[int32](res)
|
||||
setResult(a, res)
|
||||
|
||||
registerCallback c, "stdlib.hashes.hashVmImpl", hashVmImpl
|
||||
|
||||
proc hashVmImplByte(a: VmArgs) =
|
||||
# nkBracket[...]
|
||||
let sPos = a.getInt(1).int
|
||||
let ePos = a.getInt(2).int
|
||||
let arr = a.getNode(0)
|
||||
var bytes = newSeq[byte](arr.len)
|
||||
for i in 0 ..< arr.len:
|
||||
bytes[i] = byte(arr[i].intVal and 0xff)
|
||||
|
||||
var res = hashes.hash(bytes, sPos, ePos)
|
||||
if c.config.cmd == cmdCompileToJS:
|
||||
# emulate JS's terrible integers:
|
||||
res = cast[int32](res)
|
||||
setResult(a, res)
|
||||
|
||||
registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
|
||||
registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte
|
||||
|
||||
@@ -49,9 +49,6 @@ type
|
||||
## always have a size of a power of two and can use the ``and``
|
||||
## operator instead of ``mod`` for truncation of the hash value.
|
||||
|
||||
const
|
||||
IntSize = sizeof(int)
|
||||
|
||||
proc `!&`*(h: Hash, val: int): Hash {.inline.} =
|
||||
## Mixes a hash value `h` with `val` to produce a new hash value.
|
||||
##
|
||||
@@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} =
|
||||
else:
|
||||
result = cast[Hash](cast[uint](x) shr 3) # skip the alignment
|
||||
|
||||
when not defined(booting):
|
||||
proc hash*[T: proc](x: T): Hash {.inline.} =
|
||||
## Efficient hashing of proc vars. Closures are supported too.
|
||||
when T is "closure":
|
||||
result = hash(rawProc(x)) !& hash(rawEnv(x))
|
||||
else:
|
||||
result = hash(pointer(x))
|
||||
proc hash*[T: proc](x: T): Hash {.inline.} =
|
||||
## Efficient hashing of proc vars. Closures are supported too.
|
||||
when T is "closure":
|
||||
result = hash(rawProc(x)) !& hash(rawEnv(x))
|
||||
else:
|
||||
result = hash(pointer(x))
|
||||
|
||||
proc hash*(x: int): Hash {.inline.} =
|
||||
## Efficient hashing of integers.
|
||||
@@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} =
|
||||
proc hash*[A](x: openArray[A]): Hash
|
||||
proc hash*[A](x: set[A]): Hash
|
||||
|
||||
template bytewiseHashing(result: Hash, x: typed, start, stop: int) =
|
||||
for i in start .. stop:
|
||||
result = result !& hash(x[i])
|
||||
result = !$result
|
||||
|
||||
template hashImpl(result: Hash, x: typed, start, stop: int) =
|
||||
when defined(JS):
|
||||
proc imul(a, b: uint32): uint32 =
|
||||
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul
|
||||
let mask = 0xffff'u32
|
||||
var
|
||||
aHi = (a shr 16) and mask
|
||||
aLo = a and mask
|
||||
bHi = (b shr 16) and mask
|
||||
bLo = b and mask
|
||||
result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16
|
||||
else:
|
||||
template imul(a, b: uint32): untyped = a * b
|
||||
|
||||
proc rotl32(x: uint32, r: int): uint32 {.inline.} =
|
||||
(x shl r) or (x shr (32 - r))
|
||||
|
||||
proc murmurHash(x: openArray[byte]): Hash =
|
||||
# https://github.com/PeterScott/murmur3/blob/master/murmur3.c
|
||||
const
|
||||
c1 = 0xcc9e2d51'u32
|
||||
c2 = 0x1b873593'u32
|
||||
n1 = 0xe6546b64'u32
|
||||
m1 = 0x85ebca6b'u32
|
||||
m2 = 0xc2b2ae35'u32
|
||||
let
|
||||
elementSize = sizeof(x[start])
|
||||
stepSize = IntSize div elementSize
|
||||
var i = start
|
||||
while i <= stop+1 - stepSize:
|
||||
var n = 0
|
||||
when nimvm:
|
||||
# we cannot cast in VM, so we do it manually
|
||||
for j in countdown(stepSize-1, 0):
|
||||
n = (n shl (8*elementSize)) or ord(x[i+j])
|
||||
size = len(x)
|
||||
stepSize = 4 # 32-bit
|
||||
n = size div stepSize
|
||||
var
|
||||
h1: uint32
|
||||
i = 0
|
||||
|
||||
# body
|
||||
while i < n * stepSize:
|
||||
var k1: uint32
|
||||
when defined(js):
|
||||
var j = stepSize
|
||||
while j > 0:
|
||||
dec j
|
||||
k1 = (k1 shl 8) or (ord(x[i+j])).uint32
|
||||
else:
|
||||
n = cast[ptr Hash](unsafeAddr x[i])[]
|
||||
result = result !& n
|
||||
i += stepSize
|
||||
bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish
|
||||
k1 = cast[ptr uint32](unsafeAddr x[i])[]
|
||||
inc i, stepSize
|
||||
|
||||
k1 = imul(k1, c1)
|
||||
k1 = rotl32(k1, 15)
|
||||
k1 = imul(k1, c2)
|
||||
|
||||
h1 = h1 xor k1
|
||||
h1 = rotl32(h1, 13)
|
||||
h1 = h1*5 + n1
|
||||
|
||||
# tail
|
||||
var k1: uint32
|
||||
var rem = size mod stepSize
|
||||
while rem > 0:
|
||||
dec rem
|
||||
k1 = (k1 shl 8) or (ord(x[i+rem])).uint32
|
||||
k1 = imul(k1, c1)
|
||||
k1 = rotl32(k1, 15)
|
||||
k1 = imul(k1, c2)
|
||||
h1 = h1 xor k1
|
||||
|
||||
# finalization
|
||||
h1 = h1 xor size.uint32
|
||||
h1 = h1 xor (h1 shr 16)
|
||||
h1 = imul(h1, m1)
|
||||
h1 = h1 xor (h1 shr 13)
|
||||
h1 = imul(h1, m2)
|
||||
h1 = h1 xor (h1 shr 16)
|
||||
return cast[Hash](h1)
|
||||
|
||||
proc hashVmImpl(x: string, sPos, ePos: int): Hash =
|
||||
doAssert false, "implementation override in compiler/vmops.nim"
|
||||
|
||||
proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash =
|
||||
doAssert false, "implementation override in compiler/vmops.nim"
|
||||
|
||||
proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash =
|
||||
doAssert false, "implementation override in compiler/vmops.nim"
|
||||
|
||||
proc hash*(x: string): Hash =
|
||||
## Efficient hashing of strings.
|
||||
@@ -182,7 +238,16 @@ proc hash*(x: string): Hash =
|
||||
runnableExamples:
|
||||
doAssert hash("abracadabra") != hash("AbracadabrA")
|
||||
|
||||
hashImpl(result, x, 0, high(x))
|
||||
when not defined(nimToOpenArrayCString):
|
||||
result = 0
|
||||
for c in x:
|
||||
result = result !& ord(c)
|
||||
result = !$result
|
||||
else:
|
||||
when nimvm:
|
||||
result = hashVmImpl(x, 0, high(x))
|
||||
else:
|
||||
result = murmurHash(toOpenArrayByte(x, 0, high(x)))
|
||||
|
||||
proc hash*(x: cstring): Hash =
|
||||
## Efficient hashing of null-terminated strings.
|
||||
@@ -191,7 +256,19 @@ proc hash*(x: cstring): Hash =
|
||||
doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA")
|
||||
doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA")
|
||||
|
||||
hashImpl(result, x, 0, high(x))
|
||||
when not defined(nimToOpenArrayCString):
|
||||
result = 0
|
||||
var i = 0
|
||||
while x[i] != '\0':
|
||||
result = result !& ord(x[i])
|
||||
inc i
|
||||
result = !$result
|
||||
else:
|
||||
when not defined(JS) and defined(nimToOpenArrayCString):
|
||||
murmurHash(toOpenArrayByte(x, 0, x.high))
|
||||
else:
|
||||
let xx = $x
|
||||
murmurHash(toOpenArrayByte(xx, 0, high(xx)))
|
||||
|
||||
proc hash*(sBuf: string, sPos, ePos: int): Hash =
|
||||
## Efficient hashing of a string buffer, from starting
|
||||
@@ -202,7 +279,13 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash =
|
||||
var a = "abracadabra"
|
||||
doAssert hash(a, 0, 3) == hash(a, 7, 10)
|
||||
|
||||
hashImpl(result, sBuf, sPos, ePos)
|
||||
when not defined(nimToOpenArrayCString):
|
||||
result = 0
|
||||
for i in sPos..ePos:
|
||||
result = result !& ord(sBuf[i])
|
||||
result = !$result
|
||||
else:
|
||||
murmurHash(toOpenArrayByte(sBuf, sPos, ePos))
|
||||
|
||||
proc hashIgnoreStyle*(x: string): Hash =
|
||||
## Efficient hashing of strings; style is ignored.
|
||||
@@ -300,12 +383,20 @@ proc hash*[T: tuple](x: T): Hash =
|
||||
result = result !& hash(f)
|
||||
result = !$result
|
||||
|
||||
|
||||
proc hash*[A](x: openArray[A]): Hash =
|
||||
## Efficient hashing of arrays and sequences.
|
||||
when A is char|SomeInteger:
|
||||
hashImpl(result, x, 0, x.high)
|
||||
when A is byte:
|
||||
result = murmurHash(x)
|
||||
elif A is char:
|
||||
when nimvm:
|
||||
result = hashVmImplChar(x, 0, x.high)
|
||||
else:
|
||||
result = murmurHash(toOpenArrayByte(x, 0, x.high))
|
||||
else:
|
||||
bytewiseHashing(result, x, 0, x.high)
|
||||
for a in x:
|
||||
result = result !& hash(a)
|
||||
result = !$result
|
||||
|
||||
proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
|
||||
## Efficient hashing of portions of arrays and sequences, from starting
|
||||
@@ -316,10 +407,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =
|
||||
let a = [1, 2, 5, 1, 2, 6]
|
||||
doAssert hash(a, 0, 1) == hash(a, 3, 4)
|
||||
|
||||
when A is char|SomeInteger:
|
||||
hashImpl(result, aBuf, sPos, ePos)
|
||||
when A is byte:
|
||||
when nimvm:
|
||||
result = hashVmImplByte(aBuf, sPos, ePos)
|
||||
else:
|
||||
result = murmurHash(toOpenArray(aBuf, sPos, ePos))
|
||||
elif A is char:
|
||||
when nimvm:
|
||||
result = hashVmImplChar(aBuf, sPos, ePos)
|
||||
else:
|
||||
result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos))
|
||||
else:
|
||||
bytewiseHashing(result, aBuf, sPos, ePos)
|
||||
for i in sPos .. ePos:
|
||||
result = result !& hash(aBuf[i])
|
||||
result = !$result
|
||||
|
||||
proc hash*[A](x: set[A]): Hash =
|
||||
## Efficient hashing of sets.
|
||||
@@ -334,11 +435,15 @@ when isMainModule:
|
||||
a = ""
|
||||
b = newSeq[char]()
|
||||
c = newSeq[int]()
|
||||
d = cstring""
|
||||
e = "abcd"
|
||||
doAssert hash(a) == 0
|
||||
doAssert hash(b) == 0
|
||||
doAssert hash(c) == 0
|
||||
doAssert hash(d) == 0
|
||||
doAssert hashIgnoreCase(a) == 0
|
||||
doAssert hashIgnoreStyle(a) == 0
|
||||
doAssert hash(e, 3, 2) == 0
|
||||
block sameButDifferent:
|
||||
doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13)
|
||||
doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234")
|
||||
@@ -346,14 +451,14 @@ when isMainModule:
|
||||
doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234")
|
||||
block smallSize: # no multibyte hashing
|
||||
let
|
||||
xx = @['H','e','l','l','o']
|
||||
ii = @[72'i8, 101, 108, 108, 111]
|
||||
ss = "Hello"
|
||||
xx = @['H','i']
|
||||
ii = @[72'u8, 105]
|
||||
ss = "Hi"
|
||||
doAssert hash(xx) == hash(ii)
|
||||
doAssert hash(xx) == hash(ss)
|
||||
doAssert hash(xx) == hash(xx, 0, xx.high)
|
||||
doAssert hash(ss) == hash(ss, 0, ss.high)
|
||||
block largeSize: # longer than 8 characters, should trigger multibyte hashing
|
||||
block largeSize: # longer than 4 characters
|
||||
let
|
||||
xx = @['H','e','l','l','o']
|
||||
xxl = @['H','e','l','l','o','w','e','e','n','s']
|
||||
@@ -362,9 +467,6 @@ when isMainModule:
|
||||
doAssert hash(xxl) == hash(xxl, 0, xxl.high)
|
||||
doAssert hash(ssl) == hash(ssl, 0, ssl.high)
|
||||
doAssert hash(xx) == hash(xxl, 0, 4)
|
||||
block misc:
|
||||
let
|
||||
a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
|
||||
b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4]
|
||||
doAssert hash(a) == hash(b)
|
||||
doAssert hash(a, 2, 5) == hash(b, 2, 5)
|
||||
doAssert hash(xx) == hash(ssl, 0, 4)
|
||||
doAssert hash(xx, 0, 3) == hash(xxl, 0, 3)
|
||||
doAssert hash(xx, 0, 3) == hash(ssl, 0, 3)
|
||||
|
||||
@@ -4501,6 +4501,11 @@ when defined(nimconfig):
|
||||
when not defined(js):
|
||||
proc toOpenArray*[T](x: ptr UncheckedArray[T]; first, last: int): openArray[T] {.
|
||||
magic: "Slice".}
|
||||
when defined(nimToOpenArrayCString):
|
||||
proc toOpenArray*(x: cstring; first, last: int): openArray[char] {.
|
||||
magic: "Slice".}
|
||||
proc toOpenArrayByte*(x: cstring; first, last: int): openArray[byte] {.
|
||||
magic: "Slice".}
|
||||
|
||||
proc toOpenArray*[T](x: seq[T]; first, last: int): openArray[T] {.
|
||||
magic: "Slice".}
|
||||
@@ -4510,8 +4515,13 @@ proc toOpenArray*[I, T](x: array[I, T]; first, last: I): openArray[T] {.
|
||||
magic: "Slice".}
|
||||
proc toOpenArray*(x: string; first, last: int): openArray[char] {.
|
||||
magic: "Slice".}
|
||||
|
||||
proc toOpenArrayByte*(x: string; first, last: int): openArray[byte] {.
|
||||
magic: "Slice".}
|
||||
proc toOpenArrayByte*(x: openArray[char]; first, last: int): openArray[byte] {.
|
||||
magic: "Slice".}
|
||||
proc toOpenArrayByte*(x: seq[char]; first, last: int): openArray[byte] {.
|
||||
magic: "Slice".}
|
||||
|
||||
type
|
||||
ForLoopStmt* {.compilerproc.} = object ## \
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
discard """
|
||||
output: '''ob @[]
|
||||
ob3 @[]
|
||||
ob2 @[]
|
||||
3
|
||||
output: '''ob2 @[]
|
||||
ob @[]
|
||||
ob3 @[]
|
||||
ob2 @[]'''
|
||||
3
|
||||
ob2 @[]
|
||||
ob @[]
|
||||
ob3 @[]
|
||||
'''
|
||||
cmd: "nim c -r --threads:on $file"
|
||||
"""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user