mirror of
https://github.com/nim-lang/Nim.git
synced 2026-02-14 15:23:27 +00:00
Resolve things raised in https://github.com/nim-lang/Nim/issues/10081 ? (#10084)
* Resolve things raised in https://github.com/nim-lang/Nim/issues/10081 ? CDF is a standard ident in all things related to random numbers/sampling, and full words "cumulativeDistributionFunction" would be silly long, in this case, IMO. We use lowercase `cdf` to make it not look like a type, remove all looping from `sample` letting callers do it. Besides just side-stepping any `sampleSize` name choice, callers may want to filter out samples anyway which this makes slightly simpler. Also add two variants of `cumsum`, value return and in-place update distinguished by the var-ness of the first argument. Add tests for `int` and `float` for both `cumsum` and the new `sample`. (The sample tests exercise the value return mode of `cumsum`.) Functionality pre-this-PR `sample(a, w)` is now the almost as simple `for i in 0..<n: sample(a, w.cumsum)`, but this new code factoring is almost surely better. The statistical tests pass, as before. * Address Araq comment in https://github.com/nim-lang/Nim/pull/10084 We can always add in some `var` version later if desired to save memory, but this change now at least firms up the `sample` interface. * Rename `cumsum` -> `cumsummed` to honor NEP1 style. Re-instate `cumsum` as the in-place transformation. Test both in `tests/stdlib/tmath.nim` and use `cumsummed` in the example code for sample since that's a simpler example. * Fix requests from https://github.com/nim-lang/Nim/pull/10084 : example in lib/pure/math.nim and comment whitespace in lib/pure/random.nim
This commit is contained in:
@@ -163,6 +163,24 @@ proc prod*[T](x: openArray[T]): T {.noSideEffect.} =
|
||||
result = 1.T
|
||||
for i in items(x): result = result * i
|
||||
|
||||
proc cumsummed*[T](x: openArray[T]): seq[T] =
|
||||
## Return cumulative aka prefix summation of ``x``.
|
||||
##
|
||||
## .. code-block:: nim
|
||||
## var x = [1, 2, 3, 4]
|
||||
## echo x.cumsummed # [1, 3, 6, 10]
|
||||
result.setLen(x.len)
|
||||
result[0] = x[0]
|
||||
for i in 1 ..< x.len: result[i] = result[i-1] + x[i]
|
||||
|
||||
proc cumsum*[T](x: var openArray[T]) =
|
||||
## Transforms ``x`` in-place into its cumulative aka prefix summation.
|
||||
##
|
||||
## .. code-block:: nim
|
||||
## var x = [1, 2, 3, 4]
|
||||
## x.cumsum; echo x # [1, 3, 6, 10]
|
||||
for i in 1 ..< x.len: x[i] = x[i-1] + x[i]
|
||||
|
||||
{.push noSideEffect.}
|
||||
when not defined(JS): # C
|
||||
proc sqrt*(x: float32): float32 {.importc: "sqrtf", header: "<math.h>".}
|
||||
|
||||
@@ -175,27 +175,26 @@ proc sample*[T](a: openArray[T]): T =
|
||||
## returns a random element from openArray ``a`` using non-thread-safe state.
|
||||
result = a[rand(a.low..a.high)]
|
||||
|
||||
proc sample*[T, U](r: var Rand; a: openArray[T], w: openArray[U], n=1): seq[T] =
|
||||
## Return a sample (with replacement) of size ``n`` from elements of ``a``
|
||||
## according to convertible-to-``float``, not necessarily normalized, and
|
||||
## non-negative weights ``w``. Uses state in ``r``. Must have sum ``w > 0.0``.
|
||||
assert(w.len == a.len)
|
||||
var cdf = newSeq[float](a.len) # The *unnormalized* CDF
|
||||
var tot = 0.0 # Unnormalized is fine if we sample up to tot
|
||||
for i, w in w:
|
||||
assert(w >= 0)
|
||||
tot += float(w)
|
||||
cdf[i] = tot
|
||||
assert(tot > 0.0) # Need at least one non-zero weight
|
||||
for i in 0 ..< n:
|
||||
result.add(a[cdf.upperBound(r.rand(tot))])
|
||||
|
||||
proc sample*[T, U](a: openArray[T], w: openArray[U], n=1): seq[T] =
|
||||
## Return a sample (with replacement) of size ``n`` from elements of ``a``
|
||||
## according to convertible-to-``float``, not necessarily normalized, and
|
||||
## non-negative weights ``w``. Uses default non-thread-safe state.
|
||||
state.sample(a, w, n)
|
||||
proc sample*[T, U](r: var Rand; a: openArray[T], cdf: openArray[U]): T =
|
||||
## Sample one element from openArray ``a`` when it has cumulative distribution
|
||||
## function (CDF) ``cdf`` (not necessarily normalized, any type of elements
|
||||
## convertible to ``float``). Uses state in ``r``. E.g.:
|
||||
##
|
||||
## .. code-block:: nim
|
||||
## let val = [ "a", "b", "c", "d" ] # some values
|
||||
## var cnt = [1, 2, 3, 4] # histogram of counts
|
||||
## echo r.sample(val, cnt.cumsummed) # echo a sample
|
||||
assert(cdf.len == a.len) # Two basic sanity checks.
|
||||
assert(float(cdf[^1]) > 0.0)
|
||||
#While we could check cdf[i-1] <= cdf[i] for i in 1..cdf.len, that could get
|
||||
#awfully expensive even in debugging modes.
|
||||
let u = r.rand(float(cdf[^1]))
|
||||
a[cdf.upperBound(U(u))]
|
||||
|
||||
proc sample*[T, U](a: openArray[T], cdf: openArray[U]): T =
|
||||
## Like ``sample(var Rand; openArray[T], openArray[U])``, but uses default
|
||||
## non-thread-safe state.
|
||||
state.sample(a, cdf)
|
||||
|
||||
proc initRand*(seed: int64): Rand =
|
||||
## Creates a new ``Rand`` state from ``seed``.
|
||||
|
||||
@@ -4,6 +4,8 @@ discard """
|
||||
|
||||
[Suite] random float
|
||||
|
||||
[Suite] cumsum
|
||||
|
||||
[Suite] random sample
|
||||
|
||||
[Suite] ^
|
||||
@@ -74,29 +76,66 @@ suite "random float":
|
||||
var rand2:float = random(1000000.0)
|
||||
check rand1 != rand2
|
||||
|
||||
suite "cumsum":
|
||||
test "cumsum int seq return":
|
||||
let counts = [ 1, 2, 3, 4 ]
|
||||
check counts.cumsummed == [ 1, 3, 6, 10 ]
|
||||
|
||||
test "cumsum float seq return":
|
||||
let counts = [ 1.0, 2.0, 3.0, 4.0 ]
|
||||
check counts.cumsummed == [ 1.0, 3.0, 6.0, 10.0 ]
|
||||
|
||||
test "cumsum int in-place":
|
||||
var counts = [ 1, 2, 3, 4 ]
|
||||
counts.cumsum
|
||||
check counts == [ 1, 3, 6, 10 ]
|
||||
|
||||
test "cumsum float in-place":
|
||||
var counts = [ 1.0, 2.0, 3.0, 4.0 ]
|
||||
counts.cumsum
|
||||
check counts == [ 1.0, 3.0, 6.0, 10.0 ]
|
||||
|
||||
suite "random sample":
|
||||
test "non-uniform array sample":
|
||||
test "non-uniform array sample unnormalized int CDF":
|
||||
let values = [ 10, 20, 30, 40, 50 ] # values
|
||||
let weight = [ 4, 3, 2, 1, 0 ] # weights aka unnormalized probabilities
|
||||
let weightSum = 10.0 # sum of weights
|
||||
let counts = [ 4, 3, 2, 1, 0 ] # weights aka unnormalized probabilities
|
||||
var histo = initCountTable[int]()
|
||||
for v in sample(values, weight, 5000):
|
||||
histo.inc(v)
|
||||
check histo.len == 4 # number of non-zero in `weight`
|
||||
let cdf = counts.cumsummed # unnormalized CDF
|
||||
for i in 0 ..< 5000:
|
||||
histo.inc(sample(values, cdf))
|
||||
check histo.len == 4 # number of non-zero in `counts`
|
||||
# Any one bin is a binomial random var for n samples, each with prob p of
|
||||
# adding a count to k; E[k]=p*n, Var k=p*(1-p)*n, approximately Normal for
|
||||
# big n. So, P(abs(k - p*n)/sqrt(p*(1-p)*n))>3.0) =~ 0.0027, while
|
||||
# P(wholeTestFails) =~ 1 - P(binPasses)^4 =~ 1 - (1-0.0027)^4 =~ 0.01.
|
||||
for i, w in weight:
|
||||
if w == 0:
|
||||
for i, c in counts:
|
||||
if c == 0:
|
||||
check values[i] notin histo
|
||||
continue
|
||||
let p = float(w) / float(weightSum)
|
||||
let p = float(c) / float(cdf[^1])
|
||||
let n = 5000.0
|
||||
let expected = p * n
|
||||
let stdDev = sqrt(n * p * (1.0 - p))
|
||||
check abs(float(histo[values[i]]) - expected) <= 3.0 * stdDev
|
||||
|
||||
test "non-uniform array sample normalized float CDF":
|
||||
let values = [ 10, 20, 30, 40, 50 ] # values
|
||||
let counts = [ 0.4, 0.3, 0.2, 0.1, 0 ] # probabilities
|
||||
var histo = initCountTable[int]()
|
||||
let cdf = counts.cumsummed # normalized CDF
|
||||
for i in 0 ..< 5000:
|
||||
histo.inc(sample(values, cdf))
|
||||
check histo.len == 4 # number of non-zero in ``counts``
|
||||
for i, c in counts:
|
||||
if c == 0:
|
||||
check values[i] notin histo
|
||||
continue
|
||||
let p = float(c) / float(cdf[^1])
|
||||
let n = 5000.0
|
||||
let expected = p * n
|
||||
let stdDev = sqrt(n * p * (1.0 - p))
|
||||
# NOTE: like unnormalized int CDF test, P(wholeTestFails) =~ 0.01.
|
||||
check abs(float(histo[values[i]]) - expected) <= 3.0 * stdDev
|
||||
|
||||
suite "^":
|
||||
test "compiles for valid types":
|
||||
|
||||
Reference in New Issue
Block a user