From 2cfd6b702473093b167aaa3075e71a36f3bc4eef Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Sat, 28 Aug 2021 14:59:13 +0200 Subject: [PATCH 1/2] big: Add `_private_int_mul_high`. --- core/math/big/build.bat | 4 +- core/math/big/private.odin | 123 +++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/core/math/big/build.bat b/core/math/big/build.bat index 31480bcc8..e6049b2e1 100644 --- a/core/math/big/build.bat +++ b/core/math/big/build.bat @@ -1,9 +1,9 @@ @echo off -:odin run . -vet +odin run . -vet set TEST_ARGS=-fast-tests :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% -odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% +:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS% \ No newline at end of file diff --git a/core/math/big/private.odin b/core/math/big/private.odin index d4cced10a..5fda9f2b9 100644 --- a/core/math/big/private.odin +++ b/core/math/big/private.odin @@ -426,6 +426,129 @@ _private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := conte return internal_clamp(dest); } +/* + Multiplies |a| * |b| and does not compute the lower digs digits + [meant to get the higher part of the product] +*/ +_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) { + context.allocator = allocator; + + /* + Can we use the fast multiplier? + */ + if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA { + return _private_int_mul_high_comba(dest, a, b, digits); + } + + internal_grow(dest, a.used + b.used + 1) or_return; + dest.used = a.used + b.used + 1; + + pa := a.used; + pb := b.used; + for ix := 0; ix < pa; ix += 1 { + carry := DIGIT(0); + + for iy := digits - ix; iy < pb; iy += 1 { + /* + Calculate the double precision result. + */ + r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry); + + /* + Get the lower part. + */ + dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK)); + + /* + Carry the carry. + */ + carry = DIGIT(r >> _WORD(_DIGIT_BITS)); + } + dest.digit[ix + pb] = carry; + } + return internal_clamp(dest); +} + +/* + This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`. + See the comments for `_private_int_mul_comba` to see how it works. + + This is used in the Barrett reduction since for one of the multiplications + only the higher digits were needed. This essentially halves the work. + + Based on Algorithm 14.12 on pp.595 of HAC. +*/ +_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) { + context.allocator = allocator; + + W: [_WARRAY]DIGIT = ---; + _W: _WORD = 0; + + /* + Number of output digits to produce. Grow the destination as required. + */ + pa := a.used + b.used; + internal_grow(dest, pa) or_return; + + ix: int; + for ix = digits; ix < pa; ix += 1 { + /* + Get offsets into the two bignums. + */ + ty := min(b.used - 1, ix); + tx := ix - ty; + + /* + This is the number of times the loop will iterrate, essentially it's + while (tx++ < a->used && ty-- >= 0) { ... } + */ + iy := min(a.used - tx, ty + 1); + + /* + Execute loop. + */ + for iz := 0; iz < iy; iz += 1 { + _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]); + } + + /* + Store term. + */ + W[ix] = DIGIT(_W) & DIGIT(_MASK); + + /* + Make next carry. + */ + _W = _W >> _WORD(_DIGIT_BITS); + } + + /* + Setup dest + */ + old_used := dest.used; + dest.used = pa; + + for ix = digits; ix < pa; ix += 1 { + /* + Now extract the previous digit [below the carry]. + */ + dest.digit[ix] = W[ix]; + } + + /* + Zero remainder. + */ + internal_zero_unused(dest, old_used); + + /* + Adjust dest.used based on leading zeroes. + */ + return internal_clamp(dest); +} + + + + /* Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized. From 737b4fde1c8b3e450209716f8a182b8edfb4f59e Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Sat, 28 Aug 2021 18:17:57 +0200 Subject: [PATCH 2/2] big: Add `_private_int_mul_balance`. --- core/math/big/build.bat | 4 +- core/math/big/example.odin | 9 --- core/math/big/internal.odin | 11 ++-- core/math/big/private.odin | 107 +++++++++++++++++++++++++++++------- core/math/big/test.py | 17 ++++-- 5 files changed, 106 insertions(+), 42 deletions(-) diff --git a/core/math/big/build.bat b/core/math/big/build.bat index e6049b2e1..31480bcc8 100644 --- a/core/math/big/build.bat +++ b/core/math/big/build.bat @@ -1,9 +1,9 @@ @echo off -odin run . -vet +:odin run . -vet set TEST_ARGS=-fast-tests :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% -:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% +odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS% :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS% \ No newline at end of file diff --git a/core/math/big/example.odin b/core/math/big/example.odin index cddf3ffae..4542e9e15 100644 --- a/core/math/big/example.odin +++ b/core/math/big/example.odin @@ -205,15 +205,6 @@ int_to_byte_little :: proc(v: ^Int) { demo :: proc() { a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; defer destroy(a, b, c, d, e, f); - - foo := "92232459121502451677697058974826760244863271517919321608054113675118660929276431348516553336313179167211015633639725554914519355444316239500734169769447134357534241879421978647995614218985202290368055757891124109355450669008628757662409138767505519391883751112010824030579849970582074544353971308266211776494228299586414907715854328360867232691292422194412634523666770452490676515117702116926803826546868467146319938818238521874072436856528051486567230096290549225463582766830777324099589751817442141036031904145041055454639783559905920619197290800070679733841430619962318433709503256637256772215111521321630777950145713049902839937043785039344243357384899099910837463164007565230287809026956254332260375327814271845678201"; - - set(a, foo); - - print("a: ", a); - - is_sqr, _ := internal_int_is_square(a); - fmt.printf("is_square: %v\n", is_sqr); } main :: proc() { diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin index 4903dd009..9422067ae 100644 --- a/core/math/big/internal.odin +++ b/core/math/big/internal.odin @@ -659,8 +659,7 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc Can we use the balance method? Check sizes. * The smaller one needs to be larger than the Karatsuba cut-off. * The bigger one needs to be at least about one `_MUL_KARATSUBA_CUTOFF` bigger - * to make some sense, but it depends on architecture, OS, position of the - * stars... so YMMV. + * to make some sense, but it depends on architecture, OS, position of the stars... so YMMV. * Using it to cut the input into slices small enough for _mul_comba * was actually slower on the author's machine, but YMMV. */ @@ -669,13 +668,11 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc max_used := max(src.used, multiplier.used); digits := src.used + multiplier.used + 1; - if false && min_used >= MUL_KARATSUBA_CUTOFF && - max_used / 2 >= MUL_KARATSUBA_CUTOFF && + if min_used >= MUL_KARATSUBA_CUTOFF && (max_used / 2) >= MUL_KARATSUBA_CUTOFF && max_used >= (2 * min_used) { /* Not much effect was observed below a ratio of 1:2, but again: YMMV. */ - max_used >= 2 * min_used { - // err = s_mp_mul_balance(a,b,c); + err = _private_int_mul_balance(dest, src, multiplier); } else if min_used >= MUL_TOOM_CUTOFF { /* Toom path commented out until it no longer fails Factorial 10k or 100k, @@ -914,7 +911,7 @@ internal_int_factorial :: proc(res: ^Int, n: int, allocator := context.allocator context.allocator = allocator; if n >= FACTORIAL_BINARY_SPLIT_CUTOFF { - return #force_inline _private_int_factorial_binary_split(res, n); + return _private_int_factorial_binary_split(res, n); } i := len(_factorial_table); diff --git a/core/math/big/private.odin b/core/math/big/private.odin index 5fda9f2b9..d71946ce2 100644 --- a/core/math/big/private.odin +++ b/core/math/big/private.odin @@ -113,7 +113,7 @@ _private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) context.allocator = allocator; S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2); + defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2); /* Init temps. @@ -258,7 +258,7 @@ _private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.alloca context.allocator = allocator; x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(x0, x1, y0, y1, t1, x0y0, x1y1); + defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1); /* min # of digits, divided by two. @@ -546,8 +546,74 @@ _private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := return internal_clamp(dest); } +/* + Single-digit multiplication with the smaller number as the single-digit. +*/ +_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) { + context.allocator = allocator; + a, b := a, b; + a0, tmp, r := &Int{}, &Int{}, &Int{}; + defer internal_destroy(a0, tmp, r); + b_size := min(a.used, b.used); + n_blocks := max(a.used, b.used) / b_size; + + internal_grow(a0, b_size + 2) or_return; + internal_init_multi(tmp, r) or_return; + + /* + Make sure that `a` is the larger one. + */ + if a.used < b.used { + a, b = b, a; + } + assert(a.used >= b.used); + + i, j := 0, 0; + for ; i < n_blocks; i += 1 { + /* + Cut a slice off of `a`. + */ + + a0.used = b_size; + internal_copy_digits(a0, a, a0.used, j); + j += a0.used; + internal_clamp(a0); + + /* + Multiply with `b`. + */ + internal_mul(tmp, a0, b) or_return; + + /* + Shift `tmp` to the correct position. + */ + internal_shl_digit(tmp, b_size * i) or_return; + + /* + Add to output. No carry needed. + */ + internal_add(r, r, tmp) or_return; + } + + /* + The left-overs; there are always left-overs. + */ + if j < a.used { + a0.used = a.used - j; + internal_copy_digits(a0, a, a0.used, j); + j += a0.used; + internal_clamp(a0); + + internal_mul(tmp, a0, b) or_return; + internal_shl_digit(tmp, b_size * i) or_return; + internal_add(r, r, tmp) or_return; + } + + internal_swap(dest, r); + return; +} /* Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 @@ -1311,7 +1377,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}; c: int; - defer destroy(ta, tb, tq, q); + defer internal_destroy(ta, tb, tq, q); for { internal_one(tq) or_return; @@ -1364,31 +1430,34 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html */ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) { + context.allocator = allocator; inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; defer internal_destroy(inner, outer, start, stop, temp); - internal_one(inner, false, allocator) or_return; - internal_one(outer, false, allocator) or_return; + internal_one(inner, false) or_return; + internal_one(outer, false) or_return; bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n)); for i := bits_used; i >= 0; i -= 1 { start := (n >> (uint(i) + 1)) + 1 | 1; stop := (n >> uint(i)) + 1 | 1; - _private_int_recursive_product(temp, start, stop, 0, allocator) or_return; - internal_mul(inner, inner, temp, allocator) or_return; - internal_mul(outer, outer, inner, allocator) or_return; + _private_int_recursive_product(temp, start, stop, 0) or_return; + internal_mul(inner, inner, temp) or_return; + internal_mul(outer, outer, inner) or_return; } shift := n - intrinsics.count_ones(n); - return internal_shl(res, outer, int(shift), allocator); + return internal_shl(res, outer, int(shift)); } /* Recursive product used by binary split factorial algorithm. */ _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) { + context.allocator = allocator; + t1, t2 := &Int{}, &Int{}; defer internal_destroy(t1, t2); @@ -1398,28 +1467,28 @@ _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int num_factors := (stop - start) >> 1; if num_factors == 2 { - internal_set(t1, start, false, allocator) or_return; + internal_set(t1, start, false) or_return; when true { - internal_grow(t2, t1.used + 1, false, allocator) or_return; - internal_add(t2, t1, 2, allocator) or_return; + internal_grow(t2, t1.used + 1, false) or_return; + internal_add(t2, t1, 2) or_return; } else { - add(t2, t1, 2) or_return; + internal_add(t2, t1, 2) or_return; } - return internal_mul(res, t1, t2, allocator); + return internal_mul(res, t1, t2); } if num_factors > 1 { mid := (start + num_factors) | 1; - _private_int_recursive_product(t1, start, mid, level + 1, allocator) or_return; - _private_int_recursive_product(t2, mid, stop, level + 1, allocator) or_return; - return internal_mul(res, t1, t2, allocator); + _private_int_recursive_product(t1, start, mid, level + 1) or_return; + _private_int_recursive_product(t2, mid, stop, level + 1) or_return; + return internal_mul(res, t1, t2); } if num_factors == 1 { - return #force_inline internal_set(res, start, true, allocator); + return #force_inline internal_set(res, start, true); } - return #force_inline internal_one(res, true, allocator); + return #force_inline internal_one(res, true); } /* diff --git a/core/math/big/test.py b/core/math/big/test.py index 79ee2c7ca..df59fa1c8 100644 --- a/core/math/big/test.py +++ b/core/math/big/test.py @@ -403,14 +403,21 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay): return test("test_shr_signed", res, [a, bits], expected_error, expected_result) -def test_factorial(n = 0, expected_error = Error.Okay): - args = [n] - res = int_factorial(*args) +def test_factorial(number = 0, expected_error = Error.Okay): + print("Factorial:", number) + args = [number] + try: + res = int_factorial(*args) + except OSError as e: + print("{} while trying to factorial {}.".format(e, number)) + if EXIT_ON_FAIL: exit(3) + return False + expected_result = None if expected_error == Error.Okay: - expected_result = math.factorial(n) + expected_result = math.factorial(number) - return test("test_factorial", res, [n], expected_error, expected_result) + return test("test_factorial", res, [number], expected_error, expected_result) def test_gcd(a = 0, b = 0, expected_error = Error.Okay): args = [arg_to_odin(a), arg_to_odin(b)]