From a27612ec6a69b6a0ee9bd039683551daf87e5cd3 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Mon, 2 Aug 2021 21:01:46 +0200 Subject: [PATCH] Add `_mul_comba` path. --- core/math/big/basic.odin | 131 ++++++++++++++++++++++++++++++++----- core/math/big/build.bat | 6 +- core/math/big/example.odin | 6 +- 3 files changed, 120 insertions(+), 23 deletions(-) diff --git a/core/math/big/basic.odin b/core/math/big/basic.odin index f403d853f..226ddc068 100644 --- a/core/math/big/basic.odin +++ b/core/math/big/basic.odin @@ -499,8 +499,7 @@ mod_bits :: proc { int_mod_bits, }; Multiply by a DIGIT. */ int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT) -> (err: Error) { - if err = clear_if_uninitialized(src ); err != .None { return err; } - if err = clear_if_uninitialized(dest); err != .None { return err; } + if err = clear_if_uninitialized(src, dest); err != .None { return err; } if multiplier == 0 { return zero(dest); @@ -576,9 +575,7 @@ int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT) -> (err: Error) { High level multiplication (handles sign). */ int_mul :: proc(dest, src, multiplier: ^Int) -> (err: Error) { - if err = clear_if_uninitialized(src); err != .None { return err; } - if err = clear_if_uninitialized(dest); err != .None { return err; } - if err = clear_if_uninitialized(multiplier); err != .None { return err; } + if err = clear_if_uninitialized(dest, src, multiplier); err != .None { return err; } /* Early out for `multiplier` is zero; Set `dest` to zero. @@ -587,11 +584,6 @@ int_mul :: proc(dest, src, multiplier: ^Int) -> (err: Error) { return zero(dest); } - min_used := min(src.used, multiplier.used); - max_used := max(src.used, multiplier.used); - digits := src.used + multiplier.used + 1; - neg := src.sign != multiplier.sign; - if src == multiplier { /* Do we need to square? @@ -619,6 +611,11 @@ int_mul :: proc(dest, src, multiplier: ^Int) -> (err: Error) { * Using it to cut the input into slices small enough for _mul_comba * was actually slower on the author's machine, but YMMV. */ + + min_used := min(src.used, multiplier.used); + max_used := max(src.used, multiplier.used); + digits := src.used + multiplier.used + 1; + if false && min_used >= _MUL_KARATSUBA_CUTOFF && max_used / 2 >= _MUL_KARATSUBA_CUTOFF && /* @@ -630,18 +627,19 @@ int_mul :: proc(dest, src, multiplier: ^Int) -> (err: Error) { // err = s_mp_mul_toom(a, b, c); } else if false && min_used >= _MUL_KARATSUBA_CUTOFF { // err = s_mp_mul_karatsuba(a, b, c); - } else if false && digits < _WARRAY && min_used <= _MAX_COMBA { + } else if digits < _WARRAY && min_used <= _MAX_COMBA { /* Can we use the fast multiplier? * The fast multiplier can be used if the output will * have less than MP_WARRAY digits and the number of * digits won't affect carry propagation */ - // err = s_mp_mul_comba(a, b, c, digs); + err = _int_mul_comba(dest, src, multiplier, digits); } else { err = _int_mul(dest, src, multiplier, digits); } } + neg := src.sign != multiplier.sign; dest.sign = .Negative if dest.used > 0 && neg else .Zero_or_Positive; return err; } @@ -1033,14 +1031,11 @@ _int_sub :: proc(dest, number, decrease: ^Int) -> (err: Error) { many digits of output are created. */ _int_mul :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { - /* Can we use the fast multiplier? */ - when false { // Have Comba? - if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA { - return _int_mul_comba(dest, a, b, digits); - } + if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA { + return _int_mul_comba(dest, a, b, digits); } /* @@ -1095,6 +1090,108 @@ _int_mul :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { return clamp(dest); } +/* + Fast (comba) multiplier + + This is the fast column-array [comba] multiplier. It is + designed to compute the columns of the product first + then handle the carries afterwards. This has the effect + of making the nested loops that compute the columns very + simple and schedulable on super-scalar processors. + + This has been modified to produce a variable number of + digits of output so if say only a half-product is required + you don't have to compute the upper half (a feature + required for fast Barrett reduction). + + Based on Algorithm 14.12 on pp.595 of HAC. +*/ +_int_mul_comba :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { + /* + Set up array. + */ + W: [_WARRAY]DIGIT = ---; + + /* + Grow the destination as required. + */ + if err = grow(dest, digits); err != .None { return err; } + + /* + Number of output digits to produce. + */ + pa := min(digits, a.used + b.used); + + /* + Clear the carry + */ + _W := _WORD(0); + + ix: int; + for ix = 0; ix < pa; ix += 1 { + tx, ty, iy, iz: int; + + /* + Get offsets into the two bignums. + */ + ty = min(b.used - 1, ix); + tx = ix - ty; + + /* + This is the number of times the loop will iterate, essentially. + while (tx++ < a->used && ty-- >= 0) { ... } + */ + + iy = min(a.used - tx, ty + 1); + + /* + Execute loop. + */ + for iz = 0; iz < iy; iz += 1 { + _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]); + } + + /* + Store term. + */ + W[ix] = DIGIT(_W) & _MASK; + + /* + Make next carry. + */ + _W = _W >> _WORD(_DIGIT_BITS); + } + + /* + Setup dest. + */ + old_used := dest.used; + dest.used = pa; + + for ix = 0; ix < pa; ix += 1 { + /* + Now extract the previous digit [below the carry]. + */ + dest.digit[ix] = W[ix]; + } + + /* + Clear unused digits [that existed in the old copy of dest]. + */ + zero_count := old_used - dest.used; + /* + Zero remainder. + */ + if zero_count > 0 { + mem.zero_slice(dest.digit[dest.used:][:zero_count]); + } + /* + Adjust dest.used based on leading zeroes. + */ + + return clamp(dest); +} + /* Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ diff --git a/core/math/big/build.bat b/core/math/big/build.bat index 2c1edfcec..d454fef4d 100644 --- a/core/math/big/build.bat +++ b/core/math/big/build.bat @@ -1,10 +1,10 @@ @echo off -odin run . -vet +:odin run . -vet : -o:size -no-bounds-check :odin build . -build-mode:shared -show-timings -o:minimal -use-separate-modules :odin build . -build-mode:shared -show-timings -o:size -use-separate-modules -no-bounds-check :odin build . -build-mode:shared -show-timings -o:size -use-separate-modules -:odin build . -build-mode:shared -show-timings -o:speed -use-separate-modules -no-bounds-check +odin build . -build-mode:shared -show-timings -o:speed -use-separate-modules -no-bounds-check :odin build . -build-mode:shared -show-timings -o:speed -use-separate-modules -:python test.py \ No newline at end of file +python test.py \ No newline at end of file diff --git a/core/math/big/example.odin b/core/math/big/example.odin index f7ba61034..10a14507f 100644 --- a/core/math/big/example.odin +++ b/core/math/big/example.odin @@ -87,7 +87,7 @@ Event :: struct { } Timings := [Category]Event{}; -print :: proc(name: string, a: ^Int, base := i8(10), print_extra_info := false, print_name := false, newline := true) { +print :: proc(name: string, a: ^Int, base := i8(10), print_name := false, newline := true, print_extra_info := false) { s := time.tick_now(); as, err := itoa(a, base); Timings[.itoa].t += time.tick_since(s); Timings[.itoa].c += 1; @@ -117,10 +117,10 @@ demo :: proc() { defer destroy(a, b, c, d, e, f); s := time.tick_now(); - err = choose(a, 1024, 255); + err = choose(a, 65535, 255); Timings[.choose].t += time.tick_since(s); Timings[.choose].c += 1; - print("1024 choose 255", a, 10, true, true, true); + print("65535 choose 255", a, 10, true, true, true); fmt.printf("Error: %v\n", err); }