diff --git a/core/math/big/basic.odin b/core/math/big/basic.odin index 755aa51cd..425a5488c 100644 --- a/core/math/big/basic.odin +++ b/core/math/big/basic.odin @@ -326,4 +326,81 @@ int_mod_bits :: proc(remainder, numerator: ^Int, bits: int) -> (err: Error) { return #force_inline internal_int_mod_bits(remainder, numerator, bits); } -mod_bits :: proc { int_mod_bits, }; \ No newline at end of file +mod_bits :: proc { int_mod_bits, }; + + +/* + Logs and roots and such. +*/ +int_log :: proc(a: ^Int, base: DIGIT) -> (res: int, err: Error) { + if a == nil { return 0, .Invalid_Pointer; } + if err = clear_if_uninitialized(a); err != nil { return 0, err; } + + return #force_inline internal_int_log(a, base); +} + +digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) { + return #force_inline internal_digit_log(a, base); +} +log :: proc { int_log, digit_log, }; + +/* + Calculate `dest = base^power` using a square-multiply algorithm. +*/ +int_pow :: proc(dest, base: ^Int, power: int) -> (err: Error) { + if dest == nil || base == nil { return .Invalid_Pointer; } + if err = clear_if_uninitialized(dest, base); err != nil { return err; } + + return #force_inline internal_int_pow(dest, base, power); +} + +/* + Calculate `dest = base^power` using a square-multiply algorithm. +*/ +int_pow_int :: proc(dest: ^Int, base, power: int) -> (err: Error) { + if dest == nil { return .Invalid_Pointer; } + + return #force_inline internal_pow(dest, base, power); +} + +pow :: proc { int_pow, int_pow_int, small_pow, }; +exp :: pow; + +small_pow :: proc(base: _WORD, exponent: _WORD) -> (result: _WORD) { + return #force_inline internal_small_pow(base, exponent); +} + +/* + This function is less generic than `root_n`, simpler and faster. +*/ +int_sqrt :: proc(dest, src: ^Int) -> (err: Error) { + if dest == nil || src == nil { return .Invalid_Pointer; } + if err = clear_if_uninitialized(dest, src); err != nil { return err; } + + return #force_inline internal_int_sqrt(dest, src); +} +sqrt :: proc { int_sqrt, }; + + +/* + Find the nth root of an Integer. + Result found such that `(dest)**n <= src` and `(dest+1)**n > src` + + This algorithm uses Newton's approximation `x[i+1] = x[i] - f(x[i])/f'(x[i])`, + which will find the root in `log(n)` time where each step involves a fair bit. +*/ +int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) { + /* + Fast path for n == 2. + */ + if n == 2 { return sqrt(dest, src); } + + if dest == nil || src == nil { return .Invalid_Pointer; } + /* + Initialize dest + src if needed. + */ + if err = clear_if_uninitialized(dest, src); err != nil { return err; } + + return #force_inline internal_int_root_n(dest, src, n); +} +root_n :: proc { int_root_n, }; \ No newline at end of file diff --git a/core/math/big/build.bat b/core/math/big/build.bat index 1921803b5..e98ac7015 100644 --- a/core/math/big/build.bat +++ b/core/math/big/build.bat @@ -1,10 +1,8 @@ @echo off -odin run . -vet +:odin run . -vet : -o:size -:odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -:odin build . -build-mode:shared -show-timings -o:size -:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -:odin build . -build-mode:shared -show-timings -o:speed - -:python test.py \ No newline at end of file +:odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check && python test.py +:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check && python test.py +:odin build . -build-mode:shared -show-timings -o:size && python test.py +odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check && python test.py +:odin build . -build-mode:shared -show-timings -o:speed && python test.py \ No newline at end of file diff --git a/core/math/big/common.odin b/core/math/big/common.odin index 837fc5789..562aa46fa 100644 --- a/core/math/big/common.odin +++ b/core/math/big/common.odin @@ -109,6 +109,7 @@ Flags :: bit_set[Flag; u8]; Errors are a strict superset of runtime.Allocation_Error. */ Error :: enum int { + Okay = 0, Out_Of_Memory = 1, Invalid_Pointer = 2, Invalid_Argument = 3, diff --git a/core/math/big/example.odin b/core/math/big/example.odin index 761fa97cb..02671a809 100644 --- a/core/math/big/example.odin +++ b/core/math/big/example.odin @@ -75,12 +75,8 @@ demo :: proc() { a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; defer destroy(a, b, c, d, e, f); - power_of_two(a, 3112); - fmt.printf("a is power of two: %v\n", internal_is_power_of_two(a)); - sub(a, a, 1); - fmt.printf("a is power of two: %v\n", internal_is_power_of_two(a)); - add(a, a, 1); - fmt.printf("a is power of two: %v\n", internal_is_power_of_two(a)); + err := set(a, 1); + fmt.printf("err: %v\n", err); } main :: proc() { diff --git a/core/math/big/exp_log.odin b/core/math/big/exp_log.odin deleted file mode 100644 index 15dfa0b5a..000000000 --- a/core/math/big/exp_log.odin +++ /dev/null @@ -1,496 +0,0 @@ -package big - -/* - Copyright 2021 Jeroen van Rijn . - Made available under Odin's BSD-2 license. - - An arbitrary precision mathematics implementation in Odin. - For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3. - The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks. -*/ - -int_log :: proc(a: ^Int, base: DIGIT) -> (res: int, err: Error) { - if base < 2 || DIGIT(base) > _DIGIT_MAX { - return -1, .Invalid_Argument; - } - if err = clear_if_uninitialized(a); err != nil { return -1, err; } - if n, _ := is_neg(a); n { return -1, .Math_Domain_Error; } - if z, _ := is_zero(a); z { return -1, .Math_Domain_Error; } - - /* - Fast path for bases that are a power of two. - */ - if is_power_of_two(int(base)) { return _log_power_of_two(a, base); } - - /* - Fast path for `Int`s that fit within a single `DIGIT`. - */ - if a.used == 1 { return log(a.digit[0], DIGIT(base)); } - - return _int_log(a, base); - -} - -log :: proc { int_log, int_log_digit, }; - -/* - Calculate c = a**b using a square-multiply algorithm. -*/ -int_pow :: proc(dest, base: ^Int, power: int) -> (err: Error) { - power := power; - if err = clear_if_uninitialized(base); err != nil { return err; } - if err = clear_if_uninitialized(dest); err != nil { return err; } - /* - Early outs. - */ - if z, _ := is_zero(base); z { - /* - A zero base is a special case. - */ - if power < 0 { - if err = zero(dest); err != nil { return err; } - return .Math_Domain_Error; - } - if power == 0 { return set(dest, 1); } - if power > 0 { return zero(dest); } - - } - if power < 0 { - /* - Fraction, so we'll return zero. - */ - return zero(dest); - } - switch(power) { - case 0: - /* - Any base to the power zero is one. - */ - return one(dest); - case 1: - /* - Any base to the power one is itself. - */ - return copy(dest, base); - case 2: - return sqr(dest, base); - } - - g := &Int{}; - if err = copy(g, base); err != nil { return err; } - - /* - Set initial result. - */ - if err = set(dest, 1); err != nil { return err; } - - loop: for power > 0 { - /* - If the bit is set, multiply. - */ - if power & 1 != 0 { - if err = mul(dest, g, dest); err != nil { - break loop; - } - } - /* - Square. - */ - if power > 1 { - if err = sqr(g, g); err != nil { - break loop; - } - } - - /* shift to next bit */ - power >>= 1; - } - - destroy(g); - return err; -} - -/* - Calculate c = a**b. -*/ -int_pow_int :: proc(dest: ^Int, base, power: int) -> (err: Error) { - base_t := &Int{}; - defer destroy(base_t); - - if err = set(base_t, base); err != nil { return err; } - - return int_pow(dest, base_t, power); -} - -pow :: proc { int_pow, int_pow_int, }; -exp :: pow; - -/* - Returns the log2 of an `Int`, provided `base` is a power of two. - Don't call it if it isn't. -*/ -_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) { - base := base; - y: int; - for y = 0; base & 1 == 0; { - y += 1; - base >>= 1; - } - log, err = count_bits(a); - return (log - 1) / y, err; -} - -/* - -*/ -small_pow :: proc(base: _WORD, exponent: _WORD) -> (result: _WORD) { - exponent := exponent; base := base; - result = _WORD(1); - - for exponent != 0 { - if exponent & 1 == 1 { - result *= base; - } - exponent >>= 1; - base *= base; - } - return result; -} - -int_log_digit :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) { - /* - If the number is smaller than the base, it fits within a fraction. - Therefore, we return 0. - */ - if a < base { - return 0, nil; - } - - /* - If a number equals the base, the log is 1. - */ - if a == base { - return 1, nil; - } - - N := _WORD(a); - bracket_low := _WORD(1); - bracket_high := _WORD(base); - high := 1; - low := 0; - - for bracket_high < N { - low = high; - bracket_low = bracket_high; - high <<= 1; - bracket_high *= bracket_high; - } - - for high - low > 1 { - mid := (low + high) >> 1; - bracket_mid := bracket_low * small_pow(_WORD(base), _WORD(mid - low)); - - if N < bracket_mid { - high = mid; - bracket_high = bracket_mid; - } - if N > bracket_mid { - low = mid; - bracket_low = bracket_mid; - } - if N == bracket_mid { - return mid, nil; - } - } - - if bracket_high == N { - return high, nil; - } else { - return low, nil; - } -} - -/* - This function is less generic than `root_n`, simpler and faster. -*/ -int_sqrt :: proc(dest, src: ^Int) -> (err: Error) { - - when true { - if err = clear_if_uninitialized(dest); err != nil { return err; } - if err = clear_if_uninitialized(src); err != nil { return err; } - - /* Must be positive. */ - if src.sign == .Negative { return .Invalid_Argument; } - - /* Easy out. If src is zero, so is dest. */ - if z, _ := is_zero(src); z { return zero(dest); } - - /* Set up temporaries. */ - x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(x, y, t1, t2); - - count: int; - if count, err = count_bits(src); err != nil { return err; } - - a, b := count >> 1, count & 1; - if err = power_of_two(x, a+b); err != nil { return err; } - - for { - /* - y = (x + n//x)//2 - */ - div(t1, src, x); - add(t2, t1, x); - shr(y, t2, 1); - - if c, _ := cmp(y, x); c == 0 || c == 1 { - swap(dest, x); - return nil; - } - swap(x, y); - } - - swap(dest, x); - return err; - } else { - return root_n(dest, src, 2); - } -} -sqrt :: proc { int_sqrt, }; - - -/* - Find the nth root of an Integer. - Result found such that `(dest)**n <= src` and `(dest+1)**n > src` - - This algorithm uses Newton's approximation `x[i+1] = x[i] - f(x[i])/f'(x[i])`, - which will find the root in `log(n)` time where each step involves a fair bit. -*/ -int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) { - /* Fast path for n == 2 */ - if n == 2 { return sqrt(dest, src); } - - /* Initialize dest + src if needed. */ - if err = clear_if_uninitialized(dest); err != nil { return err; } - if err = clear_if_uninitialized(src); err != nil { return err; } - - if n < 0 || n > int(_DIGIT_MAX) { - return .Invalid_Argument; - } - - neg: bool; - if n & 1 == 0 { - if neg, err = is_neg(src); neg || err != nil { return .Invalid_Argument; } - } - - /* Set up temporaries. */ - t1, t2, t3, a := &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(t1, t2, t3); - - /* If a is negative fudge the sign but keep track. */ - a.sign = .Zero_or_Positive; - a.used = src.used; - a.digit = src.digit; - - /* - If "n" is larger than INT_MAX it is also larger than - log_2(src) because the bit-length of the "src" is measured - with an int and hence the root is always < 2 (two). - */ - if n > max(int) / 2 { - err = set(dest, 1); - dest.sign = a.sign; - return err; - } - - /* Compute seed: 2^(log_2(src)/n + 2) */ - ilog2: int; - ilog2, err = count_bits(src); - - /* "src" is smaller than max(int), we can cast safely. */ - if ilog2 < n { - err = set(dest, 1); - dest.sign = a.sign; - return err; - } - - ilog2 /= n; - if ilog2 == 0 { - err = set(dest, 1); - dest.sign = a.sign; - return err; - } - - /* Start value must be larger than root. */ - ilog2 += 2; - if err = power_of_two(t2, ilog2); err != nil { return err; } - - c: int; - iterations := 0; - for { - /* t1 = t2 */ - if err = copy(t1, t2); err != nil { return err; } - - /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */ - - /* t3 = t1**(b-1) */ - if err = pow(t3, t1, n-1); err != nil { return err; } - - /* numerator */ - /* t2 = t1**b */ - if err = mul(t2, t1, t3); err != nil { return err; } - - /* t2 = t1**b - a */ - if err = sub(t2, t2, a); err != nil { return err; } - - /* denominator */ - /* t3 = t1**(b-1) * b */ - if err = mul(t3, t3, DIGIT(n)); err != nil { return err; } - - /* t3 = (t1**b - a)/(b * t1**(b-1)) */ - if err = div(t3, t2, t3); err != nil { return err; } - if err = sub(t2, t1, t3); err != nil { return err; } - - /* - Number of rounds is at most log_2(root). If it is more it - got stuck, so break out of the loop and do the rest manually. - */ - if ilog2 -= 1; ilog2 == 0 { - break; - } - if c, err = cmp(t1, t2); c == 0 { break; } - iterations += 1; - if iterations == MAX_ITERATIONS_ROOT_N { - return .Max_Iterations_Reached; - } - } - - /* Result can be off by a few so check. */ - /* Loop beneath can overshoot by one if found root is smaller than actual root. */ - - iterations = 0; - for { - if err = pow(t2, t1, n); err != nil { return err; } - - c, err = cmp(t2, a); - if c == 0 { - swap(dest, t1); - return nil; - } else if c == -1 { - if err = add(t1, t1, DIGIT(1)); err != nil { return err; } - } else { - break; - } - - iterations += 1; - if iterations == MAX_ITERATIONS_ROOT_N { - return .Max_Iterations_Reached; - } - } - - iterations = 0; - /* Correct overshoot from above or from recurrence. */ - for { - if err = pow(t2, t1, n); err != nil { return err; } - - c, err = cmp(t2, a); - if c == 1 { - if err = sub(t1, t1, DIGIT(1)); err != nil { return err; } - } else { - break; - } - - iterations += 1; - if iterations == MAX_ITERATIONS_ROOT_N { - return .Max_Iterations_Reached; - } - } - - /* Set the result. */ - swap(dest, t1); - - /* set the sign of the result */ - dest.sign = src.sign; - - return err; -} -root_n :: proc { int_root_n, }; - -/* - Internal implementation of log. -*/ -_int_log :: proc(a: ^Int, base: DIGIT) -> (res: int, err: Error) { - bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; - - ic, _ := cmp(a, base); - if ic == -1 || ic == 0 { - return 1 if ic == 0 else 0, nil; - } - - if err = set(bi_base, base); err != nil { return -1, err; } - if err = init_multi(bracket_mid, t); err != nil { return -1, err; } - if err = set(bracket_low, 1); err != nil { return -1, err; } - if err = set(bracket_high, base); err != nil { return -1, err; } - - low := 0; high := 1; - - /* - A kind of Giant-step/baby-step algorithm. - Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/ - The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped - for small n. - */ - - for { - /* - Iterate until `a` is bracketed between low + high. - */ - if bc, _ := cmp(bracket_high, a); bc != -1 { - break; - } - - low = high; - if err = copy(bracket_low, bracket_high); err != nil { - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return -1, err; - } - high <<= 1; - if err = sqr(bracket_high, bracket_high); err != nil { - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return -1, err; - } - } - - for (high - low) > 1 { - mid := (high + low) >> 1; - - if err = pow(t, bi_base, mid - low); err != nil { - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return -1, err; - } - - if err = mul(bracket_mid, bracket_low, t); err != nil { - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return -1, err; - } - mc, _ := cmp(a, bracket_mid); - if mc == -1 { - high = mid; - swap(bracket_mid, bracket_high); - } - if mc == 1 { - low = mid; - swap(bracket_mid, bracket_low); - } - if mc == 0 { - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return mid, nil; - } - } - - fc, _ := cmp(bracket_high, a); - res = high if fc == 0 else low; - - destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); - return; -} \ No newline at end of file diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin index 93fa04a8d..43d496215 100644 --- a/core/math/big/internal.odin +++ b/core/math/big/internal.odin @@ -612,7 +612,7 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc /* Fast comba? */ // err = s_mp_sqr_comba(a, c); } else { - err = _private_int_sqr(dest, src); + err = #force_inline _private_int_sqr(dest, src); } } else { /* @@ -647,9 +647,9 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc * have less than MP_WARRAY digits and the number of * digits won't affect carry propagation */ - err = _private_int_mul_comba(dest, src, multiplier, digits); + err = #force_inline _private_int_mul_comba(dest, src, multiplier, digits); } else { - err = _private_int_mul(dest, src, multiplier, digits); + err = #force_inline _private_int_mul(dest, src, multiplier, digits); } } neg := src.sign != multiplier.sign; @@ -659,6 +659,14 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc internal_mul :: proc { internal_int_mul, internal_int_mul_digit, }; +internal_sqr :: proc (dest, src: ^Int) -> (res: Error) { + /* + We call `internal_mul` and not e.g. `_private_int_sqr` because the former + will dispatch to the optimal implementation depending on the source. + */ + return #force_inline internal_mul(dest, src, src); +} + /* divmod. Both the quotient and remainder are optional and may be passed a nil. @@ -838,7 +846,7 @@ internal_mulmod :: proc { internal_int_mulmod, }; remainder = (number * number) % modulus. */ internal_int_sqrmod :: proc(remainder, number, modulus: ^Int) -> (err: Error) { - if err = #force_inline internal_mul(remainder, number, number); err != nil { return err; } + if err = #force_inline internal_sqr(remainder, number); err != nil { return err; } return #force_inline internal_mod(remainder, remainder, modulus); } internal_sqrmod :: proc { internal_int_sqrmod, }; @@ -1069,10 +1077,10 @@ internal_int_compare_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (comparis case a.digit[0] < b: return -1; case a.digit[0] == b: return 0; case a.digit[0] > b: return +1; - /* - Unreachable. - Just here because Odin complains about a missing return value at the bottom of the proc otherwise. - */ + /* + Unreachable. + Just here because Odin complains about a missing return value at the bottom of the proc otherwise. + */ case: return; } } @@ -1111,6 +1119,490 @@ internal_compare_magnitude :: proc { internal_int_compare_magnitude, }; internal_cmp_mag :: internal_compare_magnitude; +/* + ========================= Logs, powers and roots ============================ +*/ + +/* + Returns log_base(a). + Assumes `a` to not be `nil` and have been iniialized. +*/ +internal_int_log :: proc(a: ^Int, base: DIGIT) -> (res: int, err: Error) { + if base < 2 || DIGIT(base) > _DIGIT_MAX { return -1, .Invalid_Argument; } + + if internal_is_negative(a) { return -1, .Math_Domain_Error; } + if internal_is_zero(a) { return -1, .Math_Domain_Error; } + + /* + Fast path for bases that are a power of two. + */ + if platform_int_is_power_of_two(int(base)) { return private_log_power_of_two(a, base); } + + /* + Fast path for `Int`s that fit within a single `DIGIT`. + */ + if a.used == 1 { return internal_log(a.digit[0], DIGIT(base)); } + + return private_int_log(a, base); + +} + +/* + Returns log_base(a), where `a` is a DIGIT. +*/ +internal_digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) { + /* + If the number is smaller than the base, it fits within a fraction. + Therefore, we return 0. + */ + if a < base { return 0, nil; } + + /* + If a number equals the base, the log is 1. + */ + if a == base { return 1, nil; } + + N := _WORD(a); + bracket_low := _WORD(1); + bracket_high := _WORD(base); + high := 1; + low := 0; + + for bracket_high < N { + low = high; + bracket_low = bracket_high; + high <<= 1; + bracket_high *= bracket_high; + } + + for high - low > 1 { + mid := (low + high) >> 1; + bracket_mid := bracket_low * #force_inline internal_small_pow(_WORD(base), _WORD(mid - low)); + + if N < bracket_mid { + high = mid; + bracket_high = bracket_mid; + } + if N > bracket_mid { + low = mid; + bracket_low = bracket_mid; + } + if N == bracket_mid { + return mid, nil; + } + } + + if bracket_high == N { + return high, nil; + } else { + return low, nil; + } +} +internal_log :: proc { internal_int_log, internal_digit_log, }; + +/* + Calculate dest = base^power using a square-multiply algorithm. + Assumes `dest` and `base` not to be `nil` and to have been initialized. +*/ +internal_int_pow :: proc(dest, base: ^Int, power: int) -> (err: Error) { + power := power; + /* + Early outs. + */ + if #force_inline internal_is_zero(base) { + /* + A zero base is a special case. + */ + if power < 0 { + if err = zero(dest); err != nil { return err; } + return .Math_Domain_Error; + } + if power == 0 { return set(dest, 1); } + if power > 0 { return zero(dest); } + + } + if power < 0 { + /* + Fraction, so we'll return zero. + */ + return zero(dest); + } + switch(power) { + case 0: + /* + Any base to the power zero is one. + */ + return one(dest); + case 1: + /* + Any base to the power one is itself. + */ + return copy(dest, base); + case 2: + return #force_inline internal_sqr(dest, base); + } + + g := &Int{}; + if err = copy(g, base); err != nil { return err; } + + /* + Set initial result. + */ + if err = set(dest, 1); err != nil { return err; } + + loop: for power > 0 { + /* + If the bit is set, multiply. + */ + if power & 1 != 0 { + if err = mul(dest, g, dest); err != nil { + break loop; + } + } + /* + Square. + */ + if power > 1 { + if err = #force_inline internal_sqr(g, g); err != nil { + break loop; + } + } + + /* shift to next bit */ + power >>= 1; + } + + destroy(g); + return err; +} + +/* + Calculate `dest = base^power`. + Assumes `dest` not to be `nil` and to have been initialized. +*/ +internal_int_pow_int :: proc(dest: ^Int, base, power: int) -> (err: Error) { + base_t := &Int{}; + defer destroy(base_t); + + if err = set(base_t, base); err != nil { return err; } + + return #force_inline internal_int_pow(dest, base_t, power); +} + +internal_pow :: proc { internal_int_pow, internal_int_pow_int, }; +internal_exp :: pow; + +/* + Returns the log2 of an `Int`. + Assumes `a` not to be `nil` and to have been initialized. + Also assumes `base` is a power of two. +*/ +private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) { + base := base; + y: int; + for y = 0; base & 1 == 0; { + y += 1; + base >>= 1; + } + log, err = count_bits(a); + return (log - 1) / y, err; +} + +/* + +*/ +internal_small_pow :: proc(base: _WORD, exponent: _WORD) -> (result: _WORD) { + exponent := exponent; base := base; + result = _WORD(1); + + for exponent != 0 { + if exponent & 1 == 1 { + result *= base; + } + exponent >>= 1; + base *= base; + } + return result; +} + +/* + This function is less generic than `root_n`, simpler and faster. + Assumes `dest` and `src` not to be `nil` and to have been initialized. +*/ +internal_int_sqrt :: proc(dest, src: ^Int) -> (err: Error) { + /* + Must be positive. + */ + if #force_inline internal_is_negative(src) { return .Invalid_Argument; } + + /* + Easy out. If src is zero, so is dest. + */ + if #force_inline internal_is_zero(src) { return zero(dest); } + + /* + Set up temporaries. + */ + x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}; + defer destroy(x, y, t1, t2); + + count: int; + if count, err = count_bits(src); err != nil { return err; } + + a, b := count >> 1, count & 1; + if err = power_of_two(x, a+b); err != nil { return err; } + + for { + /* + y = (x + n // x) // 2 + */ + internal_div(t1, src, x); + internal_add(t2, t1, x); + shr(y, t2, 1); + + if c := internal_cmp(y, x); c == 0 || c == 1 { + swap(dest, x); + return nil; + } + swap(x, y); + } + + swap(dest, x); + return err; +} +internal_sqrt :: proc { internal_int_sqrt, }; + + +/* + Find the nth root of an Integer. + Result found such that `(dest)**n <= src` and `(dest+1)**n > src` + + This algorithm uses Newton's approximation `x[i+1] = x[i] - f(x[i])/f'(x[i])`, + which will find the root in `log(n)` time where each step involves a fair bit. + + Assumes `dest` and `src` not to be `nil` and have been initialized. +*/ +internal_int_root_n :: proc(dest, src: ^Int, n: int) -> (err: Error) { + /* + Fast path for n == 2 + */ + if n == 2 { return #force_inline internal_sqrt(dest, src); } + + if n < 0 || n > int(_DIGIT_MAX) { return .Invalid_Argument; } + + if n & 1 == 0 && #force_inline internal_is_negative(src) { return .Invalid_Argument; } + + /* + Set up temporaries. + */ + t1, t2, t3, a := &Int{}, &Int{}, &Int{}, &Int{}; + defer destroy(t1, t2, t3); + + /* + If `src` is negative fudge the sign but keep track. + */ + a.sign = .Zero_or_Positive; + a.used = src.used; + a.digit = src.digit; + + /* + If "n" is larger than INT_MAX it is also larger than + log_2(src) because the bit-length of the "src" is measured + with an int and hence the root is always < 2 (two). + */ + if n > max(int) / 2 { + err = set(dest, 1); + dest.sign = a.sign; + return err; + } + + /* + Compute seed: 2^(log_2(src)/n + 2) + */ + ilog2: int; + ilog2, err = count_bits(src); + + /* + "src" is smaller than max(int), we can cast safely. + */ + if ilog2 < n { + err = set(dest, 1); + dest.sign = a.sign; + return err; + } + + ilog2 /= n; + if ilog2 == 0 { + err = set(dest, 1); + dest.sign = a.sign; + return err; + } + + /* + Start value must be larger than root. + */ + ilog2 += 2; + if err = power_of_two(t2, ilog2); err != nil { return err; } + + c: int; + iterations := 0; + for { + /* t1 = t2 */ + if err = copy(t1, t2); err != nil { return err; } + + /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */ + + /* t3 = t1**(b-1) */ + if err = internal_pow(t3, t1, n-1); err != nil { return err; } + + /* numerator */ + /* t2 = t1**b */ + if err = internal_mul(t2, t1, t3); err != nil { return err; } + + /* t2 = t1**b - a */ + if err = internal_sub(t2, t2, a); err != nil { return err; } + + /* denominator */ + /* t3 = t1**(b-1) * b */ + if err = internal_mul(t3, t3, DIGIT(n)); err != nil { return err; } + + /* t3 = (t1**b - a)/(b * t1**(b-1)) */ + if err = internal_div(t3, t2, t3); err != nil { return err; } + if err = internal_sub(t2, t1, t3); err != nil { return err; } + + /* + Number of rounds is at most log_2(root). If it is more it + got stuck, so break out of the loop and do the rest manually. + */ + if ilog2 -= 1; ilog2 == 0 { + break; + } + if c := internal_cmp(t1, t2); c == 0 { break; } + iterations += 1; + if iterations == MAX_ITERATIONS_ROOT_N { + return .Max_Iterations_Reached; + } + } + + /* Result can be off by a few so check. */ + /* Loop beneath can overshoot by one if found root is smaller than actual root. */ + + iterations = 0; + for { + if err = internal_pow(t2, t1, n); err != nil { return err; } + + c := internal_cmp(t2, a); + if c == 0 { + swap(dest, t1); + return nil; + } else if c == -1 { + if err = internal_add(t1, t1, DIGIT(1)); err != nil { return err; } + } else { + break; + } + + iterations += 1; + if iterations == MAX_ITERATIONS_ROOT_N { + return .Max_Iterations_Reached; + } + } + + iterations = 0; + /* Correct overshoot from above or from recurrence. */ + for { + if err = internal_pow(t2, t1, n); err != nil { return err; } + + c := internal_cmp(t2, a); + if c == 1 { + if err = internal_sub(t1, t1, DIGIT(1)); err != nil { return err; } + } else { + break; + } + + iterations += 1; + if iterations == MAX_ITERATIONS_ROOT_N { + return .Max_Iterations_Reached; + } + } + + /* Set the result. */ + swap(dest, t1); + + /* set the sign of the result */ + dest.sign = src.sign; + + return err; +} +internal_root_n :: proc { internal_int_root_n, }; + +/* + Internal implementation of log. + Assumes `a` not to be `nil` and to have been initialized. +*/ +private_int_log :: proc(a: ^Int, base: DIGIT) -> (res: int, err: Error) { + bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; + defer destroy(bracket_low, bracket_high, bracket_mid, t, bi_base); + + ic := #force_inline internal_cmp(a, base); + if ic == -1 || ic == 0 { + return 1 if ic == 0 else 0, nil; + } + + if err = set(bi_base, base); err != nil { return -1, err; } + if err = init_multi(bracket_mid, t); err != nil { return -1, err; } + if err = set(bracket_low, 1); err != nil { return -1, err; } + if err = set(bracket_high, base); err != nil { return -1, err; } + + low := 0; high := 1; + + /* + A kind of Giant-step/baby-step algorithm. + Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/ + The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped + for small n. + */ + + for { + /* + Iterate until `a` is bracketed between low + high. + */ + if bc := #force_inline internal_cmp(bracket_high, a); bc != -1 { break; } + + low = high; + if err = copy(bracket_low, bracket_high); err != nil { return -1, err; } + high <<= 1; + if err = #force_inline internal_sqr(bracket_high, bracket_high); err != nil { return -1, err; } + } + + for (high - low) > 1 { + mid := (high + low) >> 1; + + if err = #force_inline internal_pow(t, bi_base, mid - low); err != nil { return -1, err; } + + if err = #force_inline internal_mul(bracket_mid, bracket_low, t); err != nil { return -1, err; } + + mc := #force_inline internal_cmp(a, bracket_mid); + if mc == -1 { + high = mid; + swap(bracket_mid, bracket_high); + } + if mc == 1 { + low = mid; + swap(bracket_mid, bracket_low); + } + if mc == 0 { return mid, nil; } + } + + fc := #force_inline internal_cmp(bracket_high, a); + res = high if fc == 0 else low; + + return; +} + +/* + Other internal helpers +*/ + internal_int_zero_unused :: #force_inline proc(dest: ^Int, old_used := -1) { /* If we don't pass the number of previously used DIGITs, we zero all remaining ones. @@ -1129,864 +1621,8 @@ internal_int_zero_unused :: #force_inline proc(dest: ^Int, old_used := -1) { mem.zero_slice(dest.digit[dest.used:][:zero_count]); } } - internal_zero_unused :: proc { internal_int_zero_unused, }; - /* ========================== End of low-level routines ========================== - - ============================= Private procedures ============================= - - Private procedures used by the above low-level routines follow. - - Don't call these yourself unless you really know what you're doing. - They include implementations that are optimimal for certain ranges of input only. - - These aren't exported for the same reasons. -*/ - - -/* - Multiplies |a| * |b| and only computes upto digs digits of result. - HAC pp. 595, Algorithm 14.12 Modified so you can control how - many digits of output are created. -*/ -_private_int_mul :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { - /* - Can we use the fast multiplier? - */ - if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA { - return _private_int_mul_comba(dest, a, b, digits); - } - - /* - Set up temporary output `Int`, which we'll swap for `dest` when done. - */ - - t := &Int{}; - - if err = grow(t, max(digits, _DEFAULT_DIGIT_COUNT)); err != nil { return err; } - t.used = digits; - - /* - Compute the digits of the product directly. - */ - pa := a.used; - for ix := 0; ix < pa; ix += 1 { - /* - Limit ourselves to `digits` DIGITs of output. - */ - pb := min(b.used, digits - ix); - carry := _WORD(0); - iy := 0; - - /* - Compute the column of the output and propagate the carry. - */ - #no_bounds_check for iy = 0; iy < pb; iy += 1 { - /* - Compute the column as a _WORD. - */ - column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry; - - /* - The new column is the lower part of the result. - */ - t.digit[ix + iy] = DIGIT(column & _WORD(_MASK)); - - /* - Get the carry word from the result. - */ - carry = column >> _DIGIT_BITS; - } - /* - Set carry if it is placed below digits - */ - if ix + iy < digits { - t.digit[ix + pb] = DIGIT(carry); - } - } - - swap(dest, t); - destroy(t); - return clamp(dest); -} - -/* - Fast (comba) multiplier - - This is the fast column-array [comba] multiplier. It is - designed to compute the columns of the product first - then handle the carries afterwards. This has the effect - of making the nested loops that compute the columns very - simple and schedulable on super-scalar processors. - - This has been modified to produce a variable number of - digits of output so if say only a half-product is required - you don't have to compute the upper half (a feature - required for fast Barrett reduction). - - Based on Algorithm 14.12 on pp.595 of HAC. -*/ -_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { - /* - Set up array. - */ - W: [_WARRAY]DIGIT = ---; - - /* - Grow the destination as required. - */ - if err = grow(dest, digits); err != nil { return err; } - - /* - Number of output digits to produce. - */ - pa := min(digits, a.used + b.used); - - /* - Clear the carry - */ - _W := _WORD(0); - - ix: int; - for ix = 0; ix < pa; ix += 1 { - tx, ty, iy, iz: int; - - /* - Get offsets into the two bignums. - */ - ty = min(b.used - 1, ix); - tx = ix - ty; - - /* - This is the number of times the loop will iterate, essentially. - while (tx++ < a->used && ty-- >= 0) { ... } - */ - - iy = min(a.used - tx, ty + 1); - - /* - Execute loop. - */ - #no_bounds_check for iz = 0; iz < iy; iz += 1 { - _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]); - } - - /* - Store term. - */ - W[ix] = DIGIT(_W) & _MASK; - - /* - Make next carry. - */ - _W = _W >> _WORD(_DIGIT_BITS); - } - - /* - Setup dest. - */ - old_used := dest.used; - dest.used = pa; - - /* - Now extract the previous digit [below the carry]. - */ - copy_slice(dest.digit[0:], W[:pa]); - - /* - Clear unused digits [that existed in the old copy of dest]. - */ - zero_unused(dest, old_used); - - /* - Adjust dest.used based on leading zeroes. - */ - - return clamp(dest); -} - -/* - Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 -*/ -_private_int_sqr :: proc(dest, src: ^Int) -> (err: Error) { - pa := src.used; - - t := &Int{}; ix, iy: int; - /* - Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger. - */ - if err = grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)); err != nil { return err; } - t.used = (2 * pa) + 1; - - #no_bounds_check for ix = 0; ix < pa; ix += 1 { - carry := DIGIT(0); - /* - First calculate the digit at 2*ix; calculate double precision result. - */ - r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix])); - - /* - Store lower part in result. - */ - t.digit[ix+ix] = DIGIT(r & _WORD(_MASK)); - /* - Get the carry. - */ - carry = DIGIT(r >> _DIGIT_BITS); - - #no_bounds_check for iy = ix + 1; iy < pa; iy += 1 { - /* - First calculate the product. - */ - r = _WORD(src.digit[ix]) * _WORD(src.digit[iy]); - - /* Now calculate the double precision result. Nóte we use - * addition instead of *2 since it's easier to optimize - */ - r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry); - - /* - Store lower part. - */ - t.digit[ix+iy] = DIGIT(r & _WORD(_MASK)); - - /* - Get carry. - */ - carry = DIGIT(r >> _DIGIT_BITS); - } - /* - Propagate upwards. - */ - #no_bounds_check for carry != 0 { - r = _WORD(t.digit[ix+iy]) + _WORD(carry); - t.digit[ix+iy] = DIGIT(r & _WORD(_MASK)); - carry = DIGIT(r >> _WORD(_DIGIT_BITS)); - iy += 1; - } - } - - err = clamp(t); - swap(dest, t); - destroy(t); - return err; -} - -/* - Divide by three (based on routine from MPI and the GMP manual). -*/ -_private_int_div_3 :: proc(quotient, numerator: ^Int) -> (remainder: DIGIT, err: Error) { - /* - b = 2^_DIGIT_BITS / 3 - */ - b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3); - - q := &Int{}; - if err = grow(q, numerator.used); err != nil { return 0, err; } - q.used = numerator.used; - q.sign = numerator.sign; - - w, t: _WORD; - #no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 { - w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix]); - if w >= 3 { - /* - Multiply w by [1/3]. - */ - t = (w * b) >> _WORD(_DIGIT_BITS); - - /* - Now subtract 3 * [w/3] from w, to get the remainder. - */ - w -= t+t+t; - - /* - Fixup the remainder as required since the optimization is not exact. - */ - for w >= 3 { - t += 1; - w -= 3; - } - } else { - t = 0; - } - q.digit[ix] = DIGIT(t); - } - remainder = DIGIT(w); - - /* - [optional] store the quotient. - */ - if quotient != nil { - err = clamp(q); - swap(q, quotient); - } - destroy(q); - return remainder, nil; -} - -/* - Signed Integer Division - - c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20 - - Note that the description in HAC is horribly incomplete. - For example, it doesn't consider the case where digits are removed from 'x' in - the inner loop. - - It also doesn't consider the case that y has fewer than three digits, etc. - The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases. -*/ -_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) { - // if err = error_if_immutable(quotient, remainder); err != nil { return err; } - // if err = clear_if_uninitialized(quotient, numerator, denominator); err != nil { return err; } - - q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(q, x, y, t1, t2); - - if err = grow(q, numerator.used + 2); err != nil { return err; } - q.used = numerator.used + 2; - - if err = init_multi(t1, t2); err != nil { return err; } - if err = copy(x, numerator); err != nil { return err; } - if err = copy(y, denominator); err != nil { return err; } - - /* - Fix the sign. - */ - neg := numerator.sign != denominator.sign; - x.sign = .Zero_or_Positive; - y.sign = .Zero_or_Positive; - - /* - Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT] - */ - norm, _ := count_bits(y); - norm %= _DIGIT_BITS; - - if norm < _DIGIT_BITS - 1 { - norm = (_DIGIT_BITS - 1) - norm; - if err = shl(x, x, norm); err != nil { return err; } - if err = shl(y, y, norm); err != nil { return err; } - } else { - norm = 0; - } - - /* - Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4 - */ - n := x.used - 1; - t := y.used - 1; - - /* - while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } - y = y*b**{n-t} - */ - - if err = shl_digit(y, n - t); err != nil { return err; } - - c, _ := cmp(x, y); - for c != -1 { - q.digit[n - t] += 1; - if err = sub(x, x, y); err != nil { return err; } - c, _ = cmp(x, y); - } - - /* - Reset y by shifting it back down. - */ - shr_digit(y, n - t); - - /* - Step 3. for i from n down to (t + 1). - */ - #no_bounds_check for i := n; i >= (t + 1); i -= 1 { - if (i > x.used) { continue; } - - /* - step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt - */ - if x.digit[i] == y.digit[t] { - q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1); - } else { - - tmp := _WORD(x.digit[i]) << _DIGIT_BITS; - tmp |= _WORD(x.digit[i - 1]); - tmp /= _WORD(y.digit[t]); - if tmp > _WORD(_MASK) { - tmp = _WORD(_MASK); - } - q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK)); - } - - /* while (q{i-t-1} * (yt * b + y{t-1})) > - xi * b**2 + xi-1 * b + xi-2 - - do q{i-t-1} -= 1; - */ - - iter := 0; - - q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK; - #no_bounds_check for { - q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK; - - /* - Find left hand. - */ - zero(t1); - t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]; - t1.digit[1] = y.digit[t]; - t1.used = 2; - if err = mul(t1, t1, q.digit[(i - t) - 1]); err != nil { return err; } - - /* - Find right hand. - */ - t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]; - t2.digit[1] = x.digit[i - 1]; /* i >= 1 always holds */ - t2.digit[2] = x.digit[i]; - t2.used = 3; - - if t1_t2, _ := cmp_mag(t1, t2); t1_t2 != 1 { - break; - } - iter += 1; if iter > 100 { return .Max_Iterations_Reached; } - } - - /* - Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} - */ - if err = int_mul_digit(t1, y, q.digit[(i - t) - 1]); err != nil { return err; } - if err = shl_digit(t1, (i - t) - 1); err != nil { return err; } - if err = sub(x, x, t1); err != nil { return err; } - - /* - if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } - */ - if x.sign == .Negative { - if err = copy(t1, y); err != nil { return err; } - if err = shl_digit(t1, (i - t) - 1); err != nil { return err; } - if err = add(x, x, t1); err != nil { return err; } - - q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK; - } - } - - /* - Now q is the quotient and x is the remainder, [which we have to normalize] - Get sign before writing to c. - */ - z, _ := is_zero(x); - x.sign = .Zero_or_Positive if z else numerator.sign; - - if quotient != nil { - clamp(q); - swap(q, quotient); - quotient.sign = .Negative if neg else .Zero_or_Positive; - } - - if remainder != nil { - if err = shr(x, x, norm); err != nil { return err; } - swap(x, remainder); - } - - return nil; -} - -/* - Slower bit-bang division... also smaller. -*/ -@(deprecated="Use `_int_div_school`, it's 3.5x faster.") -_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) { - - ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}; - c: int; - - goto_end: for { - if err = one(tq); err != nil { break goto_end; } - - num_bits, _ := count_bits(numerator); - den_bits, _ := count_bits(denominator); - n := num_bits - den_bits; - - if err = abs(ta, numerator); err != nil { break goto_end; } - if err = abs(tb, denominator); err != nil { break goto_end; } - if err = shl(tb, tb, n); err != nil { break goto_end; } - if err = shl(tq, tq, n); err != nil { break goto_end; } - - for n >= 0 { - if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 { - // ta -= tb - if err = sub(ta, ta, tb); err != nil { break goto_end; } - // q += tq - if err = add( q, q, tq); err != nil { break goto_end; } - } - if err = shr1(tb, tb); err != nil { break goto_end; } - if err = shr1(tq, tq); err != nil { break goto_end; } - - n -= 1; - } - - /* - Now q == quotient and ta == remainder. - */ - neg := numerator.sign != denominator.sign; - if quotient != nil { - swap(quotient, q); - z, _ := is_zero(quotient); - quotient.sign = .Negative if neg && !z else .Zero_or_Positive; - } - if remainder != nil { - swap(remainder, ta); - z, _ := is_zero(numerator); - remainder.sign = .Zero_or_Positive if z else numerator.sign; - } - - break goto_end; - } - destroy(ta, tb, tq, q); - return err; -} - - - -/* - Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html -*/ -_private_int_factorial_binary_split :: proc(res: ^Int, n: int) -> (err: Error) { - - inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; - defer destroy(inner, outer, start, stop, temp); - - if err = set(inner, 1); err != nil { return err; } - if err = set(outer, 1); err != nil { return err; } - - bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n)); - - for i := bits_used; i >= 0; i -= 1 { - start := (n >> (uint(i) + 1)) + 1 | 1; - stop := (n >> uint(i)) + 1 | 1; - if err = _private_int_recursive_product(temp, start, stop); err != nil { return err; } - if err = internal_mul(inner, inner, temp); err != nil { return err; } - if err = internal_mul(outer, outer, inner); err != nil { return err; } - } - shift := n - intrinsics.count_ones(n); - - return shl(res, outer, int(shift)); -} - -/* - Recursive product used by binary split factorial algorithm. -*/ -_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0)) -> (err: Error) { - t1, t2 := &Int{}, &Int{}; - defer destroy(t1, t2); - - if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS { return .Max_Iterations_Reached; } - - num_factors := (stop - start) >> 1; - if num_factors == 2 { - if err = set(t1, start); err != nil { return err; } - when true { - if err = grow(t2, t1.used + 1); err != nil { return err; } - if err = internal_add(t2, t1, 2); err != nil { return err; } - } else { - if err = add(t2, t1, 2); err != nil { return err; } - } - return internal_mul(res, t1, t2); - } - - if num_factors > 1 { - mid := (start + num_factors) | 1; - if err = _private_int_recursive_product(t1, start, mid, level + 1); err != nil { return err; } - if err = _private_int_recursive_product(t2, mid, stop, level + 1); err != nil { return err; } - return internal_mul(res, t1, t2); - } - - if num_factors == 1 { return #force_inline set(res, start); } - - return #force_inline set(res, 1); -} - -/* - Internal function computing both GCD using the binary method, - and, if target isn't `nil`, also LCM. - - Expects the `a` and `b` to have been initialized - and one or both of `res_gcd` or `res_lcm` not to be `nil`. - - If both `a` and `b` are zero, return zero. - If either `a` or `b`, return the other one. - - The `gcd` and `lcm` wrappers have already done this test, - but `gcd_lcm` wouldn't have, so we still need to perform it. - - If neither result is wanted, we have nothing to do. -*/ -_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int) -> (err: Error) { - if res_gcd == nil && res_lcm == nil { return nil; } - - /* - We need a temporary because `res_gcd` is allowed to be `nil`. - */ - if a.used == 0 && b.used == 0 { - /* - GCD(0, 0) and LCM(0, 0) are both 0. - */ - if res_gcd != nil { - if err = zero(res_gcd); err != nil { return err; } - } - if res_lcm != nil { - if err = zero(res_lcm); err != nil { return err; } - } - return nil; - } else if a.used == 0 { - /* - We can early out with GCD = B and LCM = 0 - */ - if res_gcd != nil { - if err = abs(res_gcd, b); err != nil { return err; } - } - if res_lcm != nil { - if err = zero(res_lcm); err != nil { return err; } - } - return nil; - } else if b.used == 0 { - /* - We can early out with GCD = A and LCM = 0 - */ - if res_gcd != nil { - if err = abs(res_gcd, a); err != nil { return err; } - } - if res_lcm != nil { - if err = zero(res_lcm); err != nil { return err; } - } - return nil; - } - - temp_gcd_res := &Int{}; - defer destroy(temp_gcd_res); - - /* - If neither `a` or `b` was zero, we need to compute `gcd`. - Get copies of `a` and `b` we can modify. - */ - u, v := &Int{}, &Int{}; - defer destroy(u, v); - if err = copy(u, a); err != nil { return err; } - if err = copy(v, b); err != nil { return err; } - - /* - Must be positive for the remainder of the algorithm. - */ - u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive; - - /* - B1. Find the common power of two for `u` and `v`. - */ - u_lsb, _ := count_lsb(u); - v_lsb, _ := count_lsb(v); - k := min(u_lsb, v_lsb); - - if k > 0 { - /* - Divide the power of two out. - */ - if err = shr(u, u, k); err != nil { return err; } - if err = shr(v, v, k); err != nil { return err; } - } - - /* - Divide any remaining factors of two out. - */ - if u_lsb != k { - if err = shr(u, u, u_lsb - k); err != nil { return err; } - } - if v_lsb != k { - if err = shr(v, v, v_lsb - k); err != nil { return err; } - } - - for v.used != 0 { - /* - Make sure `v` is the largest. - */ - if c, _ := cmp_mag(u, v); c == 1 { - /* - Swap `u` and `v` to make sure `v` is >= `u`. - */ - swap(u, v); - } - - /* - Subtract smallest from largest. - */ - if err = internal_sub(v, v, u); err != nil { return err; } - - /* - Divide out all factors of two. - */ - b, _ := count_lsb(v); - if err = shr(v, v, b); err != nil { return err; } - } - - /* - Multiply by 2**k which we divided out at the beginning. - */ - if err = shl(temp_gcd_res, u, k); err != nil { return err; } - temp_gcd_res.sign = .Zero_or_Positive; - - /* - We've computed `gcd`, either the long way, or because one of the inputs was zero. - If we don't want `lcm`, we're done. - */ - if res_lcm == nil { - swap(temp_gcd_res, res_gcd); - return nil; - } - - /* - Computes least common multiple as `|a*b|/gcd(a,b)` - Divide the smallest by the GCD. - */ - if c, _ := cmp_mag(a, b); c == -1 { - /* - Store quotient in `t2` such that `t2 * b` is the LCM. - */ - if err = internal_div(res_lcm, a, temp_gcd_res); err != nil { return err; } - err = internal_mul(res_lcm, res_lcm, b); - } else { - /* - Store quotient in `t2` such that `t2 * a` is the LCM. - */ - if err = internal_div(res_lcm, a, temp_gcd_res); err != nil { return err; } - err = internal_mul(res_lcm, res_lcm, b); - } - - if res_gcd != nil { - swap(temp_gcd_res, res_gcd); - } - - /* - Fix the sign to positive and return. - */ - res_lcm.sign = .Zero_or_Positive; - return err; -} - -/* - ======================== End of private procedures ======================= - - =============================== Private tables =============================== - - Tables used by `internal_*` and `_*`. -*/ - -_private_prime_table := []DIGIT{ - 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, - 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, - 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, - 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083, - 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, - 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, - 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, - 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137, - - 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167, - 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199, - 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9, - 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7, - 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239, - 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265, - 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293, - 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF, - - 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301, - 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B, - 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371, - 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD, - 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5, - 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419, - 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449, - 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B, - - 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7, - 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503, - 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529, - 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F, - 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3, - 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, - 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, - 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653, -}; - -when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) { - _factorial_table := [35]_WORD{ -/* f(00): */ 1, -/* f(01): */ 1, -/* f(02): */ 2, -/* f(03): */ 6, -/* f(04): */ 24, -/* f(05): */ 120, -/* f(06): */ 720, -/* f(07): */ 5_040, -/* f(08): */ 40_320, -/* f(09): */ 362_880, -/* f(10): */ 3_628_800, -/* f(11): */ 39_916_800, -/* f(12): */ 479_001_600, -/* f(13): */ 6_227_020_800, -/* f(14): */ 87_178_291_200, -/* f(15): */ 1_307_674_368_000, -/* f(16): */ 20_922_789_888_000, -/* f(17): */ 355_687_428_096_000, -/* f(18): */ 6_402_373_705_728_000, -/* f(19): */ 121_645_100_408_832_000, -/* f(20): */ 2_432_902_008_176_640_000, -/* f(21): */ 51_090_942_171_709_440_000, -/* f(22): */ 1_124_000_727_777_607_680_000, -/* f(23): */ 25_852_016_738_884_976_640_000, -/* f(24): */ 620_448_401_733_239_439_360_000, -/* f(25): */ 15_511_210_043_330_985_984_000_000, -/* f(26): */ 403_291_461_126_605_635_584_000_000, -/* f(27): */ 10_888_869_450_418_352_160_768_000_000, -/* f(28): */ 304_888_344_611_713_860_501_504_000_000, -/* f(29): */ 8_841_761_993_739_701_954_543_616_000_000, -/* f(30): */ 265_252_859_812_191_058_636_308_480_000_000, -/* f(31): */ 8_222_838_654_177_922_817_725_562_880_000_000, -/* f(32): */ 263_130_836_933_693_530_167_218_012_160_000_000, -/* f(33): */ 8_683_317_618_811_886_495_518_194_401_280_000_000, -/* f(34): */ 295_232_799_039_604_140_847_618_609_643_520_000_000, - }; -} else { - _factorial_table := [21]_WORD{ -/* f(00): */ 1, -/* f(01): */ 1, -/* f(02): */ 2, -/* f(03): */ 6, -/* f(04): */ 24, -/* f(05): */ 120, -/* f(06): */ 720, -/* f(07): */ 5_040, -/* f(08): */ 40_320, -/* f(09): */ 362_880, -/* f(10): */ 3_628_800, -/* f(11): */ 39_916_800, -/* f(12): */ 479_001_600, -/* f(13): */ 6_227_020_800, -/* f(14): */ 87_178_291_200, -/* f(15): */ 1_307_674_368_000, -/* f(16): */ 20_922_789_888_000, -/* f(17): */ 355_687_428_096_000, -/* f(18): */ 6_402_373_705_728_000, -/* f(19): */ 121_645_100_408_832_000, -/* f(20): */ 2_432_902_008_176_640_000, - }; -}; - -/* - ========================= End of private tables ======================== */ \ No newline at end of file diff --git a/core/math/big/private.odin b/core/math/big/private.odin new file mode 100644 index 000000000..e9db6e3ef --- /dev/null +++ b/core/math/big/private.odin @@ -0,0 +1,866 @@ +package big + +/* + Copyright 2021 Jeroen van Rijn . + Made available under Odin's BSD-2 license. + + An arbitrary precision mathematics implementation in Odin. + For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3. + The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks. + + ============================= Private procedures ============================= + + Private procedures used by the above low-level routines follow. + + Don't call these yourself unless you really know what you're doing. + They include implementations that are optimimal for certain ranges of input only. + + These aren't exported for the same reasons. +*/ + +import "core:intrinsics" + +/* + Multiplies |a| * |b| and only computes upto digs digits of result. + HAC pp. 595, Algorithm 14.12 Modified so you can control how + many digits of output are created. +*/ +_private_int_mul :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { + /* + Can we use the fast multiplier? + */ + if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA { + return #force_inline _private_int_mul_comba(dest, a, b, digits); + } + + /* + Set up temporary output `Int`, which we'll swap for `dest` when done. + */ + + t := &Int{}; + + if err = grow(t, max(digits, _DEFAULT_DIGIT_COUNT)); err != nil { return err; } + t.used = digits; + + /* + Compute the digits of the product directly. + */ + pa := a.used; + for ix := 0; ix < pa; ix += 1 { + /* + Limit ourselves to `digits` DIGITs of output. + */ + pb := min(b.used, digits - ix); + carry := _WORD(0); + iy := 0; + + /* + Compute the column of the output and propagate the carry. + */ + #no_bounds_check for iy = 0; iy < pb; iy += 1 { + /* + Compute the column as a _WORD. + */ + column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry; + + /* + The new column is the lower part of the result. + */ + t.digit[ix + iy] = DIGIT(column & _WORD(_MASK)); + + /* + Get the carry word from the result. + */ + carry = column >> _DIGIT_BITS; + } + /* + Set carry if it is placed below digits + */ + if ix + iy < digits { + t.digit[ix + pb] = DIGIT(carry); + } + } + + swap(dest, t); + destroy(t); + return clamp(dest); +} + +/* + Fast (comba) multiplier + + This is the fast column-array [comba] multiplier. It is + designed to compute the columns of the product first + then handle the carries afterwards. This has the effect + of making the nested loops that compute the columns very + simple and schedulable on super-scalar processors. + + This has been modified to produce a variable number of + digits of output so if say only a half-product is required + you don't have to compute the upper half (a feature + required for fast Barrett reduction). + + Based on Algorithm 14.12 on pp.595 of HAC. +*/ +_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int) -> (err: Error) { + /* + Set up array. + */ + W: [_WARRAY]DIGIT = ---; + + /* + Grow the destination as required. + */ + if err = grow(dest, digits); err != nil { return err; } + + /* + Number of output digits to produce. + */ + pa := min(digits, a.used + b.used); + + /* + Clear the carry + */ + _W := _WORD(0); + + ix: int; + for ix = 0; ix < pa; ix += 1 { + tx, ty, iy, iz: int; + + /* + Get offsets into the two bignums. + */ + ty = min(b.used - 1, ix); + tx = ix - ty; + + /* + This is the number of times the loop will iterate, essentially. + while (tx++ < a->used && ty-- >= 0) { ... } + */ + + iy = min(a.used - tx, ty + 1); + + /* + Execute loop. + */ + #no_bounds_check for iz = 0; iz < iy; iz += 1 { + _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]); + } + + /* + Store term. + */ + W[ix] = DIGIT(_W) & _MASK; + + /* + Make next carry. + */ + _W = _W >> _WORD(_DIGIT_BITS); + } + + /* + Setup dest. + */ + old_used := dest.used; + dest.used = pa; + + /* + Now extract the previous digit [below the carry]. + */ + copy_slice(dest.digit[0:], W[:pa]); + + /* + Clear unused digits [that existed in the old copy of dest]. + */ + zero_unused(dest, old_used); + + /* + Adjust dest.used based on leading zeroes. + */ + + return clamp(dest); +} + +/* + Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 + Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized. +*/ +_private_int_sqr :: proc(dest, src: ^Int) -> (err: Error) { + pa := src.used; + + t := &Int{}; ix, iy: int; + /* + Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger. + */ + if err = grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)); err != nil { return err; } + t.used = (2 * pa) + 1; + + #no_bounds_check for ix = 0; ix < pa; ix += 1 { + carry := DIGIT(0); + /* + First calculate the digit at 2*ix; calculate double precision result. + */ + r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix])); + + /* + Store lower part in result. + */ + t.digit[ix+ix] = DIGIT(r & _WORD(_MASK)); + /* + Get the carry. + */ + carry = DIGIT(r >> _DIGIT_BITS); + + #no_bounds_check for iy = ix + 1; iy < pa; iy += 1 { + /* + First calculate the product. + */ + r = _WORD(src.digit[ix]) * _WORD(src.digit[iy]); + + /* Now calculate the double precision result. Nóte we use + * addition instead of *2 since it's easier to optimize + */ + r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry); + + /* + Store lower part. + */ + t.digit[ix+iy] = DIGIT(r & _WORD(_MASK)); + + /* + Get carry. + */ + carry = DIGIT(r >> _DIGIT_BITS); + } + /* + Propagate upwards. + */ + #no_bounds_check for carry != 0 { + r = _WORD(t.digit[ix+iy]) + _WORD(carry); + t.digit[ix+iy] = DIGIT(r & _WORD(_MASK)); + carry = DIGIT(r >> _WORD(_DIGIT_BITS)); + iy += 1; + } + } + + err = clamp(t); + swap(dest, t); + destroy(t); + return err; +} + +/* + Divide by three (based on routine from MPI and the GMP manual). +*/ +_private_int_div_3 :: proc(quotient, numerator: ^Int) -> (remainder: DIGIT, err: Error) { + /* + b = 2^_DIGIT_BITS / 3 + */ + b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3); + + q := &Int{}; + if err = grow(q, numerator.used); err != nil { return 0, err; } + q.used = numerator.used; + q.sign = numerator.sign; + + w, t: _WORD; + #no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 { + w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix]); + if w >= 3 { + /* + Multiply w by [1/3]. + */ + t = (w * b) >> _WORD(_DIGIT_BITS); + + /* + Now subtract 3 * [w/3] from w, to get the remainder. + */ + w -= t+t+t; + + /* + Fixup the remainder as required since the optimization is not exact. + */ + for w >= 3 { + t += 1; + w -= 3; + } + } else { + t = 0; + } + q.digit[ix] = DIGIT(t); + } + remainder = DIGIT(w); + + /* + [optional] store the quotient. + */ + if quotient != nil { + err = clamp(q); + swap(q, quotient); + } + destroy(q); + return remainder, nil; +} + +/* + Signed Integer Division + + c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20 + + Note that the description in HAC is horribly incomplete. + For example, it doesn't consider the case where digits are removed from 'x' in + the inner loop. + + It also doesn't consider the case that y has fewer than three digits, etc. + The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases. +*/ +_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) { + // if err = error_if_immutable(quotient, remainder); err != nil { return err; } + // if err = clear_if_uninitialized(quotient, numerator, denominator); err != nil { return err; } + + q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; + defer destroy(q, x, y, t1, t2); + + if err = grow(q, numerator.used + 2); err != nil { return err; } + q.used = numerator.used + 2; + + if err = init_multi(t1, t2); err != nil { return err; } + if err = copy(x, numerator); err != nil { return err; } + if err = copy(y, denominator); err != nil { return err; } + + /* + Fix the sign. + */ + neg := numerator.sign != denominator.sign; + x.sign = .Zero_or_Positive; + y.sign = .Zero_or_Positive; + + /* + Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT] + */ + norm, _ := count_bits(y); + norm %= _DIGIT_BITS; + + if norm < _DIGIT_BITS - 1 { + norm = (_DIGIT_BITS - 1) - norm; + if err = shl(x, x, norm); err != nil { return err; } + if err = shl(y, y, norm); err != nil { return err; } + } else { + norm = 0; + } + + /* + Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4 + */ + n := x.used - 1; + t := y.used - 1; + + /* + while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } + y = y*b**{n-t} + */ + + if err = shl_digit(y, n - t); err != nil { return err; } + + c, _ := cmp(x, y); + for c != -1 { + q.digit[n - t] += 1; + if err = sub(x, x, y); err != nil { return err; } + c, _ = cmp(x, y); + } + + /* + Reset y by shifting it back down. + */ + shr_digit(y, n - t); + + /* + Step 3. for i from n down to (t + 1). + */ + #no_bounds_check for i := n; i >= (t + 1); i -= 1 { + if (i > x.used) { continue; } + + /* + step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt + */ + if x.digit[i] == y.digit[t] { + q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1); + } else { + + tmp := _WORD(x.digit[i]) << _DIGIT_BITS; + tmp |= _WORD(x.digit[i - 1]); + tmp /= _WORD(y.digit[t]); + if tmp > _WORD(_MASK) { + tmp = _WORD(_MASK); + } + q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK)); + } + + /* while (q{i-t-1} * (yt * b + y{t-1})) > + xi * b**2 + xi-1 * b + xi-2 + + do q{i-t-1} -= 1; + */ + + iter := 0; + + q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK; + #no_bounds_check for { + q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK; + + /* + Find left hand. + */ + zero(t1); + t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]; + t1.digit[1] = y.digit[t]; + t1.used = 2; + if err = mul(t1, t1, q.digit[(i - t) - 1]); err != nil { return err; } + + /* + Find right hand. + */ + t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]; + t2.digit[1] = x.digit[i - 1]; /* i >= 1 always holds */ + t2.digit[2] = x.digit[i]; + t2.used = 3; + + if t1_t2, _ := cmp_mag(t1, t2); t1_t2 != 1 { + break; + } + iter += 1; if iter > 100 { return .Max_Iterations_Reached; } + } + + /* + Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} + */ + if err = int_mul_digit(t1, y, q.digit[(i - t) - 1]); err != nil { return err; } + if err = shl_digit(t1, (i - t) - 1); err != nil { return err; } + if err = sub(x, x, t1); err != nil { return err; } + + /* + if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } + */ + if x.sign == .Negative { + if err = copy(t1, y); err != nil { return err; } + if err = shl_digit(t1, (i - t) - 1); err != nil { return err; } + if err = add(x, x, t1); err != nil { return err; } + + q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK; + } + } + + /* + Now q is the quotient and x is the remainder, [which we have to normalize] + Get sign before writing to c. + */ + z, _ := is_zero(x); + x.sign = .Zero_or_Positive if z else numerator.sign; + + if quotient != nil { + clamp(q); + swap(q, quotient); + quotient.sign = .Negative if neg else .Zero_or_Positive; + } + + if remainder != nil { + if err = shr(x, x, norm); err != nil { return err; } + swap(x, remainder); + } + + return nil; +} + +/* + Slower bit-bang division... also smaller. +*/ +@(deprecated="Use `_int_div_school`, it's 3.5x faster.") +_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) { + + ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}; + c: int; + + goto_end: for { + if err = one(tq); err != nil { break goto_end; } + + num_bits, _ := count_bits(numerator); + den_bits, _ := count_bits(denominator); + n := num_bits - den_bits; + + if err = abs(ta, numerator); err != nil { break goto_end; } + if err = abs(tb, denominator); err != nil { break goto_end; } + if err = shl(tb, tb, n); err != nil { break goto_end; } + if err = shl(tq, tq, n); err != nil { break goto_end; } + + for n >= 0 { + if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 { + // ta -= tb + if err = sub(ta, ta, tb); err != nil { break goto_end; } + // q += tq + if err = add( q, q, tq); err != nil { break goto_end; } + } + if err = shr1(tb, tb); err != nil { break goto_end; } + if err = shr1(tq, tq); err != nil { break goto_end; } + + n -= 1; + } + + /* + Now q == quotient and ta == remainder. + */ + neg := numerator.sign != denominator.sign; + if quotient != nil { + swap(quotient, q); + z, _ := is_zero(quotient); + quotient.sign = .Negative if neg && !z else .Zero_or_Positive; + } + if remainder != nil { + swap(remainder, ta); + z, _ := is_zero(numerator); + remainder.sign = .Zero_or_Positive if z else numerator.sign; + } + + break goto_end; + } + destroy(ta, tb, tq, q); + return err; +} + + + +/* + Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html +*/ +_private_int_factorial_binary_split :: proc(res: ^Int, n: int) -> (err: Error) { + + inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}; + defer destroy(inner, outer, start, stop, temp); + + if err = set(inner, 1); err != nil { return err; } + if err = set(outer, 1); err != nil { return err; } + + bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n)); + + for i := bits_used; i >= 0; i -= 1 { + start := (n >> (uint(i) + 1)) + 1 | 1; + stop := (n >> uint(i)) + 1 | 1; + if err = _private_int_recursive_product(temp, start, stop); err != nil { return err; } + if err = internal_mul(inner, inner, temp); err != nil { return err; } + if err = internal_mul(outer, outer, inner); err != nil { return err; } + } + shift := n - intrinsics.count_ones(n); + + return shl(res, outer, int(shift)); +} + +/* + Recursive product used by binary split factorial algorithm. +*/ +_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0)) -> (err: Error) { + t1, t2 := &Int{}, &Int{}; + defer destroy(t1, t2); + + if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS { return .Max_Iterations_Reached; } + + num_factors := (stop - start) >> 1; + if num_factors == 2 { + if err = set(t1, start); err != nil { return err; } + when true { + if err = grow(t2, t1.used + 1); err != nil { return err; } + if err = internal_add(t2, t1, 2); err != nil { return err; } + } else { + if err = add(t2, t1, 2); err != nil { return err; } + } + return internal_mul(res, t1, t2); + } + + if num_factors > 1 { + mid := (start + num_factors) | 1; + if err = _private_int_recursive_product(t1, start, mid, level + 1); err != nil { return err; } + if err = _private_int_recursive_product(t2, mid, stop, level + 1); err != nil { return err; } + return internal_mul(res, t1, t2); + } + + if num_factors == 1 { return #force_inline set(res, start); } + + return #force_inline set(res, 1); +} + +/* + Internal function computing both GCD using the binary method, + and, if target isn't `nil`, also LCM. + + Expects the `a` and `b` to have been initialized + and one or both of `res_gcd` or `res_lcm` not to be `nil`. + + If both `a` and `b` are zero, return zero. + If either `a` or `b`, return the other one. + + The `gcd` and `lcm` wrappers have already done this test, + but `gcd_lcm` wouldn't have, so we still need to perform it. + + If neither result is wanted, we have nothing to do. +*/ +_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int) -> (err: Error) { + if res_gcd == nil && res_lcm == nil { return nil; } + + /* + We need a temporary because `res_gcd` is allowed to be `nil`. + */ + if a.used == 0 && b.used == 0 { + /* + GCD(0, 0) and LCM(0, 0) are both 0. + */ + if res_gcd != nil { + if err = zero(res_gcd); err != nil { return err; } + } + if res_lcm != nil { + if err = zero(res_lcm); err != nil { return err; } + } + return nil; + } else if a.used == 0 { + /* + We can early out with GCD = B and LCM = 0 + */ + if res_gcd != nil { + if err = abs(res_gcd, b); err != nil { return err; } + } + if res_lcm != nil { + if err = zero(res_lcm); err != nil { return err; } + } + return nil; + } else if b.used == 0 { + /* + We can early out with GCD = A and LCM = 0 + */ + if res_gcd != nil { + if err = abs(res_gcd, a); err != nil { return err; } + } + if res_lcm != nil { + if err = zero(res_lcm); err != nil { return err; } + } + return nil; + } + + temp_gcd_res := &Int{}; + defer destroy(temp_gcd_res); + + /* + If neither `a` or `b` was zero, we need to compute `gcd`. + Get copies of `a` and `b` we can modify. + */ + u, v := &Int{}, &Int{}; + defer destroy(u, v); + if err = copy(u, a); err != nil { return err; } + if err = copy(v, b); err != nil { return err; } + + /* + Must be positive for the remainder of the algorithm. + */ + u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive; + + /* + B1. Find the common power of two for `u` and `v`. + */ + u_lsb, _ := count_lsb(u); + v_lsb, _ := count_lsb(v); + k := min(u_lsb, v_lsb); + + if k > 0 { + /* + Divide the power of two out. + */ + if err = shr(u, u, k); err != nil { return err; } + if err = shr(v, v, k); err != nil { return err; } + } + + /* + Divide any remaining factors of two out. + */ + if u_lsb != k { + if err = shr(u, u, u_lsb - k); err != nil { return err; } + } + if v_lsb != k { + if err = shr(v, v, v_lsb - k); err != nil { return err; } + } + + for v.used != 0 { + /* + Make sure `v` is the largest. + */ + if c, _ := cmp_mag(u, v); c == 1 { + /* + Swap `u` and `v` to make sure `v` is >= `u`. + */ + swap(u, v); + } + + /* + Subtract smallest from largest. + */ + if err = internal_sub(v, v, u); err != nil { return err; } + + /* + Divide out all factors of two. + */ + b, _ := count_lsb(v); + if err = shr(v, v, b); err != nil { return err; } + } + + /* + Multiply by 2**k which we divided out at the beginning. + */ + if err = shl(temp_gcd_res, u, k); err != nil { return err; } + temp_gcd_res.sign = .Zero_or_Positive; + + /* + We've computed `gcd`, either the long way, or because one of the inputs was zero. + If we don't want `lcm`, we're done. + */ + if res_lcm == nil { + swap(temp_gcd_res, res_gcd); + return nil; + } + + /* + Computes least common multiple as `|a*b|/gcd(a,b)` + Divide the smallest by the GCD. + */ + if c, _ := cmp_mag(a, b); c == -1 { + /* + Store quotient in `t2` such that `t2 * b` is the LCM. + */ + if err = internal_div(res_lcm, a, temp_gcd_res); err != nil { return err; } + err = internal_mul(res_lcm, res_lcm, b); + } else { + /* + Store quotient in `t2` such that `t2 * a` is the LCM. + */ + if err = internal_div(res_lcm, a, temp_gcd_res); err != nil { return err; } + err = internal_mul(res_lcm, res_lcm, b); + } + + if res_gcd != nil { + swap(temp_gcd_res, res_gcd); + } + + /* + Fix the sign to positive and return. + */ + res_lcm.sign = .Zero_or_Positive; + return err; +} + +/* + ======================== End of private procedures ======================= + + =============================== Private tables =============================== + + Tables used by `internal_*` and `_*`. +*/ + +_private_prime_table := []DIGIT{ + 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, + 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, + 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, + 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083, + 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, + 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, + 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, + 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137, + + 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167, + 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199, + 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9, + 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7, + 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239, + 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265, + 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293, + 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF, + + 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301, + 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B, + 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371, + 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD, + 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5, + 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419, + 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449, + 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B, + + 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7, + 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503, + 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529, + 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F, + 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3, + 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, + 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, + 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653, +}; + +when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) { + _factorial_table := [35]_WORD{ +/* f(00): */ 1, +/* f(01): */ 1, +/* f(02): */ 2, +/* f(03): */ 6, +/* f(04): */ 24, +/* f(05): */ 120, +/* f(06): */ 720, +/* f(07): */ 5_040, +/* f(08): */ 40_320, +/* f(09): */ 362_880, +/* f(10): */ 3_628_800, +/* f(11): */ 39_916_800, +/* f(12): */ 479_001_600, +/* f(13): */ 6_227_020_800, +/* f(14): */ 87_178_291_200, +/* f(15): */ 1_307_674_368_000, +/* f(16): */ 20_922_789_888_000, +/* f(17): */ 355_687_428_096_000, +/* f(18): */ 6_402_373_705_728_000, +/* f(19): */ 121_645_100_408_832_000, +/* f(20): */ 2_432_902_008_176_640_000, +/* f(21): */ 51_090_942_171_709_440_000, +/* f(22): */ 1_124_000_727_777_607_680_000, +/* f(23): */ 25_852_016_738_884_976_640_000, +/* f(24): */ 620_448_401_733_239_439_360_000, +/* f(25): */ 15_511_210_043_330_985_984_000_000, +/* f(26): */ 403_291_461_126_605_635_584_000_000, +/* f(27): */ 10_888_869_450_418_352_160_768_000_000, +/* f(28): */ 304_888_344_611_713_860_501_504_000_000, +/* f(29): */ 8_841_761_993_739_701_954_543_616_000_000, +/* f(30): */ 265_252_859_812_191_058_636_308_480_000_000, +/* f(31): */ 8_222_838_654_177_922_817_725_562_880_000_000, +/* f(32): */ 263_130_836_933_693_530_167_218_012_160_000_000, +/* f(33): */ 8_683_317_618_811_886_495_518_194_401_280_000_000, +/* f(34): */ 295_232_799_039_604_140_847_618_609_643_520_000_000, + }; +} else { + _factorial_table := [21]_WORD{ +/* f(00): */ 1, +/* f(01): */ 1, +/* f(02): */ 2, +/* f(03): */ 6, +/* f(04): */ 24, +/* f(05): */ 120, +/* f(06): */ 720, +/* f(07): */ 5_040, +/* f(08): */ 40_320, +/* f(09): */ 362_880, +/* f(10): */ 3_628_800, +/* f(11): */ 39_916_800, +/* f(12): */ 479_001_600, +/* f(13): */ 6_227_020_800, +/* f(14): */ 87_178_291_200, +/* f(15): */ 1_307_674_368_000, +/* f(16): */ 20_922_789_888_000, +/* f(17): */ 355_687_428_096_000, +/* f(18): */ 6_402_373_705_728_000, +/* f(19): */ 121_645_100_408_832_000, +/* f(20): */ 2_432_902_008_176_640_000, + }; +}; + +/* + ========================= End of private tables ======================== +*/ \ No newline at end of file