From 5e520f4e08feb1324757b123b047b57ac38aadc6 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Mon, 30 Aug 2021 23:00:49 +0200
Subject: [PATCH 01/13] big: Add `reduce_2k`.

---
 core/math/big/build.bat    |   4 +-
 core/math/big/example.odin |   4 +-
 core/math/big/prime.odin   | 212 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..e6049b2e1 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,9 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4542e9e15..e2ed30680 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -203,8 +203,8 @@ int_to_byte_little :: proc(v: ^Int) {
 }
 
 demo :: proc() {
-	a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	defer destroy(a, b, c, d, e, f);
+	// a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	// defer destroy(a, b, c, d, e, f);
 }
 
 main :: proc() {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f35e02807..6a5bec6dc 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -15,7 +15,7 @@ package math_big
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
 */
-int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
+internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
 	assert_if_nil(a);
 	context.allocator = allocator;
 
@@ -207,6 +207,216 @@ int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: D
 	return #force_inline internal_int_montgomery_setup(n);
 }
 
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+	um := m.used;
+
+	/*
+		q = x
+	*/
+	copy(q, x)                                                       or_return;
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1);
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		mul(q, q, mu)                                                or_return;
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return;
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1);
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return;
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return;
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_cmp(x, 0) == -1 {
+		internal_set(q, 1)                                           or_return;
+		internal_shl_digit(q, um + 1)                                or_return;
+		internal_add(x, x, q)                                        or_return;
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_cmp(x, m) != -1 {
+		internal_sub(x, x, m)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+internal_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return;
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+internal_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return;
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a);
+		iw := 1;
+		iz := DIGIT(1);
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil;
+			}
+
+			iz <<= 1;
+			if iz > _DIGIT_MAX {
+				iw += 1;
+				iz  = 1;
+			}
+		}
+		return true, nil;
+	} else {
+		return true, nil;
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_int_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0;
+		iy := 0;
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1;
+			}
+		}
+		return iy >= (a.used / 2), nil;
+	} else {
+		return false, nil;
+	}
+}
+
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From c3a70ac277494b70e86578f1ce31923a0ca8d2c8 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 12:15:09 +0200
Subject: [PATCH 02/13] Big: Added Barrett reduction setup.

---
 core/math/big/prime.odin | 45 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 6a5bec6dc..6a3a098a4 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -416,6 +416,51 @@ internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
 	}
 }
 
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(tmp, tmp, a)                                        or_return;
+
+	return tmp.digit[0], nil;
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+internal_int_reduce_2k_setup_l :: proc(a, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(d, tmp, a)                                          or_return;
+
+	return nil;
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "b" it calulates the value required in "a"
+*/
+internal_int_reduce_setup :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	internal_int_power_of_two(a, b.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(a, a, b);
+}
+
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.

From 65a15e9c060d74bc3a7977c8c3329ec43dc810b2 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 16:43:07 +0200
Subject: [PATCH 03/13] big: Add `internal_int_exponent_mod`.

---
 core/math/big/api.odin      |   7 +-
 core/math/big/common.odin   |  28 ++--
 core/math/big/example.odin  |  24 +++-
 core/math/big/helpers.odin  |   3 +-
 core/math/big/internal.odin |   3 +-
 core/math/big/logical.odin  |   3 +-
 core/math/big/prime.odin    | 248 ++++++++++++++++++++++++++++++++++--
 core/math/big/private.odin  |   3 +-
 core/math/big/public.odin   |   3 +-
 core/math/big/radix.odin    |   3 +-
 core/math/big/test.odin     |   3 +-
 core/math/big/test.py       |   9 ++
 core/math/big/tune.odin     |   3 +-
 13 files changed, 294 insertions(+), 46 deletions(-)

diff --git a/core/math/big/api.odin b/core/math/big/api.odin
index 1f2eab8d7..e2761b425 100644
--- a/core/math/big/api.odin
+++ b/core/math/big/api.odin
@@ -1,14 +1,15 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-2 license.
+	Made available under Odin's BSD-3 license.
 
 	An arbitrary precision mathematics implementation in Odin.
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 
 	This file collects public proc maps and their aliases.
+*/
+package math_big
+/*
 
 	=== === === === === === === === === === === === === === === === === === === === === === === ===
 	                                    Basic arithmetic.
diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index ce1f7d77f..4171d25f3 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 
@@ -57,10 +56,10 @@ when #config(MATH_BIG_EXE, true) {
 	debugged where necessary.
 */
 
-_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MUL_KARATSUBA_CUTOFF,  80);
-_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(SQR_KARATSUBA_CUTOFF, 120);
-_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MUL_TOOM_CUTOFF,      350);
-_DEFAULT_SQR_TOOM_CUTOFF      :: #config(SQR_TOOM_CUTOFF,      400);
+_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MATH_BIG_MUL_KARATSUBA_CUTOFF,  80);
+_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(MATH_BIG_SQR_KARATSUBA_CUTOFF, 120);
+_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MATH_BIG_MUL_TOOM_CUTOFF,      350);
+_DEFAULT_SQR_TOOM_CUTOFF      :: #config(MATH_BIG_SQR_TOOM_CUTOFF,      400);
 
 
 MAX_ITERATIONS_ROOT_N := 500;
@@ -85,15 +84,22 @@ FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
 
 	2) Optimizations thanks to precomputed masks wouldn't work.
 */
-MATH_BIG_FORCE_64_BIT :: #config(MATH_BIG_FORCE_64_BIT, false);
-MATH_BIG_FORCE_32_BIT :: #config(MATH_BIG_FORCE_32_BIT, false);
+MATH_BIG_FORCE_64_BIT   :: #config(MATH_BIG_FORCE_64_BIT, false);
+MATH_BIG_FORCE_32_BIT   :: #config(MATH_BIG_FORCE_32_BIT, false);
 when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously."); };
 
-_LOW_MEMORY           :: #config(BIGINT_SMALL_MEMORY, false);
+/*
+	Trade a smaller memory footprint for more processing overhead?
+*/
+_LOW_MEMORY             :: #config(MATH_BIG_SMALL_MEMORY, false);
 when _LOW_MEMORY {
-	_DEFAULT_DIGIT_COUNT :: 8;
+	_DEFAULT_DIGIT_COUNT ::   8;
+	_TAB_SIZE            ::  32;
+	_MAX_WIN_SIZE        ::   5;
 } else {
-	_DEFAULT_DIGIT_COUNT :: 32;
+	_DEFAULT_DIGIT_COUNT ::  32;
+	_TAB_SIZE            :: 256;
+	_MAX_WIN_SIZE        ::   0;
 }
 
 /*
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index e2ed30680..18b6062d9 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,8 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
+
 
 import "core:fmt"
 import "core:mem"
@@ -18,11 +18,14 @@ print_configation :: proc() {
 `
 Configuration:
 	_DIGIT_BITS                           %v
+	_SMALL_MEMORY                         %v
 	_MIN_DIGIT_COUNT                      %v
 	_MAX_DIGIT_COUNT                      %v
 	_DEFAULT_DIGIT_COUNT                  %v
 	_MAX_COMBA                            %v
 	_WARRAY                               %v
+	_TAB_SIZE                             %v
+	_MAX_WIN_SIZE                         %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -34,11 +37,14 @@ Runtime tunable:
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
 
 `, _DIGIT_BITS,
+_LOW_MEMORY,
 _MIN_DIGIT_COUNT,
 _MAX_DIGIT_COUNT,
 _DEFAULT_DIGIT_COUNT,
 _MAX_COMBA,
 _WARRAY,
+_TAB_SIZE,
+_MAX_WIN_SIZE,
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
 MUL_TOOM_CUTOFF,
@@ -203,8 +209,18 @@ int_to_byte_little :: proc(v: ^Int) {
 }
 
 demo :: proc() {
-	// a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	// defer destroy(a, b, c, d, e, f);
+	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer destroy(a, b, c, d, e, f, res);
+
+	set(a, 42);
+	set(b, 6);
+	set(c, 5);
+
+	if err := internal_int_exponent_mod(res, a, b, c, 0); err != nil {
+		fmt.printf("Error: %v\n", err);
+	}
+
+	print("res: ", res);
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index 8ce1b2811..ff654172c 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 import rnd "core:math/rand"
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 9422067ae..789163af2 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -31,6 +29,7 @@ package math_big
 
 	TODO: Handle +/- Infinity and NaN.
 */
+package math_big
 
 import "core:mem"
 import "core:intrinsics"
diff --git a/core/math/big/logical.odin b/core/math/big/logical.odin
index 64f3b0898..1e7f8e1b1 100644
--- a/core/math/big/logical.odin
+++ b/core/math/big/logical.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains logical operations like `and`, `or` and `xor`.
 */
+package math_big
 
 /*
 	The `and`, `or` and `xor` binops differ in two lines only.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 6a3a098a4..1947ac634 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains prime finding operations.
 */
+package math_big
 
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
@@ -223,7 +222,7 @@ internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		q = x
 	*/
-	copy(q, x)                                                       or_return;
+	internal_copy(q, x)                                              or_return;
 
 	/*
 		q1 = x / b**(k-1)
@@ -234,7 +233,7 @@ internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 		According to HAC this optimization is ok.
 	*/
 	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
-		mul(q, q, mu)                                                or_return;
+		internal_mul(q, q, mu)                                       or_return;
 	} else {
 		_private_int_mul_high(q, q, mu, um)                          or_return;
 	}
@@ -435,32 +434,257 @@ internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) ->
 
 /*
 	Determines the setup value.
-	Assumes `a` is not `nil`.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
 */
-internal_int_reduce_2k_setup_l :: proc(a, d: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
 	tmp := &Int{};
 	defer internal_destroy(tmp);
 	internal_zero(tmp)                                               or_return;
 
-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
-	internal_sub(d, tmp, a)                                          or_return;
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
+	internal_sub(mu, tmp, P)                                         or_return;
 
 	return nil;
 }
 
 /*
 	Pre-calculate the value required for Barrett reduction.
-	For a given modulus "b" it calulates the value required in "a"
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
 */
-internal_int_reduce_setup :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-	internal_int_power_of_two(a, b.used * 2 * _DIGIT_BITS)           or_return;
-	return internal_int_div(a, a, b);
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(mu, mu, P);
 }
 
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return;
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{};
+	defer internal_destroy(mu);
+	internal_zero(mu)                                                or_return;
+
+	if redmode == 0 {
+		internal_int_reduce_setup(mu, P)                             or_return;
+		redux = internal_int_reduce;
+	} else {
+		internal_int_reduce_2k_setup_l(mu, P)                        or_return;
+		redux = internal_int_reduce_2k_l;
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return;
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return;
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, mu)                                          or_return;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return;
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := uint(0);
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue;
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, mu)                                        or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+		}
+	}
+	return err;
+}
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d71946ce2..7e839337f 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -17,6 +15,7 @@ package math_big
 
 	These aren't exported for the same reasons.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index 542725289..d69b3ba22 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains basic arithmetic operations like `add`, `sub`, `mul`, `div`, ...
 */
+package math_big
 
 /*
 	===========================
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index acf0bacbd..8a7040158 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -14,6 +12,7 @@ package math_big
 		- Use Barrett reduction for non-powers-of-two.
 		- Also look at extracting and splatting several digits at once.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
diff --git a/core/math/big/test.odin b/core/math/big/test.odin
index ea3c6be49..8d60fc5ee 100644
--- a/core/math/big/test.odin
+++ b/core/math/big/test.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -11,6 +9,7 @@ package math_big
 
 	This file exports procedures for use with the test.py test suite.
 */
+package math_big
 
 /*
 	TODO: Write tests for `internal_*` and test reusing parameters with the public implementations.
diff --git a/core/math/big/test.py b/core/math/big/test.py
index df59fa1c8..e095b061e 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -1,3 +1,12 @@
+#
+#	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+#	Made available under Odin's BSD-3 license.
+#
+#	A BigInt implementation in Odin.
+#	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
+#	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
+#
+
 from ctypes import *
 from random import *
 import math
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index 700a5e74a..3381065bb 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:fmt"
 import "core:time"

From 2110778040787234041ee977ec456e5098f2caf7 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 21:26:19 +0200
Subject: [PATCH 04/13] big: Add `internal_int_exponent_mod_fast`.

---
 core/math/big/example.odin  |   6 +-
 core/math/big/internal.odin |  25 ++--
 core/math/big/prime.odin    | 283 +++++++++++++++++++++++++++++++++++-
 core/math/big/private.odin  |   5 +-
 4 files changed, 302 insertions(+), 17 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 18b6062d9..4a64aaa01 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -208,15 +208,17 @@ int_to_byte_little :: proc(v: ^Int) {
 	}
 }
 
+// printf :: fmt.printf;
+
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
 	set(a, 42);
 	set(b, 6);
-	set(c, 5);
+	set(c, 131);
 
-	if err := internal_int_exponent_mod(res, a, b, c, 0); err != nil {
+	if err := internal_int_exponent_mod_fast(res, a, b, c, 0); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 789163af2..ca113275c 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -991,13 +991,21 @@ internal_int_mod_bits :: proc(remainder, numerator: ^Int, bits: int, allocator :
 	public ones that have already satisfied these constraints.
 */
 
+/*
+	This procedure returns the allocated capacity of an Int.
+	Assumes `a` not to be `nil`.
+*/
+internal_int_allocated_cap :: #force_inline proc(a: ^Int) -> (cap: int) {
+	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
+	return raw.cap;
+}
+
 /*
 	This procedure will return `true` if the `Int` is initialized, `false` if not.
 	Assumes `a` not to be `nil`.
 */
 internal_int_is_initialized :: #force_inline proc(a: ^Int) -> (initialized: bool) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-	return raw.cap >= _MIN_DIGIT_COUNT;
+	return internal_int_allocated_cap(a) >= _MIN_DIGIT_COUNT;
 }
 internal_is_initialized :: proc { internal_int_is_initialized, };
 
@@ -1650,8 +1658,7 @@ internal_int_destroy :: proc(integers: ..^Int) {
 	integers := integers;
 
 	for a in &integers {
-		raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-		if raw.cap > 0 {
+		if internal_int_allocated_cap(a) > 0 {
 			mem.zero_slice(a.digit[:]);
 			free(&a.digit[0]);
 		}
@@ -1913,23 +1920,23 @@ internal_int_shrink :: proc(a: ^Int) -> (err: Error) {
 internal_shrink :: proc { internal_int_shrink, };
 
 internal_int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := context.allocator) -> (err: Error) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-
 	/*
 		We need at least _MIN_DIGIT_COUNT or a.used digits, whichever is bigger.
 		The caller is asking for `digits`. Let's be accomodating.
 	*/
+	cap := internal_int_allocated_cap(a);
+
 	needed := max(_MIN_DIGIT_COUNT, a.used, digits);
 	if !allow_shrink {
-		needed = max(needed, raw.cap);
+		needed = max(needed, cap);
 	}
 
 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
 	*/
-	if raw.cap == 0 {
+	if cap == 0 {
 		a.digit = make([dynamic]DIGIT, needed, allocator);
-	} else if raw.cap != needed {
+	} else if cap != needed {
 		/*
 			`[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 		*/
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 1947ac634..9771ef077 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -144,7 +144,7 @@ internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := cont
 		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
 		internal_int_power_of_two(a, power)                          or_return;
 	} else {
-		internal_one(a);
+		internal_one(a)                                              or_return;
 		bits = 1;
 	}
 
@@ -187,7 +187,8 @@ internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
 	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-	when _WORD_TYPE_BITS == 64 {
+
+	when _DIGIT_TYPE_BITS == 64 {
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
 	}
@@ -473,6 +474,10 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	M := [_TAB_SIZE]Int{};
 	winsize: uint;
 
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
 	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
 
 	defer {
@@ -686,6 +691,280 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	return err;
 }
 
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = internal_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = internal_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+		if (MP_HAS(MP_DR_SETUP) && MP_HAS(MP_DR_REDUCE)) {
+			/* setup DR reduction for moduli of the form B**k - b */
+			mp_dr_setup(P, &mp);
+			redux = mp_dr_reduce;
+		} else {
+			err = MP_VAL;
+			goto LBL_M;
+		}
+		*/
+		return .Unimplemented;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = internal_int_reduce_2k_setup(P)                        or_return;
+		redux = internal_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		internal_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+   		print("slot: ", &M[slot]);
+		redux(&M[slot], P, rho)                                      or_return;
+		print("slot redux: ", &M[slot]);
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
+	return nil;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 7e839337f..552b100cf 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1730,9 +1730,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return;
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1753,7 +1750,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return;
+	internal_grow(x, n.used + 1)                                     or_return;
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]

From ee04fb1ce1266ac82ee591e8ba0f9339f1038486 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 22:00:20 +0200
Subject: [PATCH 05/13] big: Remove temporary prints.

---
 core/math/big/prime.odin | 2 --
 1 file changed, 2 deletions(-)

diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 9771ef077..12d37269a 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -832,9 +832,7 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 
 	for x = 0; x < int(winsize - 1); x += 1 {
 		internal_sqr(&M[slot], &M[slot])                             or_return;
-   		print("slot: ", &M[slot]);
 		redux(&M[slot], P, rho)                                      or_return;
-		print("slot redux: ", &M[slot]);
 	}
 
 	/*

From 7d0dedf951b615835cd84819460f3f29a89a5c9b Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 23:13:36 +0200
Subject: [PATCH 06/13] big: Add Diminished Radix reduction.

---
 core/math/big/example.odin |   2 +-
 core/math/big/prime.odin   | 114 ++++++++++++++++++++++++++++++++++---
 2 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4a64aaa01..ea4c83328 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := internal_int_exponent_mod_fast(res, a, b, c, 0); err != nil {
+	if err := internal_int_exponent_mod_fast(res, a, b, c, 1); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 12d37269a..027e13462 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,6 +10,8 @@
 */
 package math_big
 
+import "core:mem"
+
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
@@ -781,16 +783,10 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 		}
 	} else if redmode == 1 {
 		/*
-		if (MP_HAS(MP_DR_SETUP) && MP_HAS(MP_DR_REDUCE)) {
-			/* setup DR reduction for moduli of the form B**k - b */
-			mp_dr_setup(P, &mp);
-			redux = mp_dr_reduce;
-		} else {
-			err = MP_VAL;
-			goto LBL_M;
-		}
+			Setup DR reduction for moduli of the form B**k - b.
 		*/
-		return .Unimplemented;
+		rho = internal_int_dr_setup(P);
+		redux = internal_int_dr_reduce;
 	} else {
 		/*
 			Setup DR reduction for moduli of the form 2**k - b.
@@ -963,6 +959,106 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 	return nil;
 }
 
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+internal_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+internal_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false; }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+internal_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used;
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return;
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int;
+		mu := DIGIT(0);
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
+			x.digit[i] = DIGIT(r & _WORD(_MASK));
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu;
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return;
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_cmp_mag(x, n) == -1 { break; }
+
+		internal_sub(x, x, n)                                        or_return;
+	}
+	return nil;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From a056e1943435ed37330bc6d5c0eac241d323170b Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 00:04:55 +0200
Subject: [PATCH 07/13] big: Cue up `internal_int_exponent_mod` wrapper
 function.

---
 core/math/big/example.odin |    2 +-
 core/math/big/prime.odin   | 1061 ++----------------------------------
 core/math/big/private.odin | 1009 ++++++++++++++++++++++++++++++++++
 3 files changed, 1067 insertions(+), 1005 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index ea4c83328..de0aed468 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := internal_int_exponent_mod_fast(res, a, b, c, 1); err != nil {
+	if err := _private_int_exponent_mod(res, a, b, c, 1); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 027e13462..edd6d9f3d 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,8 +10,6 @@
 */
 package math_big
 
-import "core:mem"
-
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
@@ -35,1027 +33,82 @@ internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator)
 }
 
 /*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
-*/
-internal_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-	/*
-		Can the fast reduction [comba] method be used?
-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
-		since carries are fixed up in the inner loop.
-	*/
-	digs := (n.used * 2) + 1;
-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
-		return _private_montgomery_reduce_comba(x, n, rho);
-	}
+	This is a shell function that calls either the normal or Montgomery exptmod functions.
+	Originally the call to the Montgomery code was embedded in the normal function but that
+	wasted alot of stack space for nothing (since 99% of the time the Montgomery code would be called).
 
-	/*
-		Grow the input as required
-	*/
-	internal_grow(x, digs)                                           or_return;
-	x.used = digs;
-
-	for ix := 0; ix < n.used; ix += 1 {
-		/*
-			`mu = ai * rho mod b`
-			The value of rho must be precalculated via `int_montgomery_setup()`,
-			such that it equals -1/n0 mod b this allows the following inner loop
-			to reduce the input one digit at a time.
-		*/
-
-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
-
-		/*
-			a = a + mu * m * b**i
-			Multiply and add in place.
-		*/
-		u  := DIGIT(0);
-		iy := int(0);
-		for ; iy < n.used; iy += 1 {
-			/*
-				Compute product and sum.
-			*/
-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
-
-			/*
-				Get carry.
-			*/
-			u = DIGIT(r >> _DIGIT_BITS);
-
-			/*
-				Fix digit.
-			*/
-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
-		}
-
-		/*
-			At this point the ix'th digit of x should be zero.
-			Propagate carries upwards as required.
-		*/
-		for u != 0 {
-			x.digit[ix + iy] += u;
-			u = x.digit[ix + iy] >> _DIGIT_BITS;
-			x.digit[ix + iy] &= _MASK;
-			iy += 1;
-		}
-	}
-
-	/*
-		At this point the n.used'th least significant digits of x are all zero,
-		which means we can shift x to the right by n.used digits and the
-		residue is unchanged.
-
-		x = x/b**n.used.
-	*/
-	internal_clamp(x);
-	internal_shr_digit(x, n.used);
-
-	/*
-		if x >= n then x = x - n
-	*/
-	if internal_cmp_mag(x, n) != -1 {
-		return internal_sub(x, x, n);
-	}
-
-	return nil;
-}
-
-int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(x, n);
-	context.allocator = allocator;
-
-	internal_clear_if_uninitialized(x, n) or_return;
-
-	return #force_inline internal_int_montgomery_reduce(x, n, rho);
-}
-
-/*
-	Shifts with subtractions when the result is greater than b.
-
-	The method is slightly modified to shift B unconditionally upto just under
-	the leading bit of b.  This saves alot of multiple precision shifting.
-*/
-internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-	/*
-		How many bits of last digit does b use.
-	*/
-	bits := internal_count_bits(b) % _DIGIT_BITS;
-
-	if b.used > 1 {
-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
-		internal_int_power_of_two(a, power)                          or_return;
-	} else {
-		internal_one(a)                                              or_return;
-		bits = 1;
-	}
-
-	/*
-		Now compute C = A * B mod b.
-	*/
-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
-		internal_int_shl1(a, a)                                      or_return;
-		if internal_cmp_mag(a, b) != -1 {
-			internal_sub(a, a, b)                                    or_return;
-		}
-	}
-	return nil;
-}
-
-int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(a, b);
-	context.allocator = allocator;
-
-	internal_clear_if_uninitialized(a, b) or_return;
-
-	return #force_inline internal_int_montgomery_calc_normalization(a, b);
-}
-
-/*
-	Sets up the Montgomery reduction stuff.
-*/
-internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
-	/*
-		Fast inversion mod 2**k
-		Based on the fact that:
-
-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
-		                  =>  2*X*A - X*X*A*A = 1
-		                  =>  2*(1) - (1)     = 1
-	*/
-	b := n.digit[0];
-	if b & 1 == 0 { return 0, .Invalid_Argument; }
-
-	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-
-	when _DIGIT_TYPE_BITS == 64 {
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
-	}
-
-	/*
-		rho = -1/m mod b
-	*/
-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
-	return rho, nil;
-}
-
-int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
-	assert_if_nil(n);
-	internal_clear_if_uninitialized(n, allocator) or_return;
-
-	return #force_inline internal_int_montgomery_setup(n);
-}
-
-/*
-	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
-	From HAC pp.604 Algorithm 14.42
-
-	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
-*/
-internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-	um := m.used;
-
-	/*
-		q = x
-	*/
-	internal_copy(q, x)                                              or_return;
-
-	/*
-		q1 = x / b**(k-1)
-	*/
-	internal_shr_digit(q, um - 1);
-
-	/*
-		According to HAC this optimization is ok.
-	*/
-	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
-		internal_mul(q, q, mu)                                       or_return;
-	} else {
-		_private_int_mul_high(q, q, mu, um)                          or_return;
-	}
-
-	/*
-		q3 = q2 / b**(k+1)
-	*/
-	internal_shr_digit(q, um + 1);
-
-	/*
-		x = x mod b**(k+1), quick (no division)
-	*/
-	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
-
-	/*
-		q = q * m mod b**(k+1), quick (no division)
-	*/
-	_private_int_mul(q, q, m, um + 1)                                or_return;
-
-	/*
-		x = x - q
-	*/
-	internal_sub(x, x, q)                                            or_return;
-
-	/*
-		If x < 0, add b**(k+1) to it.
-	*/
-	if internal_cmp(x, 0) == -1 {
-		internal_set(q, 1)                                           or_return;
-		internal_shl_digit(q, um + 1)                                or_return;
-		internal_add(x, x, q)                                        or_return;
-	}
-
-	/*
-		Back off if it's too big.
-	*/
-	for internal_cmp(x, m) != -1 {
-		internal_sub(x, x, m)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
-*/
-internal_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-
-	internal_zero(q)                                                 or_return;
-
-	p := internal_count_bits(n);
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return;
-
-		if d != 1 {
-			/*
-				q = q * d
-			*/
-			internal_mul(q, q, d)                                    or_return;
-		}
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
-		internal_sub(a, a, n)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Reduces `a` modulo `n` where `n` is of the form 2**p - d
-	This differs from reduce_2k since "d" can be larger than a single digit.
-*/
-internal_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-
-	internal_zero(q)                                                 or_return;
-
-	p := internal_count_bits(n);
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return;
-
-		/*
-			q = q * d
-		*/
-		internal_mul(q, q, d)                                        or_return;
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
-		internal_sub(a, a, n)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Determines if `internal_int_reduce_2k` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-internal_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a);
-
-	if internal_is_zero(a) {
-		return false, nil;
-	} else if a.used == 1 {
-		return true, nil;
-	} else if a.used  > 1 {
-		iy := internal_count_bits(a);
-		iw := 1;
-		iz := DIGIT(1);
-
-		/*
-			Test every bit from the second digit up, must be 1.
-		*/
-		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
-			if a.digit[iw] & iz == 0 {
-				return false, nil;
-			}
-
-			iz <<= 1;
-			if iz > _DIGIT_MAX {
-				iw += 1;
-				iz  = 1;
-			}
-		}
-		return true, nil;
-	} else {
-		return true, nil;
-	}
-}
-
-/*
-	Determines if `internal_int_reduce_2k_l` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a);
-
-	if internal_int_is_zero(a) {
-		return false, nil;
-	} else if a.used == 1 {
-		return true, nil;
-	} else if a.used  > 1 {
-		/*
-			If more than half of the digits are -1 we're sold.
-		*/
-		ix := 0;
-		iy := 0;
-
-		for ; ix < a.used; ix += 1 {
-			if a.digit[ix] == _DIGIT_MAX {
-				iy += 1;
-			}
-		}
-		return iy >= (a.used / 2), nil;
-	} else {
-		return false, nil;
-	}
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` is not `nil`.
-*/
-internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
-	context.allocator = allocator;
-
-	tmp := &Int{};
-	defer internal_destroy(tmp);
-	internal_zero(tmp)                                               or_return;
-
-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
-	internal_sub(tmp, tmp, a)                                        or_return;
-
-	return tmp.digit[0], nil;
-}
-
-/*
-	Determines the setup value.
-	Assumes `mu` and `P` are not `nil`.
-
-	d := (1 << a.bits) - a;
-*/
-internal_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	tmp := &Int{};
-	defer internal_destroy(tmp);
-	internal_zero(tmp)                                               or_return;
-
-	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
-	internal_sub(mu, tmp, P)                                         or_return;
-
-	return nil;
-}
-
-/*
-	Pre-calculate the value required for Barrett reduction.
-	For a given modulus "P" it calulates the value required in "mu"
-	Assumes `mu` and `P` are not `nil`.
-*/
-internal_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
-	return internal_int_div(mu, mu, P);
-}
-
-/*
 	Computes res == G**X mod P.
 	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
 */
-internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-	M := [_TAB_SIZE]Int{};
-	winsize: uint;
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
-
-	defer {
-		internal_destroy(&M[1]);
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x]);
-		}
-	}
-
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X);
-	switch {
-	case x <= 7:
-		winsize = 2;
-	case x <= 36:
-		winsize = 3;
-	case x <= 140:
-		winsize = 4;
-	case x <= 450:
-		winsize = 5;
-	case x <= 1303:
-		winsize = 6;
-	case x <= 3529:
-		winsize = 7;
-	case:
-		winsize = 8;
-	}
-
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
-
-	/*
-		Init M array.
-		Init first cell.
-	*/
-	internal_zero(&M[1])                                             or_return;
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_zero(&M[x])                                         or_return;
-	}
-
-	/*
-		Create `mu`, used for Barrett reduction.
-	*/
-	mu := &Int{};
-	defer internal_destroy(mu);
-	internal_zero(mu)                                                or_return;
-
-	if redmode == 0 {
-		internal_int_reduce_setup(mu, P)                             or_return;
-		redux = internal_int_reduce;
-	} else {
-		internal_int_reduce_2k_setup_l(mu, P)                        or_return;
-		redux = internal_int_reduce_2k_l;
-	}
-
-	/*
-		Create M table.
-
-		The M table contains powers of the base, e.g. M[x] = G**x mod P.
-		The first half of the table is not computed, though, except for M[0] and M[1].
-	*/
-	internal_int_mod(&M[1], G, P)                                    or_return;
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-
-		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
-		instead of repeated squaring.
-	*/
-	slot := 1 << (winsize - 1);
-	internal_copy(&M[slot], &M[1])                                   or_return;
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		/*
-			Square it.
-		*/
-		internal_sqr(&M[slot], &M[slot])                             or_return;
-
-		/*
-			Reduce modulo P
-		*/
-		redux(&M[slot], P, mu)                                       or_return;
-	}
-
-	/*
-		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-	*/
-	for x = slot + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
-		redux(&M[x], P, mu)                                          or_return;
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_one(res)                                                or_return;
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0;
-	bitcnt := 1;
-	buf    := DIGIT(0);
-	digidx := X.used - 1;
-	bitcpy := uint(0);
-	bitbuf := DIGIT(0);
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1;
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits.
-			*/
-			if digidx == -1 { break; }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx];
-			digidx -= 1;
-			bitcnt = _DIGIT_BITS;
-		}
-
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := buf >> (_DIGIT_BITS - 1) & 1;
-		buf <<= 1;
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit
-			in the exponent.  Technically this opt is not required but it
-			does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 {
-			continue;
-		}
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, mu)                                        or_return;
-			continue;
-		}
-
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1;
-		bitbuf |= (y << (winsize - bitcpy));
-		mode    = 2;
-
-		if (bitcpy == winsize) {
-			/*
-				Window is filled so square as required and multiply.
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return;
-				redux(res, P, mu)                                    or_return;
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return;
-			redux(res, P, mu)                                        or_return;
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0;
-			bitbuf = 0;
-			mode   = 1;
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < int(bitcpy); x += 1 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, mu)                                        or_return;
-
-			bitbuf <<= 1;
-			if ((bitbuf & (1 << winsize)) != 0) {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return;
-				redux(res, P, mu)                                    or_return;
-			}
-		}
-	}
-	return err;
-}
-
 /*
-	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+	int dr;
 
-	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
-	The value of `k` changes based on the size of the exponent.
-
-	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
-
-	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
-*/
-internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	M := [_TAB_SIZE]Int{};
-	winsize: uint;
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
-
-	defer {
-		internal_destroy(&M[1]);
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x]);
-		}
+	/* modulus P must be positive */
+	if (mp_isneg(P)) {
+		return MP_VAL;
 	}
 
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X);
-	switch {
-	case x <= 7:
-		winsize = 2;
-	case x <= 36:
-		winsize = 3;
-	case x <= 140:
-		winsize = 4;
-	case x <= 450:
-		winsize = 5;
-	case x <= 1303:
-		winsize = 6;
-	case x <= 3529:
-		winsize = 7;
-	case:
-		winsize = 8;
-	}
+	/* if exponent X is negative we have to recurse */
+	if (mp_isneg(X)) {
+		mp_int tmpG, tmpX;
+		mp_err err;
 
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
-
-	/*
-		Init M array
-		Init first cell.
-	*/
-	cap := internal_int_allocated_cap(P);
-	internal_grow(&M[1], cap)                                        or_return;
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_grow(&M[x], cap)                                    or_return;
-	}
-
-	/*
-		Determine and setup reduction code.
-	*/
-	rho: DIGIT;
-
-	if redmode == 0 {
-		/*
-			Now setup Montgomery.
-		*/
-		rho = internal_int_montgomery_setup(P)                       or_return;
-
-		/*
-			Automatically pick the comba one if available (saves quite a few calls/ifs).
-		*/
-		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
-			redux = _private_montgomery_reduce_comba;
-		} else {
-			/*
-				Use slower baseline Montgomery method.
-			*/
-			redux = internal_int_montgomery_reduce;
-		}
-	} else if redmode == 1 {
-		/*
-			Setup DR reduction for moduli of the form B**k - b.
-		*/
-		rho = internal_int_dr_setup(P);
-		redux = internal_int_dr_reduce;
-	} else {
-		/*
-			Setup DR reduction for moduli of the form 2**k - b.
-		*/
-		rho = internal_int_reduce_2k_setup(P)                        or_return;
-		redux = internal_int_reduce_2k;
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_grow(res, cap)                                          or_return;
-
-	/*
-		Create M table
-		The first half of the table is not computed, though, except for M[0] and M[1]
-	*/
-
-	if redmode == 0 {
-		/*
-			Now we need R mod m.
-		*/
-		internal_int_montgomery_calc_normalization(res, P)           or_return;
-
-		/*
-			Now set M[1] to G * R mod m.
-		*/
-		internal_mulmod(&M[1], G, res, P)                            or_return;
-	} else {
-		internal_one(res)                                            or_return;
-		internal_mod(&M[1], G, P)                                    or_return;
-	}
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-	*/
-	slot := 1 << (winsize - 1);
-	internal_copy(&M[slot], &M[1])                                   or_return;
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		internal_sqr(&M[slot], &M[slot])                             or_return;
-		redux(&M[slot], P, rho)                                      or_return;
-	}
-
-	/*
-		Create upper table.
-	*/
-	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
-		redux(&M[x], P, rho)                                         or_return;
-	}
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0;
-	bitcnt := 1;
-	buf    := DIGIT(0);
-	digidx := X.used - 1;
-	bitcpy := 0;
-	bitbuf := DIGIT(0);
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1;
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits so break.
-			*/
-			if digidx == -1 { break; }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx];
-			digidx -= 1;
-			bitcnt = _DIGIT_BITS;
+		if (!MP_HAS(MP_INVMOD)) {
+			return MP_VAL;
 		}
 
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := (buf >> (_DIGIT_BITS - 1)) & 1;
-		buf <<= 1;
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit in the exponent.
-			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 { continue; }
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, rho)                                       or_return;
-			continue;
+		if ((err = mp_init_multi(&tmpG, &tmpX, NULL)) != MP_OKAY) {
+			return err;
 		}
 
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1;
-		bitbuf |= (y << (winsize - uint(bitcpy)));
-		mode    = 2;
-
-		if bitcpy == int(winsize) {
-			/*
-				Window is filled so square as required and multiply
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return;
-				redux(res, P, rho)                                   or_return;
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return;
-			redux(res, P, rho)                                       or_return;
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0;
-			bitbuf = 0;
-			mode   = 1;
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < bitcpy; x += 1 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, rho)                                       or_return;
-
-			/*
-				Get next bit of the window.
-			*/
-			bitbuf <<= 1;
-			if bitbuf & (1 << winsize) != 0 {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return;
-				redux(res, P, rho)                                   or_return;
-			}
-		}
-	}
-
-	if redmode == 0 {
-		/*
-			Fixup result if Montgomery reduction is used.
-			Recall that any value in a Montgomery system is actually multiplied by R mod n.
-			So we have to reduce one more time to cancel out the factor of R.
-		*/
-		redux(res, P, rho)                                           or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-internal_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
-	/*
-		The casts are required if _DIGIT_BITS is one less than
-		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
-	*/
-	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
-}
-
-/*
-	Determines if a number is a valid DR modulus.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-internal_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
-	/*
-		Must be at least two digits.
-	*/
-	if a.used < 2 { return false; }
-
-	/*
-		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
-	*/
-	for ix := 1; ix < a.used; ix += 1 {
-		if a.digit[ix] != _MASK {
-			return false;
-		}
-	}
-	return true;
-}
-
-/*
-	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-	Based on algorithm from the paper
-
-		"Generating Efficient Primes for Discrete Log Cryptosystems"
-					Chae Hoon Lim, Pil Joong Lee,
-			POSTECH Information Research Laboratories
-
-	The modulus must be of a special format [see manual].
-	Has been modified to use algorithm 7.10 from the LTM book instead
-
-	Input x must be in the range 0 <= x <= (n-1)**2
-	Assumes `x` and `n` to not be `nil` and to have been initialized.
-*/
-internal_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
-	/*
-		m = digits in modulus.
-	*/
-	m := n.used;
-
-	/*
-		Ensure that "x" has at least 2m digits.
-	*/
-	internal_grow(x, m + m)                                          or_return;
-
-	/*
-		Top of loop, this is where the code resumes if another reduction pass is required.
-	*/
-	for {
-		i: int;
-		mu := DIGIT(0);
-
-		/*
-			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
-		*/
-		for i = 0; i < m; i += 1 {
-			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
-			x.digit[i] = DIGIT(r & _WORD(_MASK));
-			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		/* first compute 1/G mod P */
+		if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+			goto LBL_ERR;
 		}
 
-		/*
-			Set final carry.
-		*/
-		x.digit[i] = mu;
+		/* now get |X| */
+		if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+			goto LBL_ERR;
+		}
 
-		/*
-			Zero words above m.
-		*/
-		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
-
-		/*
-			Clamp, sub and return.
-		*/
-		internal_clamp(x)                                            or_return;
-
-		/*
-			If x >= n then subtract and reduce again.
-			Each successive "recursion" makes the input smaller and smaller.
-		*/
-		if internal_cmp_mag(x, n) == -1 { break; }
-
-		internal_sub(x, x, n)                                        or_return;
+		/* and now compute (1/G)**|X| instead of G**X [X < 0] */
+		err = mp_exptmod(&tmpG, &tmpX, P, Y);
+LBL_ERR:
+		mp_clear_multi(&tmpG, &tmpX, NULL);
+		return err;
 	}
+
+	/* modified diminished radix reduction */
+	if (MP_HAS(MP_REDUCE_IS_2K_L) && MP_HAS(MP_REDUCE_2K_L) && MP_HAS(S_MP_EXPTMOD) &&
+		 mp_reduce_is_2k_l(P)) {
+		return s_mp_exptmod(G, X, P, Y, 1);
+	}
+
+	/* is it a DR modulus? default to no */
+	dr = (MP_HAS(MP_DR_IS_MODULUS) && mp_dr_is_modulus(P)) ? 1 : 0;
+
+	/* if not, is it a unrestricted DR modulus? */
+	if (MP_HAS(MP_REDUCE_IS_2K) && (dr == 0)) {
+		dr = (mp_reduce_is_2k(P)) ? 2 : 0;
+	}
+
+	/* if the modulus is odd or dr != 0 use the montgomery method */
+	if (MP_HAS(S_MP_EXPTMOD_FAST) && (mp_isodd(P) || (dr != 0))) {
+		return s_mp_exptmod_fast(G, X, P, Y, dr);
+	}
+
+	/* otherwise use the generic Barrett reduction technique */
+	if (MP_HAS(S_MP_EXPTMOD)) {
+		return s_mp_exptmod(G, X, P, Y, 0);
+	}
+
+	/* no exptmod for evens */
+	return MP_VAL;
+
+	*/
 	return nil;
 }
 
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 552b100cf..d2878fcc1 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1850,6 +1850,1015 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	return nil;
 }
 
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return;
+
+	digs := (n.used * 2) + 1;
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho);
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return;
+	x.used = digs;
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0);
+		iy := int(0);
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS);
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u;
+			u = x.digit[ix + iy] >> _DIGIT_BITS;
+			x.digit[ix + iy] &= _MASK;
+			iy += 1;
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x);
+	internal_shr_digit(x, n.used);
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_cmp_mag(x, n) != -1 {
+		return internal_sub(x, x, n);
+	}
+
+	return nil;
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	bits := internal_count_bits(b) % _DIGIT_BITS;
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
+		internal_int_power_of_two(a, power)                          or_return;
+	} else {
+		internal_one(a)                                              or_return;
+		bits = 1;
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return;
+		if internal_cmp_mag(a, b) != -1 {
+			internal_sub(a, a, b)                                    or_return;
+		}
+	}
+	return nil;
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return;
+
+	b := n.digit[0];
+	if b & 1 == 0 { return 0, .Invalid_Argument; }
+
+	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
+	return rho, nil;
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+	um := m.used;
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return;
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1);
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return;
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return;
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1);
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return;
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return;
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_cmp(x, 0) == -1 {
+		internal_set(q, 1)                                           or_return;
+		internal_shl_digit(q, um + 1)                                or_return;
+		internal_add(x, x, q)                                        or_return;
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_cmp(x, m) != -1 {
+		internal_sub(x, x, m)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return;
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return;
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a);
+		iw := 1;
+		iz := DIGIT(1);
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil;
+			}
+
+			iz <<= 1;
+			if iz > _DIGIT_MAX {
+				iw += 1;
+				iz  = 1;
+			}
+		}
+		return true, nil;
+	} else {
+		return true, nil;
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_int_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0;
+		iy := 0;
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1;
+			}
+		}
+		return iy >= (a.used / 2), nil;
+	} else {
+		return false, nil;
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(tmp, tmp, a)                                        or_return;
+
+	return tmp.digit[0], nil;
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
+	internal_sub(mu, tmp, P)                                         or_return;
+
+	return nil;
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(mu, mu, P);
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false; }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used;
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return;
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int;
+		mu := DIGIT(0);
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
+			x.digit[i] = DIGIT(r & _WORD(_MASK));
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu;
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return;
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_cmp_mag(x, n) == -1 { break; }
+
+		internal_sub(x, x, n)                                        or_return;
+	}
+	return nil;
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return;
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{};
+	defer internal_destroy(mu);
+	internal_zero(mu)                                                or_return;
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return;
+		redux = _private_int_reduce;
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return;
+		redux = _private_int_reduce_2k_l;
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return;
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return;
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, mu)                                          or_return;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return;
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := uint(0);
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue;
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, mu)                                        or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+		}
+	}
+	return err;
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P);
+		redux = _private_int_dr_reduce;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return;
+		redux = _private_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+		redux(&M[slot], P, rho)                                      or_return;
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
+	return nil;
+}
+
 /*
 	hac 14.61, pp608
 */

From 7d7ed6b95f50ec36e6ed5ad6bded466ca75de26e Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 12:33:33 +0200
Subject: [PATCH 08/13] big: Add `internal_int_exponent_mod`.

---
 core/math/big/example.odin |   2 +-
 core/math/big/prime.odin   | 127 ++++++++++++++++++-------------------
 2 files changed, 63 insertions(+), 66 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index de0aed468..64ac30424 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := _private_int_exponent_mod(res, a, b, c, 1); err != nil {
+	if err := internal_int_exponent_mod(res, a, b, c); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index edd6d9f3d..85a7b931f 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -43,73 +43,70 @@ internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator)
 internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-/*
-	int dr;
-
-	/* modulus P must be positive */
-	if (mp_isneg(P)) {
-		return MP_VAL;
-	}
-
-	/* if exponent X is negative we have to recurse */
-	if (mp_isneg(X)) {
-		mp_int tmpG, tmpX;
-		mp_err err;
-
-		if (!MP_HAS(MP_INVMOD)) {
-			return MP_VAL;
-		}
-
-		if ((err = mp_init_multi(&tmpG, &tmpX, NULL)) != MP_OKAY) {
-			return err;
-		}
-
-		/* first compute 1/G mod P */
-		if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
-			goto LBL_ERR;
-		}
-
-		/* now get |X| */
-		if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
-			goto LBL_ERR;
-		}
-
-		/* and now compute (1/G)**|X| instead of G**X [X < 0] */
-		err = mp_exptmod(&tmpG, &tmpX, P, Y);
-LBL_ERR:
-		mp_clear_multi(&tmpG, &tmpX, NULL);
-		return err;
-	}
-
-	/* modified diminished radix reduction */
-	if (MP_HAS(MP_REDUCE_IS_2K_L) && MP_HAS(MP_REDUCE_2K_L) && MP_HAS(S_MP_EXPTMOD) &&
-		 mp_reduce_is_2k_l(P)) {
-		return s_mp_exptmod(G, X, P, Y, 1);
-	}
-
-	/* is it a DR modulus? default to no */
-	dr = (MP_HAS(MP_DR_IS_MODULUS) && mp_dr_is_modulus(P)) ? 1 : 0;
-
-	/* if not, is it a unrestricted DR modulus? */
-	if (MP_HAS(MP_REDUCE_IS_2K) && (dr == 0)) {
-		dr = (mp_reduce_is_2k(P)) ? 2 : 0;
-	}
-
-	/* if the modulus is odd or dr != 0 use the montgomery method */
-	if (MP_HAS(S_MP_EXPTMOD_FAST) && (mp_isodd(P) || (dr != 0))) {
-		return s_mp_exptmod_fast(G, X, P, Y, dr);
-	}
-
-	/* otherwise use the generic Barrett reduction technique */
-	if (MP_HAS(S_MP_EXPTMOD)) {
-		return s_mp_exptmod(G, X, P, Y, 0);
-	}
-
-	/* no exptmod for evens */
-	return MP_VAL;
+	dr: int;
 
+	/*
+		Modulus P must be positive.
 	*/
-	return nil;
+	if internal_is_negative(P) { return .Invalid_Argument; }
+
+	/*
+		If exponent X is negative we have to recurse.
+	*/
+	if internal_is_negative(X) {
+		tmpG, tmpX := &Int{}, &Int{};
+		defer internal_destroy(tmpG, tmpX);
+
+		internal_init_multi(tmpG, tmpX) or_return;
+
+		/*
+			First compute 1/G mod P.
+		*/
+		internal_invmod(tmpG, G, P) or_return;
+
+		/*
+			now get |X|.
+		*/
+		internal_abs(tmpX, X) or_return;
+
+		/*
+			And now compute (1/G)**|X| instead of G**X [X < 0].
+		*/
+		return internal_int_exponent_mod(res, tmpG, tmpX, P);
+	}
+
+	/*
+		Modified diminished radix reduction.
+	*/
+	can_reduce_2k_l := _private_int_reduce_is_2k_l(P) or_return;
+	if can_reduce_2k_l {
+		return _private_int_exponent_mod(res, G, X, P, 1);
+	}
+
+	/*
+		Is it a DR modulus? default to no.
+	*/
+	dr = 1 if _private_dr_is_modulus(P) else 0;
+
+	/*
+		If not, is it a unrestricted DR modulus?
+	*/
+	if dr == 0 {
+		reduce_is_2k := _private_int_reduce_is_2k(P) or_return;
+		dr = 2 if reduce_is_2k else 0;
+	}
+
+	/*
+		If the modulus is odd or dr != 0 use the montgomery method.
+	*/
+	if internal_int_is_odd(P) || dr != 0 {
+		return _private_int_exponent_mod(res, G, X, P, dr);
+	}
+
+	/*
+		Otherwise use the generic Barrett reduction technique.
+	*/
+	return _private_int_exponent_mod(res, G, X, P, 0);
 }
 
 /*

From fd83cbf40bad98e7d50d75bee2cdf4a5d3d54921 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 14:36:15 +0200
Subject: [PATCH 09/13] big: Add `ilog2`.

---
 core/math/big/example.odin | 10 ----------
 core/math/big/private.odin |  2 +-
 core/math/big/public.odin  |  6 ++++++
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 64ac30424..449a851d1 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -213,16 +213,6 @@ int_to_byte_little :: proc(v: ^Int) {
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
-
-	set(a, 42);
-	set(b, 6);
-	set(c, 131);
-
-	if err := internal_int_exponent_mod(res, a, b, c); err != nil {
-		fmt.printf("Error: %v\n", err);
-	}
-
-	print("res: ", res);
 }
 
 main :: proc() {
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d2878fcc1..72f5bb9e2 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1437,7 +1437,7 @@ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := cont
 	internal_one(inner, false)                                       or_return;
 	internal_one(outer, false)                                       or_return;
 
-	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n));
+	bits_used := ilog2(n);
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1;
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index d69b3ba22..68c574905 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -10,6 +10,8 @@
 */
 package math_big
 
+import "core:intrinsics"
+
 /*
 	===========================
 		User-level routines    
@@ -383,6 +385,10 @@ digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) {
 }
 log :: proc { int_log, digit_log, };
 
+ilog2 :: proc(value: $T) -> (log2: T) {
+	return (size_of(T) * 8) - intrinsics.count_leading_zeros(value);
+}
+
 /*
 	Calculate `dest = base^power` using a square-multiply algorithm.
 */

From df29d1021058624664ea8ec8fac0d01d9d45b97a Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 15:57:08 +0200
Subject: [PATCH 10/13] big: Add `internal_int_kronecker`.

---
 core/math/big/example.odin | 129 -------------------------------------
 core/math/big/prime.odin   |  95 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 129 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 449a851d1..4da2ebbe9 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -79,135 +79,6 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-int_to_byte :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b1);
-	int_from_bytes_big(t, b1);
-	fmt.printf("big: %v | err: %v\n", b1, err);
-
-	int_from_bytes_big(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b2);
-	fmt.printf("big python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_big_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b3, true);
-	fmt.printf("big signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_big(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b4, true);
-	fmt.printf("big signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_big_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
-
-int_to_byte_little :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b1);
-	fmt.printf("little: %v | err: %v\n", b1, err);
-
-	int_from_bytes_little(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b2);
-	fmt.printf("little python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_little_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b3, true);
-	fmt.printf("little signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_little(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b4, true);
-	fmt.printf("little signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_little_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
-
 // printf :: fmt.printf;
 
 demo :: proc() {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 85a7b931f..182e2bffa 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -109,6 +109,101 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.alloc
 	return _private_int_exponent_mod(res, G, X, P, 0);
 }
 
+/*
+	Kronecker symbol (a|p)
+	Straightforward implementation of algorithm 1.4.10 in
+	Henri Cohen: "A Course in Computational Algebraic Number Theory"
+
+	@book{cohen2013course,
+		title={A course in computational algebraic number theory},
+		author={Cohen, Henri},
+		volume={138},
+		year={2013},
+		publisher={Springer Science \& Business Media}
+	}
+
+	Assumes `a` and `p` to not be `nil` and to have been initialized.
+*/
+internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (kronecker: int, err: Error) {
+	context.allocator = allocator;
+
+	a1, p1, r := &Int{}, &Int{}, &Int{};
+	defer internal_destroy(a1, p1, r);
+
+	table := []int{0, 1, 0, -1, 0, -1, 0, 1};
+
+	if internal_int_is_zero(p) {
+		if a.used == 1 && a.digit[0] == 1 {
+			return 1, nil;
+		} else {
+			return 0, nil;
+		}
+	}
+
+	if internal_is_even(a) && internal_is_even(p) {
+		return 0, nil;
+	}
+
+	internal_copy(a1, a) or_return;
+	internal_copy(p1, p) or_return;
+
+	v := internal_count_lsb(p1) or_return;
+	internal_shr(p1, p1, v) or_return;
+
+	k := 1 if v & 1 == 0 else table[a.digit[0] & 7];
+
+	if internal_is_negative(p1) {
+		p1.sign = .Zero_or_Positive;
+		if internal_is_negative(a1) {
+			k = -k;
+		}
+	}
+
+	internal_zero(r) or_return;
+
+	for {
+		if internal_is_zero(a1) {
+			if internal_cmp(p1, 1) == 0 {
+				return k, nil;
+			} else {
+				return 0, nil;
+			}
+		}
+
+		v = internal_count_lsb(a1) or_return;
+		internal_shr(a1, a1, v) or_return;
+
+		if v & 1 == 1 {
+			k = k * table[p1.digit[0] & 7];
+		}
+
+		if internal_is_negative(a1) {
+			/*
+				Compute k = (-1)^((a1)*(p1-1)/4) * k.
+				a1.digit[0] + 1 cannot overflow because the MSB
+				of the DIGIT type is not set by definition.
+			 */
+			if a1.digit[0] + 1 & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		} else {
+			/*
+				Compute k = (-1)^((a1-1)*(p1-1)/4) * k.
+			*/
+			if a1.digit[0] & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		}
+
+		internal_copy(r, a1) or_return;
+		r.sign = .Zero_or_Positive;
+
+		internal_mod(a1, p1, r) or_return;
+		internal_copy(p1, r)    or_return;
+	}
+	return;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From 335d361fc6b878f283423b74babab3f5e13b847c Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 18:00:00 +0200
Subject: [PATCH 11/13] big: Add comparison helpers.

---
 core/math/big/internal.odin | 174 +++++++++++++++++++++
 core/math/big/public.odin   | 292 ++++++++++++++++++++++++++++++++++++
 2 files changed, 466 insertions(+)

diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index ca113275c..aa62569e7 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1098,6 +1098,7 @@ internal_is_power_of_two :: proc { internal_int_is_power_of_two, };
 	Expects `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	/*
@@ -1121,6 +1122,7 @@ internal_cmp :: internal_compare;
 	Expects: `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (comparison: int) {
+	assert_if_nil(a);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	switch {
@@ -1152,6 +1154,7 @@ internal_cmp_digit :: internal_compare_digit;
 	Compare the magnitude of two `Int`s, unsigned.
 */
 internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	/*
 		Compare based on used digits.
 	*/
@@ -1179,6 +1182,177 @@ internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison:
 internal_compare_magnitude :: proc { internal_int_compare_magnitude, };
 internal_cmp_mag :: internal_compare_magnitude;
 
+
+/*
+	bool := a < b
+*/
+internal_int_less_than :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp(a, b) == -1;
+}
+
+/*
+	bool := a < b
+*/
+internal_int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than: bool) {
+	return internal_cmp_digit(a, b) == -1;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp_mag(a, b) == -1;
+}
+
+internal_less_than :: proc {
+	internal_int_less_than,
+	internal_int_less_than_digit,
+}
+internal_lt :: internal_less_than;
+
+internal_less_than_abs :: proc {
+	internal_int_less_than_abs,
+}
+internal_lt_abs :: internal_less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp(a, b) <= 0;
+}
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) <= 0;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) <= 0;
+}
+
+internal_less_than_or_equal :: proc {
+	internal_int_less_than_or_equal,
+	internal_int_less_than_or_equal_digit,
+}
+internal_lteq :: internal_less_than_or_equal;
+
+internal_less_than_or_equal_abs :: proc {
+	internal_int_less_than_or_equal_abs,
+}
+internal_lteq_abs :: internal_less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+internal_int_equals :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp(a, b) == 0;
+}
+
+/*
+	bool := a == b
+*/
+internal_int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (equals: bool) {
+	return internal_cmp_digit(a, b) == 0;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp_mag(a, b) == 0;
+}
+
+internal_equals :: proc {
+	internal_int_equals,
+	internal_int_equals_digit,
+}
+internal_eq :: internal_equals;
+
+internal_equals_abs :: proc {
+	internal_int_equals_abs,
+}
+internal_eq_abs :: internal_equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp(a, b) >= 0;
+}
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) >= 0;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) >= 0;
+}
+
+internal_greater_than_or_equal :: proc {
+	internal_int_greater_than_or_equal,
+	internal_int_greater_than_or_equal_digit,
+}
+internal_gteq :: internal_greater_than_or_equal;
+
+internal_greater_than_or_equal_abs :: proc {
+	internal_int_greater_than_or_equal_abs,
+}
+internal_gteq_abs :: internal_greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp(a, b) == 1;
+}
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than: bool) {
+	return internal_cmp_digit(a, b) == 1;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp_mag(a, b) == 1;
+}
+
+internal_greater_than :: proc {
+	internal_int_greater_than,
+	internal_int_greater_than_digit,
+}
+internal_gt :: internal_greater_than;
+
+internal_greater_than_abs :: proc {
+	internal_int_greater_than_abs,
+}
+internal_gt_abs :: internal_greater_than_abs;
+
+
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
 
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index 68c574905..f701663a7 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -561,6 +561,298 @@ int_compare_magnitude :: proc(a, b: ^Int, allocator := context.allocator) -> (re
 
 	return #force_inline internal_cmp_mag(a, b), nil;
 }
+int_cmp_mag :: int_compare_magnitude;
+
+
+/*
+	bool := a < b
+*/
+int_less_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := a < b
+*/
+int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == -1, err;
+}
+
+less_than :: proc {
+	int_less_than,
+	int_less_than_digit,
+}
+lt :: less_than;
+
+less_than_abs :: proc {
+	int_less_than_abs,
+}
+lt_abs :: less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c <= 0, err;
+}
+
+less_than_or_equal :: proc {
+	int_less_than_or_equal,
+	int_less_than_or_equal_digit,
+}
+lteq :: less_than_or_equal;
+
+less_than_or_equal_abs :: proc {
+	int_less_than_or_equal_abs,
+}
+lteq_abs :: less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+int_equals :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := a == b
+*/
+int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == 0, err;
+}
+
+equals :: proc {
+	int_equals,
+	int_equals_digit,
+}
+eq :: equals;
+
+equals_abs :: proc {
+	int_equals_abs,
+}
+eq_abs :: equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c >= 0, err;
+}
+
+greater_than_or_equal :: proc {
+	int_greater_than_or_equal,
+	int_greater_than_or_equal_digit,
+}
+gteq :: greater_than_or_equal;
+
+greater_than_or_equal_abs :: proc {
+	int_greater_than_or_equal_abs,
+}
+gteq_abs :: greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+int_greater_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := a > b
+*/
+int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c > 0, err;
+}
+
+greater_than :: proc {
+	int_greater_than,
+	int_greater_than_digit,
+}
+gt :: greater_than;
+
+greater_than_abs :: proc {
+	int_greater_than_abs,
+}
+gt_abs :: greater_than_abs;
+
 
 /*
 	Check if remainders are possible squares - fast exclude non-squares.

From 671b413b15db1c0b4a5c04735b1b3bf047ce7fa9 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 19:12:32 +0200
Subject: [PATCH 12/13] big: Use new comparison helpers.

---
 core/math/big/build.bat     |  5 ++--
 core/math/big/internal.odin | 29 +++++++++++----------
 core/math/big/prime.odin    |  2 +-
 core/math/big/private.odin  | 50 +++++++++++++++++--------------------
 core/math/big/test.py       |  1 -
 5 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index e6049b2e1..6e7f3a166 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,10 @@
 @echo off
-odin run . -vet
+:odin run . -vet
 
 set TEST_ARGS=-fast-tests
+:set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index aa62569e7..0f66fcdaf 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -136,7 +136,7 @@ internal_int_add_signed :: proc(dest, a, b: ^Int, allocator := context.allocator
 		Subtract the one with the greater magnitude from the other.
 		The result gets the sign of the one with the greater magnitude.
 	*/
-	if #force_inline internal_cmp_mag(a, b) == -1 {
+	if #force_inline internal_lt_abs(a, b) {
 		x, y = y, x;
 	}
 
@@ -358,7 +358,7 @@ internal_int_sub_signed :: proc(dest, number, decrease: ^Int, allocator := conte
 		Subtract a positive from a positive, OR negative from a negative.
 		First, take the difference between their magnitudes, then...
 	*/
-	if #force_inline internal_cmp_mag(number, decrease) == -1 {
+	if #force_inline internal_lt_abs(number, decrease) {
 		/*
 			The second has a larger magnitude.
 			The result has the *opposite* sign from the first number.
@@ -718,7 +718,7 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	/*
 		If numerator < denominator then quotient = 0, remainder = numerator.
 	*/
-	if #force_inline internal_cmp_mag(numerator, denominator) == -1 {
+	if #force_inline internal_lt_abs(numerator, denominator) {
 		if remainder != nil {
 			internal_copy(remainder, numerator) or_return;
 		}
@@ -731,7 +731,6 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	if (denominator.used > 2 * MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used / 3) * 2) {
 		assert(denominator.used >= 160 && numerator.used >= 240, "MUL_KARATSUBA_CUTOFF global not properly set.");
 		err = _private_int_div_recursive(quotient, remainder, numerator, denominator);
-		// err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
 	} else {
 		when true {
 			err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
@@ -1243,12 +1242,12 @@ internal_less_than_or_equal :: proc {
 	internal_int_less_than_or_equal,
 	internal_int_less_than_or_equal_digit,
 }
-internal_lteq :: internal_less_than_or_equal;
+internal_lte :: internal_less_than_or_equal;
 
 internal_less_than_or_equal_abs :: proc {
 	internal_int_less_than_or_equal_abs,
 }
-internal_lteq_abs :: internal_less_than_or_equal_abs;
+internal_lte_abs :: internal_less_than_or_equal_abs;
 
 
 /*
@@ -1311,12 +1310,12 @@ internal_greater_than_or_equal :: proc {
 	internal_int_greater_than_or_equal,
 	internal_int_greater_than_or_equal_digit,
 }
-internal_gteq :: internal_greater_than_or_equal;
+internal_gte :: internal_greater_than_or_equal;
 
 internal_greater_than_or_equal_abs :: proc {
 	internal_int_greater_than_or_equal_abs,
 }
-internal_gteq_abs :: internal_greater_than_or_equal_abs;
+internal_gte_abs :: internal_greater_than_or_equal_abs;
 
 
 /*
@@ -1410,7 +1409,7 @@ internal_int_is_square :: proc(a: ^Int, allocator := context.allocator) -> (squa
 	sqrt(t, a) or_return;
 	sqr(t, t)  or_return;
 
-	square = internal_cmp_mag(t, a) == 0;
+	square = internal_eq_abs(t, a);
 
 	return;
 }
@@ -1642,7 +1641,7 @@ internal_int_sqrt :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 		internal_add(t2, t1, x)  or_return;
 		internal_shr(y, t2, 1)   or_return;
 
-		if c := internal_cmp(y, x); c == 0 || c == 1 {
+		if internal_gte(y, x) {
 			internal_swap(dest, x);
 			return nil;
 		}
@@ -1757,8 +1756,8 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 			 Number of rounds is at most log_2(root). If it is more it
 			 got stuck, so break out of the loop and do the rest manually.
 		*/
-		if ilog2 -= 1;    ilog2 == 0 { break; }
-		if internal_cmp(t1, t2) == 0 { break; }
+		if ilog2 -= 1; ilog2 == 0 { break; }
+		if internal_eq(t1, t2)    { break; }
 
 		iterations += 1;
 		if iterations == MAX_ITERATIONS_ROOT_N {
@@ -1796,7 +1795,7 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 	for {
 		internal_pow(t2, t1, n) or_return;
 	
-		if internal_cmp(t2, a) != 1 { break; }
+		if internal_lt(t2, a) { break; }
 		
 		internal_sub(t1, t1, DIGIT(1)) or_return;
 
@@ -2001,12 +2000,12 @@ internal_int_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		For all n in N and n > 0, n = 0 mod 1.
 	*/
-	if internal_is_positive(a) && internal_cmp(b, 1) == 0 { return internal_zero(dest);	}
+	if internal_is_positive(a) && internal_eq(b, 1) { return internal_zero(dest);	}
 
 	/*
 		`b` cannot be negative and has to be > 1
 	*/
-	if internal_is_negative(b) && internal_cmp(b, 1) != 1 { return .Invalid_Argument; }
+	if internal_is_negative(b) || internal_gt(b, 1) { return .Invalid_Argument; }
 
 	/*
 		If the modulus is odd we can use a faster routine instead.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 182e2bffa..d6626ffbf 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -163,7 +163,7 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 
 	for {
 		if internal_is_zero(a1) {
-			if internal_cmp(p1, 1) == 0 {
+			if internal_eq(p1, 1) {
 				return k, nil;
 			} else {
 				return 0, nil;
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 72f5bb9e2..fc2fe69e8 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1071,11 +1071,11 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 
 	internal_shl_digit(y, n - t) or_return;
 
-	c := internal_cmp(x, y);
-	for c != -1 {
+	gte := internal_gte(x, y);
+	for gte {
 		q.digit[n - t] += 1;
 		internal_sub(x, x, y) or_return;
-		c = internal_cmp(x, y);
+		gte = internal_gte(x, y);
 	}
 
 	/*
@@ -1134,7 +1134,7 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 			t2.digit[2] = x.digit[i];
 			t2.used = 3;
 
-			if t1_t2 := internal_cmp_mag(t1, t2); t1_t2 != 1 {
+			if internal_lte(t1, t2) {
 				break;
 			}
 			iter += 1; if iter > 100 {
@@ -1227,15 +1227,13 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
 	*/
-	if internal_cmp(A1, 0) == -1 {
+	if internal_lt(A1, 0) {
 		internal_shl(t, b, k * _DIGIT_BITS) or_return;
 
 		for {
 			internal_decr(Q1) or_return;
 			internal_add(A1, A1, t) or_return;
-			if internal_cmp(A1, 0) != -1 {
-				break;
-			}
+			if internal_gte(A1, 0) { break; }
 		}
 	}
 
@@ -1256,7 +1254,7 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
 	*/
-	for internal_cmp(A2, 0) == -1 {
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
 		internal_decr(Q0) or_return;
 		internal_add(A2, A2, b) or_return;
 	}
@@ -1391,7 +1389,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 		shl(tq, tq, n)       or_return;
 
 		for n >= 0 {
-			if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 {
+			if internal_gte(ta, tb) {
 				// ta -= tb
 				sub(ta, ta, tb) or_return;
 				//  q += tq
@@ -1596,7 +1594,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		/*
 			Make sure `v` is the largest.
 		*/
-		if internal_cmp_mag(u, v) == 1 {
+		if internal_gt(u, v) {
 			/*
 				Swap `u` and `v` to make sure `v` is >= `u`.
 			*/
@@ -1634,7 +1632,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		Computes least common multiple as `|a*b|/gcd(a,b)`
 		Divide the smallest by the GCD.
 	*/
-	if internal_cmp_mag(a, b) == -1 {
+	if internal_lt_abs(a, b) {
 		/*
 			Store quotient in `t2` such that `t2 * b` is the LCM.
 		*/
@@ -1694,9 +1692,7 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 		/*
 			Iterate until `a` is bracketed between low + high.
 		*/
-		if #force_inline internal_cmp(bracket_high, a) != -1 {
-			break;
-		}
+		if #force_inline internal_gte(bracket_high, a) { break; }
 
 		low = high;
 		#force_inline internal_copy(bracket_low, bracket_high) or_return;
@@ -1844,7 +1840,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		if A >= m then A = A - m
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n);
 	}
 	return nil;
@@ -1932,7 +1928,7 @@ _private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := cont
 	/*
 		if x >= n then x = x - n
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n);
 	}
 
@@ -1969,7 +1965,7 @@ _private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := cont
 	*/
 	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
 		internal_int_shl1(a, a)                                      or_return;
-		if internal_cmp_mag(a, b) != -1 {
+		if internal_gte_abs(a, b) {
 			internal_sub(a, a, b)                                    or_return;
 		}
 	}
@@ -2064,7 +2060,7 @@ _private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		If x < 0, add b**(k+1) to it.
 	*/
-	if internal_cmp(x, 0) == -1 {
+	if internal_is_negative(x) {
 		internal_set(q, 1)                                           or_return;
 		internal_shl_digit(q, um + 1)                                or_return;
 		internal_add(x, x, q)                                        or_return;
@@ -2073,7 +2069,7 @@ _private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		Back off if it's too big.
 	*/
-	for internal_cmp(x, m) != -1 {
+	for internal_gte(x, m) {
 		internal_sub(x, x, m)                                        or_return;
 	}
 
@@ -2110,7 +2106,7 @@ _private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.alloca
 			a = a + q
 		*/
 		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
+		if internal_lt_abs(a, n)                                     { break; }
 		internal_sub(a, a, n)                                        or_return;
 	}
 
@@ -2146,7 +2142,7 @@ _private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator)
 			a = a + q
 		*/
 		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
+		if internal_lt_abs(a, n)                                     { break; }
 		internal_sub(a, a, n)                                        or_return;
 	}
 
@@ -2359,7 +2355,7 @@ _private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.alloca
 			If x >= n then subtract and reduce again.
 			Each successive "recursion" makes the input smaller and smaller.
 		*/
-		if internal_cmp_mag(x, n) == -1 { break; }
+		if internal_lt_abs(x, n) { break; }
 
 		internal_sub(x, x, n)                                        or_return;
 	}
@@ -2985,21 +2981,21 @@ _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator
 	/*
 		If `v` != `1` then there is no inverse.
 	*/
-	if internal_cmp(v, 1) != 0 {
+	if !internal_eq(v, 1) {
 		return .Invalid_Argument;
 	}
 
 	/*
 		If its too low.
 	*/
-	if internal_cmp(C, 0) == -1 {
+	if internal_is_negative(C) {
 		internal_add(C, C, b) or_return;
 	}
 
 	/*
 		Too big.
 	*/
-	if internal_cmp(C, 0) != -1 {
+	if internal_gte(C, 0) {
 		internal_sub(C, C, b) or_return;
 	}
 
@@ -3152,7 +3148,7 @@ _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		Too big.
 	*/
-	for internal_cmp_mag(D, b) != -1 {
+	for internal_gte_abs(D, b) {
 		internal_sub(D, D, b) or_return;
 	}
 
diff --git a/core/math/big/test.py b/core/math/big/test.py
index e095b061e..4d314401f 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -413,7 +413,6 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay):
 	return test("test_shr_signed", res, [a, bits], expected_error, expected_result)
 
 def test_factorial(number = 0, expected_error = Error.Okay):
-	print("Factorial:", number)
 	args  = [number]
 	try:
 		res = int_factorial(*args)

From ae354731edd1d408f1852c4fb19af2c840cbed46 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 19:18:13 +0200
Subject: [PATCH 13/13] big: Add ; after proc map.

---
 core/math/big/build.bat     |  4 ++--
 core/math/big/internal.odin | 20 ++++++++++----------
 core/math/big/public.odin   | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 6e7f3a166..4a6aeeb3e 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -4,7 +4,7 @@
 set TEST_ARGS=-fast-tests
 :set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 0f66fcdaf..72ff1fe76 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1207,12 +1207,12 @@ internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool
 internal_less_than :: proc {
 	internal_int_less_than,
 	internal_int_less_than_digit,
-}
+};
 internal_lt :: internal_less_than;
 
 internal_less_than_abs :: proc {
 	internal_int_less_than_abs,
-}
+};
 internal_lt_abs :: internal_less_than_abs;
 
 
@@ -1241,12 +1241,12 @@ internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_t
 internal_less_than_or_equal :: proc {
 	internal_int_less_than_or_equal,
 	internal_int_less_than_or_equal_digit,
-}
+};
 internal_lte :: internal_less_than_or_equal;
 
 internal_less_than_or_equal_abs :: proc {
 	internal_int_less_than_or_equal_abs,
-}
+};
 internal_lte_abs :: internal_less_than_or_equal_abs;
 
 
@@ -1275,12 +1275,12 @@ internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
 internal_equals :: proc {
 	internal_int_equals,
 	internal_int_equals_digit,
-}
+};
 internal_eq :: internal_equals;
 
 internal_equals_abs :: proc {
 	internal_int_equals_abs,
-}
+};
 internal_eq_abs :: internal_equals_abs;
 
 
@@ -1309,12 +1309,12 @@ internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (gre
 internal_greater_than_or_equal :: proc {
 	internal_int_greater_than_or_equal,
 	internal_int_greater_than_or_equal_digit,
-}
+};
 internal_gte :: internal_greater_than_or_equal;
 
 internal_greater_than_or_equal_abs :: proc {
 	internal_int_greater_than_or_equal_abs,
-}
+};
 internal_gte_abs :: internal_greater_than_or_equal_abs;
 
 
@@ -1343,12 +1343,12 @@ internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than
 internal_greater_than :: proc {
 	internal_int_greater_than,
 	internal_int_greater_than_digit,
-}
+};
 internal_gt :: internal_greater_than;
 
 internal_greater_than_abs :: proc {
 	internal_int_greater_than_abs,
-}
+};
 internal_gt_abs :: internal_greater_than_abs;
 
 
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index f701663a7..e8ff1e28b 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -613,12 +613,12 @@ int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocat
 less_than :: proc {
 	int_less_than,
 	int_less_than_digit,
-}
+};
 lt :: less_than;
 
 less_than_abs :: proc {
 	int_less_than_abs,
-}
+};
 lt_abs :: less_than_abs;
 
 
@@ -671,12 +671,12 @@ int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := contex
 less_than_or_equal :: proc {
 	int_less_than_or_equal,
 	int_less_than_or_equal_digit,
-}
+};
 lteq :: less_than_or_equal;
 
 less_than_or_equal_abs :: proc {
 	int_less_than_or_equal_abs,
-}
+};
 lteq_abs :: less_than_or_equal_abs;
 
 
@@ -729,12 +729,12 @@ int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator)
 equals :: proc {
 	int_equals,
 	int_equals_digit,
-}
+};
 eq :: equals;
 
 equals_abs :: proc {
 	int_equals_abs,
-}
+};
 eq_abs :: equals_abs;
 
 
@@ -787,12 +787,12 @@ int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := con
 greater_than_or_equal :: proc {
 	int_greater_than_or_equal,
 	int_greater_than_or_equal_digit,
-}
+};
 gteq :: greater_than_or_equal;
 
 greater_than_or_equal_abs :: proc {
 	int_greater_than_or_equal_abs,
-}
+};
 gteq_abs :: greater_than_or_equal_abs;
 
 
@@ -845,12 +845,12 @@ int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allo
 greater_than :: proc {
 	int_greater_than,
 	int_greater_than_digit,
-}
+};
 gt :: greater_than;
 
 greater_than_abs :: proc {
 	int_greater_than_abs,
-}
+};
 gt_abs :: greater_than_abs;