From 2cfd6b702473093b167aaa3075e71a36f3bc4eef Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sat, 28 Aug 2021 14:59:13 +0200
Subject: [PATCH 1/2] big: Add `_private_int_mul_high`.

---
 core/math/big/build.bat    |   4 +-
 core/math/big/private.odin | 123 +++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..e6049b2e1 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,9 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d4cced10a..5fda9f2b9 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -426,6 +426,129 @@ _private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := conte
 	return internal_clamp(dest);
 }
 
+/*
+	Multiplies |a| * |b| and does not compute the lower digs digits
+	[meant to get the higher part of the product]
+*/
+_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	/*
+		Can we use the fast multiplier?
+	*/
+	if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
+		return _private_int_mul_high_comba(dest, a, b, digits);
+	}
+
+	internal_grow(dest, a.used + b.used + 1) or_return;
+	dest.used = a.used + b.used + 1;
+
+	pa := a.used;
+	pb := b.used;
+	for ix := 0; ix < pa; ix += 1 {
+		carry := DIGIT(0);
+
+		for iy := digits - ix; iy < pb; iy += 1 {
+			/*
+				Calculate the double precision result.
+			*/
+			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry);
+
+			/*
+				Get the lower part.
+			*/
+			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
+
+			/*
+				Carry the carry.
+			*/
+			carry = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+		dest.digit[ix + pb] = carry;
+	}
+	return internal_clamp(dest);
+}
+
+/*
+	This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
+	See the comments for `_private_int_mul_comba` to see how it works.
+
+	This is used in the Barrett reduction since for one of the multiplications
+	only the higher digits were needed.  This essentially halves the work.
+
+	Based on Algorithm 14.12 on pp.595 of HAC.
+*/
+_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	W: [_WARRAY]DIGIT = ---;
+	_W: _WORD = 0;
+
+	/*
+		Number of output digits to produce. Grow the destination as required.
+	*/
+	pa := a.used + b.used;
+	internal_grow(dest, pa) or_return;
+
+	ix: int;
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Get offsets into the two bignums.
+		*/
+		ty := min(b.used - 1, ix);
+		tx := ix - ty;
+
+		/*
+			This is the number of times the loop will iterrate, essentially it's
+			while (tx++ < a->used && ty-- >= 0) { ... }
+		*/
+		iy := min(a.used - tx, ty + 1);
+
+		/*
+			Execute loop.
+		*/
+		for iz := 0; iz < iy; iz += 1 {
+			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]);
+		}
+
+		/*
+			Store term.
+		*/
+		W[ix] = DIGIT(_W) & DIGIT(_MASK);
+
+		/*
+			Make next carry.
+		*/
+		_W = _W >> _WORD(_DIGIT_BITS);
+	}
+
+	/*
+		Setup dest
+	*/
+	old_used := dest.used;
+	dest.used = pa;
+
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Now extract the previous digit [below the carry].
+		*/
+		dest.digit[ix] = W[ix];
+	}
+
+	/*
+		Zero remainder.
+	*/
+	internal_zero_unused(dest, old_used);
+
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+	return internal_clamp(dest);
+}
+
+
+
+
 /*
 	Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
 	Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.

From 737b4fde1c8b3e450209716f8a182b8edfb4f59e Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sat, 28 Aug 2021 18:17:57 +0200
Subject: [PATCH 2/2] big: Add `_private_int_mul_balance`.

---
 core/math/big/build.bat     |   4 +-
 core/math/big/example.odin  |   9 ---
 core/math/big/internal.odin |  11 ++--
 core/math/big/private.odin  | 107 +++++++++++++++++++++++++++++-------
 core/math/big/test.py       |  17 ++++--
 5 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index e6049b2e1..31480bcc8 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,9 @@
 @echo off
-odin run . -vet
+:odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index cddf3ffae..4542e9e15 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -205,15 +205,6 @@ int_to_byte_little :: proc(v: ^Int) {
 demo :: proc() {
 	a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f);
-
-	foo := "92232459121502451677697058974826760244863271517919321608054113675118660929276431348516553336313179167211015633639725554914519355444316239500734169769447134357534241879421978647995614218985202290368055757891124109355450669008628757662409138767505519391883751112010824030579849970582074544353971308266211776494228299586414907715854328360867232691292422194412634523666770452490676515117702116926803826546868467146319938818238521874072436856528051486567230096290549225463582766830777324099589751817442141036031904145041055454639783559905920619197290800070679733841430619962318433709503256637256772215111521321630777950145713049902839937043785039344243357384899099910837463164007565230287809026956254332260375327814271845678201";
-
-	set(a, foo);
-
-	print("a: ", a);
-
-	is_sqr, _ := internal_int_is_square(a);
-	fmt.printf("is_square: %v\n", is_sqr);
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 4903dd009..9422067ae 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -659,8 +659,7 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 			Can we use the balance method? Check sizes.
 			* The smaller one needs to be larger than the Karatsuba cut-off.
 			* The bigger one needs to be at least about one `_MUL_KARATSUBA_CUTOFF` bigger
-			* to make some sense, but it depends on architecture, OS, position of the
-			* stars... so YMMV.
+			* to make some sense, but it depends on architecture, OS, position of the stars... so YMMV.
 			* Using it to cut the input into slices small enough for _mul_comba
 			* was actually slower on the author's machine, but YMMV.
 		*/
@@ -669,13 +668,11 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 		max_used := max(src.used, multiplier.used);
 		digits   := src.used + multiplier.used + 1;
 
-		if        false &&  min_used     >= MUL_KARATSUBA_CUTOFF &&
-							max_used / 2 >= MUL_KARATSUBA_CUTOFF &&
+		if min_used >= MUL_KARATSUBA_CUTOFF && (max_used / 2) >= MUL_KARATSUBA_CUTOFF && max_used >= (2 * min_used) {
 			/*
 				Not much effect was observed below a ratio of 1:2, but again: YMMV.
 			*/
-							max_used     >= 2 * min_used {
-			// err = s_mp_mul_balance(a,b,c);
+			err = _private_int_mul_balance(dest, src, multiplier);
 		} else if min_used >= MUL_TOOM_CUTOFF {
 			/*
 				Toom path commented out until it no longer fails Factorial 10k or 100k,
@@ -914,7 +911,7 @@ internal_int_factorial :: proc(res: ^Int, n: int, allocator := context.allocator
 	context.allocator = allocator;
 
 	if n >= FACTORIAL_BINARY_SPLIT_CUTOFF {
-		return #force_inline _private_int_factorial_binary_split(res, n);
+		return _private_int_factorial_binary_split(res, n);
 	}
 
 	i := len(_factorial_table);
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 5fda9f2b9..d71946ce2 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -113,7 +113,7 @@ _private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator)
 	context.allocator = allocator;
 
 	S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	defer destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2);
+	defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2);
 
 	/*
 		Init temps.
@@ -258,7 +258,7 @@ _private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.alloca
 	context.allocator = allocator;
 
 	x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	defer destroy(x0, x1, y0, y1, t1, x0y0, x1y1);
+	defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1);
 
 	/*
 		min # of digits, divided by two.
@@ -546,8 +546,74 @@ _private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator :=
 	return internal_clamp(dest);
 }
 
+/*
+	Single-digit multiplication with the smaller number as the single-digit.
+*/
+_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	a, b := a, b;
 
+	a0, tmp, r := &Int{}, &Int{}, &Int{};
+	defer internal_destroy(a0, tmp, r);
 
+	b_size   := min(a.used, b.used);
+	n_blocks := max(a.used, b.used) / b_size;
+
+	internal_grow(a0, b_size + 2) or_return;
+	internal_init_multi(tmp, r)   or_return;
+
+	/*
+		Make sure that `a` is the larger one.
+	*/
+	if a.used < b.used {
+		a, b = b, a;
+	}
+	assert(a.used >= b.used);
+
+	i, j := 0, 0;
+	for ; i < n_blocks; i += 1 {
+		/*
+			Cut a slice off of `a`.
+		*/
+
+		a0.used = b_size;
+		internal_copy_digits(a0, a, a0.used, j);
+		j += a0.used;
+		internal_clamp(a0);
+
+		/*
+			Multiply with `b`.
+		*/
+		internal_mul(tmp, a0, b)                                     or_return;
+
+		/*
+			Shift `tmp` to the correct position.
+		*/
+		internal_shl_digit(tmp, b_size * i)                          or_return;
+
+		/*
+			Add to output. No carry needed.
+		*/
+		internal_add(r, r, tmp)                                      or_return;
+	}
+
+	/*
+		The left-overs; there are always left-overs.
+	*/
+	if j < a.used {
+		a0.used = a.used - j;
+		internal_copy_digits(a0, a, a0.used, j);
+		j += a0.used;
+		internal_clamp(a0);
+
+		internal_mul(tmp, a0, b)                                     or_return;
+		internal_shl_digit(tmp, b_size * i)                          or_return;
+		internal_add(r, r, tmp)                                      or_return;
+	}
+
+	internal_swap(dest, r);
+	return;
+}
 
 /*
 	Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
@@ -1311,7 +1377,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 
 	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{};
 	c: int;
-	defer destroy(ta, tb, tq, q);
+	defer internal_destroy(ta, tb, tq, q);
 
 	for {
 		internal_one(tq) or_return;
@@ -1364,31 +1430,34 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 	Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
 */
 _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
 
 	inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer internal_destroy(inner, outer, start, stop, temp);
 
-	internal_one(inner, false, allocator) or_return;
-	internal_one(outer, false, allocator) or_return;
+	internal_one(inner, false)                                       or_return;
+	internal_one(outer, false)                                       or_return;
 
 	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n));
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1;
 		stop  := (n >> uint(i)) + 1 | 1;
-		_private_int_recursive_product(temp, start, stop, 0, allocator) or_return;
-		internal_mul(inner, inner, temp, allocator) or_return;
-		internal_mul(outer, outer, inner, allocator) or_return;
+		_private_int_recursive_product(temp, start, stop, 0)         or_return;
+		internal_mul(inner, inner, temp)                             or_return;
+		internal_mul(outer, outer, inner)                            or_return;
 	}
 	shift := n - intrinsics.count_ones(n);
 
-	return internal_shl(res, outer, int(shift), allocator);
+	return internal_shl(res, outer, int(shift));
 }
 
 /*
 	Recursive product used by binary split factorial algorithm.
 */
 _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
 	t1, t2 := &Int{}, &Int{};
 	defer internal_destroy(t1, t2);
 
@@ -1398,28 +1467,28 @@ _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int
 
 	num_factors := (stop - start) >> 1;
 	if num_factors == 2 {
-		internal_set(t1, start, false, allocator) or_return;
+		internal_set(t1, start, false)                               or_return;
 		when true {
-			internal_grow(t2, t1.used + 1, false, allocator) or_return;
-			internal_add(t2, t1, 2, allocator) or_return;
+			internal_grow(t2, t1.used + 1, false)                    or_return;
+			internal_add(t2, t1, 2)                                  or_return;
 		} else {
-			add(t2, t1, 2) or_return;
+			internal_add(t2, t1, 2)                                  or_return;
 		}
-		return internal_mul(res, t1, t2, allocator);
+		return internal_mul(res, t1, t2);
 	}
 
 	if num_factors > 1 {
 		mid := (start + num_factors) | 1;
-		_private_int_recursive_product(t1, start,  mid, level + 1, allocator) or_return;
-		_private_int_recursive_product(t2,   mid, stop, level + 1, allocator) or_return;
-		return internal_mul(res, t1, t2, allocator);
+		_private_int_recursive_product(t1, start,  mid, level + 1)   or_return;
+		_private_int_recursive_product(t2,   mid, stop, level + 1)   or_return;
+		return internal_mul(res, t1, t2);
 	}
 
 	if num_factors == 1 {
-		return #force_inline internal_set(res, start, true, allocator);
+		return #force_inline internal_set(res, start, true);
 	}
 
-	return #force_inline internal_one(res, true, allocator);
+	return #force_inline internal_one(res, true);
 }
 
 /*
diff --git a/core/math/big/test.py b/core/math/big/test.py
index 79ee2c7ca..df59fa1c8 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -403,14 +403,21 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay):
 		
 	return test("test_shr_signed", res, [a, bits], expected_error, expected_result)
 
-def test_factorial(n = 0, expected_error = Error.Okay):
-	args  = [n]
-	res   = int_factorial(*args)
+def test_factorial(number = 0, expected_error = Error.Okay):
+	print("Factorial:", number)
+	args  = [number]
+	try:
+		res = int_factorial(*args)
+	except OSError as e:
+		print("{} while trying to factorial {}.".format(e, number))
+		if EXIT_ON_FAIL: exit(3)
+		return False
+
 	expected_result = None
 	if expected_error == Error.Okay:
-		expected_result = math.factorial(n)
+		expected_result = math.factorial(number)
 		
-	return test("test_factorial", res, [n], expected_error, expected_result)
+	return test("test_factorial", res, [number], expected_error, expected_result)
 
 def test_gcd(a = 0, b = 0, expected_error = Error.Okay):
 	args  = [arg_to_odin(a), arg_to_odin(b)]