From 2cfd6b702473093b167aaa3075e71a36f3bc4eef Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sat, 28 Aug 2021 14:59:13 +0200
Subject: [PATCH] big: Add `_private_int_mul_high`.

---
 core/math/big/build.bat    |   4 +-
 core/math/big/private.odin | 123 +++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..e6049b2e1 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,9 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d4cced10a..5fda9f2b9 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -426,6 +426,129 @@ _private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := conte
 	return internal_clamp(dest);
 }
 
+/*
+	Multiplies |a| * |b| and does not compute the lower digs digits
+	[meant to get the higher part of the product]
+*/
+_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	/*
+		Can we use the fast multiplier?
+	*/
+	if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
+		return _private_int_mul_high_comba(dest, a, b, digits);
+	}
+
+	internal_grow(dest, a.used + b.used + 1) or_return;
+	dest.used = a.used + b.used + 1;
+
+	pa := a.used;
+	pb := b.used;
+	for ix := 0; ix < pa; ix += 1 {
+		carry := DIGIT(0);
+
+		for iy := digits - ix; iy < pb; iy += 1 {
+			/*
+				Calculate the double precision result.
+			*/
+			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry);
+
+			/*
+				Get the lower part.
+			*/
+			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
+
+			/*
+				Carry the carry.
+			*/
+			carry = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+		dest.digit[ix + pb] = carry;
+	}
+	return internal_clamp(dest);
+}
+
+/*
+	This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
+	See the comments for `_private_int_mul_comba` to see how it works.
+
+	This is used in the Barrett reduction since for one of the multiplications
+	only the higher digits were needed.  This essentially halves the work.
+
+	Based on Algorithm 14.12 on pp.595 of HAC.
+*/
+_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	W: [_WARRAY]DIGIT = ---;
+	_W: _WORD = 0;
+
+	/*
+		Number of output digits to produce. Grow the destination as required.
+	*/
+	pa := a.used + b.used;
+	internal_grow(dest, pa) or_return;
+
+	ix: int;
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Get offsets into the two bignums.
+		*/
+		ty := min(b.used - 1, ix);
+		tx := ix - ty;
+
+		/*
+			This is the number of times the loop will iterrate, essentially it's
+			while (tx++ < a->used && ty-- >= 0) { ... }
+		*/
+		iy := min(a.used - tx, ty + 1);
+
+		/*
+			Execute loop.
+		*/
+		for iz := 0; iz < iy; iz += 1 {
+			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz]);
+		}
+
+		/*
+			Store term.
+		*/
+		W[ix] = DIGIT(_W) & DIGIT(_MASK);
+
+		/*
+			Make next carry.
+		*/
+		_W = _W >> _WORD(_DIGIT_BITS);
+	}
+
+	/*
+		Setup dest
+	*/
+	old_used := dest.used;
+	dest.used = pa;
+
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Now extract the previous digit [below the carry].
+		*/
+		dest.digit[ix] = W[ix];
+	}
+
+	/*
+		Zero remainder.
+	*/
+	internal_zero_unused(dest, old_used);
+
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+	return internal_clamp(dest);
+}
+
+
+
+
 /*
 	Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
 	Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.