From 980aa37bee30792bd8cd9cb082313f6633267840 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Thu, 24 Jun 2021 14:56:28 +0200
Subject: [PATCH] ZLIB: Another 10%+ faster.

---
 core/compress/common.odin    |  5 ++++-
 core/compress/zlib/zlib.odin | 24 ++++++++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/core/compress/common.odin b/core/compress/common.odin
index e35365c81..df798e751 100644
--- a/core/compress/common.odin
+++ b/core/compress/common.odin
@@ -250,8 +250,11 @@ peek_back_byte :: #force_inline proc(cb: ^Code_Buffer, offset: i64) -> (res: u8,
 @(optimization_mode="speed")
 refill_lsb :: proc(z: ^Context, cb: ^Code_Buffer, width := i8(24)) {
 	when #config(TRACY_ENABLE, false) { tracy.ZoneN("Refill LSB"); }
+
+	refill := u64(width);
+
 	for {
-		if cb.num_bits > u64(width) {
+		if cb.num_bits > refill {
 			break;
 		}
 		if cb.code_buffer == 0 && cb.num_bits > 63 {
diff --git a/core/compress/zlib/zlib.odin b/core/compress/zlib/zlib.odin
index 41578e16b..ce15ea147 100644
--- a/core/compress/zlib/zlib.odin
+++ b/core/compress/zlib/zlib.odin
@@ -23,6 +23,16 @@ import "core:hash"
 	Returns: Error.
 */
 
+/*
+	Do we do Adler32 as we write bytes to output?
+	It used to be faster to do it inline, now it's faster to do it at the end of `inflate`.
+
+	We'll see what's faster after more optimization, and might end up removing
+	`Context.rolling_hash` if not inlining it is still faster.
+
+*/
+INLINE_ADLER :: false;
+
 Context     :: compress.Context;
 Code_Buffer :: compress.Code_Buffer;
 
@@ -135,7 +145,7 @@ write_byte :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, c: u8) -> (err:
 	when #config(TRACY_ENABLE, false) { tracy.ZoneN("Write Byte"); }
 	c := c;
 	buf := transmute([]u8)mem.Raw_Slice{data=&c, len=1};
-	z.rolling_hash = hash.adler32(buf, z.rolling_hash);
+	when INLINE_ADLER { z.rolling_hash = hash.adler32(buf, z.rolling_hash); }
 
 	_, e := z.output->impl_write(buf);
 	if e != .None {
@@ -161,7 +171,7 @@ repl_byte :: proc(z: ^Context, cb: ^Code_Buffer, count: u16, c: u8) -> (err: io.
 		cb.last[z.bytes_written & cb.window_mask] = c;
 		z.bytes_written += 1;
 	}
-	z.rolling_hash = hash.adler32(buf, z.rolling_hash);
+	when INLINE_ADLER { z.rolling_hash = hash.adler32(buf, z.rolling_hash); }
 
 	_, e := z.output->impl_write(buf);
 	if e != .None {
@@ -188,7 +198,7 @@ repl_bytes :: proc(z: ^Context, cb: ^Code_Buffer, count: u16, distance: u16) ->
 		buf[i] = c;
 		z.bytes_written += 1; offset += 1;
 	}
-	z.rolling_hash = hash.adler32(buf, z.rolling_hash);
+	when INLINE_ADLER { z.rolling_hash = hash.adler32(buf, z.rolling_hash); }
 
 	_, e := z.output->impl_write(buf);
 	if e != .None {
@@ -458,8 +468,13 @@ inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := cont
 
 	if !raw {
 		compress.discard_to_next_byte_lsb(cb);
-
 		adler32 := compress.read_bits_lsb(ctx, cb, 8) << 24 | compress.read_bits_lsb(ctx, cb, 8) << 16 | compress.read_bits_lsb(ctx, cb, 8) << 8 | compress.read_bits_lsb(ctx, cb, 8);
+
+		when !INLINE_ADLER {
+			buf := (^bytes.Buffer)(ctx.output.stream_data).buf[:];
+			ctx.rolling_hash = hash.adler32(buf);
+		}
+
 		if ctx.rolling_hash != u32(adler32) {
 			return E_General.Checksum_Failed;
 		}
@@ -643,6 +658,7 @@ inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, allocator := cont
 			break;
 		}
 	}
+
 	return nil;
 }