From 1cfe226686369d3d36ab3d408997534c21b7f9f1 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Wed, 23 Jun 2021 22:18:17 +0200 Subject: [PATCH] ZLIB: More faster. --- core/compress/common.odin | 38 +++++++++++++++++++++++++++++------- core/compress/zlib/zlib.odin | 12 ++++++++++-- core/hash/crc.odin | 9 ++++++--- core/hash/hash.odin | 13 ++++++++++-- 4 files changed, 58 insertions(+), 14 deletions(-) diff --git a/core/compress/common.odin b/core/compress/common.odin index 0cf899cb0..e35365c81 100644 --- a/core/compress/common.odin +++ b/core/compress/common.odin @@ -131,13 +131,16 @@ Code_Buffer :: struct #packed { This simplifies end-of-stream handling where bits may be left in the bit buffer. */ +@(optimization_mode="speed") read_slice :: #force_inline proc(z: ^Context, size: int) -> (res: []u8, err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Read Slice"); } - if len(z.input_data) >= size { - res = z.input_data[:size]; - z.input_data = z.input_data[size:]; - return res, .None; + #no_bounds_check { + if len(z.input_data) >= size { + res = z.input_data[:size]; + z.input_data = z.input_data[size:]; + return res, .None; + } } if z.input_fully_in_memory { @@ -160,6 +163,7 @@ read_slice :: #force_inline proc(z: ^Context, size: int) -> (res: []u8, err: io. return []u8{}, e; } +@(optimization_mode="speed") read_data :: #force_inline proc(z: ^Context, $T: typeid) -> (res: T, err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Read Data"); } @@ -171,9 +175,18 @@ read_data :: #force_inline proc(z: ^Context, $T: typeid) -> (res: T, err: io.Err return T{}, e; } +@(optimization_mode="speed") read_u8 :: #force_inline proc(z: ^Context) -> (res: u8, err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Read u8"); } + #no_bounds_check { + if len(z.input_data) >= 1 { + res = z.input_data[0]; + z.input_data = z.input_data[1:]; + return res, .None; + } + } + b, e := read_slice(z, 1); if e == .None { return b[0], .None; @@ -182,14 +195,17 @@ read_u8 :: #force_inline proc(z: ^Context) -> (res: u8, err: io.Error) { return 0, e; } +@(optimization_mode="speed") peek_data :: #force_inline proc(z: ^Context, $T: typeid) -> (res: T, err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Peek Data"); } size :: size_of(T); - if len(z.input_data) >= size { - buf := z.input_data[:size]; - return (^T)(&buf[0])^, .None; + #no_bounds_check { + if len(z.input_data) >= size { + buf := z.input_data[:size]; + return (^T)(&buf[0])^, .None; + } } if z.input_fully_in_memory { @@ -224,12 +240,14 @@ peek_data :: #force_inline proc(z: ^Context, $T: typeid) -> (res: T, err: io.Err } // Sliding window read back +@(optimization_mode="speed") peek_back_byte :: #force_inline proc(cb: ^Code_Buffer, offset: i64) -> (res: u8, err: io.Error) { // Look back into the sliding window. return cb.last[offset & cb.window_mask], .None; } // Generalized bit reader LSB +@(optimization_mode="speed") refill_lsb :: proc(z: ^Context, cb: ^Code_Buffer, width := i8(24)) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Refill LSB"); } for { @@ -254,11 +272,13 @@ refill_lsb :: proc(z: ^Context, cb: ^Code_Buffer, width := i8(24)) { } } +@(optimization_mode="speed") consume_bits_lsb :: #force_inline proc(cb: ^Code_Buffer, width: u8) { cb.code_buffer >>= width; cb.num_bits -= u64(width); } +@(optimization_mode="speed") peek_bits_lsb :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, width: u8) -> u32 { if cb.num_bits < u64(width) { refill_lsb(z, cb); @@ -267,23 +287,27 @@ peek_bits_lsb :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, width: u8) -> return u32(cb.code_buffer & ~(~u64(0) << width)); } +@(optimization_mode="speed") peek_bits_no_refill_lsb :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, width: u8) -> u32 { assert(cb.num_bits >= u64(width)); return u32(cb.code_buffer & ~(~u64(0) << width)); } +@(optimization_mode="speed") read_bits_lsb :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, width: u8) -> u32 { k := peek_bits_lsb(z, cb, width); consume_bits_lsb(cb, width); return k; } +@(optimization_mode="speed") read_bits_no_refill_lsb :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, width: u8) -> u32 { k := peek_bits_no_refill_lsb(z, cb, width); consume_bits_lsb(cb, width); return k; } +@(optimization_mode="speed") discard_to_next_byte_lsb :: proc(cb: ^Code_Buffer) { discard := u8(cb.num_bits & 7); consume_bits_lsb(cb, discard); diff --git a/core/compress/zlib/zlib.odin b/core/compress/zlib/zlib.odin index a57d202d5..41578e16b 100644 --- a/core/compress/zlib/zlib.odin +++ b/core/compress/zlib/zlib.odin @@ -115,7 +115,7 @@ Huffman_Table :: struct { }; // Implementation starts here - +@(optimization_mode="speed") z_bit_reverse :: #force_inline proc(n: u16, bits: u8) -> (r: u16) { assert(bits <= 16); // NOTE: Can optimize with llvm.bitreverse.i64 or some bit twiddling @@ -130,6 +130,7 @@ z_bit_reverse :: #force_inline proc(n: u16, bits: u8) -> (r: u16) { return; } +@(optimization_mode="speed") write_byte :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, c: u8) -> (err: io.Error) #no_bounds_check { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Write Byte"); } c := c; @@ -146,6 +147,7 @@ write_byte :: #force_inline proc(z: ^Context, cb: ^Code_Buffer, c: u8) -> (err: return .None; } +@(optimization_mode="speed") repl_byte :: proc(z: ^Context, cb: ^Code_Buffer, count: u16, c: u8) -> (err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Repl Byte"); } /* @@ -168,6 +170,7 @@ repl_byte :: proc(z: ^Context, cb: ^Code_Buffer, count: u16, c: u8) -> (err: io. return .None; } +@(optimization_mode="speed") repl_bytes :: proc(z: ^Context, cb: ^Code_Buffer, count: u16, distance: u16) -> (err: io.Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Repl Bytes"); } /* @@ -199,6 +202,7 @@ allocate_huffman_table :: proc(allocator := context.allocator) -> (z: ^Huffman_T return new(Huffman_Table, allocator), nil; } +@(optimization_mode="speed") build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Build Huffman Table"); } sizes: [HUFFMAN_MAX_BITS+1]int; @@ -258,6 +262,7 @@ build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) { return nil; } +@(optimization_mode="speed") decode_huffman_slowpath :: proc(z: ^Context, cb: ^Code_Buffer, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Decode Huffman Slow"); } code := u16(compress.peek_bits_lsb(z, cb, 16)); @@ -289,6 +294,7 @@ decode_huffman_slowpath :: proc(z: ^Context, cb: ^Code_Buffer, t: ^Huffman_Table return r, nil; } +@(optimization_mode="speed") decode_huffman :: proc(z: ^Context, cb: ^Code_Buffer, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Decode Huffman"); } if cb.num_bits < 16 { @@ -309,6 +315,7 @@ decode_huffman :: proc(z: ^Context, cb: ^Code_Buffer, t: ^Huffman_Table) -> (r: return decode_huffman_slowpath(z, cb, t); } +@(optimization_mode="speed") parse_huffman_block :: proc(z: ^Context, cb: ^Code_Buffer, z_repeat, z_offset: ^Huffman_Table) -> (err: Error) #no_bounds_check { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Parse Huffman Block"); } #no_bounds_check for { @@ -379,6 +386,7 @@ parse_huffman_block :: proc(z: ^Context, cb: ^Code_Buffer, z_repeat, z_offset: ^ } } +@(optimization_mode="speed") inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := context.allocator) -> (err: Error) #no_bounds_check { /* ctx.input must be an io.Stream backed by an implementation that supports: @@ -459,7 +467,7 @@ inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := cont return nil; } -// @(optimization_mode="speed") +@(optimization_mode="speed") inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, allocator := context.allocator) -> (err: Error) #no_bounds_check { when #config(TRACY_ENABLE, false) { tracy.ZoneN("Inflate Raw"); } final := u32(0); diff --git a/core/hash/crc.odin b/core/hash/crc.odin index bb31669d0..e2c7c8e62 100644 --- a/core/hash/crc.odin +++ b/core/hash/crc.odin @@ -1,15 +1,18 @@ package hash -crc32 :: proc(data: []byte, seed := u32(0)) -> u32 #no_bounds_check { +@(optimization_mode="speed") +crc32 :: proc(data: []byte, seed := u32(0)) -> u32 { result := ~u32(seed); - for b in data { + #no_bounds_check for b in data { result = result>>8 ~ _crc32_table[(result ~ u32(b)) & 0xff]; } return ~result; } + +@(optimization_mode="speed") crc64 :: proc(data: []byte, seed := u32(0)) -> u64 #no_bounds_check { result := ~u64(seed); - for b in data { + #no_bounds_check for b in data { result = result>>8 ~ _crc64_table[(result ~ u64(b)) & 0xff]; } return ~result; diff --git a/core/hash/hash.odin b/core/hash/hash.odin index 5bd2f6e10..6740629be 100644 --- a/core/hash/hash.odin +++ b/core/hash/hash.odin @@ -2,16 +2,18 @@ package hash import "core:mem" +@(optimization_mode="speed") adler32 :: proc(data: []byte, seed := u32(1)) -> u32 { ADLER_CONST :: 65521; a, b: u32 = seed & 0xFFFF, seed >> 16; - for x in data { + #no_bounds_check for x in data { a = (a + u32(x)) % ADLER_CONST; b = (b + a) % ADLER_CONST; } return (b << 16) | a; } +@(optimization_mode="speed") djb2 :: proc(data: []byte) -> u32 { hash: u32 = 5381; for b in data { @@ -20,6 +22,7 @@ djb2 :: proc(data: []byte) -> u32 { return hash; } +@(optimization_mode="speed") fnv32 :: proc(data: []byte) -> u32 { h: u32 = 0x811c9dc5; for b in data { @@ -28,6 +31,7 @@ fnv32 :: proc(data: []byte) -> u32 { return h; } +@(optimization_mode="speed") fnv64 :: proc(data: []byte) -> u64 { h: u64 = 0xcbf29ce484222325; for b in data { @@ -36,6 +40,7 @@ fnv64 :: proc(data: []byte) -> u64 { return h; } +@(optimization_mode="speed") fnv32a :: proc(data: []byte) -> u32 { h: u32 = 0x811c9dc5; for b in data { @@ -44,6 +49,7 @@ fnv32a :: proc(data: []byte) -> u32 { return h; } +@(optimization_mode="speed") fnv64a :: proc(data: []byte) -> u64 { h: u64 = 0xcbf29ce484222325; for b in data { @@ -52,6 +58,7 @@ fnv64a :: proc(data: []byte) -> u64 { return h; } +@(optimization_mode="speed") jenkins :: proc(data: []byte) -> u32 { hash: u32 = 0; for b in data { @@ -65,6 +72,7 @@ jenkins :: proc(data: []byte) -> u32 { return hash; } +@(optimization_mode="speed") murmur32 :: proc(data: []byte) -> u32 { c1_32: u32 : 0xcc9e2d51; c2_32: u32 : 0x1b873593; @@ -114,6 +122,7 @@ murmur32 :: proc(data: []byte) -> u32 { return h1; } +@(optimization_mode="speed") murmur64 :: proc(data: []byte) -> u64 { SEED :: 0x9747b28c; @@ -219,7 +228,7 @@ murmur64 :: proc(data: []byte) -> u64 { } } - +@(optimization_mode="speed") sdbm :: proc(data: []byte) -> u32 { hash: u32 = 0; for b in data {