ZLIB: If output size is known, reserve that much.

This commit is contained in:
Jeroen van Rijn
2021-06-26 13:17:14 +02:00
parent ab12ca69af
commit 40a12cca53
5 changed files with 166 additions and 28 deletions

View File

@@ -11,6 +11,39 @@ package compress
import "core:io"
import "core:image"
/*
These settings bound how much compression algorithms will allocate for their output buffer.
If streaming their output, these are unnecessary and will be ignored.
*/
/*
When a decompression routine doesn't stream its output, but writes to a buffer,
we pre-allocate an output buffer to speed up decompression. The default is 1 MiB.
*/
COMPRESS_OUTPUT_ALLOCATE_MIN :: int(#config(COMPRESS_OUTPUT_ALLOCATE_MIN, 1 << 20));
/*
This bounds the maximum a buffer will resize to as needed, or the maximum we'll
pre-allocate if you inform the decompression routine you know the payload size.
For reference, the largest payload size of a GZIP file is 4 GiB.
*/
when size_of(uintptr) == 8 {
/*
For 64-bit platforms, we set the default max buffer size to 4 GiB,
which is GZIP and PKZIP's max payload size.
*/
COMPRESS_OUTPUT_ALLOCATE_MAX :: int(#config(COMPRESS_OUTPUT_ALLOCATE_MAX, 1 << 32));
} else {
/*
For 32-bit platforms, we set the default max buffer size to 512 MiB.
*/
COMPRESS_OUTPUT_ALLOCATE_MAX :: int(#config(COMPRESS_OUTPUT_ALLOCATE_MAX, 1 << 29));
}
// when #config(TRACY_ENABLE, false) { import tracy "shared:odin-tracy" }
Error :: union {
@@ -46,6 +79,20 @@ GZIP_Error :: enum {
Comment_Too_Long,
Payload_Length_Invalid,
Payload_CRC_Invalid,
/*
GZIP's payload can be a maximum of max(u32le), or 4 GiB.
If you tell it you expect it to contain more, that's obviously an error.
*/
Payload_Size_Exceeds_Max_Payload,
/*
For buffered instead of streamed output, the payload size can't exceed
the max set by the `COMPRESS_OUTPUT_ALLOCATE_MAX` switch in compress/common.odin.
You can tweak this setting using `-define:COMPRESS_OUTPUT_ALLOCATE_MAX=size_in_bytes`
*/
Output_Exceeds_COMPRESS_OUTPUT_ALLOCATE_MAX,
}
ZIP_Error :: enum {
@@ -79,7 +126,7 @@ Context :: struct #packed {
input_data: []u8,
output: io.Stream,
output_buf: [dynamic]u8,
output_buf: ^[dynamic]u8,
bytes_written: i64,
/*
@@ -103,9 +150,10 @@ Context :: struct #packed {
*/
input_fully_in_memory: b8,
input_refills_from_stream: b8,
reserved_flags: [2]b8,
output_to_stream: b8,
reserved_flag: b8,
}
#assert(size_of(Context) == 128);
// #assert(size_of(Context) == 128);
/*
Compression algorithm context

View File

@@ -45,7 +45,7 @@ main :: proc() {
if len(args) < 2 {
stderr("No input file specified.\n");
err := load(TEST, &buf);
err := load(slice=TEST, buf=&buf, known_gzip_size=len(TEST));
if err == nil {
stdout("Displaying test vector: ");
stdout(bytes.buffer_to_string(&buf));

View File

@@ -21,6 +21,8 @@ import "core:io"
import "core:bytes"
import "core:hash"
// import "core:fmt"
Magic :: enum u16le {
GZIP = 0x8b << 8 | 0x1f,
}
@@ -99,7 +101,9 @@ E_GZIP :: compress.GZIP_Error;
E_ZLIB :: compress.ZLIB_Error;
E_Deflate :: compress.Deflate_Error;
load_from_slice :: proc(slice: []u8, buf: ^bytes.Buffer, allocator := context.allocator) -> (err: Error) {
GZIP_MAX_PAYLOAD_SIZE :: int(max(u32le));
load_from_slice :: proc(slice: []u8, buf: ^bytes.Buffer, known_gzip_size := -1, expected_output_size := -1, allocator := context.allocator) -> (err: Error) {
r := bytes.Reader{};
bytes.reader_init(&r, slice);
@@ -111,33 +115,47 @@ load_from_slice :: proc(slice: []u8, buf: ^bytes.Buffer, allocator := context.al
input_fully_in_memory = true,
input_refills_from_stream = true,
};
err = load_from_stream(ctx, buf, allocator);
err = load_from_stream(ctx, buf, known_gzip_size, expected_output_size, allocator);
return err;
}
load_from_file :: proc(filename: string, buf: ^bytes.Buffer, allocator := context.allocator) -> (err: Error) {
load_from_file :: proc(filename: string, buf: ^bytes.Buffer, expected_output_size := -1, allocator := context.allocator) -> (err: Error) {
data, ok := os.read_entire_file(filename, allocator);
defer delete(data);
err = E_General.File_Not_Found;
if ok {
err = load_from_slice(data, buf, allocator);
err = load_from_slice(data, buf, len(data), expected_output_size, allocator);
}
return;
}
load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator := context.allocator) -> (err: Error) {
load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, known_gzip_size := -1, expected_output_size := -1, allocator := context.allocator) -> (err: Error) {
buf := buf;
expected_output_size := expected_output_size;
input_data_consumed := 0;
ws := bytes.buffer_to_stream(buf);
ctx.output = ws;
if expected_output_size > GZIP_MAX_PAYLOAD_SIZE {
return E_GZIP.Payload_Size_Exceeds_Max_Payload;
}
if expected_output_size > compress.COMPRESS_OUTPUT_ALLOCATE_MAX {
return E_GZIP.Output_Exceeds_COMPRESS_OUTPUT_ALLOCATE_MAX;
}
b: []u8;
header, e := compress.read_data(ctx, Header);
if e != .None {
return E_General.File_Too_Short;
}
input_data_consumed += size_of(Header);
if header.magic != .GZIP {
return E_GZIP.Invalid_GZIP_Signature;
@@ -163,6 +181,8 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
if .extra in header.flags {
xlen, e_extra := compress.read_data(ctx, u16le);
input_data_consumed += 2;
if e_extra != .None {
return E_General.Stream_Too_Short;
}
@@ -184,6 +204,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
return E_General.Stream_Too_Short;
}
xlen -= 2;
input_data_consumed += 2;
field_length, field_error = compress.read_data(ctx, u16le);
if field_error != .None {
@@ -191,6 +212,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
return E_General.Stream_Too_Short;
}
xlen -= 2;
input_data_consumed += 2;
if xlen <= 0 {
// We're not going to try and recover by scanning for a ZLIB header.
@@ -206,6 +228,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
return E_General.Stream_Too_Short;
}
xlen -= field_length;
input_data_consumed += int(field_length);
// printf("%v\n", string(field_data));
}
@@ -227,6 +250,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
if name_error != .None {
return E_General.Stream_Too_Short;
}
input_data_consumed += 1;
if b[0] == 0 {
break;
}
@@ -250,6 +274,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
if comment_error != .None {
return E_General.Stream_Too_Short;
}
input_data_consumed += 1;
if b[0] == 0 {
break;
}
@@ -265,6 +290,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
if .header_crc in header.flags {
crc_error: io.Error;
_, crc_error = compress.read_slice(ctx, 2);
input_data_consumed += 2;
if crc_error != .None {
return E_General.Stream_Too_Short;
}
@@ -280,7 +306,43 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
code_buffer := compress.Code_Buffer{};
cb := &code_buffer;
zlib_error := zlib.inflate_raw(ctx, &code_buffer);
payload_u32le: u32le;
// fmt.printf("known_gzip_size: %v | expected_output_size: %v\n", known_gzip_size, expected_output_size);
if expected_output_size > -1 {
/*
We already checked that it's not larger than the output buffer max,
or GZIP length field's max.
We'll just pass it on to `zlib.inflate_raw`;
*/
} else {
/*
If we know the size of the GZIP file *and* it is fully in memory,
then we can peek at the unpacked size at the end.
We'll still want to ensure there's capacity left in the output buffer when we write, of course.
*/
if ctx.input_fully_in_memory && known_gzip_size > -1 {
offset := known_gzip_size - input_data_consumed - 4;
if len(ctx.input_data) >= offset + 4 {
length_bytes := ctx.input_data[offset:][:4];
payload_u32le = (^u32le)(&length_bytes[0])^;
expected_output_size = int(payload_u32le);
}
} else {
/*
TODO(Jeroen): When reading a GZIP from a stream, check if impl_seek is present.
If so, we can seek to the end, grab the size from the footer, and seek back to payload start.
*/
}
}
// fmt.printf("GZIP: Expected Payload Size: %v\n", expected_output_size);
zlib_error := zlib.inflate_raw(z=ctx, cb=&code_buffer, expected_output_size=expected_output_size);
if zlib_error != nil {
return zlib_error;
}
@@ -300,9 +362,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
}
}
payload_crc := transmute(u32le)payload_crc_b;
payload_len: u32le;
payload_len, footer_error = compress.read_data(ctx, u32le);
payload_u32le, footer_error = compress.read_data(ctx, u32le);
payload := bytes.buffer_to_bytes(buf);
crc32 := u32le(hash.crc32(payload));
@@ -311,7 +371,7 @@ load_from_stream :: proc(ctx: ^compress.Context, buf: ^bytes.Buffer, allocator :
return E_GZIP.Payload_CRC_Invalid;
}
if len(payload) != int(payload_len) {
if len(payload) != int(payload_u32le) {
return E_GZIP.Payload_Length_Invalid;
}
return nil;

View File

@@ -35,11 +35,13 @@ main :: proc() {
171, 15, 18, 59, 138, 112, 63, 23, 205, 110, 254, 136, 109, 78, 231,
63, 234, 138, 133, 204,
};
OUTPUT_SIZE :: 438;
buf: bytes.Buffer;
// We can pass ", true" to inflate a raw DEFLATE stream instead of a ZLIB wrapped one.
err := inflate(ODIN_DEMO, &buf);
err := inflate(input=ODIN_DEMO, buf=&buf, expected_output_size=OUTPUT_SIZE);
defer bytes.buffer_destroy(&buf);
if err != nil {
@@ -47,5 +49,5 @@ main :: proc() {
}
s := bytes.buffer_to_string(&buf);
fmt.printf("Input: %v bytes, output (%v bytes):\n%v\n", len(ODIN_DEMO), len(s), s);
assert(len(s) == 438);
assert(len(s) == OUTPUT_SIZE);
}

View File

@@ -16,6 +16,8 @@ import "core:io"
import "core:bytes"
import "core:hash"
// import "core:fmt"
// when #config(TRACY_ENABLE, false) { import tracy "shared:odin-tracy" }
/*
@@ -397,7 +399,7 @@ parse_huffman_block :: proc(z: ^Context, cb: ^Code_Buffer, z_repeat, z_offset: ^
}
@(optimization_mode="speed")
inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := context.allocator) -> (err: Error) #no_bounds_check {
inflate_from_stream :: proc(using ctx: ^Context, raw := false, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
/*
ctx.input must be an io.Stream backed by an implementation that supports:
- read
@@ -461,7 +463,7 @@ inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := cont
}
// Parse ZLIB stream without header.
err = inflate_raw(ctx, cb);
err = inflate_raw(z=ctx, cb=cb, expected_output_size=expected_output_size);
if err != nil {
return err;
}
@@ -483,12 +485,29 @@ inflate_from_stream :: proc(using ctx: ^Context, raw := false, allocator := cont
}
@(optimization_mode="speed")
inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, allocator := context.allocator) -> (err: Error) #no_bounds_check {
inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
when #config(TRACY_ENABLE, false) { tracy.ZoneN("Inflate Raw"); }
final := u32(0);
type := u32(0);
cb.num_bits = 0;
buf := (^bytes.Buffer)(z.output.stream_data);
z.output_buf = &buf.buf;
// fmt.printf("ZLIB: Expected Payload Size: %v\n", expected_output_size);
if expected_output_size > -1 && expected_output_size <= compress.COMPRESS_OUTPUT_ALLOCATE_MAX {
reserve(z.output_buf, expected_output_size);
// resize (z.output_buf, expected_output_size);
} else {
reserve(z.output_buf, compress.COMPRESS_OUTPUT_ALLOCATE_MIN);
}
// reserve(&z.output_buf, compress.COMPRESS_OUTPUT_ALLOCATE_MIN);
// resize (&z.output_buf, compress.COMPRESS_OUTPUT_ALLOCATE_MIN);
// fmt.printf("ZLIB: buf: %v\n", buf);
// fmt.printf("ZLIB: output_buf: %v\n", z.output_buf);
// fmt.printf("ZLIB: z.output: %v\n", z.output);
cb.num_bits = 0;
cb.code_buffer = 0;
z_repeat: ^Huffman_Table;
@@ -519,6 +538,10 @@ inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, allocator := cont
cb.last = mem.make_dynamic_array_len_cap([dynamic]u8, cb.window_mask + 1, cb.window_mask + 1, allocator);
defer delete(cb.last);
final := u32(0);
type := u32(0);
for {
final = compress.read_bits_lsb(z, cb, 1);
type = compress.read_bits_lsb(z, cb, 2);
@@ -659,10 +682,15 @@ inflate_from_stream_raw :: proc(z: ^Context, cb: ^Code_Buffer, allocator := cont
}
}
// fmt.printf("ZLIB: Bytes written: %v\n", z.bytes_written);
if int(z.bytes_written) != len(buf.buf) {
resize(&buf.buf, int(z.bytes_written));
}
return nil;
}
inflate_from_byte_array :: proc(input: []u8, buf: ^bytes.Buffer, raw := false) -> (err: Error) {
inflate_from_byte_array :: proc(input: []u8, buf: ^bytes.Buffer, raw := false, expected_output_size := -1) -> (err: Error) {
ctx := Context{};
r := bytes.Reader{};
@@ -673,15 +701,15 @@ inflate_from_byte_array :: proc(input: []u8, buf: ^bytes.Buffer, raw := false) -
ctx.input_fully_in_memory = true;
buf := buf;
ws := bytes.buffer_to_stream(buf);
ws := bytes.buffer_to_stream(buf);
ctx.output = ws;
err = inflate_from_stream(&ctx, raw);
err = inflate_from_stream(ctx=&ctx, raw=raw, expected_output_size=expected_output_size);
return err;
}
inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, cb: ^Code_Buffer, raw := false) -> (err: Error) {
inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, cb: ^Code_Buffer, raw := false, expected_output_size := -1) -> (err: Error) {
ctx := Context{};
r := bytes.Reader{};
@@ -692,10 +720,10 @@ inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, cb: ^Code_B
ctx.input_fully_in_memory = true;
buf := buf;
ws := bytes.buffer_to_stream(buf);
ws := bytes.buffer_to_stream(buf);
ctx.output = ws;
return inflate_from_stream_raw(&ctx, cb);
return inflate_from_stream_raw(z=&ctx, cb=cb, expected_output_size=expected_output_size);
}
inflate :: proc{inflate_from_stream, inflate_from_byte_array};