From 5a82f06ba10dcdcb3cf384d30e0f3c879cb05ef1 Mon Sep 17 00:00:00 2001 From: Shane Shrybman Date: Mon, 9 Mar 2026 14:51:35 -0400 Subject: [PATCH 01/21] Add virtual arena tests for bug #5821 --- tests/core/mem/test_core_mem.odin | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/core/mem/test_core_mem.odin b/tests/core/mem/test_core_mem.odin index 9d64e50a3..3afd49a06 100644 --- a/tests/core/mem/test_core_mem.odin +++ b/tests/core/mem/test_core_mem.odin @@ -57,6 +57,33 @@ test_align_bumping_block_limit :: proc(t: ^testing.T) { testing.expect(t, len(data) == 896) } +@(test) +test_large_minimum_block_size :: proc(t: ^testing.T) { + a: virtual.Arena + defer virtual.arena_destroy(&a) + + init_err := virtual.arena_init_growing(&a, 16*mem.Megabyte) + testing.expect_value(t, init_err, nil) + + align : uint = 4 + for _ in 0..<6 { + data, err := virtual.arena_alloc(&a, 18874368, align) + testing.expect_value(t, err, nil) + testing.expect(t, len(data) == 18874368) + testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0) + align *= 2 + virtual.arena_free_all(&a) + } + + align = 4 + for _ in 0..<32 { + data, err := virtual.arena_alloc(&a, 1048576, align) + testing.expect_value(t, err, nil) + testing.expect(t, len(data) == 1048576) + testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0) + } +} + @(test) tlsf_test_overlap_and_zero :: proc(t: ^testing.T) { default_allocator := context.allocator From 5d80809dc845b890f75288b3e2b644ada42c4498 Mon Sep 17 00:00:00 2001 From: Shane Shrybman Date: Mon, 9 Mar 2026 14:58:58 -0400 Subject: [PATCH 02/21] Fix virtual arena memory block overcommit bug #5821 --- core/mem/virtual/arena.odin | 4 ++-- core/mem/virtual/virtual.odin | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/mem/virtual/arena.odin b/core/mem/virtual/arena.odin index b515aa3cf..bcf3ee702 100644 --- a/core/mem/virtual/arena.odin +++ b/core/mem/virtual/arena.odin @@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc := needed := mem.align_forward_uint(size, alignment) needed = max(needed, arena.default_commit_size) - block_size := max(needed, arena.minimum_block_size) + block_size := max(needed, arena.minimum_block_size) + alignment - new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return + new_block := memory_block_alloc(needed, block_size) or_return new_block.prev = arena.curr_block arena.curr_block = new_block arena.total_reserved += new_block.reserved diff --git a/core/mem/virtual/virtual.odin b/core/mem/virtual/virtual.odin index 3f388acf3..d37c61267 100644 --- a/core/mem/virtual/virtual.odin +++ b/core/mem/virtual/virtual.odin @@ -154,7 +154,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint, pmblock.committed = platform_total_commit block.committed = pmblock.committed - base_offset - + assert(block.committed <= block.reserved) } return } @@ -174,7 +174,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint, err = .Out_Of_Memory return } - assert(block.committed <= block.reserved) + do_commit_if_necessary(block, size, default_commit_size) or_return data = block.base[block.used+alignment_offset:][:min_size] From 9df092759e46e67bbbd49fad21bb95442e08bf5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Botero?= Date: Sat, 10 Jan 2026 10:21:00 -0500 Subject: [PATCH 03/21] Add support for other Android architectures --- src/build_settings.cpp | 30 ++++++++++++++++++++++++++++-- src/linker.cpp | 9 +++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/build_settings.cpp b/src/build_settings.cpp index f46b7d247..bcaf72f90 100644 --- a/src/build_settings.cpp +++ b/src/build_settings.cpp @@ -1683,7 +1683,20 @@ gb_internal void init_android_values(bool with_sdk) { gb_exit(1); } - bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/")); + switch (bc->metrics.arch) { + case TargetArch_arm64: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("aarch64-linux-android"); + break; + case TargetArch_arm32: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("arm-linux-androideabi"); + break; + case TargetArch_amd64: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("x86_64-linux-android"); + break; + case TargetArch_i386: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("i686-linux-android"); + break; + } char buf[32] = {}; gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL); @@ -1958,9 +1971,22 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta } else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) { switch (metrics->arch) { case TargetArch_arm64: - bc->metrics.target_triplet = str_lit("aarch64-none-linux-android"); + bc->metrics.target_triplet = str_lit("aarch64-linux-android"); bc->reloc_mode = RelocMode_PIC; break; + case TargetArch_arm32: + bc->metrics.target_triplet = str_lit("armv7a-linux-androideabi"); + bc->reloc_mode = RelocMode_PIC; + break; + case TargetArch_amd64: + bc->metrics.target_triplet = str_lit("x86_64-linux-android"); + bc->reloc_mode = RelocMode_PIC; + break; + case TargetArch_i386: + bc->metrics.target_triplet = str_lit("i686-linux-android"); + bc->reloc_mode = RelocMode_PIC; + break; + default: GB_PANIC("Unknown architecture for -subtarget:android"); } diff --git a/src/linker.cpp b/src/linker.cpp index e48486d9a..12f016cea 100644 --- a/src/linker.cpp +++ b/src/linker.cpp @@ -676,7 +676,7 @@ try_cross_linking:; defer (gb_string_free(glue)); glue = gb_string_append_fmt(glue, "bin/clang"); - glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL); + glue = gb_string_append_fmt(glue, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL); glue = gb_string_appendc(glue, "-c \""); glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len); glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c"); @@ -697,8 +697,9 @@ try_cross_linking:; glue = gb_string_appendc(glue, "\"-I"); glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len); - glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/"); - glue = gb_string_appendc(glue, "\" "); + glue = gb_string_appendc(glue, "sysroot/usr/include/"); + glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len); + glue = gb_string_appendc(glue, "/\" "); glue = gb_string_appendc(glue, "-Wno-macro-redefined "); @@ -969,7 +970,7 @@ try_cross_linking:; gbString ndk_bin_directory = gb_string_make_length(temporary_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len); link_command_line = gb_string_appendc(link_command_line, ndk_bin_directory); link_command_line = gb_string_appendc(link_command_line, "bin/clang"); - link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL); + link_command_line = gb_string_append_fmt(link_command_line, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL); } else { link_command_line = gb_string_appendc(link_command_line, clang_path); } From 273ab7e3bb49a0b09d46f42041b58316196a8ee8 Mon Sep 17 00:00:00 2001 From: Laytan Laats Date: Wed, 11 Mar 2026 22:16:35 +0100 Subject: [PATCH 04/21] nbio: fix posix big send/recv wrongly check if done --- core/nbio/impl_posix.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/nbio/impl_posix.odin b/core/nbio/impl_posix.odin index 8469c9ade..3ecb5d8a3 100644 --- a/core/nbio/impl_posix.odin +++ b/core/nbio/impl_posix.odin @@ -805,7 +805,7 @@ send_exec :: proc(op: ^Operation) -> Op_Result { op.send.sent += n - if op.send.sent < total { + if n < total { return send_exec(op) } @@ -869,7 +869,7 @@ recv_exec :: proc(op: ^Operation) -> Op_Result { assert(is_tcp || op.recv.received == 0) op.recv.received += n - if is_tcp && n != 0 && op.recv.received < total { + if is_tcp && n != 0 && n < total { return recv_exec(op) } From b4405d01f86d35d916e8b21704e2d0c2a6b17116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20Vaeng=20R=C3=B8tnes?= Date: Thu, 12 Mar 2026 16:37:05 +0100 Subject: [PATCH 05/21] Fix typo in error message for using statement Change "It you do require..." to "If you do require..." --- src/check_type.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/check_type.cpp b/src/check_type.cpp index 82e70dd33..ed0d6528e 100644 --- a/src/check_type.cpp +++ b/src/check_type.cpp @@ -1853,7 +1853,7 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para if (is_using && (feature_flags & OptInFeatureFlag_UsingStmt) == 0) { ERROR_BLOCK(); error(param, "'using' has been disallowed as it is considered bad practice to use as a statement/procedure parameter outside of immediate refactoring"); - error_line("\tIt you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n"); + error_line("\tIf you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n"); } if (type_expr == nullptr) { From 27667ce36bf76e39aad3becf469a6fbcbb2d85fe Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Fri, 13 Mar 2026 11:54:15 +0100 Subject: [PATCH 06/21] =?UTF-8?q?`iff`=20->=20`if=20and=20only=20if=20(?= =?UTF-8?q?=E2=9F=BA)`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/runtime/core.odin | 2 +- base/runtime/core_builtin.odin | 8 ++++---- .../random_generator_chacha8_simd256.odin | 2 +- core/bufio/reader.odin | 2 +- core/bufio/writer.odin | 2 +- core/bytes/bytes.odin | 4 ++-- core/container/avl/avl.odin | 18 ++++++++--------- core/container/rbtree/rbtree.odin | 20 +++++++++---------- core/crypto/_aes/hw_intel/api.odin | 2 +- .../_chacha20/simd128/chacha20_simd128.odin | 2 +- .../_chacha20/simd256/chacha20_simd256.odin | 2 +- core/crypto/_fiat/field_p256r1/field.odin | 2 +- core/crypto/_fiat/field_p384r1/field.odin | 2 +- core/crypto/_subtle/subtle.odin | 10 +++++----- core/crypto/aead/aead.odin | 2 +- core/crypto/aead/low_level.odin | 2 +- core/crypto/aegis/aegis.odin | 2 +- core/crypto/aegis/aegis_impl_hw_gen.odin | 2 +- core/crypto/aegis/aegis_impl_hw_intel.odin | 2 +- core/crypto/aes/aes_gcm.odin | 2 +- core/crypto/aes/aes_impl_hw_gen.odin | 2 +- core/crypto/aes/aes_impl_hw_intel.odin | 2 +- core/crypto/blake2b/blake2b.odin | 2 +- core/crypto/blake2s/blake2s.odin | 2 +- .../chacha20poly1305/chacha20poly1305.odin | 2 +- core/crypto/crypto.odin | 14 ++++++------- core/crypto/deoxysii/deoxysii.odin | 2 +- .../crypto/deoxysii/deoxysii_impl_hw_gen.odin | 2 +- .../deoxysii/deoxysii_impl_hw_intel.odin | 2 +- core/crypto/ecdh/ecdh.odin | 10 +++++----- core/crypto/ecdsa/ecdsa.odin | 10 +++++----- core/crypto/ecdsa/ecdsa_asn1.odin | 2 +- core/crypto/ecdsa/ecdsa_verify.odin | 4 ++-- core/crypto/ed25519/ed25519.odin | 10 +++++----- core/crypto/hash/low_level.odin | 2 +- core/crypto/hmac/hmac.odin | 2 +- core/crypto/kmac/kmac.odin | 2 +- core/crypto/legacy/keccak/keccak.odin | 2 +- core/crypto/legacy/md5/md5.odin | 2 +- core/crypto/legacy/sha1/sha1.odin | 2 +- core/crypto/pbkdf2/pbkdf2.odin | 2 +- core/crypto/poly1305/poly1305.odin | 2 +- core/crypto/ristretto255/ristretto255.odin | 10 +++++----- .../ristretto255/ristretto255_scalar.odin | 4 ++-- core/crypto/sha2/sha2.odin | 2 +- core/crypto/sha2/sha2_impl_hw_gen.odin | 2 +- core/crypto/sha2/sha2_impl_hw_intel.odin | 2 +- core/crypto/sha3/sha3.odin | 2 +- core/crypto/sm3/sm3.odin | 2 +- core/crypto/tuplehash/tuplehash.odin | 2 +- core/io/io.odin | 2 +- core/sys/darwin/Foundation/NSBlock.odin | 2 +- src/check_expr.cpp | 6 +++--- src/llvm_backend_expr.cpp | 4 ++-- vendor/lua/5.4/include/luaconf.h | 2 +- vendor/portmidi/portmidi.odin | 4 ++-- 56 files changed, 109 insertions(+), 109 deletions(-) diff --git a/base/runtime/core.odin b/base/runtime/core.odin index 983f104e3..e2ed78452 100644 --- a/base/runtime/core.odin +++ b/base/runtime/core.odin @@ -141,7 +141,7 @@ Type_Info_Struct :: struct { flags: Type_Info_Struct_Flags, - // These are only set iff this structure is an SOA structure + // These are only set if and only if (⟺) this structure is an SOA structure soa_kind: Type_Info_Struct_Soa_Kind, soa_len: i32, soa_base_type: ^Type_Info, diff --git a/base/runtime/core_builtin.odin b/base/runtime/core_builtin.odin index 974b2f048..4d8b493f7 100644 --- a/base/runtime/core_builtin.odin +++ b/base/runtime/core_builtin.odin @@ -1094,7 +1094,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int { -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `context.assertion_failure_procedure` to assert. // // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true. @@ -1118,7 +1118,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc := } } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `context.assertion_failure_procedure` to assert. // This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute. @builtin @@ -1158,7 +1158,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! { p("not yet implemented", message, loc) } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `default_assertion_contextless_failure_proc` to assert. // // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true. @@ -1178,7 +1178,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex } } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `default_assertion_contextless_failure_proc` to assert. @builtin ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) { diff --git a/base/runtime/random_generator_chacha8_simd256.odin b/base/runtime/random_generator_chacha8_simd256.odin index c0985f456..f2ccb6934 100644 --- a/base/runtime/random_generator_chacha8_simd256.odin +++ b/base/runtime/random_generator_chacha8_simd256.odin @@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) { // // LLVM appears not to consider "this instruction is totally // awful on the given microarchitcture", which leads to - // `VPCOMPRESSED` being generated iff AVX512 support is + // `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is // enabled for `intrinsics.simd_masked_compress_store`. // On Zen 4, this leads to a 50% performance regression vs // the 128-bit SIMD code. diff --git a/core/bufio/reader.odin b/core/bufio/reader.odin index 8e30542f1..e361612d2 100644 --- a/core/bufio/reader.odin +++ b/core/bufio/reader.odin @@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) { b.buf = buf } -// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set +// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set reader_destroy :: proc(b: ^Reader) { delete(b.buf, b.buf_allocator) b^ = {} diff --git a/core/bufio/writer.odin b/core/bufio/writer.odin index 666c05b67..9d5ef3481 100644 --- a/core/bufio/writer.odin +++ b/core/bufio/writer.odin @@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) { b.buf = buf } -// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set +// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set writer_destroy :: proc(b: ^Writer) { delete(b.buf, b.buf_allocator) b^ = {} diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 33978b3df..55eca5386 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc return subslices[:] } -// alias returns true iff a and b have a non-zero length, and any part of +// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of // a overlaps with b. alias :: proc "contextless" (a, b: []byte) -> bool { a_len, b_len := len(a), len(b) @@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool { return a_start <= b_end && b_start <= a_end } -// alias_inexactly returns true iff a and b have a non-zero length, +// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length, // the base pointer of a and b are NOT equal, and any part of a overlaps // with b (ie: `alias(a, b)` with an exception that returns false for // `a == b`, `b = a[:len(a)-69]` and similar conditions). diff --git a/core/container/avl/avl.odin b/core/container/avl/avl.odin index 6c7216c29..1208cd213 100644 --- a/core/container/avl/avl.odin +++ b/core/container/avl/avl.odin @@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int { return t._size } -// first returns the first node in the tree (in-order) or nil iff +// first returns the first node in the tree (in-order) or nil if and only if (⟺) // the tree is empty. first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) { return tree_first_or_last_in_order(t, Direction.Backward) } -// last returns the last element in the tree (in-order) or nil iff +// last returns the last element in the tree (in-order) or nil if and only if (⟺) // the tree is empty. last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) { return tree_first_or_last_in_order(t, Direction.Forward) } // find finds the value in the tree, and returns the corresponding -// node or nil iff the value is not present. +// node or nil if and only if (⟺) the value is not present. find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) { cur := t._root descend_loop: for cur != nil { @@ -168,7 +168,7 @@ find_or_insert :: proc( return } -// remove removes a node or value from the tree, and returns true iff the +// remove removes a node or value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove :: proc { @@ -176,7 +176,7 @@ remove :: proc { remove_node, } -// remove_value removes a value from the tree, and returns true iff the +// remove_value removes a value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool { @@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t return remove_node(t, n, call_on_remove) } -// remove_node removes a node from the tree, and returns true iff the +// remove_node removes a node from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool { @@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" ( } // iterator_get returns the node currently pointed to by the iterator, -// or nil iff the node has been removed, the tree is empty, or the end +// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end // of the tree has been reached. iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) { return it._cur } // iterator_remove removes the node currently pointed to by the iterator, -// and returns true iff the removal was successful. Semantics are the +// and returns true if and only if (⟺) the removal was successful. Semantics are the // same as the Tree remove. iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool { if it._cur == nil { @@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) - } // iterator_next advances the iterator and returns the (node, true) or -// or (nil, false) iff the end of the tree has been reached. +// or (nil, false) if and only if (⟺) the end of the tree has been reached. // // Note: The first call to iterator_next will return the first node instead // of advancing the iterator. diff --git a/core/container/rbtree/rbtree.odin b/core/container/rbtree/rbtree.odin index e892188d7..c138838df 100644 --- a/core/container/rbtree/rbtree.odin +++ b/core/container/rbtree/rbtree.odin @@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) { return t._size } -// first returns the first node in the tree (in-order) or nil iff +// first returns the first node in the tree (in-order) or nil if and only if (⟺) // the tree is empty. first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) { return tree_first_or_last_in_order(t, Direction.Backward) } -// last returns the last element in the tree (in-order) or nil iff +// last returns the last element in the tree (in-order) or nil if and only if (⟺) // the tree is empty. last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) { return tree_first_or_last_in_order(t, Direction.Forward) } -// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present. +// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present. find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) { node = t._root for node != nil { @@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) { return node } -// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present. +// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present. find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok { if n := find(t, key); n != nil { return n.value, true @@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n: return n, true, nil } -// remove removes a node or value from the tree, and returns true iff the +// remove removes a node or value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove :: proc { @@ -162,7 +162,7 @@ remove :: proc { remove_node, } -// remove_value removes a value from the tree, and returns true iff the +// remove_value removes a value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's key + value will be left intact, // the node itself will be freed via the tree's node allocator. remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool { @@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) return remove_node(t, n, call_on_remove) } -// remove_node removes a node from the tree, and returns true iff the +// remove_node removes a node from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's key + value will be left intact, // the node itself will be freed via the tree's node allocator. remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) { @@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K } // iterator_get returns the node currently pointed to by the iterator, -// or nil iff the node has been removed, the tree is empty, or the end +// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end // of the tree has been reached. iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) { return it._cur } // iterator_remove removes the node currently pointed to by the iterator, -// and returns true iff the removal was successful. Semantics are the +// and returns true if and only if (⟺) the removal was successful. Semantics are the // same as the Tree remove. iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool { if it._cur == nil { @@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t } // iterator_next advances the iterator and returns the (node, true) or -// or (nil, false) iff the end of the tree has been reached. +// or (nil, false) if and only if (⟺) the end of the tree has been reached. // // Note: The first call to iterator_next will return the first node instead // of advancing the iterator. diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin index ce769fc10..9547d8f84 100644 --- a/core/crypto/_aes/hw_intel/api.odin +++ b/core/crypto/_aes/hw_intel/api.odin @@ -3,7 +3,7 @@ package aes_hw_intel import "core:sys/info" -// is_supported returns true iff hardware accelerated AES +// is_supported returns true if and only if (⟺) hardware accelerated AES // is supported. is_supported :: proc "contextless" () -> bool { // Note: Everything with AES-NI and PCLMULQDQ has support for diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin index 9da0a54ea..fd48074df 100644 --- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" ( intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3) } -// is_performant returns true iff the target and current host both support +// is_performant returns true if and only if (⟺) the target and current host both support // "enough" 128-bit SIMD to make this implementation performant. is_performant :: proc "contextless" () -> bool { when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 { diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256.odin b/core/crypto/_chacha20/simd256/chacha20_simd256.odin index 407fbac56..c2f709aec 100644 --- a/core/crypto/_chacha20/simd256/chacha20_simd256.odin +++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin @@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0} @(private = "file") _VEC_TWO: simd.u64x4 : {2, 0, 2, 0} -// is_performant returns true iff the target and current host both support +// is_performant returns true if and only if (⟺) the target and current host both support // "enough" SIMD to make this implementation performant. is_performant :: proc "contextless" () -> bool { req_features :: info.CPU_Features{.avx, .avx2} diff --git a/core/crypto/_fiat/field_p256r1/field.odin b/core/crypto/_fiat/field_p256r1/field.odin index f39bee4a9..f7dd978aa 100644 --- a/core/crypto/_fiat/field_p256r1/field.odin +++ b/core/crypto/_fiat/field_p256r1/field.odin @@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> tmp: Montgomery_Domain_Field_Element = --- fe_sub(&tmp, arg1, arg2) - // This will only underflow iff arg1 == arg2, and we return the borrow, + // This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow, // which will be 1. is_eq := subtle.u64_is_zero(fe_non_zero(&tmp)) diff --git a/core/crypto/_fiat/field_p384r1/field.odin b/core/crypto/_fiat/field_p384r1/field.odin index 5cb5cd05e..2bddff18c 100644 --- a/core/crypto/_fiat/field_p384r1/field.odin +++ b/core/crypto/_fiat/field_p384r1/field.odin @@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> tmp: Montgomery_Domain_Field_Element = --- fe_sub(&tmp, arg1, arg2) - // This will only underflow iff arg1 == arg2, and we return the borrow, + // This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow, // which will be 1. is_eq := subtle.u64_is_zero(fe_non_zero(&tmp)) diff --git a/core/crypto/_subtle/subtle.odin b/core/crypto/_subtle/subtle.odin index 89328072c..454066e4a 100644 --- a/core/crypto/_subtle/subtle.odin +++ b/core/crypto/_subtle/subtle.odin @@ -5,17 +5,17 @@ package _subtle import "core:math/bits" -// byte_eq returns 1 iff a == b, 0 otherwise. +// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise. @(optimization_mode="none") byte_eq :: proc "contextless" (a, b: byte) -> int { v := a ~ b - // v == 0 iff a == b. The subtraction will underflow, setting the + // v == 0 if and only if (⟺) a == b. The subtraction will underflow, setting the // sign bit, which will get returned. return int((u32(v)-1) >> 31) } -// u64_eq returns 1 iff a == b, 0 otherwise. +// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise. @(optimization_mode="none") u64_eq :: proc "contextless" (a, b: u64) -> u64 { _, borrow := bits.sub_u64(0, a ~ b, 0) @@ -27,14 +27,14 @@ eq :: proc { u64_eq, } -// u64_is_zero returns 1 iff a == 0, 0 otherwise. +// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise. @(optimization_mode="none") u64_is_zero :: proc "contextless" (a: u64) -> u64 { _, borrow := bits.sub_u64(a, 1, 0) return borrow } -// u64_is_non_zero returns 1 iff a != 0, 0 otherwise. +// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise. @(optimization_mode="none") u64_is_non_zero :: proc "contextless" (a: u64) -> u64 { is_zero := u64_is_zero(a) diff --git a/core/crypto/aead/aead.odin b/core/crypto/aead/aead.odin index c8f324929..ed14a41f3 100644 --- a/core/crypto/aead/aead.odin +++ b/core/crypto/aead/aead.odin @@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided algorithm, key, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and ciphertext MUST alias exactly or not at all. diff --git a/core/crypto/aead/low_level.odin b/core/crypto/aead/low_level.odin index c80574a0d..c89d85823 100644 --- a/core/crypto/aead/low_level.odin +++ b/core/crypto/aead/low_level.odin @@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aegis/aegis.odin b/core/crypto/aegis/aegis.odin index fbb19f1ae..5aee61767 100644 --- a/core/crypto/aegis/aegis.odin +++ b/core/crypto/aegis/aegis.odin @@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aegis/aegis_impl_hw_gen.odin b/core/crypto/aegis/aegis_impl_hw_gen.odin index 5ec2f3d6e..db38e71bc 100644 --- a/core/crypto/aegis/aegis_impl_hw_gen.odin +++ b/core/crypto/aegis/aegis_impl_hw_gen.odin @@ -7,7 +7,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported" @(private) State_HW :: struct {} -// is_hardware_accelerated returns true iff hardware accelerated AEGIS +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/aegis/aegis_impl_hw_intel.odin b/core/crypto/aegis/aegis_impl_hw_intel.odin index 7673b6b28..8b767908c 100644 --- a/core/crypto/aegis/aegis_impl_hw_intel.odin +++ b/core/crypto/aegis/aegis_impl_hw_intel.odin @@ -20,7 +20,7 @@ State_HW :: struct { rate: int, } -// is_hardware_accelerated returns true iff hardware accelerated AEGIS +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return aes.is_hardware_accelerated() diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index bb87788ac..0acd95d2f 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -65,7 +65,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) { // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context_GCM, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin index 0c9ec6edc..506298751 100644 --- a/core/crypto/aes/aes_impl_hw_gen.odin +++ b/core/crypto/aes/aes_impl_hw_gen.odin @@ -4,7 +4,7 @@ package aes @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported" -// is_hardware_accelerated returns true iff hardware accelerated AES +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin index 0f1fa6143..96a1811f3 100644 --- a/core/crypto/aes/aes_impl_hw_intel.odin +++ b/core/crypto/aes/aes_impl_hw_intel.odin @@ -3,7 +3,7 @@ package aes import "core:crypto/_aes/hw_intel" -// is_hardware_accelerated returns true iff hardware accelerated AES +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return hw_intel.is_supported() diff --git a/core/crypto/blake2b/blake2b.odin b/core/crypto/blake2b/blake2b.odin index 6c2c5c1e9..8cce6dac8 100644 --- a/core/crypto/blake2b/blake2b.odin +++ b/core/crypto/blake2b/blake2b.odin @@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _blake2.final(ctx, hash, finalize_clone) diff --git a/core/crypto/blake2s/blake2s.odin b/core/crypto/blake2s/blake2s.odin index 902f992b3..35e278f72 100644 --- a/core/crypto/blake2s/blake2s.odin +++ b/core/crypto/blake2s/blake2s.odin @@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _blake2.final(ctx, hash, finalize_clone) diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin index 0504acab0..c405a2736 100644 --- a/core/crypto/chacha20poly1305/chacha20poly1305.odin +++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin @@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin index b36bc2004..f4ddbfbe7 100644 --- a/core/crypto/crypto.odin +++ b/core/crypto/crypto.odin @@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle" // Omit large precomputed tables, trading off performance for size. COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false) -// HAS_RAND_BYTES is true iff the runtime provides a cryptographic +// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic // entropy source. HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES -// compare_constant_time returns 1 iff a and b are equal, 0 otherwise. +// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise. // // The execution time of this routine is constant regardless of the contents // of the slices being compared, as long as the length of the slices is equal. -// If the length of the two slices is different, it will early-return 0. +// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0. compare_constant_time :: proc "contextless" (a, b: []byte) -> int { // If the length of the slices is different, early return. // @@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int { return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n) } -// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by +// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by // a and b are equal, 0 otherwise. // // The execution time of this routine is constant regardless of the @@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i v |= x[i] ~ y[i] } - // After the loop, v == 0 iff a == b. The subtraction will underflow - // iff v == 0, setting the sign-bit, which gets returned. + // After the loop, v == 0 if and only if (⟺) a == b. The subtraction will underflow + // if and only if (⟺) v == 0, setting the sign-bit, which gets returned. return subtle.eq(0, v) } -// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise. +// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise. is_zero_constant_time :: proc "contextless" (b: []byte) -> int { v: byte for b_ in b { diff --git a/core/crypto/deoxysii/deoxysii.odin b/core/crypto/deoxysii/deoxysii.odin index 829d3d3ad..ffe9b4b32 100644 --- a/core/crypto/deoxysii/deoxysii.odin +++ b/core/crypto/deoxysii/deoxysii.odin @@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin index b0705ca62..89dae7229 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin @@ -4,7 +4,7 @@ package deoxysii @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported" -// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin index cdad16f42..88c569d53 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin @@ -21,7 +21,7 @@ _PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0} @(private = "file") _PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0} -// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return aes.is_hardware_accelerated() diff --git a/core/crypto/ecdh/ecdh.odin b/core/crypto/ecdh/ecdh.odin index af60f5649..6a8f6e466 100644 --- a/core/crypto/ecdh/ecdh.odin +++ b/core/crypto/ecdh/ecdh.odin @@ -104,7 +104,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { private_key_clear(priv_key) @@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool { private_key_clear(priv_key) @@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) { } } -// private_key_equal returns true iff the private keys are equal, +// private_key_equal returns true if and only if (⟺) the private keys are equal, // in constant time. private_key_equal :: proc(p, q: ^Private_Key) -> bool { if p._curve != q._curve { @@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool { public_key_clear(pub_key) @@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { } } -// public_key_equal returns true iff the public keys are equal, +// public_key_equal returns true if and only if (⟺) the public keys are equal, // in constant time. public_key_equal :: proc(p, q: ^Public_Key) -> bool { if p._curve != q._curve { diff --git a/core/crypto/ecdsa/ecdsa.odin b/core/crypto/ecdsa/ecdsa.odin index 241d50987..6c71feef7 100644 --- a/core/crypto/ecdsa/ecdsa.odin +++ b/core/crypto/ecdsa/ecdsa.odin @@ -79,7 +79,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { private_key_clear(priv_key) @@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool { private_key_clear(priv_key) @@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) { } } -// private_key_equal returns true iff the private keys are equal, +// private_key_equal returns true if and only if (⟺) the private keys are equal, // in constant time. private_key_equal :: proc(p, q: ^Private_Key) -> bool { if p._curve != q._curve { @@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool { public_key_clear(pub_key) @@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { } } -// public_key_equal returns true iff the public keys are equal, +// public_key_equal returns true if and only if (⟺) the public keys are equal, // in constant time. public_key_equal :: proc(p, q: ^Public_Key) -> bool { if p._curve != q._curve { diff --git a/core/crypto/ecdsa/ecdsa_asn1.odin b/core/crypto/ecdsa/ecdsa_asn1.odin index 0b423286d..74c9d65e6 100644 --- a/core/crypto/ecdsa/ecdsa_asn1.odin +++ b/core/crypto/ecdsa/ecdsa_asn1.odin @@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) { return nil, nil, false } - // DER requires a leading 0 iff the sign bit of the leading byte + // DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte // is set to distinguish between positive and negative integers, // and the minimal length representation. `r` and `s` are always // going to be unsigned, so we validate malformed DER and strip diff --git a/core/crypto/ecdsa/ecdsa_verify.odin b/core/crypto/ecdsa/ecdsa_verify.odin index 6b4e3dd4a..bd973a8df 100644 --- a/core/crypto/ecdsa/ecdsa_verify.odin +++ b/core/crypto/ecdsa/ecdsa_verify.odin @@ -3,7 +3,7 @@ package ecdsa import "core:crypto/hash" import secec "core:crypto/_weierstrass" -// verify_raw returns true iff sig is a valid signature by pub_key over +// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over // msg, hased using hash_algo, per the verification procedure specifed // in SEC 1, Version 2.0, Section 4.1.4. // @@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: [] panic("crypto/ecdsa: invalid curve") } -// verify_asn1 returns true iff sig is a valid signature by pub_key over +// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over // msg, hased using hash_algo, per the verification procedure specifed // in SEC 1, Version 2.0, Section 4.1.4. // diff --git a/core/crypto/ed25519/ed25519.odin b/core/crypto/ed25519/ed25519.odin index 817c8d34b..2020c0633 100644 --- a/core/crypto/ed25519/ed25519.odin +++ b/core/crypto/ed25519/ed25519.odin @@ -48,7 +48,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key) -> bool { private_key_clear(priv_key) @@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool { if len(b) != PRIVATE_KEY_SIZE { return false @@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool { if len(b) != PUBLIC_KEY_SIZE { return false @@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { copy(dst, pub_key._b[:]) } -// public_key_equal returns true iff pub_key is equal to other. +// public_key_equal returns true if and only if (⟺) pub_key is equal to other. public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool { ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key") return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1 } -// verify returns true iff sig is a valid signature by pub_key over msg. +// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg. // // The optional `allow_small_order_A` parameter will make this // implementation strictly compatible with FIPS 186-5, at the expense of diff --git a/core/crypto/hash/low_level.odin b/core/crypto/hash/low_level.odin index 242eadd5f..44b2a8100 100644 --- a/core/crypto/hash/low_level.odin +++ b/core/crypto/hash/low_level.odin @@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { switch &impl in ctx._impl { diff --git a/core/crypto/hmac/hmac.odin b/core/crypto/hmac/hmac.odin index d28c03b5b..56decc0ef 100644 --- a/core/crypto/hmac/hmac.odin +++ b/core/crypto/hmac/hmac.odin @@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) { } // verify will verify the HMAC tag computed with the specified algorithm -// and key over msg and return true iff the tag is valid. It requires +// and key over msg and return true if and only if (⟺) the tag is valid. It requires // that the tag is correctly sized. verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool { tag_buf: [hash.MAX_DIGEST_SIZE]byte diff --git a/core/crypto/kmac/kmac.odin b/core/crypto/kmac/kmac.odin index 4ecff4f12..f0c27739a 100644 --- a/core/crypto/kmac/kmac.odin +++ b/core/crypto/kmac/kmac.odin @@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) { } // verify will verify the KMAC tag computed with the specified security -// strength, key and domain separator over msg and return true iff the +// strength, key and domain separator over msg and return true if and only if (⟺) the // tag is valid. verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool { derived_tag := make([]byte, len(tag), allocator) diff --git a/core/crypto/legacy/keccak/keccak.odin b/core/crypto/legacy/keccak/keccak.odin index ec6af2565..ffca1c95c 100644 --- a/core/crypto/legacy/keccak/keccak.odin +++ b/core/crypto/legacy/keccak/keccak.odin @@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/crypto/legacy/md5/md5.odin b/core/crypto/legacy/md5/md5.odin index 399a789ed..4bbc5d32a 100644 --- a/core/crypto/legacy/md5/md5.odin +++ b/core/crypto/legacy/md5/md5.odin @@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/legacy/sha1/sha1.odin b/core/crypto/legacy/sha1/sha1.odin index f9adcc3d1..892f893a6 100644 --- a/core/crypto/legacy/sha1/sha1.odin +++ b/core/crypto/legacy/sha1/sha1.odin @@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/pbkdf2/pbkdf2.odin b/core/crypto/pbkdf2/pbkdf2.odin index 9d8394031..c27ec4aa2 100644 --- a/core/crypto/pbkdf2/pbkdf2.odin +++ b/core/crypto/pbkdf2/pbkdf2.odin @@ -66,7 +66,7 @@ derive :: proc( dst_blk = dst_blk[h_len:] } - // Instead of rounding l up, just proceass the one extra block iff + // Instead of rounding l up, just proceass the one extra block if and only if (⟺) // r != 0. if r > 0 { tmp: [hash.MAX_DIGEST_SIZE]byte diff --git a/core/crypto/poly1305/poly1305.odin b/core/crypto/poly1305/poly1305.odin index 69e2e3ad3..ed2bec82d 100644 --- a/core/crypto/poly1305/poly1305.odin +++ b/core/crypto/poly1305/poly1305.odin @@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) { } // verify will verify the Poly1305 tag computed with the key over msg and -// return true iff the tag is valid. It requires that the tag is correctly +// return true if and only if (⟺) the tag is valid. It requires that the tag is correctly // sized. verify :: proc(tag, msg, key: []byte) -> bool { ctx: Context = --- diff --git a/core/crypto/ristretto255/ristretto255.odin b/core/crypto/ristretto255/ristretto255.odin index 3724aee7a..ef899d1da 100644 --- a/core/crypto/ristretto255/ristretto255.odin +++ b/core/crypto/ristretto255/ristretto255.odin @@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc( ge._is_initialized = true } -// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`. +// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{a}) @@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { ge._is_initialized = true } -// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`. +// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{ge, a}) @@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { grp.ge_cond_assign(&ge._p, &a._p, ctrl) } -// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`. +// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{a, b}) @@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { ge._is_initialized = true } -// ge_equal returns 1 iff `a == b`, and 0 otherwise. +// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise. @(require_results) ge_equal :: proc(a, b: ^Group_Element) -> int { _ge_ensure_initialized([]^Group_Element{a, b}) @@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int { return ret } -// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise. +// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise. @(require_results) ge_is_identity :: proc(ge: ^Group_Element) -> int { return ge_equal(ge, &GE_IDENTITY) diff --git a/core/crypto/ristretto255/ristretto255_scalar.odin b/core/crypto/ristretto255/ristretto255_scalar.odin index 75844b3f4..743e02ef3 100644 --- a/core/crypto/ristretto255/ristretto255_scalar.odin +++ b/core/crypto/ristretto255/ristretto255_scalar.odin @@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) { grp.sc_square(sc, a) } -// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`. +// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) { grp.sc_cond_assign(sc, a, ctrl) } -// sc_equal returns 1 iff `a == b`, and 0 otherwise. +// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise. @(require_results) sc_equal :: proc(a, b: ^Scalar) -> int { return grp.sc_equal(a, b) diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index 36fa4aa02..dc41462e4 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -191,7 +191,7 @@ update :: proc(ctx: ^$T, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin index 85c7f8b28..837d0656d 100644 --- a/core/crypto/sha2/sha2_impl_hw_gen.odin +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -4,7 +4,7 @@ package sha2 @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" -// is_hardware_accelerated_256 returns true iff hardware accelerated +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated // SHA-224/SHA-256 is supported. is_hardware_accelerated_256 :: proc "contextless" () -> bool { return false diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha2_impl_hw_intel.odin index 83ef58a12..3f6ebb746 100644 --- a/core/crypto/sha2/sha2_impl_hw_intel.odin +++ b/core/crypto/sha2/sha2_impl_hw_intel.odin @@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814} K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7} -// is_hardware_accelerated_256 returns true iff hardware accelerated +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated // SHA-224/SHA-256 is supported. is_hardware_accelerated_256 :: proc "contextless" () -> bool { req_features :: info.CPU_Features{ diff --git a/core/crypto/sha3/sha3.odin b/core/crypto/sha3/sha3.odin index 2ca70963a..2f8d95092 100644 --- a/core/crypto/sha3/sha3.odin +++ b/core/crypto/sha3/sha3.odin @@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/crypto/sm3/sm3.odin b/core/crypto/sm3/sm3.odin index ac38ca417..6f1d788e0 100644 --- a/core/crypto/sm3/sm3.odin +++ b/core/crypto/sm3/sm3.odin @@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/tuplehash/tuplehash.odin b/core/crypto/tuplehash/tuplehash.odin index 5c8d8e39b..a45c1b120 100644 --- a/core/crypto/tuplehash/tuplehash.odin +++ b/core/crypto/tuplehash/tuplehash.odin @@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/io/io.odin b/core/io/io.odin index 9fae30946..631a232aa 100644 --- a/core/io/io.odin +++ b/core/io/io.odin @@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err // copy_n copies n bytes (or till an error) from src to dst. // It returns the number of bytes copied and the first error that occurred whilst copying, if any. -// On return, written == n IFF err == nil +// On return, written == n if and only if (⟺) err == nil copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) { nsrc := limited_reader_init(&Limited_Reader{}, src, n) written, err = copy(dst, nsrc) diff --git a/core/sys/darwin/Foundation/NSBlock.odin b/core/sys/darwin/Foundation/NSBlock.odin index 34e562d75..a5b99abc5 100644 --- a/core/sys/darwin/Foundation/NSBlock.odin +++ b/core/sys/darwin/Foundation/NSBlock.odin @@ -82,7 +82,7 @@ internal_block_literal_make :: proc (is_global: bool, user_data: rawptr, user_pr BLOCK_HAS_COPY_DISPOSE :: 1 << 25 BLOCK_HAS_CTOR :: 1 << 26 // helpers have C++ code BLOCK_IS_GLOBAL :: 1 << 28 - BLOCK_HAS_STRET :: 1 << 29 // IFF BLOCK_HAS_SIGNATURE + BLOCK_HAS_STRET :: 1 << 29 // if and only if (⟺) BLOCK_HAS_SIGNATURE BLOCK_HAS_SIGNATURE :: 1 << 30 bl.isa = is_global ? &_NSConcreteGlobalBlock : &_NSConcreteStackBlock diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 80df35edc..3dd4808c3 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -4460,9 +4460,9 @@ gb_internal void check_binary_expr(CheckerContext *c, Operand *x, Ast *node, Typ truncated: r = a - b*trunc(a/b) floored: r = a - b*floor(a/b) - IFF a/0 == 0, then (a%0 == a) or (a%%0 == a) - IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0) - IFF a/0 == 0b111..., then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0) + If and only if (⟺) a/0 == 0b111..., then (a%0 == a) or (a%%0 == a) */ switch (zero_behaviour) { diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 1685f9627..fc68561b3 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -1424,8 +1424,8 @@ gb_internal LLVMValueRef lb_integer_modulo(lbProcedure *p, LLVMValueRef lhs, LLV truncated: r = a - b*trunc(a/b) floored: r = a - b*floor(a/b) - IFF a/0 == 0, then (a%0 == a) or (a%%0 == a) - IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0) + If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0) */ switch (behaviour) { diff --git a/vendor/lua/5.4/include/luaconf.h b/vendor/lua/5.4/include/luaconf.h index 3ad294e4f..fbfa5781b 100644 --- a/vendor/lua/5.4/include/luaconf.h +++ b/vendor/lua/5.4/include/luaconf.h @@ -71,7 +71,7 @@ /* -@@ LUAI_IS32INT is true iff 'int' has (at least) 32 bits. +@@ LUAI_IS32INT is true if and only if (⟺) 'int' has (at least) 32 bits. */ #define LUAI_IS32INT ((UINT_MAX >> 30) >= 3) diff --git a/vendor/portmidi/portmidi.odin b/vendor/portmidi/portmidi.odin index 58d7c2ec2..1f2aca286 100644 --- a/vendor/portmidi/portmidi.odin +++ b/vendor/portmidi/portmidi.odin @@ -119,8 +119,8 @@ DeviceInfo :: struct { structVersion: c.int, /**< this internal structure version */ interf: cstring, /**< underlying MIDI API, e.g. MMSystem or DirectX */ name: cstring, /**< device name, e.g. USB MidiSport 1x1 */ - input: b32, /**< true iff input is available */ - output: b32, /**< true iff output is available */ + input: b32, /**< true if and only if (⟺) input is available */ + output: b32, /**< true if and only if (⟺) output is available */ opened: b32, /**< used by generic PortMidi code to do error checking on arguments */ } From 004a65e3826740230a4e65a77c1c2273befb3549 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Thu, 12 Mar 2026 02:40:20 +0900 Subject: [PATCH 07/21] core/crypto/_aes/hw_intel: Remove a copy of zero_explicit --- .../_aes/hw_intel/hw_intel_keysched.odin | 26 ++----------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin index 96108442d..7b339c5f5 100644 --- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin +++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin @@ -24,6 +24,7 @@ package aes_hw_intel import "base:intrinsics" +import "core:crypto" import "core:crypto/_aes" import "core:simd/x86" @@ -173,28 +174,5 @@ keysched :: proc(ctx: ^Context, key: []byte) { ctx._num_rounds = num_rounds - zero_explicit(&sks, size_of(sks)) + crypto.zero_explicit(&sks, size_of(sks)) } - -/* -Set each byte of a memory range to zero. - -This procedure copies the value `0` into the `len` bytes of a memory range, -starting at address `data`. - -This procedure returns the pointer to `data`. - -Unlike the `zero()` procedure, which can be optimized away or reordered by the -compiler under certain circumstances, `zero_explicit()` procedure can not be -optimized away or reordered with other memory access operations, and the -compiler assumes volatile semantics of the memory. -*/ -zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr { - // This routine tries to avoid the compiler optimizing away the call, - // so that it is always executed. It is intended to provide - // equivalent semantics to those provided by the C11 Annex K 3.7.4.1 - // memset_s call. - intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero - intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering - return data -} \ No newline at end of file From 48b1f3b8301ad3e0977555bd4e0a4cc8434f4216 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Thu, 12 Mar 2026 02:44:21 +0900 Subject: [PATCH 08/21] core/simd/arm: Initial import --- core/simd/arm/aes.odin | 34 ++++++++++++++++++++++++++++++++++ core/simd/arm/doc.odin | 2 ++ core/simd/arm/types.odin | 5 +++++ 3 files changed, 41 insertions(+) create mode 100644 core/simd/arm/aes.odin create mode 100644 core/simd/arm/doc.odin create mode 100644 core/simd/arm/types.odin diff --git a/core/simd/arm/aes.odin b/core/simd/arm/aes.odin new file mode 100644 index 000000000..acafb9f1e --- /dev/null +++ b/core/simd/arm/aes.odin @@ -0,0 +1,34 @@ +#+build arm64,arm32 +package simd_arm + +@(require_results,enable_target_feature="aes") +vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { + return _vaeseq_u8(data, key) +} + +@(require_results,enable_target_feature="aes") +vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { + return _vaesdq_u8(data, key) +} + +@(require_results,enable_target_feature="aes") +vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { + return _vaesmcq_u8(data) +} + +@(require_results,enable_target_feature="aes") +vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { + return _vaesimcq_u8(data) +} + +@(private,default_calling_convention="none") +foreign _ { + @(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese") + _vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesd" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesd") + _vaesdq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesmc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesmc") + _vaesmcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesimc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesimc") + _vaesimcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t --- +} diff --git a/core/simd/arm/doc.odin b/core/simd/arm/doc.odin new file mode 100644 index 000000000..ecedc7bac --- /dev/null +++ b/core/simd/arm/doc.odin @@ -0,0 +1,2 @@ +// `SIMD` intrinsics specific to ARMv8 `arm32` and `arm64` architectures. +package simd_arm \ No newline at end of file diff --git a/core/simd/arm/types.odin b/core/simd/arm/types.odin new file mode 100644 index 000000000..7c86483a7 --- /dev/null +++ b/core/simd/arm/types.odin @@ -0,0 +1,5 @@ +#+build arm64,arm32 +package simd_arm + +uint8x16_t :: #simd[16]u8 +uint32x4_t :: #simd[4]u32 From 6aeed0e20e05ff2c17c7fc17b10fcc9518f3395a Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Thu, 12 Mar 2026 20:35:55 +0900 Subject: [PATCH 09/21] core/crypto/_aes/hw: Initial import --- core/crypto/_aes/hw/api.odin | 69 +++++++ .../ghash.odin => hw/ghash_intel.odin} | 2 +- core/crypto/_aes/hw/intrinsics_arm.odin | 115 +++++++++++ core/crypto/_aes/hw/intrinsics_intel.odin | 55 ++++++ core/crypto/_aes/hw/keysched_hw.odin | 181 ++++++++++++++++++ core/crypto/_aes/hw/unsupported.odin | 11 ++ core/crypto/_aes/hw_intel/api.odin | 38 ---- .../_aes/hw_intel/hw_intel_keysched.odin | 178 ----------------- core/crypto/aes/aes_gcm_hw_intel.odin | 20 +- core/crypto/aes/aes_impl_hw_intel.odin | 8 +- 10 files changed, 446 insertions(+), 231 deletions(-) create mode 100644 core/crypto/_aes/hw/api.odin rename core/crypto/_aes/{hw_intel/ghash.odin => hw/ghash_intel.odin} (99%) create mode 100644 core/crypto/_aes/hw/intrinsics_arm.odin create mode 100644 core/crypto/_aes/hw/intrinsics_intel.odin create mode 100644 core/crypto/_aes/hw/keysched_hw.odin create mode 100644 core/crypto/_aes/hw/unsupported.odin delete mode 100644 core/crypto/_aes/hw_intel/api.odin delete mode 100644 core/crypto/_aes/hw_intel/hw_intel_keysched.odin diff --git a/core/crypto/_aes/hw/api.odin b/core/crypto/_aes/hw/api.odin new file mode 100644 index 000000000..09f674657 --- /dev/null +++ b/core/crypto/_aes/hw/api.odin @@ -0,0 +1,69 @@ +package aes_hw + +@(require) import "core:sys/info" + +// is_supported returns true if and only if (⟺) hardware accelerated AES +// is supported. +is_supported :: proc "contextless" () -> bool { + when ODIN_ARCH == .amd64 { + // Note: Everything with AES-NI has support for + // the required SSE extxtensions. + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .aes, + } + return info.cpu_features() >= req_features + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + req_features :: info.CPU_Features{ + .asimd, + .aes, + } + return info.cpu_features() >= req_features + } else { + return false + } +} + +// is_ghash_supported returns true if and only if (⟺) hardware accelerated +// GHASH is supported. +is_ghash_supported :: proc "contextless" () -> bool { + // Just having hardware GHASH is silly. + if !is_supported() { + return false + } + + when ODIN_ARCH == .amd64 { + return info.cpu_features() >= info.CPU_Features{ + .pclmulqdq, + } + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{ + // Once we can actually use this, we can re-enable this. + // + // return info.cpu_features() >= info.CPU_Features{ + // .pmull, + // } + return false + } else { + return false + } +} + +// Context is a keyed AES (ECB) instance. +Context :: struct { + // Note: The ideal thing to do is for the expanded round keys to be + // arrays of `u8x16`, however that implies alignment (or using AVX). + // + // All the people using e-waste processors that don't support an + // instruction set that has been around for over 10 years are why + // we can't have nice things. + _sk_exp_enc: [15][16]byte, + _sk_exp_dec: [15][16]byte, + _num_rounds: int, +} + +// init initializes a context for AES with the provided key. +init :: proc(ctx: ^Context, key: []byte) { + keysched(ctx, key) +} diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw/ghash_intel.odin similarity index 99% rename from core/crypto/_aes/hw_intel/ghash.odin rename to core/crypto/_aes/hw/ghash_intel.odin index 5f51b614b..d80816d5d 100644 --- a/core/crypto/_aes/hw_intel/ghash.odin +++ b/core/crypto/_aes/hw/ghash_intel.odin @@ -21,7 +21,7 @@ // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+build amd64 -package aes_hw_intel +package aes_hw import "base:intrinsics" import "core:crypto/_aes" diff --git a/core/crypto/_aes/hw/intrinsics_arm.odin b/core/crypto/_aes/hw/intrinsics_arm.odin new file mode 100644 index 000000000..ccd8efa8f --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_arm.odin @@ -0,0 +1,115 @@ +#+build arm64,arm32 +package aes_hw + +import "core:simd" +import "core:simd/arm" + +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/ + +TARGET_FEATURES :: "neon,aes" +HAS_GHASH :: false // Temporary + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key) +} + +aesimc :: arm.vaesimcq_u8 + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A + + // Undo ShiftRows step from AESE and extract X1 and X3 + dest := simd.swizzle( + a, + 0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1) + 0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1)) + 0x0c, 0x09, 0x06, 0x03, // SubBytes(X3) + 0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3)) + ) + + rcons := simd.u8x16{ + 0, 0, 0, 0, + IMM8, 0, 0, 0, + 0, 0, 0, 0, + IMM8, 0, 0, 0, + } + + return simd.bit_xor(dest, rcons) +} + +// The keyschedule implementation is easier to read with some extra +// Intel intrinsics that are emulated by built-in LLVM ops anyway. + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + shift :: IMM8 & 0xff + + // This needs to emit behavior identical to PSLLDQ which is as follows: + // + // TEMP := COUNT + // IF (TEMP > 15) THEN TEMP := 16; FI + // DEST := DEST << (TEMP * 8) + // DEST[MAXVL-1:128] (Unmodified) + + return simd.shuffle( + simd.u8x16{}, + a, + 0 when shift > 15 else (16 - shift + 0), + 1 when shift > 15 else (16 - shift + 1), + 2 when shift > 15 else (16 - shift + 2), + 3 when shift > 15 else (16 - shift + 3), + 4 when shift > 15 else (16 - shift + 4), + 5 when shift > 15 else (16 - shift + 5), + 6 when shift > 15 else (16 - shift + 6), + 7 when shift > 15 else (16 - shift + 7), + 8 when shift > 15 else (16 - shift + 8), + 9 when shift > 15 else (16 - shift + 9), + 10 when shift > 15 else (16 - shift + 10), + 11 when shift > 15 else (16 - shift + 11), + 12 when shift > 15 else (16 - shift + 12), + 13 when shift > 15 else (16 - shift + 13), + 14 when shift > 15 else (16 - shift + 14), + 15 when shift > 15 else (16 - shift + 15), + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + v := transmute(simd.i32x4)a + return transmute(simd.u8x16)simd.shuffle( + v, + v, + IMM8 & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)simd.shuffle( + transmute(simd.u32x4)(a), + transmute(simd.u32x4)(b), + u32(MASK) & 0b11, + (u32(MASK)>>2) & 0b11, + ((u32(MASK)>>4) & 0b11)+4, + ((u32(MASK)>>6) & 0b11)+4) +} diff --git a/core/crypto/_aes/hw/intrinsics_intel.odin b/core/crypto/_aes/hw/intrinsics_intel.odin new file mode 100644 index 000000000..25399dfae --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_intel.odin @@ -0,0 +1,55 @@ +#+build amd64 +package aes_hw + +import "core:simd" +import "core:simd/x86" + +// Intel/RISC-V semantics. + +TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes" +HAS_GHASH :: true + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data))) +} + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK)) +} diff --git a/core/crypto/_aes/hw/keysched_hw.odin b/core/crypto/_aes/hw/keysched_hw.odin new file mode 100644 index 000000000..7d85c43b7 --- /dev/null +++ b/core/crypto/_aes/hw/keysched_hw.odin @@ -0,0 +1,181 @@ +// Copyright (c) 2017 Thomas Pornin +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#+build amd64,arm64,arm32 +package aes_hw + +import "base:intrinsics" +import "core:crypto" +import "core:crypto/_aes" +import "core:simd" + +// Inspiration taken from BearSSL's AES-NI implementation. +// +// Note: This assumes that the SROA optimization pass is enabled to be +// anything resembling performant otherwise, LLVM will not elide a massive +// number of redundant loads/stores it generates for every intrinsic call. + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xff) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + tmp := k2 + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + r1 := _mm_shuffle_ps(tmp, k1, 0x44) + r2 := _mm_shuffle_ps(k1, k2, 0x4e) + + return r1, r2 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + return k1 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xaa) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds]) + for i in 1 ..< num_rounds { + tmp := aesimc(sks[i]) + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp) + } + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0]) +} + +@(private, enable_target_feature = TARGET_FEATURES) +keysched :: proc(ctx: ^Context, key: []byte) { + sks: [15]simd.u8x16 = --- + + // Compute the encryption keys. + num_rounds, key_len := 0, len(key) + switch key_len { + case _aes.KEY_SIZE_128: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01)) + sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02)) + sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04)) + sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08)) + sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10)) + sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20)) + sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40)) + sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80)) + sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b)) + sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36)) + num_rounds = _aes.ROUNDS_128 + case _aes.KEY_SIZE_192: + k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + + k1_tmp: [16]byte + copy(k1_tmp[:], key[16:24]) + k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp)) + crypto.zero_explicit(&k1_tmp, size_of(k1_tmp)) + + sks[0] = k0 + sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01)) + sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02)) + sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04)) + sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08)) + sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10)) + sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20)) + sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40)) + sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80)) + num_rounds = _aes.ROUNDS_192 + + case _aes.KEY_SIZE_256: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:]))) + sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01)) + sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01)) + sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02)) + sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02)) + sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04)) + sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04)) + sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08)) + sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08)) + sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10)) + sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10)) + sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20)) + sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20)) + sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40)) + num_rounds = _aes.ROUNDS_256 + case: + panic("crypto/aes: invalid AES key size") + } + for i in 0 ..= num_rounds { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i]) + } + + // Compute the decryption keys. GCM and CTR do not need this, however + // ECB, CBC, OCB3, etc do. + derive_dec_keys(ctx, &sks, num_rounds) + + ctx._num_rounds = num_rounds + + crypto.zero_explicit(&sks, size_of(sks)) +} diff --git a/core/crypto/_aes/hw/unsupported.odin b/core/crypto/_aes/hw/unsupported.odin new file mode 100644 index 000000000..3fb31b6b8 --- /dev/null +++ b/core/crypto/_aes/hw/unsupported.odin @@ -0,0 +1,11 @@ +#+build !amd64 +#+build !arm64 +#+build !arm32 +package aes_hw + +HAS_GHASH :: false + +@(private) +keysched :: proc(ctx: ^Context, key: []byte) { + panic("crypto/aes: hardware implementation unsupported") +} diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin deleted file mode 100644 index 9547d8f84..000000000 --- a/core/crypto/_aes/hw_intel/api.odin +++ /dev/null @@ -1,38 +0,0 @@ -#+build amd64 -package aes_hw_intel - -import "core:sys/info" - -// is_supported returns true if and only if (⟺) hardware accelerated AES -// is supported. -is_supported :: proc "contextless" () -> bool { - // Note: Everything with AES-NI and PCLMULQDQ has support for - // the required SSE extxtensions. - req_features :: info.CPU_Features{ - .sse2, - .ssse3, - .sse41, - .aes, - .pclmulqdq, - } - return info.cpu_features() >= req_features -} - -// Context is a keyed AES (ECB) instance. -Context :: struct { - // Note: The ideal thing to do is for the expanded round keys to be - // arrays of `__m128i`, however that implies alignment (or using AVX). - // - // All the people using e-waste processors that don't support an - // insturction set that has been around for over 10 years are why - // we can't have nice things. - _sk_exp_enc: [15][16]byte, - _sk_exp_dec: [15][16]byte, - _num_rounds: int, -} - -// init initializes a context for AES with the provided key. -init :: proc(ctx: ^Context, key: []byte) { - keysched(ctx, key) -} - diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin deleted file mode 100644 index 7b339c5f5..000000000 --- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) 2017 Thomas Pornin -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR -// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#+build amd64 -package aes_hw_intel - -import "base:intrinsics" -import "core:crypto" -import "core:crypto/_aes" -import "core:simd/x86" - -// Intel AES-NI based implementation. Inspiration taken from BearSSL. -// -// Note: This assumes that the SROA optimization pass is enabled to be -// anything resembling performat otherwise, LLVM will not elide a massive -// number of redundant loads/stores it generates for every intrinsic call. - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xff) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", require_results, enable_target_feature = "sse,sse2") -expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - tmp := k2 - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44)) - r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e)) - - return r1, r2 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - return k1 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xaa) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", enable_target_feature = "aes") -derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds]) - for i in 1 ..< num_rounds { - tmp := x86._mm_aesimc_si128(sks[i]) - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp) - } - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0]) -} - -@(private, enable_target_feature = "sse,sse2,aes") -keysched :: proc(ctx: ^Context, key: []byte) { - sks: [15]x86.__m128i = --- - - // Compute the encryption keys. - num_rounds, key_len := 0, len(key) - switch key_len { - case _aes.KEY_SIZE_128: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01)) - sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02)) - sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04)) - sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08)) - sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10)) - sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20)) - sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40)) - sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80)) - sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b)) - sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36)) - num_rounds = _aes.ROUNDS_128 - case _aes.KEY_SIZE_192: - k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - k1 := x86.__m128i{ - intrinsics.unaligned_load((^i64)(raw_data(key[16:]))), - 0, - } - sks[0] = k0 - sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01)) - sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02)) - sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04)) - sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08)) - sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10)) - sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20)) - sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40)) - sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80)) - num_rounds = _aes.ROUNDS_192 - case _aes.KEY_SIZE_256: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:]))) - sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01)) - sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01)) - sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02)) - sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02)) - sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04)) - sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04)) - sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08)) - sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08)) - sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10)) - sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10)) - sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20)) - sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20)) - sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40)) - num_rounds = _aes.ROUNDS_256 - case: - panic("crypto/aes: invalid AES key size") - } - for i in 0 ..= num_rounds { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i]) - } - - // Compute the decryption keys. GCM and CTR do not need this, however - // ECB, CBC, OCB3, etc do. - derive_dec_keys(ctx, &sks, num_rounds) - - ctx._num_rounds = num_rounds - - crypto.zero_explicit(&sks, size_of(sks)) -} diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin index c6e564773..75c97be80 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw_intel.odin @@ -4,7 +4,7 @@ package aes import "base:intrinsics" import "core:crypto" import "core:crypto/_aes" -import "core:crypto/_aes/hw_intel" +import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" import "core:simd/x86" @@ -17,7 +17,7 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [ init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. - hw_intel.ghash(s[:], h[:], aad) + aes_hw.ghash(s[:], h[:], aad) gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext)) copy(tag, s[:]) @@ -35,7 +35,7 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: s: [_aes.GHASH_TAG_SIZE]byte init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) - hw_intel.ghash(s[:], h[:], aad) + aes_hw.ghash(s[:], h[:], aad) gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext)) @@ -71,11 +71,11 @@ init_ghash_hw :: proc( } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - hw_intel.ghash(j0[:], h[:], iv) + aes_hw.ghash(j0[:], h[:], iv) tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) - hw_intel.ghash(j0[:], h[:], tmp[:]) + aes_hw.ghash(j0[:], h[:], tmp[:]) } // ECB encrypt j0, so that we can just XOR with the tag. @@ -94,7 +94,7 @@ final_ghash_hw :: proc( endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8) endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8) - hw_intel.ghash(s[:], h[:], blk[:]) + aes_hw.ghash(s[:], h[:], blk[:]) j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0)) s_vec := intrinsics.unaligned_load((^x86.__m128i)(s)) s_vec = x86._mm_xor_si128(s_vec, j0_vec) @@ -131,7 +131,7 @@ gctr_hw :: proc( nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= CTR_STRIDE_HW { if !is_seal { - hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) } #unroll for i in 0 ..< CTR_STRIDE_HW { @@ -174,7 +174,7 @@ gctr_hw :: proc( xor_blocks_hw(dst, src, blks[:]) if is_seal { - hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) } src = src[CTR_STRIDE_BYTES_HW:] @@ -186,7 +186,7 @@ gctr_hw :: proc( for n := len(src); n > 0; { l := min(n, BLOCK_SIZE) if !is_seal { - hw_intel.ghash(s[:], h[:], src[:l]) + aes_hw.ghash(s[:], h[:], src[:l]) } blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr) @@ -219,7 +219,7 @@ gctr_hw :: proc( copy(dst, blk[:l]) } if is_seal { - hw_intel.ghash(s[:], h[:], dst[:l]) + aes_hw.ghash(s[:], h[:], dst[:l]) } dst = dst[l:] diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin index 96a1811f3..fe3849eda 100644 --- a/core/crypto/aes/aes_impl_hw_intel.odin +++ b/core/crypto/aes/aes_impl_hw_intel.odin @@ -1,18 +1,18 @@ #+build amd64 package aes -import "core:crypto/_aes/hw_intel" +import aes_hw "core:crypto/_aes/hw" // is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { - return hw_intel.is_supported() + return aes_hw.is_supported() } @(private) -Context_Impl_Hardware :: hw_intel.Context +Context_Impl_Hardware :: aes_hw.Context @(private, enable_target_feature = "sse2,aes") init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { - hw_intel.init(ctx, key) + aes_hw.init(ctx, key) } From df1a9661008607f0cb3612cd0b3d78ad0787ceb3 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Thu, 12 Mar 2026 22:06:22 +0900 Subject: [PATCH 10/21] core/crypto/aegis: Migrate to generic SIMD + HW AES --- core/crypto/aegis/aegis_impl_hw.odin | 397 +++++++++++++++++++++ core/crypto/aegis/aegis_impl_hw_gen.odin | 2 + core/crypto/aegis/aegis_impl_hw_intel.odin | 389 -------------------- 3 files changed, 399 insertions(+), 389 deletions(-) create mode 100644 core/crypto/aegis/aegis_impl_hw.odin delete mode 100644 core/crypto/aegis/aegis_impl_hw_intel.odin diff --git a/core/crypto/aegis/aegis_impl_hw.odin b/core/crypto/aegis/aegis_impl_hw.odin new file mode 100644 index 000000000..4a39b341c --- /dev/null +++ b/core/crypto/aegis/aegis_impl_hw.odin @@ -0,0 +1,397 @@ +#+build amd64,arm64,arm32 +package aegis + +import "base:intrinsics" +import "core:crypto" +import aes_hw "core:crypto/_aes/hw" +import "core:encoding/endian" +import "core:simd" + +@(private) +State_HW :: struct { + s0: simd.u8x16, + s1: simd.u8x16, + s2: simd.u8x16, + s3: simd.u8x16, + s4: simd.u8x16, + s5: simd.u8x16, + s6: simd.u8x16, + s7: simd.u8x16, + rate: int, +} + +when ODIN_ARCH == .amd64 { + @(private="file") + TARGET_FEATURES :: "sse2,aes" +} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private="file") + TARGET_FEATURES :: "neon,aes" +} + +// is_hardware_accelerated returns true if and only if (⟺) hardware +// accelerated AEGIS is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return aes_hw.is_supported() +} + +@(private, enable_target_feature = TARGET_FEATURES) +init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { + switch ctx._key_len { + case KEY_SIZE_128L: + key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0])) + iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv))) + + st.s0 = simd.bit_xor(key, iv) + st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0])) + st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0])) + st.s3 = st.s1 + st.s4 = st.s0 + st.s5 = simd.bit_xor(key, st.s2) // key ^ C0 + st.s6 = simd.bit_xor(key, st.s1) // key ^ C1 + st.s7 = st.s5 + st.rate = _RATE_128L + + for _ in 0 ..< 10 { + update_hw_128l(st, iv, key) + } + case KEY_SIZE_256: + k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0])) + k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16])) + n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0])) + n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16])) + + st.s0 = simd.bit_xor(k0, n0) + st.s1 = simd.bit_xor(k1, n1) + st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0])) + st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0])) + st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0 + st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1 + st.rate = _RATE_256 + + u0, u1 := st.s0, st.s1 + for _ in 0 ..< 4 { + update_hw_256(st, k0) + update_hw_256(st, k1) + update_hw_256(st, u0) + update_hw_256(st, u1) + } + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) { + s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0)) + s1_ := aes_hw.aesenc(st.s0, st.s1) + s2_ := aes_hw.aesenc(st.s1, st.s2) + s3_ := aes_hw.aesenc(st.s2, st.s3) + s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1)) + s5_ := aes_hw.aesenc(st.s4, st.s5) + s6_ := aes_hw.aesenc(st.s5, st.s6) + s7_ := aes_hw.aesenc(st.s6, st.s7) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_ +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) { + s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m)) + s1_ := aes_hw.aesenc(st.s0, st.s1) + s2_ := aes_hw.aesenc(st.s1, st.s2) + s3_ := aes_hw.aesenc(st.s2, st.s3) + s4_ := aes_hw.aesenc(st.s3, st.s4) + s5_ := aes_hw.aesenc(st.s4, st.s5) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_ +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16])) + update_hw_128l(st, t0, t1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0])) + update_hw_256(st, m) +} + +@(private, enable_target_feature = TARGET_FEATURES) +absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check { + ai, l := aad, len(aad) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + absorb_hw_128l(st, ai) + ai = ai[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + absorb_hw_256(st, ai) + + ai = ai[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // AAD is not confidential. + copy(tmp[:], ai) + switch st.rate { + case _RATE_128L: + absorb_hw_128l(st, tmp[:]) + case _RATE_256: + absorb_hw_256(st, tmp[:]) + } + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) { + z0 := simd.bit_xor( + st.s6, + simd.bit_xor( + st.s1, + simd.bit_and(st.s2, st.s3), + ), + ) + z1 := simd.bit_xor( + st.s2, + simd.bit_xor( + st.s5, + simd.bit_and(st.s6, st.s7), + ), + ) + return z0, z1 +} + +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 { + return simd.bit_xor( + st.s1, + simd.bit_xor( + st.s4, + simd.bit_xor( + st.s5, + simd.bit_and(st.s2, st.s3), + ), + ), + ) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16])) + update_hw_128l(st, t0, t1) + + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z := z_hw_256(st) + + xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi))) + update_hw_256(st, xi_) + + ci_ := simd.bit_xor(xi_, z) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_) +} + +@(private, enable_target_feature = TARGET_FEATURES) +enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + ci, xi, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + enc_hw_128l(st, ci, xi) + ci = ci[_RATE_128L:] + xi = xi[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + enc_hw_256(st, ci, xi) + ci = ci[_RATE_256:] + xi = xi[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // Ciphertext is not confidential. + copy(tmp[:], xi) + switch st.rate { + case _RATE_128L: + enc_hw_128l(st, tmp[:], tmp[:]) + case _RATE_256: + enc_hw_256(st, tmp[:], tmp[:]) + } + copy(ci, tmp[:l]) + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16])) + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + + update_hw_128l(st, out0, out1) + intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z := z_hw_256(st) + + ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci))) + xi_ := simd.bit_xor(ci_, z) + + update_hw_256(st, xi_) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_128L]byte + defer crypto.zero_explicit(&tmp, size_of(tmp)) + + z0, z1 := z_hw_128l(st) + copy(tmp[:], cn) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + + intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_128L; off += 1 { + tmp[off] = 0 + } + out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0 + out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1 + update_hw_128l(st, out0, out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_256]byte + defer crypto.zero_explicit(&tmp, size_of(tmp)) + + z := z_hw_256(st) + copy(tmp[:], cn) + + cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + xn_ := simd.bit_xor(cn_, z) + + intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_256; off += 1 { + tmp[off] = 0 + } + xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + update_hw_256(st, xn_) +} + +@(private, enable_target_feature = TARGET_FEATURES) +dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + xi, ci, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + dec_hw_128l(st, xi, ci) + xi = xi[_RATE_128L:] + ci = ci[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + dec_hw_256(st, xi, ci) + xi = xi[_RATE_256:] + ci = ci[_RATE_256:] + l -= _RATE_256 + } + } + + // Process the remainder. + if l > 0 { + switch st.rate { + case _RATE_128L: + dec_partial_hw_128l(st, xi, ci) + case _RATE_256: + dec_partial_hw_256(st, xi, ci) + } + } +} + +@(private, enable_target_feature = TARGET_FEATURES) +finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { + tmp: [16]byte + endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) + endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) + + t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + + t0, t1: simd.u8x16 = ---, --- + switch st.rate { + case _RATE_128L: + t = simd.bit_xor(st.s2, t) + for _ in 0 ..< 7 { + update_hw_128l(st, t, t) + } + + t0 = simd.bit_xor(st.s0, st.s1) + t0 = simd.bit_xor(t0, st.s2) + t0 = simd.bit_xor(t0, st.s3) + + t1 = simd.bit_xor(st.s4, st.s5) + t1 = simd.bit_xor(t1, st.s6) + if len(tag) == TAG_SIZE_256 { + t1 = simd.bit_xor(t1, st.s7) + } + case _RATE_256: + t = simd.bit_xor(st.s3, t) + for _ in 0 ..< 7 { + update_hw_256(st, t) + } + + t0 = simd.bit_xor(st.s0, st.s1) + t0 = simd.bit_xor(t0, st.s2) + + t1 = simd.bit_xor(st.s3, st.s4) + t1 = simd.bit_xor(t1, st.s5) + } + switch len(tag) { + case TAG_SIZE_128: + t0 = simd.bit_xor(t0, t1) + intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0) + case TAG_SIZE_256: + intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0) + intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1) + } +} + +@(private) +reset_state_hw :: proc "contextless" (st: ^State_HW) { + crypto.zero_explicit(st, size_of(st^)) +} diff --git a/core/crypto/aegis/aegis_impl_hw_gen.odin b/core/crypto/aegis/aegis_impl_hw_gen.odin index db38e71bc..8f8b4c5da 100644 --- a/core/crypto/aegis/aegis_impl_hw_gen.odin +++ b/core/crypto/aegis/aegis_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package aegis @(private = "file") diff --git a/core/crypto/aegis/aegis_impl_hw_intel.odin b/core/crypto/aegis/aegis_impl_hw_intel.odin deleted file mode 100644 index 8b767908c..000000000 --- a/core/crypto/aegis/aegis_impl_hw_intel.odin +++ /dev/null @@ -1,389 +0,0 @@ -#+build amd64 -package aegis - -import "base:intrinsics" -import "core:crypto" -import "core:crypto/aes" -import "core:encoding/endian" -import "core:simd/x86" - -@(private) -State_HW :: struct { - s0: x86.__m128i, - s1: x86.__m128i, - s2: x86.__m128i, - s3: x86.__m128i, - s4: x86.__m128i, - s5: x86.__m128i, - s6: x86.__m128i, - s7: x86.__m128i, - rate: int, -} - -// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS -// is supported. -is_hardware_accelerated :: proc "contextless" () -> bool { - return aes.is_hardware_accelerated() -} - -@(private, enable_target_feature = "sse2,aes") -init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { - switch ctx._key_len { - case KEY_SIZE_128L: - key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) - iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv))) - - st.s0 = x86._mm_xor_si128(key, iv) - st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) - st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) - st.s3 = st.s1 - st.s4 = st.s0 - st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0 - st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1 - st.s7 = st.s5 - st.rate = _RATE_128L - - for _ in 0 ..< 10 { - update_hw_128l(st, iv, key) - } - case KEY_SIZE_256: - k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) - k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16])) - n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0])) - n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16])) - - st.s0 = x86._mm_xor_si128(k0, n0) - st.s1 = x86._mm_xor_si128(k1, n1) - st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) - st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) - st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0 - st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1 - st.rate = _RATE_256 - - u0, u1 := st.s0, st.s1 - for _ in 0 ..< 4 { - update_hw_256(st, k0) - update_hw_256(st, k1) - update_hw_256(st, u0) - update_hw_256(st, u1) - } - } -} - -@(private = "file", enable_target_feature = "sse2,aes") -update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) { - s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0)) - s1_ := x86._mm_aesenc_si128(st.s0, st.s1) - s2_ := x86._mm_aesenc_si128(st.s1, st.s2) - s3_ := x86._mm_aesenc_si128(st.s2, st.s3) - s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1)) - s5_ := x86._mm_aesenc_si128(st.s4, st.s5) - s6_ := x86._mm_aesenc_si128(st.s5, st.s6) - s7_ := x86._mm_aesenc_si128(st.s6, st.s7) - st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_ -} - -@(private = "file", enable_target_feature = "sse2,aes") -update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) { - s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m)) - s1_ := x86._mm_aesenc_si128(st.s0, st.s1) - s2_ := x86._mm_aesenc_si128(st.s1, st.s2) - s3_ := x86._mm_aesenc_si128(st.s2, st.s3) - s4_ := x86._mm_aesenc_si128(st.s3, st.s4) - s5_ := x86._mm_aesenc_si128(st.s4, st.s5) - st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_ -} - -@(private = "file", enable_target_feature = "sse2,aes") -absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { - t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16])) - update_hw_128l(st, t0, t1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { - m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) - update_hw_256(st, m) -} - -@(private, enable_target_feature = "sse2,aes") -absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check { - ai, l := aad, len(aad) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - absorb_hw_128l(st, ai) - ai = ai[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - absorb_hw_256(st, ai) - - ai = ai[_RATE_256:] - l -= _RATE_256 - } - } - - // Pad out the remainder with `0`s till it is rate sized. - if l > 0 { - tmp: [_RATE_MAX]byte // AAD is not confidential. - copy(tmp[:], ai) - switch st.rate { - case _RATE_128L: - absorb_hw_128l(st, tmp[:]) - case _RATE_256: - absorb_hw_256(st, tmp[:]) - } - } -} - -@(private = "file", enable_target_feature = "sse2", require_results) -z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) { - z0 := x86._mm_xor_si128( - st.s6, - x86._mm_xor_si128( - st.s1, - x86._mm_and_si128(st.s2, st.s3), - ), - ) - z1 := x86._mm_xor_si128( - st.s2, - x86._mm_xor_si128( - st.s5, - x86._mm_and_si128(st.s6, st.s7), - ), - ) - return z0, z1 -} - -@(private = "file", enable_target_feature = "sse2", require_results) -z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i { - return x86._mm_xor_si128( - st.s1, - x86._mm_xor_si128( - st.s4, - x86._mm_xor_si128( - st.s5, - x86._mm_and_si128(st.s2, st.s3), - ), - ), - ) -} - -@(private = "file", enable_target_feature = "sse2,aes") -enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { - z0, z1 := z_hw_128l(st) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16])) - update_hw_128l(st, t0, t1) - - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { - z := z_hw_256(st) - - xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi))) - update_hw_256(st, xi_) - - ci_ := x86._mm_xor_si128(xi_, z) - intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_) -} - -@(private, enable_target_feature = "sse2,aes") -enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { - ci, xi, l := dst, src, len(src) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - enc_hw_128l(st, ci, xi) - ci = ci[_RATE_128L:] - xi = xi[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - enc_hw_256(st, ci, xi) - ci = ci[_RATE_256:] - xi = xi[_RATE_256:] - l -= _RATE_256 - } - } - - // Pad out the remainder with `0`s till it is rate sized. - if l > 0 { - tmp: [_RATE_MAX]byte // Ciphertext is not confidential. - copy(tmp[:], xi) - switch st.rate { - case _RATE_128L: - enc_hw_128l(st, tmp[:], tmp[:]) - case _RATE_256: - enc_hw_256(st, tmp[:], tmp[:]) - } - copy(ci, tmp[:l]) - } -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { - z0, z1 := z_hw_128l(st) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16])) - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - - update_hw_128l(st, out0, out1) - intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { - z := z_hw_256(st) - - ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci))) - xi_ := x86._mm_xor_si128(ci_, z) - - update_hw_256(st, xi_) - intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { - tmp: [_RATE_128L]byte - defer crypto.zero_explicit(&tmp, size_of(tmp)) - - z0, z1 := z_hw_128l(st) - copy(tmp[:], cn) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - - intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1) - copy(xn, tmp[:]) - - for off := len(xn); off < _RATE_128L; off += 1 { - tmp[off] = 0 - } - out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0 - out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1 - update_hw_128l(st, out0, out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { - tmp: [_RATE_256]byte - defer crypto.zero_explicit(&tmp, size_of(tmp)) - - z := z_hw_256(st) - copy(tmp[:], cn) - - cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - xn_ := x86._mm_xor_si128(cn_, z) - - intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_) - copy(xn, tmp[:]) - - for off := len(xn); off < _RATE_256; off += 1 { - tmp[off] = 0 - } - xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - update_hw_256(st, xn_) -} - -@(private, enable_target_feature = "sse2,aes") -dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { - xi, ci, l := dst, src, len(src) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - dec_hw_128l(st, xi, ci) - xi = xi[_RATE_128L:] - ci = ci[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - dec_hw_256(st, xi, ci) - xi = xi[_RATE_256:] - ci = ci[_RATE_256:] - l -= _RATE_256 - } - } - - // Process the remainder. - if l > 0 { - switch st.rate { - case _RATE_128L: - dec_partial_hw_128l(st, xi, ci) - case _RATE_256: - dec_partial_hw_256(st, xi, ci) - } - } -} - -@(private, enable_target_feature = "sse2,aes") -finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { - tmp: [16]byte - endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) - endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) - - t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - - t0, t1: x86.__m128i = ---, --- - switch st.rate { - case _RATE_128L: - t = x86._mm_xor_si128(st.s2, t) - for _ in 0 ..< 7 { - update_hw_128l(st, t, t) - } - - t0 = x86._mm_xor_si128(st.s0, st.s1) - t0 = x86._mm_xor_si128(t0, st.s2) - t0 = x86._mm_xor_si128(t0, st.s3) - - t1 = x86._mm_xor_si128(st.s4, st.s5) - t1 = x86._mm_xor_si128(t1, st.s6) - if len(tag) == TAG_SIZE_256 { - t1 = x86._mm_xor_si128(t1, st.s7) - } - case _RATE_256: - t = x86._mm_xor_si128(st.s3, t) - for _ in 0 ..< 7 { - update_hw_256(st, t) - } - - t0 = x86._mm_xor_si128(st.s0, st.s1) - t0 = x86._mm_xor_si128(t0, st.s2) - - t1 = x86._mm_xor_si128(st.s3, st.s4) - t1 = x86._mm_xor_si128(t1, st.s5) - } - switch len(tag) { - case TAG_SIZE_128: - t0 = x86._mm_xor_si128(t0, t1) - intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) - case TAG_SIZE_256: - intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) - intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1) - } -} - -@(private) -reset_state_hw :: proc "contextless" (st: ^State_HW) { - crypto.zero_explicit(st, size_of(st^)) -} From e09923f585dc11a644a58903044dcec3574c89c7 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Fri, 13 Mar 2026 02:04:41 +0900 Subject: [PATCH 11/21] core/crypto/deoxysii: Migrate to generic SIMD + HW AES --- ...pl_hw_intel.odin => deoxysii_impl_hw.odin} | 241 ++++++++++-------- .../crypto/deoxysii/deoxysii_impl_hw_gen.odin | 2 + 2 files changed, 138 insertions(+), 105 deletions(-) rename core/crypto/deoxysii/{deoxysii_impl_hw_intel.odin => deoxysii_impl_hw.odin} (58%) diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin b/core/crypto/deoxysii/deoxysii_impl_hw.odin similarity index 58% rename from core/crypto/deoxysii/deoxysii_impl_hw_intel.odin rename to core/crypto/deoxysii/deoxysii_impl_hw.odin index 88c569d53..47f9ab55f 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw.odin @@ -1,152 +1,183 @@ -#+build amd64 +#+build amd64,arm64,arm32 package deoxysii import "base:intrinsics" import "core:crypto" -import "core:crypto/aes" +import aes_hw "core:crypto/_aes/hw" import "core:simd" -import "core:simd/x86" // This processes a maximum of 4 blocks at a time, as that is suitable // for most current hardware that doesn't say "Xeon". +// +// TODO/perf: ARM should be able to do 8 at a time. + +when ODIN_ARCH == .amd64 { + @(private="file") + TARGET_FEATURES :: "sse2,ssse3,aes" +} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private="file") + TARGET_FEATURES :: "neon,aes" +} @(private = "file") -_BIT_ENC :: x86.__m128i{0x80, 0} +_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} @(private = "file") -_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0} +_PREFIX_AD_BLOCK :: simd.u8x16{ + PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0} +_PREFIX_AD_FINAL :: simd.u8x16{ + PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0} +_PREFIX_MSG_BLOCK :: simd.u8x16{ + PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0} +_PREFIX_MSG_FINAL :: simd.u8x16{ + PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} // is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { - return aes.is_hardware_accelerated() + return aes_hw.is_supported() } -@(private = "file", enable_target_feature = "sse4.1", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) auth_tweak :: #force_inline proc "contextless" ( - prefix: x86.__m128i, + prefix: simd.u8x16, block_nr: int, -) -> x86.__m128i { - return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1) -} +) -> simd.u8x16 { + when ODIN_ENDIAN == .Little { + block_nr_u64 := intrinsics.byte_swap(u64(block_nr)) + } else { + block_nr_u64 := u64(block_nr) + } -@(private = "file", enable_target_feature = "sse2", require_results) -enc_tweak :: #force_inline proc "contextless" ( - tag: x86.__m128i, - block_nr: int, -) -> x86.__m128i { - return x86._mm_xor_si128( - x86._mm_or_si128(tag, _BIT_ENC), - x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))}, + return simd.bit_or( + prefix, + transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}), ) } -@(private = "file", enable_target_feature = "ssse3", require_results) -h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i { - return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +enc_tweak :: #force_inline proc "contextless" ( + tag: simd.u8x16, + block_nr: int, +) -> simd.u8x16 { + when ODIN_ENDIAN == .Little { + block_nr_u64 := intrinsics.byte_swap(u64(block_nr)) + } else { + block_nr_u64 := u64(block_nr) + } + + return simd.bit_xor( + simd.bit_or(tag, _BIT_ENC), + transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}), + ) } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_x4 :: #force_inline proc "contextless" ( ctx: ^Context, - s_0, s_1, s_2, s_3: x86.__m128i, - tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i, -) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check { + s_0, s_1, s_2, s_3: simd.u8x16, + tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16, +) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check { s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3 tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3 - sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) - stk_0 := x86._mm_xor_si128(tk1_0, sk) - stk_1 := x86._mm_xor_si128(tk1_1, sk) - stk_2 := x86._mm_xor_si128(tk1_2, sk) - stk_3 := x86._mm_xor_si128(tk1_3, sk) + sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0])) + stk_0 := simd.bit_xor(tk1_0, sk) + stk_1 := simd.bit_xor(tk1_1, sk) + stk_2 := simd.bit_xor(tk1_2, sk) + stk_3 := simd.bit_xor(tk1_3, sk) - s_0 = x86._mm_xor_si128(s_0, stk_0) - s_1 = x86._mm_xor_si128(s_1, stk_1) - s_2 = x86._mm_xor_si128(s_2, stk_2) - s_3 = x86._mm_xor_si128(s_3, stk_3) + s_0 = simd.bit_xor(s_0, stk_0) + s_1 = simd.bit_xor(s_1, stk_1) + s_2 = simd.bit_xor(s_2, stk_2) + s_3 = simd.bit_xor(s_3, stk_3) for i in 1 ..= BC_ROUNDS { - sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i])) - tk1_0 = h_(tk1_0) - tk1_1 = h_(tk1_1) - tk1_2 = h_(tk1_2) - tk1_3 = h_(tk1_3) + tk1_0 = h(tk1_0) + tk1_1 = h(tk1_1) + tk1_2 = h(tk1_2) + tk1_3 = h(tk1_3) - stk_0 = x86._mm_xor_si128(tk1_0, sk) - stk_1 = x86._mm_xor_si128(tk1_1, sk) - stk_2 = x86._mm_xor_si128(tk1_2, sk) - stk_3 = x86._mm_xor_si128(tk1_3, sk) + stk_0 = simd.bit_xor(tk1_0, sk) + stk_1 = simd.bit_xor(tk1_1, sk) + stk_2 = simd.bit_xor(tk1_2, sk) + stk_3 = simd.bit_xor(tk1_3, sk) - s_0 = x86._mm_aesenc_si128(s_0, stk_0) - s_1 = x86._mm_aesenc_si128(s_1, stk_1) - s_2 = x86._mm_aesenc_si128(s_2, stk_2) - s_3 = x86._mm_aesenc_si128(s_3, stk_3) + s_0 = aes_hw.aesenc(s_0, stk_0) + s_1 = aes_hw.aesenc(s_1, stk_1) + s_2 = aes_hw.aesenc(s_2, stk_2) + s_3 = aes_hw.aesenc(s_3, stk_3) } return s_0, s_1, s_2, s_3 } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_x1 :: #force_inline proc "contextless" ( ctx: ^Context, - s: x86.__m128i, - tweak: x86.__m128i, -) -> x86.__m128i #no_bounds_check { + s: simd.u8x16, + tweak: simd.u8x16, +) -> simd.u8x16 #no_bounds_check { s, tk1 := s, tweak - sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) - stk := x86._mm_xor_si128(tk1, sk) + sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0])) + stk := simd.bit_xor(tk1, sk) - s = x86._mm_xor_si128(s, stk) + s = simd.bit_xor(s, stk) for i in 1 ..= BC_ROUNDS { - sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i])) - tk1 = h_(tk1) + tk1 = h(tk1) - stk = x86._mm_xor_si128(tk1, sk) + stk = simd.bit_xor(tk1, sk) - s = x86._mm_aesenc_si128(s, stk) + s = aes_hw.aesenc(s, stk) } return s } -@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_absorb :: proc "contextless" ( ctx: ^Context, - tag: x86.__m128i, + tag: simd.u8x16, src: []byte, - tweak_prefix: x86.__m128i, + tweak_prefix: simd.u8x16, stk_block_nr: int, -) -> (x86.__m128i, int) #no_bounds_check { +) -> (simd.u8x16, int) #no_bounds_check { src, stk_block_nr, tag := src, stk_block_nr, tag nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= 4 { d_0, d_1, d_2, d_3 := bc_x4( ctx, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))), auth_tweak(tweak_prefix, stk_block_nr), auth_tweak(tweak_prefix, stk_block_nr + 1), auth_tweak(tweak_prefix, stk_block_nr + 2), auth_tweak(tweak_prefix, stk_block_nr + 3), ) - tag = x86._mm_xor_si128(tag, d_0) - tag = x86._mm_xor_si128(tag, d_1) - tag = x86._mm_xor_si128(tag, d_2) - tag = x86._mm_xor_si128(tag, d_3) + tag = simd.bit_xor(tag, d_0) + tag = simd.bit_xor(tag, d_1) + tag = simd.bit_xor(tag, d_2) + tag = simd.bit_xor(tag, d_3) src = src[4*BLOCK_SIZE:] stk_block_nr += 4 @@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" ( for nr_blocks > 0 { d := bc_x1( ctx, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), auth_tweak(tweak_prefix, stk_block_nr), ) - tag = x86._mm_xor_si128(tag, d) + tag = simd.bit_xor(tag, d) src = src[BLOCK_SIZE:] stk_block_nr += 1 @@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" ( return tag, stk_block_nr } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_final :: proc "contextless" ( ctx: ^Context, - tag: x86.__m128i, + tag: simd.u8x16, iv: []byte, -) -> x86.__m128i { +) -> simd.u8x16 { tmp: [BLOCK_SIZE]byte tmp[0] = PREFIX_TAG << PREFIX_SHIFT copy(tmp[1:], iv) - tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp)) + tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp)) return bc_x1(ctx, tag, tweak) } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_encrypt :: proc "contextless" ( ctx: ^Context, dst: []byte, src: []byte, - iv: x86.__m128i, - tweak_tag: x86.__m128i, + iv: simd.u8x16, + tweak_tag: simd.u8x16, stk_block_nr: int, ) -> int { dst, src, stk_block_nr := dst, src, stk_block_nr @@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" ( ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst)), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst)), + simd.bit_xor( d_0, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])), + simd.bit_xor( d_1, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])), + simd.bit_xor( d_2, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])), + simd.bit_xor( d_3, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))), ), ) @@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" ( ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst)), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst)), + simd.bit_xor( d, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), ), ) @@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" ( e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check { tmp: [BLOCK_SIZE]byte copy(tmp[1:], iv) - iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp))) // Algorithm 3 // @@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) // if A_∗ != nil then // Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗)) // end - auth: x86.__m128i + auth: simd.u8x16 n: int aad := aad @@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) copy(dst[n*BLOCK_SIZE:], m_star[:]) } - intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth) } @(private, require_results) d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { tmp: [BLOCK_SIZE]byte copy(tmp[1:], iv) - iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp))) // Algorithm 4 // @@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // if C_∗ != nil then // M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N) // end - auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag))) + auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag))) m := ciphertext n := bc_encrypt(ctx, dst, m, iv_, auth, 0) @@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // if A∗ != nil then // Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗)) // end - auth = x86.__m128i{0, 0} + auth = simd.u8x16{} aad := aad auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0) aad = aad[BLOCK_SIZE*n:] @@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // Tag verification // if tag0 = tag then return (M_1 || ... || M_l || M_∗) // else return false - intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth) ok := crypto.compare_constant_time(tmp[:], tag) == 1 crypto.zero_explicit(&tmp, size_of(tmp)) diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin index 89dae7229..7f5444535 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package deoxysii @(private = "file") From 3139b7e755a9cea4af922a0fef47e0a372f1fba6 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Fri, 13 Mar 2026 15:00:21 +0900 Subject: [PATCH 12/21] core/crypto/aes: Migrate to generic SIMD + HW AES --- ...{aes_ctr_hw_intel.odin => aes_ctr_hw.odin} | 62 ++++----- core/crypto/aes/aes_ecb_hw.odin | 59 +++++++++ core/crypto/aes/aes_ecb_hw_intel.odin | 58 --------- core/crypto/aes/aes_gcm.odin | 5 + ...{aes_gcm_hw_intel.odin => aes_gcm_hw.odin} | 122 ++++++++++++------ ...es_impl_hw_intel.odin => aes_impl_hw.odin} | 4 +- core/crypto/aes/aes_impl_hw_gen.odin | 2 + 7 files changed, 184 insertions(+), 128 deletions(-) rename core/crypto/aes/{aes_ctr_hw_intel.odin => aes_ctr_hw.odin} (62%) create mode 100644 core/crypto/aes/aes_ecb_hw.odin delete mode 100644 core/crypto/aes/aes_ecb_hw_intel.odin rename core/crypto/aes/{aes_gcm_hw_intel.odin => aes_gcm_hw.odin} (63%) rename core/crypto/aes/{aes_impl_hw_intel.odin => aes_impl_hw.odin} (81%) diff --git a/core/crypto/aes/aes_ctr_hw_intel.odin b/core/crypto/aes/aes_ctr_hw.odin similarity index 62% rename from core/crypto/aes/aes_ctr_hw_intel.odin rename to core/crypto/aes/aes_ctr_hw.odin index f30122c86..859b63a40 100644 --- a/core/crypto/aes/aes_ctr_hw_intel.odin +++ b/core/crypto/aes/aes_ctr_hw.odin @@ -1,30 +1,32 @@ -#+build amd64 +#+build amd64,arm64,arm32 package aes import "base:intrinsics" import "core:crypto/_aes" +import aes_hw "core:crypto/_aes/hw" +import "core:encoding/endian" import "core:math/bits" -import "core:simd/x86" +import "core:simd" @(private) CTR_STRIDE_HW :: 4 @(private) CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE -@(private, enable_target_feature = "sse2,aes") +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check { hw_ctx := ctx._impl.(Context_Impl_Hardware) - sks: [15]x86.__m128i = --- + sks: [15]simd.u8x16 = --- for i in 0 ..= hw_ctx._num_rounds { - sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i])) + sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i])) } - hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) { - ret := x86.__m128i{ - i64(intrinsics.byte_swap(hi)), - i64(intrinsics.byte_swap(lo)), - } + hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) { + buf: [BLOCK_SIZE]byte = --- + endian.unchecked_put_u64be(buf[0:], hi) + endian.unchecked_put_u64be(buf[8:], lo) + ret := intrinsics.unaligned_load((^simd.u8x16)(&buf)) hi, lo := hi, lo carry: u64 @@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b nr_blocks := nr_blocks ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo - blks: [CTR_STRIDE_HW]x86.__m128i = --- + blks: [CTR_STRIDE_HW]simd.u8x16 = --- for nr_blocks >= CTR_STRIDE_HW { #unroll for i in 0..< CTR_STRIDE_HW { blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + blks[i] = simd.bit_xor(blks[i], sks[0]) } #unroll for i in 1 ..= 9 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } switch hw_ctx._num_rounds { case _aes.ROUNDS_128: #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + blks[i] = aes_hw.aesenclast(blks[i], sks[10]) } case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + blks[i] = aes_hw.aesenclast(blks[i], sks[12]) } case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + blks[i] = aes_hw.aesenclast(blks[i], sks[14]) } } @@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b for nr_blocks > 0 { blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) - blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + blks[0] = simd.bit_xor(blks[0], sks[0]) #unroll for i in 1 ..= 9 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } switch hw_ctx._num_rounds { case _aes.ROUNDS_128: - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + blks[0] = aes_hw.aesenclast(blks[0], sks[10]) case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + blks[0] = aes_hw.aesenclast(blks[0], sks[12]) case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + blks[0] = aes_hw.aesenclast(blks[0], sks[14]) } xor_blocks_hw(dst, src, blks[:1]) @@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b zero_explicit(&sks, size_of(sks)) } -@(private, enable_target_feature = "sse2") -xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) { +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) { #no_bounds_check { if src != nil { for i in 0 ..< len(blocks) { off := i * BLOCK_SIZE - tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:]))) - blocks[i] = x86._mm_xor_si128(blocks[i], tmp) + tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:]))) + blocks[i] = simd.bit_xor(blocks[i], tmp) } } for i in 0 ..< len(blocks) { - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i]) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i]) } } } diff --git a/core/crypto/aes/aes_ecb_hw.odin b/core/crypto/aes/aes_ecb_hw.odin new file mode 100644 index 000000000..87a006d9b --- /dev/null +++ b/core/crypto/aes/aes_ecb_hw.odin @@ -0,0 +1,59 @@ +#+build amd64,arm64,arm32 +package aes + +import "base:intrinsics" +import "core:crypto/_aes" +import aes_hw "core:crypto/_aes/hw" +import "core:simd" + +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))) + + blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0]))) + #unroll for i in 1 ..= 9 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14]))) + } + + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk) +} + +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))) + + blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0]))) + #unroll for i in 1 ..= 9 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14]))) + } + + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk) +} diff --git a/core/crypto/aes/aes_ecb_hw_intel.odin b/core/crypto/aes/aes_ecb_hw_intel.odin deleted file mode 100644 index f1d44a25f..000000000 --- a/core/crypto/aes/aes_ecb_hw_intel.odin +++ /dev/null @@ -1,58 +0,0 @@ -#+build amd64 -package aes - -import "base:intrinsics" -import "core:crypto/_aes" -import "core:simd/x86" - -@(private, enable_target_feature = "sse2,aes") -encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { - blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) - - blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0]))) - #unroll for i in 1 ..= 9 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - switch ctx._num_rounds { - case _aes.ROUNDS_128: - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10]))) - case _aes.ROUNDS_192: - #unroll for i in 10 ..= 11 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12]))) - case _aes.ROUNDS_256: - #unroll for i in 10 ..= 13 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14]))) - } - - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) -} - -@(private, enable_target_feature = "sse2,aes") -decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { - blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) - - blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0]))) - #unroll for i in 1 ..= 9 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - switch ctx._num_rounds { - case _aes.ROUNDS_128: - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10]))) - case _aes.ROUNDS_192: - #unroll for i in 10 ..= 11 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12]))) - case _aes.ROUNDS_256: - #unroll for i in 10 ..= 13 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14]))) - } - - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) -} diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index 0acd95d2f..531844a32 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -4,6 +4,7 @@ import "core:bytes" import "core:crypto" import "core:crypto/_aes" import "core:crypto/_aes/ct64" +import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" // GCM_IV_SIZE is the default size of the GCM IV in bytes. @@ -26,6 +27,10 @@ Context_GCM :: struct { // init_gcm initializes a Context_GCM with the provided key. init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) { + when aes_hw.HAS_GHASH { + impl := aes_hw.is_ghash_supported() ? impl : .Portable + + } init_impl(&ctx._impl, key, impl) ctx._is_initialized = true } diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw.odin similarity index 63% rename from core/crypto/aes/aes_gcm_hw_intel.odin rename to core/crypto/aes/aes_gcm_hw.odin index 75c97be80..13c035a20 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw.odin @@ -1,12 +1,13 @@ -#+build amd64 +#+build amd64,arm64,arm32 package aes import "base:intrinsics" import "core:crypto" import "core:crypto/_aes" +@(require) import "core:crypto/_aes/ct64" import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" -import "core:simd/x86" +import "core:simd" @(private) gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) { @@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [ init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. - aes_hw.ghash(s[:], h[:], aad) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], aad) + } else { + ct64.ghash(s[:], h[:], aad) + } gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext)) copy(tag, s[:]) @@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: s: [_aes.GHASH_TAG_SIZE]byte init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) - aes_hw.ghash(s[:], h[:], aad) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], aad) + } else { + ct64.ghash(s[:], h[:], aad) + } gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext)) @@ -71,18 +80,26 @@ init_ghash_hw :: proc( } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - aes_hw.ghash(j0[:], h[:], iv) + when aes_hw.HAS_GHASH { + aes_hw.ghash(j0[:], h[:], iv) + } else { + ct64.ghash(j0[:], h[:], iv) + } tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) - aes_hw.ghash(j0[:], h[:], tmp[:]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(j0[:], h[:], tmp[:]) + } else { + ct64.ghash(j0[:], h[:], tmp[:]) + } } // ECB encrypt j0, so that we can just XOR with the tag. encrypt_block_hw(ctx, j0_enc[:], j0[:]) } -@(private = "file", enable_target_feature = "sse2") +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) final_ghash_hw :: proc( s: ^[_aes.GHASH_BLOCK_SIZE]byte, h: ^[_aes.GHASH_KEY_SIZE]byte, @@ -94,14 +111,18 @@ final_ghash_hw :: proc( endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8) endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8) - aes_hw.ghash(s[:], h[:], blk[:]) - j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0)) - s_vec := intrinsics.unaligned_load((^x86.__m128i)(s)) - s_vec = x86._mm_xor_si128(s_vec, j0_vec) - intrinsics.unaligned_store((^x86.__m128i)(s), s_vec) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], blk[:]) + } else { + ct64.ghash(s[:], h[:], blk[:]) + } + j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0)) + s_vec := intrinsics.unaligned_load((^simd.u8x16)(s)) + s_vec = simd.bit_xor(s_vec, j0_vec) + intrinsics.unaligned_store((^simd.u8x16)(s), s_vec) } -@(private = "file", enable_target_feature = "sse2,sse4.1,aes") +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) gctr_hw :: proc( ctx: ^Context_Impl_Hardware, dst: []byte, @@ -111,13 +132,13 @@ gctr_hw :: proc( iv: ^[_aes.GHASH_BLOCK_SIZE]byte, is_seal: bool, ) #no_bounds_check { - sks: [15]x86.__m128i = --- + sks: [15]simd.u8x16 = --- for i in 0 ..= ctx._num_rounds { - sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])) + sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])) } // Setup the counter block - ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv)) + ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv)) ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1 src, dst := src, dst @@ -127,11 +148,15 @@ gctr_hw :: proc( // This results in an unreadable mess, so we opt for simplicity // as performance is adequate. - blks: [CTR_STRIDE_HW]x86.__m128i = --- + blks: [CTR_STRIDE_HW]simd.u8x16 = --- nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= CTR_STRIDE_HW { if !is_seal { - aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + } else { + ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + } } #unroll for i in 0 ..< CTR_STRIDE_HW { @@ -139,42 +164,46 @@ gctr_hw :: proc( } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + blks[i] = simd.bit_xor(blks[i], sks[0]) } #unroll for i in 1 ..= 9 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } switch ctx._num_rounds { case _aes.ROUNDS_128: #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + blks[i] = aes_hw.aesenclast(blks[i], sks[10]) } case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + blks[i] = aes_hw.aesenclast(blks[i], sks[12]) } case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + blks[i] = aes_hw.aesenclast(blks[i], sks[14]) } } xor_blocks_hw(dst, src, blks[:]) if is_seal { - aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + } else { + ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + } } src = src[CTR_STRIDE_BYTES_HW:] @@ -186,28 +215,32 @@ gctr_hw :: proc( for n := len(src); n > 0; { l := min(n, BLOCK_SIZE) if !is_seal { - aes_hw.ghash(s[:], h[:], src[:l]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], src[:l]) + } else { + ct64.ghash(s[:], h[:], src[:l]) + } } blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr) - blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + blks[0] = simd.bit_xor(blks[0], sks[0]) #unroll for i in 1 ..= 9 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } switch ctx._num_rounds { case _aes.ROUNDS_128: - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + blks[0] = aes_hw.aesenclast(blks[0], sks[10]) case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + blks[0] = aes_hw.aesenclast(blks[0], sks[12]) case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + blks[0] = aes_hw.aesenclast(blks[0], sks[14]) } if l == BLOCK_SIZE { @@ -219,7 +252,11 @@ gctr_hw :: proc( copy(dst, blk[:l]) } if is_seal { - aes_hw.ghash(s[:], h[:], dst[:l]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], dst[:l]) + } else { + ct64.ghash(s[:], h[:], dst[:l]) + } } dst = dst[l:] @@ -235,8 +272,17 @@ gctr_hw :: proc( // the compiler. // // src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity` -@(private = "file", enable_target_feature = "sse4.1") -hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) { - ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3) +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) +hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) { + when ODIN_ENDIAN == .Little { + ctr_be := intrinsics.byte_swap(ctr) + } else { + ctr_be := ctr + } + + ret := transmute(simd.u8x16)( + simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be) + ) + return ret, ctr + 1 } diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw.odin similarity index 81% rename from core/crypto/aes/aes_impl_hw_intel.odin rename to core/crypto/aes/aes_impl_hw.odin index fe3849eda..fe93966f8 100644 --- a/core/crypto/aes/aes_impl_hw_intel.odin +++ b/core/crypto/aes/aes_impl_hw.odin @@ -1,4 +1,4 @@ -#+build amd64 +#+build amd64,arm64,arm32 package aes import aes_hw "core:crypto/_aes/hw" @@ -12,7 +12,7 @@ is_hardware_accelerated :: proc "contextless" () -> bool { @(private) Context_Impl_Hardware :: aes_hw.Context -@(private, enable_target_feature = "sse2,aes") +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { aes_hw.init(ctx, key) } diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin index 506298751..10b08b7b5 100644 --- a/core/crypto/aes/aes_impl_hw_gen.odin +++ b/core/crypto/aes/aes_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package aes @(private = "file") From af8853473af19ee9c424d0974392b1e26d820441 Mon Sep 17 00:00:00 2001 From: Laytan Date: Tue, 11 Nov 2025 19:14:40 +0100 Subject: [PATCH 13/21] query host features instead of only host cpu for more accurate -microarch:native --- src/llvm_backend.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 931813f42..600df6a20 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -66,6 +66,19 @@ gb_internal String get_final_microarchitecture() { gb_internal String get_default_features() { BuildContext *bc = &build_context; + if (bc->microarch == str_lit("native")) { + String features = make_string_c(LLVMGetHostCPUFeatures()); + + // Update the features string so LLVM uses it later. + if (bc->target_features_string.len > 0) { + bc->target_features_string = concatenate3_strings(permanent_allocator(), features, str_lit(","), bc->target_features_string); + } else { + bc->target_features_string = features; + } + + return features; + } + int off = 0; for (int i = 0; i < bc->metrics.arch; i += 1) { off += target_microarch_counts[i]; From f56ec37d1b7f532a941d683181fe031fa7f74499 Mon Sep 17 00:00:00 2001 From: Laytan Laats Date: Fri, 13 Mar 2026 20:43:03 +0100 Subject: [PATCH 14/21] fix type info of u16 could not be found --- src/checker.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/checker.cpp b/src/checker.cpp index 8acc5f4ae..f0ba94d24 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -2240,7 +2240,7 @@ gb_internal void add_type_info_type_internal(CheckerContext *c, Type *t) { case Type_BitSet: add_type_info_type_internal(c, bt->BitSet.elem); - add_type_info_type_internal(c, bt->BitSet.underlying); + add_type_info_type_internal(c, bit_set_to_int(bt)); break; case Type_Pointer: @@ -2479,7 +2479,7 @@ gb_internal void add_min_dep_type_info(Checker *c, Type *t) { case Type_BitSet: add_min_dep_type_info(c, bt->BitSet.elem); - add_min_dep_type_info(c, bt->BitSet.underlying); + add_min_dep_type_info(c, bit_set_to_int(bt)); break; case Type_Pointer: From adb2890d2bf05b4f3764418cc7bd6a5c16f9e67e Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sat, 14 Mar 2026 04:12:17 +0900 Subject: [PATCH 15/21] core/simd/arm: Formating fixes (NFC) --- core/simd/arm/aes.odin | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/simd/arm/aes.odin b/core/simd/arm/aes.odin index acafb9f1e..b1f44e52c 100644 --- a/core/simd/arm/aes.odin +++ b/core/simd/arm/aes.odin @@ -1,27 +1,27 @@ #+build arm64,arm32 package simd_arm -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { return _vaeseq_u8(data, key) } -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { return _vaesdq_u8(data, key) } -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { return _vaesmcq_u8(data) } -@(require_results,enable_target_feature="aes") +@(require_results,enable_target_feature = "aes") vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { return _vaesimcq_u8(data) } -@(private,default_calling_convention="none") +@(private, default_calling_convention = "none") foreign _ { @(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese") _vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- From 3a59e8c84950eabf6c50a684a0e02dbde250ec22 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sat, 14 Mar 2026 05:09:35 +0900 Subject: [PATCH 16/21] core/simd/arm: Add the SHA intrinsics The SHA3 ones aren't in the developer.arm.com documentation. --- core/simd/arm/sha.odin | 108 +++++++++++++++++++++++++++++++++++++++ core/simd/arm/types.odin | 4 ++ 2 files changed, 112 insertions(+) create mode 100644 core/simd/arm/sha.odin diff --git a/core/simd/arm/sha.odin b/core/simd/arm/sha.odin new file mode 100644 index 000000000..ca87c9795 --- /dev/null +++ b/core/simd/arm/sha.odin @@ -0,0 +1,108 @@ +#+build arm64,arm32 +package simd_arm + +@(require_results, enable_target_feature = "sha2") +vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1cq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1pq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1mq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t { + return _vsha1h_u32(e) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t { + return _vsha1su0q_u32(w0_3, w4_7, w8_11) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha1su1q_u32(tw0_3, w12_15) +} + +@(require_results, enable_target_feature = "sha2") +vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t { + return _vsha256hq_u32(hash_abcd, hash_efgh, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t { + return _vsha256h2q_u32(hash_efgh, hash_abcd, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t { + return _vsha256su0q_u32(w0_3, w4_7) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha256su1q_u32(tw0_3, w8_11, w12_15) +} + +// Note: The SHA512 instructions are part of the `sha3` feature set. + +@(require_results, enable_target_feature = "sha3") +vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t { + return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2) +} + +@(require_results, enable_target_feature = "sha3") +vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t { + return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t { + return _vsha512su0q_u64(w0_1, w2_) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t { + return _vsha512su1q_u64(s01_s02, w14_15, w9_10) +} + +@(private, default_calling_convention = "none") +foreign _ { + @(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c") + _vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p") + _vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m") + _vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h") + _vsha1h_u32 :: proc(e: uint32_t) -> uint32_t --- + @(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0") + _vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1") + _vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h") + _vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2") + _vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0") + _vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1") + _vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h") + _vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2") + _vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0") + _vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1") + _vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t --- +} diff --git a/core/simd/arm/types.odin b/core/simd/arm/types.odin index 7c86483a7..05e3540b6 100644 --- a/core/simd/arm/types.odin +++ b/core/simd/arm/types.odin @@ -1,5 +1,9 @@ #+build arm64,arm32 package simd_arm +// Type aliases to match `arm_neon.h`. +uint32_t :: u32 + uint8x16_t :: #simd[16]u8 uint32x4_t :: #simd[4]u32 +uint64x2_t :: #simd[2]u64 From 84b38810f11802511101345daeefdd943a43d622 Mon Sep 17 00:00:00 2001 From: mlgudi Date: Sun, 15 Mar 2026 02:46:30 +0000 Subject: [PATCH 17/21] powmod: fix Montgomery branch calling Barrett implementation --- core/math/big/prime.odin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin index 1c772143b..7fa6d8e4a 100644 --- a/core/math/big/prime.odin +++ b/core/math/big/prime.odin @@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc If the modulus is odd or dr != 0 use the montgomery method. */ if internal_int_is_odd(P) || dr != 0 { - return _private_int_exponent_mod(res, G, X, P, dr) + return _private_int_exponent_mod_fast(res, G, X, P, dr) } /* From 76da2c32334aefa69daa7f3c3a8ea6de3c0adf97 Mon Sep 17 00:00:00 2001 From: mlgudi Date: Sun, 15 Mar 2026 02:47:26 +0000 Subject: [PATCH 18/21] mul_high: fix aliasing bug when dest overlaps input --- core/math/big/private.odin | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/core/math/big/private.odin b/core/math/big/private.odin index 506f68165..1feb433b0 100644 --- a/core/math/big/private.odin +++ b/core/math/big/private.odin @@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex return _private_int_mul_high_comba(dest, a, b, digits) } - internal_grow(dest, a.used + b.used + 1) or_return - dest.used = a.used + b.used + 1 + /* + Set up temporary output `Int`, which we'll swap for `dest` when done. + */ + + t := &Int{} + + internal_grow(t, a.used + b.used + 1) or_return + t.used = a.used + b.used + 1 pa := a.used pb := b.used @@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex /* Calculate the double precision result. */ - r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry) + r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry) /* Get the lower part. */ - dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK)) + t.digit[ix + iy] = DIGIT(r & _WORD(_MASK)) /* Carry the carry. */ carry = DIGIT(r >> _WORD(_DIGIT_BITS)) } - dest.digit[ix + pb] = carry + t.digit[ix + pb] = carry } + + internal_swap(dest, t) + internal_destroy(t) return internal_clamp(dest) } From 9194b599ec5f3b3edbc88c31ef19ebc372cebdf2 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sun, 15 Mar 2026 00:09:25 +0900 Subject: [PATCH 19/21] core/crypto/sha2: Add ARMv8 SHA256 acceleration --- core/crypto/sha2/sha256_impl_hw_arm.odin | 224 ++++++++++++++++++ ...w_intel.odin => sha256_impl_hw_intel.odin} | 0 core/crypto/sha2/sha2_impl_hw_gen.odin | 2 + 3 files changed, 226 insertions(+) create mode 100644 core/crypto/sha2/sha256_impl_hw_arm.odin rename core/crypto/sha2/{sha2_impl_hw_intel.odin => sha256_impl_hw_intel.odin} (100%) diff --git a/core/crypto/sha2/sha256_impl_hw_arm.odin b/core/crypto/sha2/sha256_impl_hw_arm.odin new file mode 100644 index 000000000..618cc6fff --- /dev/null +++ b/core/crypto/sha2/sha256_impl_hw_arm.odin @@ -0,0 +1,224 @@ +#+build arm64,arm32 +package sha2 + +// Based on the public domain code by Jeffrey Walton, though +// realistically, there only is one sensible way to write this. +// +// See: https://github.com/noloader/SHA-Intrinsics + +import "base:intrinsics" +import "core:simd" +import "core:simd/arm" +import "core:sys/info" + +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware +// accelerated SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{ + .asimd, + .sha256, + } + return info.cpu_features() >= req_features +} + +@(private = "file") +K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5} +@(private = "file") +K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5} +@(private = "file") +K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3} +@(private = "file") +K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174} +@(private = "file") +K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC} +@(private = "file") +K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA} +@(private = "file") +K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7} +@(private = "file") +K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967} +@(private = "file") +K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13} +@(private = "file") +K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85} +@(private = "file") +K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3} +@(private = "file") +K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070} +@(private = "file") +K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5} +@(private = "file") +K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3} +@(private = "file") +K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208} +@(private = "file") +K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2} + +@(private, enable_target_feature = "neon,sha2") +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { + state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4])) + + data := data + for len(data) >= BLOCK_SIZE_256 { + // Save state + abef_save, cdgh_save := state_0, state_1 + + // Load message + msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data))) + msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:]))) + msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:]))) + msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:]))) + + // Reverse for little endian + when ODIN_ENDIAN == .Little { + msg_0 = byteswap_u32x4(msg_0) + msg_1 = byteswap_u32x4(msg_1) + msg_2 = byteswap_u32x4(msg_2) + msg_3 = byteswap_u32x4(msg_3) + } + + tmp_0 := simd.add(msg_0, K_0) + + // Rounds 0-3 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 := state_0 + tmp_1 := simd.add(msg_1, K_1) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 4-7 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_2) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 8-11 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_3) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 12-15 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_4) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 16-19 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_5) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 20-23 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_6) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 24-27 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_7) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 28-31 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_8) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 32-35 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_9) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 36-39 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_10) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 40-43 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_11) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 44-47 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_12) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 48-51 + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_13) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 52-55 + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_14) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Rounds 56-59 + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_15) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 60-63 + tmp_2 = state_0 + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Combine state + state_0 = simd.add(state_0, abef_save) + state_1 = simd.add(state_1, cdgh_save) + + data = data[BLOCK_SIZE_256:] + } + + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1) +} + +when ODIN_ENDIAN == .Little { + @(private = "file", enable_target_feature = "neon") + byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 { + return transmute(simd.u32x4)( + simd.shuffle( + transmute(simd.u8x16)(a), + transmute(simd.u8x16)(a), + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} \ No newline at end of file diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha256_impl_hw_intel.odin similarity index 100% rename from core/crypto/sha2/sha2_impl_hw_intel.odin rename to core/crypto/sha2/sha256_impl_hw_intel.odin diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin index 837d0656d..d735e3c61 100644 --- a/core/crypto/sha2/sha2_impl_hw_gen.odin +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package sha2 @(private = "file") From f82fbc94051955c2024d6a9e8299034a86b29b16 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Sun, 15 Mar 2026 11:06:22 +0100 Subject: [PATCH 20/21] -vet-tabs --- core/math/big/private.odin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/math/big/private.odin b/core/math/big/private.odin index 1feb433b0..1ca706c6d 100644 --- a/core/math/big/private.odin +++ b/core/math/big/private.odin @@ -472,7 +472,7 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex t.digit[ix + pb] = carry } - internal_swap(dest, t) + internal_swap(dest, t) internal_destroy(t) return internal_clamp(dest) } From ca73cd395f0eef92a618a12db67e1971dc922a6e Mon Sep 17 00:00:00 2001 From: gingerBill Date: Sun, 15 Mar 2026 10:33:52 +0000 Subject: [PATCH 21/21] Fix #6412 --- src/check_expr.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 80df35edc..aa3e26580 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -2955,14 +2955,21 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper if (check_is_assignable_to(c, x, y->type) || check_is_assignable_to(c, y, x->type)) { + if (x->type->failure || y->type->failure) { + // // skip any failures + x->mode = Addressing_Value; + x->type = t_untyped_bool; + return; + } + Type *err_type = x->type; bool defined = false; switch (op) { case Token_CmpEq: case Token_NotEq: - defined = (is_type_comparable(x->type) && is_type_comparable(y->type)) || - (is_operand_nil(*x) && type_has_nil(y->type)) || - (is_operand_nil(*y) && type_has_nil(x->type)); + defined = ((is_operand_nil(*x) && type_has_nil(y->type)) || + (is_operand_nil(*y) && type_has_nil(x->type)) || + is_type_comparable(x->type) && is_type_comparable(y->type)); break; case Token_Lt: case Token_Gt: