diff --git a/base/runtime/core.odin b/base/runtime/core.odin index 52993286a..dbd8ba468 100644 --- a/base/runtime/core.odin +++ b/base/runtime/core.odin @@ -141,7 +141,7 @@ Type_Info_Struct :: struct { flags: Type_Info_Struct_Flags, - // These are only set iff this structure is an SOA structure + // These are only set if and only if (⟺) this structure is an SOA structure soa_kind: Type_Info_Struct_Soa_Kind, soa_len: i32, soa_base_type: ^Type_Info, diff --git a/base/runtime/core_builtin.odin b/base/runtime/core_builtin.odin index ad3420770..80c8f8752 100644 --- a/base/runtime/core_builtin.odin +++ b/base/runtime/core_builtin.odin @@ -1525,7 +1525,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int { -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `context.assertion_failure_procedure` to assert. // // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true. @@ -1549,7 +1549,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc := } } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `context.assertion_failure_procedure` to assert. // This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute. @builtin @@ -1589,7 +1589,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! { p("not yet implemented", message, loc) } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `default_assertion_contextless_failure_proc` to assert. // // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true. @@ -1609,7 +1609,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex } } -// Evaluates the condition and panics the program iff the condition is false. +// Evaluates the condition and panics the program if and only if (⟺) the condition is false. // This uses the `default_assertion_contextless_failure_proc` to assert. @builtin ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) { diff --git a/base/runtime/random_generator_chacha8_simd256.odin b/base/runtime/random_generator_chacha8_simd256.odin index c0985f456..f2ccb6934 100644 --- a/base/runtime/random_generator_chacha8_simd256.odin +++ b/base/runtime/random_generator_chacha8_simd256.odin @@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) { // // LLVM appears not to consider "this instruction is totally // awful on the given microarchitcture", which leads to - // `VPCOMPRESSED` being generated iff AVX512 support is + // `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is // enabled for `intrinsics.simd_masked_compress_store`. // On Zen 4, this leads to a 50% performance regression vs // the 128-bit SIMD code. diff --git a/core/bufio/reader.odin b/core/bufio/reader.odin index 8e30542f1..e361612d2 100644 --- a/core/bufio/reader.odin +++ b/core/bufio/reader.odin @@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) { b.buf = buf } -// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set +// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set reader_destroy :: proc(b: ^Reader) { delete(b.buf, b.buf_allocator) b^ = {} diff --git a/core/bufio/writer.odin b/core/bufio/writer.odin index 666c05b67..9d5ef3481 100644 --- a/core/bufio/writer.odin +++ b/core/bufio/writer.odin @@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) { b.buf = buf } -// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set +// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set writer_destroy :: proc(b: ^Writer) { delete(b.buf, b.buf_allocator) b^ = {} diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 33978b3df..55eca5386 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc return subslices[:] } -// alias returns true iff a and b have a non-zero length, and any part of +// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of // a overlaps with b. alias :: proc "contextless" (a, b: []byte) -> bool { a_len, b_len := len(a), len(b) @@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool { return a_start <= b_end && b_start <= a_end } -// alias_inexactly returns true iff a and b have a non-zero length, +// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length, // the base pointer of a and b are NOT equal, and any part of a overlaps // with b (ie: `alias(a, b)` with an exception that returns false for // `a == b`, `b = a[:len(a)-69]` and similar conditions). diff --git a/core/container/avl/avl.odin b/core/container/avl/avl.odin index 6c7216c29..1208cd213 100644 --- a/core/container/avl/avl.odin +++ b/core/container/avl/avl.odin @@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int { return t._size } -// first returns the first node in the tree (in-order) or nil iff +// first returns the first node in the tree (in-order) or nil if and only if (⟺) // the tree is empty. first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) { return tree_first_or_last_in_order(t, Direction.Backward) } -// last returns the last element in the tree (in-order) or nil iff +// last returns the last element in the tree (in-order) or nil if and only if (⟺) // the tree is empty. last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) { return tree_first_or_last_in_order(t, Direction.Forward) } // find finds the value in the tree, and returns the corresponding -// node or nil iff the value is not present. +// node or nil if and only if (⟺) the value is not present. find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) { cur := t._root descend_loop: for cur != nil { @@ -168,7 +168,7 @@ find_or_insert :: proc( return } -// remove removes a node or value from the tree, and returns true iff the +// remove removes a node or value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove :: proc { @@ -176,7 +176,7 @@ remove :: proc { remove_node, } -// remove_value removes a value from the tree, and returns true iff the +// remove_value removes a value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool { @@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t return remove_node(t, n, call_on_remove) } -// remove_node removes a node from the tree, and returns true iff the +// remove_node removes a node from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool { @@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" ( } // iterator_get returns the node currently pointed to by the iterator, -// or nil iff the node has been removed, the tree is empty, or the end +// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end // of the tree has been reached. iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) { return it._cur } // iterator_remove removes the node currently pointed to by the iterator, -// and returns true iff the removal was successful. Semantics are the +// and returns true if and only if (⟺) the removal was successful. Semantics are the // same as the Tree remove. iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool { if it._cur == nil { @@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) - } // iterator_next advances the iterator and returns the (node, true) or -// or (nil, false) iff the end of the tree has been reached. +// or (nil, false) if and only if (⟺) the end of the tree has been reached. // // Note: The first call to iterator_next will return the first node instead // of advancing the iterator. diff --git a/core/container/rbtree/rbtree.odin b/core/container/rbtree/rbtree.odin index e892188d7..c138838df 100644 --- a/core/container/rbtree/rbtree.odin +++ b/core/container/rbtree/rbtree.odin @@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) { return t._size } -// first returns the first node in the tree (in-order) or nil iff +// first returns the first node in the tree (in-order) or nil if and only if (⟺) // the tree is empty. first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) { return tree_first_or_last_in_order(t, Direction.Backward) } -// last returns the last element in the tree (in-order) or nil iff +// last returns the last element in the tree (in-order) or nil if and only if (⟺) // the tree is empty. last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) { return tree_first_or_last_in_order(t, Direction.Forward) } -// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present. +// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present. find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) { node = t._root for node != nil { @@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) { return node } -// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present. +// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present. find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok { if n := find(t, key); n != nil { return n.value, true @@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n: return n, true, nil } -// remove removes a node or value from the tree, and returns true iff the +// remove removes a node or value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's value will be left intact, // the node itself will be freed via the tree's node allocator. remove :: proc { @@ -162,7 +162,7 @@ remove :: proc { remove_node, } -// remove_value removes a value from the tree, and returns true iff the +// remove_value removes a value from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's key + value will be left intact, // the node itself will be freed via the tree's node allocator. remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool { @@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) return remove_node(t, n, call_on_remove) } -// remove_node removes a node from the tree, and returns true iff the +// remove_node removes a node from the tree, and returns true if and only if (⟺) the // removal was successful. While the node's key + value will be left intact, // the node itself will be freed via the tree's node allocator. remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) { @@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K } // iterator_get returns the node currently pointed to by the iterator, -// or nil iff the node has been removed, the tree is empty, or the end +// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end // of the tree has been reached. iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) { return it._cur } // iterator_remove removes the node currently pointed to by the iterator, -// and returns true iff the removal was successful. Semantics are the +// and returns true if and only if (⟺) the removal was successful. Semantics are the // same as the Tree remove. iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool { if it._cur == nil { @@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t } // iterator_next advances the iterator and returns the (node, true) or -// or (nil, false) iff the end of the tree has been reached. +// or (nil, false) if and only if (⟺) the end of the tree has been reached. // // Note: The first call to iterator_next will return the first node instead // of advancing the iterator. diff --git a/core/crypto/_aes/hw/api.odin b/core/crypto/_aes/hw/api.odin new file mode 100644 index 000000000..09f674657 --- /dev/null +++ b/core/crypto/_aes/hw/api.odin @@ -0,0 +1,69 @@ +package aes_hw + +@(require) import "core:sys/info" + +// is_supported returns true if and only if (⟺) hardware accelerated AES +// is supported. +is_supported :: proc "contextless" () -> bool { + when ODIN_ARCH == .amd64 { + // Note: Everything with AES-NI has support for + // the required SSE extxtensions. + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .aes, + } + return info.cpu_features() >= req_features + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + req_features :: info.CPU_Features{ + .asimd, + .aes, + } + return info.cpu_features() >= req_features + } else { + return false + } +} + +// is_ghash_supported returns true if and only if (⟺) hardware accelerated +// GHASH is supported. +is_ghash_supported :: proc "contextless" () -> bool { + // Just having hardware GHASH is silly. + if !is_supported() { + return false + } + + when ODIN_ARCH == .amd64 { + return info.cpu_features() >= info.CPU_Features{ + .pclmulqdq, + } + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{ + // Once we can actually use this, we can re-enable this. + // + // return info.cpu_features() >= info.CPU_Features{ + // .pmull, + // } + return false + } else { + return false + } +} + +// Context is a keyed AES (ECB) instance. +Context :: struct { + // Note: The ideal thing to do is for the expanded round keys to be + // arrays of `u8x16`, however that implies alignment (or using AVX). + // + // All the people using e-waste processors that don't support an + // instruction set that has been around for over 10 years are why + // we can't have nice things. + _sk_exp_enc: [15][16]byte, + _sk_exp_dec: [15][16]byte, + _num_rounds: int, +} + +// init initializes a context for AES with the provided key. +init :: proc(ctx: ^Context, key: []byte) { + keysched(ctx, key) +} diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw/ghash_intel.odin similarity index 99% rename from core/crypto/_aes/hw_intel/ghash.odin rename to core/crypto/_aes/hw/ghash_intel.odin index 5f51b614b..d80816d5d 100644 --- a/core/crypto/_aes/hw_intel/ghash.odin +++ b/core/crypto/_aes/hw/ghash_intel.odin @@ -21,7 +21,7 @@ // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+build amd64 -package aes_hw_intel +package aes_hw import "base:intrinsics" import "core:crypto/_aes" diff --git a/core/crypto/_aes/hw/intrinsics_arm.odin b/core/crypto/_aes/hw/intrinsics_arm.odin new file mode 100644 index 000000000..ccd8efa8f --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_arm.odin @@ -0,0 +1,115 @@ +#+build arm64,arm32 +package aes_hw + +import "core:simd" +import "core:simd/arm" + +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/ + +TARGET_FEATURES :: "neon,aes" +HAS_GHASH :: false // Temporary + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key) +} + +aesimc :: arm.vaesimcq_u8 + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A + + // Undo ShiftRows step from AESE and extract X1 and X3 + dest := simd.swizzle( + a, + 0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1) + 0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1)) + 0x0c, 0x09, 0x06, 0x03, // SubBytes(X3) + 0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3)) + ) + + rcons := simd.u8x16{ + 0, 0, 0, 0, + IMM8, 0, 0, 0, + 0, 0, 0, 0, + IMM8, 0, 0, 0, + } + + return simd.bit_xor(dest, rcons) +} + +// The keyschedule implementation is easier to read with some extra +// Intel intrinsics that are emulated by built-in LLVM ops anyway. + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + shift :: IMM8 & 0xff + + // This needs to emit behavior identical to PSLLDQ which is as follows: + // + // TEMP := COUNT + // IF (TEMP > 15) THEN TEMP := 16; FI + // DEST := DEST << (TEMP * 8) + // DEST[MAXVL-1:128] (Unmodified) + + return simd.shuffle( + simd.u8x16{}, + a, + 0 when shift > 15 else (16 - shift + 0), + 1 when shift > 15 else (16 - shift + 1), + 2 when shift > 15 else (16 - shift + 2), + 3 when shift > 15 else (16 - shift + 3), + 4 when shift > 15 else (16 - shift + 4), + 5 when shift > 15 else (16 - shift + 5), + 6 when shift > 15 else (16 - shift + 6), + 7 when shift > 15 else (16 - shift + 7), + 8 when shift > 15 else (16 - shift + 8), + 9 when shift > 15 else (16 - shift + 9), + 10 when shift > 15 else (16 - shift + 10), + 11 when shift > 15 else (16 - shift + 11), + 12 when shift > 15 else (16 - shift + 12), + 13 when shift > 15 else (16 - shift + 13), + 14 when shift > 15 else (16 - shift + 14), + 15 when shift > 15 else (16 - shift + 15), + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + v := transmute(simd.i32x4)a + return transmute(simd.u8x16)simd.shuffle( + v, + v, + IMM8 & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)simd.shuffle( + transmute(simd.u32x4)(a), + transmute(simd.u32x4)(b), + u32(MASK) & 0b11, + (u32(MASK)>>2) & 0b11, + ((u32(MASK)>>4) & 0b11)+4, + ((u32(MASK)>>6) & 0b11)+4) +} diff --git a/core/crypto/_aes/hw/intrinsics_intel.odin b/core/crypto/_aes/hw/intrinsics_intel.odin new file mode 100644 index 000000000..25399dfae --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_intel.odin @@ -0,0 +1,55 @@ +#+build amd64 +package aes_hw + +import "core:simd" +import "core:simd/x86" + +// Intel/RISC-V semantics. + +TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes" +HAS_GHASH :: true + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data))) +} + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK)) +} diff --git a/core/crypto/_aes/hw/keysched_hw.odin b/core/crypto/_aes/hw/keysched_hw.odin new file mode 100644 index 000000000..7d85c43b7 --- /dev/null +++ b/core/crypto/_aes/hw/keysched_hw.odin @@ -0,0 +1,181 @@ +// Copyright (c) 2017 Thomas Pornin +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#+build amd64,arm64,arm32 +package aes_hw + +import "base:intrinsics" +import "core:crypto" +import "core:crypto/_aes" +import "core:simd" + +// Inspiration taken from BearSSL's AES-NI implementation. +// +// Note: This assumes that the SROA optimization pass is enabled to be +// anything resembling performant otherwise, LLVM will not elide a massive +// number of redundant loads/stores it generates for every intrinsic call. + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xff) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + tmp := k2 + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + r1 := _mm_shuffle_ps(tmp, k1, 0x44) + r2 := _mm_shuffle_ps(k1, k2, 0x4e) + + return r1, r2 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + return k1 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xaa) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds]) + for i in 1 ..< num_rounds { + tmp := aesimc(sks[i]) + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp) + } + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0]) +} + +@(private, enable_target_feature = TARGET_FEATURES) +keysched :: proc(ctx: ^Context, key: []byte) { + sks: [15]simd.u8x16 = --- + + // Compute the encryption keys. + num_rounds, key_len := 0, len(key) + switch key_len { + case _aes.KEY_SIZE_128: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01)) + sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02)) + sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04)) + sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08)) + sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10)) + sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20)) + sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40)) + sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80)) + sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b)) + sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36)) + num_rounds = _aes.ROUNDS_128 + case _aes.KEY_SIZE_192: + k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + + k1_tmp: [16]byte + copy(k1_tmp[:], key[16:24]) + k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp)) + crypto.zero_explicit(&k1_tmp, size_of(k1_tmp)) + + sks[0] = k0 + sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01)) + sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02)) + sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04)) + sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08)) + sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10)) + sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20)) + sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40)) + sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80)) + num_rounds = _aes.ROUNDS_192 + + case _aes.KEY_SIZE_256: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:]))) + sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01)) + sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01)) + sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02)) + sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02)) + sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04)) + sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04)) + sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08)) + sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08)) + sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10)) + sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10)) + sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20)) + sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20)) + sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40)) + num_rounds = _aes.ROUNDS_256 + case: + panic("crypto/aes: invalid AES key size") + } + for i in 0 ..= num_rounds { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i]) + } + + // Compute the decryption keys. GCM and CTR do not need this, however + // ECB, CBC, OCB3, etc do. + derive_dec_keys(ctx, &sks, num_rounds) + + ctx._num_rounds = num_rounds + + crypto.zero_explicit(&sks, size_of(sks)) +} diff --git a/core/crypto/_aes/hw/unsupported.odin b/core/crypto/_aes/hw/unsupported.odin new file mode 100644 index 000000000..3fb31b6b8 --- /dev/null +++ b/core/crypto/_aes/hw/unsupported.odin @@ -0,0 +1,11 @@ +#+build !amd64 +#+build !arm64 +#+build !arm32 +package aes_hw + +HAS_GHASH :: false + +@(private) +keysched :: proc(ctx: ^Context, key: []byte) { + panic("crypto/aes: hardware implementation unsupported") +} diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin deleted file mode 100644 index ce769fc10..000000000 --- a/core/crypto/_aes/hw_intel/api.odin +++ /dev/null @@ -1,38 +0,0 @@ -#+build amd64 -package aes_hw_intel - -import "core:sys/info" - -// is_supported returns true iff hardware accelerated AES -// is supported. -is_supported :: proc "contextless" () -> bool { - // Note: Everything with AES-NI and PCLMULQDQ has support for - // the required SSE extxtensions. - req_features :: info.CPU_Features{ - .sse2, - .ssse3, - .sse41, - .aes, - .pclmulqdq, - } - return info.cpu_features() >= req_features -} - -// Context is a keyed AES (ECB) instance. -Context :: struct { - // Note: The ideal thing to do is for the expanded round keys to be - // arrays of `__m128i`, however that implies alignment (or using AVX). - // - // All the people using e-waste processors that don't support an - // insturction set that has been around for over 10 years are why - // we can't have nice things. - _sk_exp_enc: [15][16]byte, - _sk_exp_dec: [15][16]byte, - _num_rounds: int, -} - -// init initializes a context for AES with the provided key. -init :: proc(ctx: ^Context, key: []byte) { - keysched(ctx, key) -} - diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin deleted file mode 100644 index 96108442d..000000000 --- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2017 Thomas Pornin -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR -// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#+build amd64 -package aes_hw_intel - -import "base:intrinsics" -import "core:crypto/_aes" -import "core:simd/x86" - -// Intel AES-NI based implementation. Inspiration taken from BearSSL. -// -// Note: This assumes that the SROA optimization pass is enabled to be -// anything resembling performat otherwise, LLVM will not elide a massive -// number of redundant loads/stores it generates for every intrinsic call. - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xff) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", require_results, enable_target_feature = "sse,sse2") -expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - tmp := k2 - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44)) - r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e)) - - return r1, r2 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - return k1 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xaa) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", enable_target_feature = "aes") -derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds]) - for i in 1 ..< num_rounds { - tmp := x86._mm_aesimc_si128(sks[i]) - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp) - } - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0]) -} - -@(private, enable_target_feature = "sse,sse2,aes") -keysched :: proc(ctx: ^Context, key: []byte) { - sks: [15]x86.__m128i = --- - - // Compute the encryption keys. - num_rounds, key_len := 0, len(key) - switch key_len { - case _aes.KEY_SIZE_128: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01)) - sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02)) - sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04)) - sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08)) - sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10)) - sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20)) - sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40)) - sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80)) - sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b)) - sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36)) - num_rounds = _aes.ROUNDS_128 - case _aes.KEY_SIZE_192: - k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - k1 := x86.__m128i{ - intrinsics.unaligned_load((^i64)(raw_data(key[16:]))), - 0, - } - sks[0] = k0 - sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01)) - sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02)) - sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04)) - sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08)) - sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10)) - sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20)) - sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40)) - sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80)) - num_rounds = _aes.ROUNDS_192 - case _aes.KEY_SIZE_256: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:]))) - sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01)) - sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01)) - sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02)) - sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02)) - sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04)) - sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04)) - sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08)) - sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08)) - sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10)) - sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10)) - sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20)) - sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20)) - sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40)) - num_rounds = _aes.ROUNDS_256 - case: - panic("crypto/aes: invalid AES key size") - } - for i in 0 ..= num_rounds { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i]) - } - - // Compute the decryption keys. GCM and CTR do not need this, however - // ECB, CBC, OCB3, etc do. - derive_dec_keys(ctx, &sks, num_rounds) - - ctx._num_rounds = num_rounds - - zero_explicit(&sks, size_of(sks)) -} - -/* -Set each byte of a memory range to zero. - -This procedure copies the value `0` into the `len` bytes of a memory range, -starting at address `data`. - -This procedure returns the pointer to `data`. - -Unlike the `zero()` procedure, which can be optimized away or reordered by the -compiler under certain circumstances, `zero_explicit()` procedure can not be -optimized away or reordered with other memory access operations, and the -compiler assumes volatile semantics of the memory. -*/ -zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr { - // This routine tries to avoid the compiler optimizing away the call, - // so that it is always executed. It is intended to provide - // equivalent semantics to those provided by the C11 Annex K 3.7.4.1 - // memset_s call. - intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero - intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering - return data -} \ No newline at end of file diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin index 9da0a54ea..fd48074df 100644 --- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" ( intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3) } -// is_performant returns true iff the target and current host both support +// is_performant returns true if and only if (⟺) the target and current host both support // "enough" 128-bit SIMD to make this implementation performant. is_performant :: proc "contextless" () -> bool { when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 { diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256.odin b/core/crypto/_chacha20/simd256/chacha20_simd256.odin index 407fbac56..c2f709aec 100644 --- a/core/crypto/_chacha20/simd256/chacha20_simd256.odin +++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin @@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0} @(private = "file") _VEC_TWO: simd.u64x4 : {2, 0, 2, 0} -// is_performant returns true iff the target and current host both support +// is_performant returns true if and only if (⟺) the target and current host both support // "enough" SIMD to make this implementation performant. is_performant :: proc "contextless" () -> bool { req_features :: info.CPU_Features{.avx, .avx2} diff --git a/core/crypto/_fiat/field_p256r1/field.odin b/core/crypto/_fiat/field_p256r1/field.odin index f39bee4a9..f7dd978aa 100644 --- a/core/crypto/_fiat/field_p256r1/field.odin +++ b/core/crypto/_fiat/field_p256r1/field.odin @@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> tmp: Montgomery_Domain_Field_Element = --- fe_sub(&tmp, arg1, arg2) - // This will only underflow iff arg1 == arg2, and we return the borrow, + // This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow, // which will be 1. is_eq := subtle.u64_is_zero(fe_non_zero(&tmp)) diff --git a/core/crypto/_fiat/field_p384r1/field.odin b/core/crypto/_fiat/field_p384r1/field.odin index 5cb5cd05e..2bddff18c 100644 --- a/core/crypto/_fiat/field_p384r1/field.odin +++ b/core/crypto/_fiat/field_p384r1/field.odin @@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> tmp: Montgomery_Domain_Field_Element = --- fe_sub(&tmp, arg1, arg2) - // This will only underflow iff arg1 == arg2, and we return the borrow, + // This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow, // which will be 1. is_eq := subtle.u64_is_zero(fe_non_zero(&tmp)) diff --git a/core/crypto/_subtle/subtle.odin b/core/crypto/_subtle/subtle.odin index 89328072c..454066e4a 100644 --- a/core/crypto/_subtle/subtle.odin +++ b/core/crypto/_subtle/subtle.odin @@ -5,17 +5,17 @@ package _subtle import "core:math/bits" -// byte_eq returns 1 iff a == b, 0 otherwise. +// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise. @(optimization_mode="none") byte_eq :: proc "contextless" (a, b: byte) -> int { v := a ~ b - // v == 0 iff a == b. The subtraction will underflow, setting the + // v == 0 if and only if (⟺) a == b. The subtraction will underflow, setting the // sign bit, which will get returned. return int((u32(v)-1) >> 31) } -// u64_eq returns 1 iff a == b, 0 otherwise. +// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise. @(optimization_mode="none") u64_eq :: proc "contextless" (a, b: u64) -> u64 { _, borrow := bits.sub_u64(0, a ~ b, 0) @@ -27,14 +27,14 @@ eq :: proc { u64_eq, } -// u64_is_zero returns 1 iff a == 0, 0 otherwise. +// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise. @(optimization_mode="none") u64_is_zero :: proc "contextless" (a: u64) -> u64 { _, borrow := bits.sub_u64(a, 1, 0) return borrow } -// u64_is_non_zero returns 1 iff a != 0, 0 otherwise. +// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise. @(optimization_mode="none") u64_is_non_zero :: proc "contextless" (a: u64) -> u64 { is_zero := u64_is_zero(a) diff --git a/core/crypto/aead/aead.odin b/core/crypto/aead/aead.odin index c8f324929..ed14a41f3 100644 --- a/core/crypto/aead/aead.odin +++ b/core/crypto/aead/aead.odin @@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided algorithm, key, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and ciphertext MUST alias exactly or not at all. diff --git a/core/crypto/aead/low_level.odin b/core/crypto/aead/low_level.odin index c80574a0d..c89d85823 100644 --- a/core/crypto/aead/low_level.odin +++ b/core/crypto/aead/low_level.odin @@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aegis/aegis.odin b/core/crypto/aegis/aegis.odin index fbb19f1ae..5aee61767 100644 --- a/core/crypto/aegis/aegis.odin +++ b/core/crypto/aegis/aegis.odin @@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aegis/aegis_impl_hw.odin b/core/crypto/aegis/aegis_impl_hw.odin new file mode 100644 index 000000000..4a39b341c --- /dev/null +++ b/core/crypto/aegis/aegis_impl_hw.odin @@ -0,0 +1,397 @@ +#+build amd64,arm64,arm32 +package aegis + +import "base:intrinsics" +import "core:crypto" +import aes_hw "core:crypto/_aes/hw" +import "core:encoding/endian" +import "core:simd" + +@(private) +State_HW :: struct { + s0: simd.u8x16, + s1: simd.u8x16, + s2: simd.u8x16, + s3: simd.u8x16, + s4: simd.u8x16, + s5: simd.u8x16, + s6: simd.u8x16, + s7: simd.u8x16, + rate: int, +} + +when ODIN_ARCH == .amd64 { + @(private="file") + TARGET_FEATURES :: "sse2,aes" +} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private="file") + TARGET_FEATURES :: "neon,aes" +} + +// is_hardware_accelerated returns true if and only if (⟺) hardware +// accelerated AEGIS is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return aes_hw.is_supported() +} + +@(private, enable_target_feature = TARGET_FEATURES) +init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { + switch ctx._key_len { + case KEY_SIZE_128L: + key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0])) + iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv))) + + st.s0 = simd.bit_xor(key, iv) + st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0])) + st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0])) + st.s3 = st.s1 + st.s4 = st.s0 + st.s5 = simd.bit_xor(key, st.s2) // key ^ C0 + st.s6 = simd.bit_xor(key, st.s1) // key ^ C1 + st.s7 = st.s5 + st.rate = _RATE_128L + + for _ in 0 ..< 10 { + update_hw_128l(st, iv, key) + } + case KEY_SIZE_256: + k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0])) + k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16])) + n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0])) + n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16])) + + st.s0 = simd.bit_xor(k0, n0) + st.s1 = simd.bit_xor(k1, n1) + st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0])) + st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0])) + st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0 + st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1 + st.rate = _RATE_256 + + u0, u1 := st.s0, st.s1 + for _ in 0 ..< 4 { + update_hw_256(st, k0) + update_hw_256(st, k1) + update_hw_256(st, u0) + update_hw_256(st, u1) + } + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) { + s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0)) + s1_ := aes_hw.aesenc(st.s0, st.s1) + s2_ := aes_hw.aesenc(st.s1, st.s2) + s3_ := aes_hw.aesenc(st.s2, st.s3) + s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1)) + s5_ := aes_hw.aesenc(st.s4, st.s5) + s6_ := aes_hw.aesenc(st.s5, st.s6) + s7_ := aes_hw.aesenc(st.s6, st.s7) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_ +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) { + s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m)) + s1_ := aes_hw.aesenc(st.s0, st.s1) + s2_ := aes_hw.aesenc(st.s1, st.s2) + s3_ := aes_hw.aesenc(st.s2, st.s3) + s4_ := aes_hw.aesenc(st.s3, st.s4) + s5_ := aes_hw.aesenc(st.s4, st.s5) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_ +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16])) + update_hw_128l(st, t0, t1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0])) + update_hw_256(st, m) +} + +@(private, enable_target_feature = TARGET_FEATURES) +absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check { + ai, l := aad, len(aad) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + absorb_hw_128l(st, ai) + ai = ai[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + absorb_hw_256(st, ai) + + ai = ai[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // AAD is not confidential. + copy(tmp[:], ai) + switch st.rate { + case _RATE_128L: + absorb_hw_128l(st, tmp[:]) + case _RATE_256: + absorb_hw_256(st, tmp[:]) + } + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) { + z0 := simd.bit_xor( + st.s6, + simd.bit_xor( + st.s1, + simd.bit_and(st.s2, st.s3), + ), + ) + z1 := simd.bit_xor( + st.s2, + simd.bit_xor( + st.s5, + simd.bit_and(st.s6, st.s7), + ), + ) + return z0, z1 +} + +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 { + return simd.bit_xor( + st.s1, + simd.bit_xor( + st.s4, + simd.bit_xor( + st.s5, + simd.bit_and(st.s2, st.s3), + ), + ), + ) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16])) + update_hw_128l(st, t0, t1) + + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z := z_hw_256(st) + + xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi))) + update_hw_256(st, xi_) + + ci_ := simd.bit_xor(xi_, z) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_) +} + +@(private, enable_target_feature = TARGET_FEATURES) +enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + ci, xi, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + enc_hw_128l(st, ci, xi) + ci = ci[_RATE_128L:] + xi = xi[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + enc_hw_256(st, ci, xi) + ci = ci[_RATE_256:] + xi = xi[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // Ciphertext is not confidential. + copy(tmp[:], xi) + switch st.rate { + case _RATE_128L: + enc_hw_128l(st, tmp[:], tmp[:]) + case _RATE_256: + enc_hw_256(st, tmp[:], tmp[:]) + } + copy(ci, tmp[:l]) + } +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16])) + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + + update_hw_128l(st, out0, out1) + intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z := z_hw_256(st) + + ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci))) + xi_ := simd.bit_xor(ci_, z) + + update_hw_256(st, xi_) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_128L]byte + defer crypto.zero_explicit(&tmp, size_of(tmp)) + + z0, z1 := z_hw_128l(st) + copy(tmp[:], cn) + + t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) + out0 := simd.bit_xor(t0, z0) + out1 := simd.bit_xor(t1, z1) + + intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0) + intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_128L; off += 1 { + tmp[off] = 0 + } + out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0 + out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1 + update_hw_128l(st, out0, out1) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_256]byte + defer crypto.zero_explicit(&tmp, size_of(tmp)) + + z := z_hw_256(st) + copy(tmp[:], cn) + + cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + xn_ := simd.bit_xor(cn_, z) + + intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_256; off += 1 { + tmp[off] = 0 + } + xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + update_hw_256(st, xn_) +} + +@(private, enable_target_feature = TARGET_FEATURES) +dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + xi, ci, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + dec_hw_128l(st, xi, ci) + xi = xi[_RATE_128L:] + ci = ci[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + dec_hw_256(st, xi, ci) + xi = xi[_RATE_256:] + ci = ci[_RATE_256:] + l -= _RATE_256 + } + } + + // Process the remainder. + if l > 0 { + switch st.rate { + case _RATE_128L: + dec_partial_hw_128l(st, xi, ci) + case _RATE_256: + dec_partial_hw_256(st, xi, ci) + } + } +} + +@(private, enable_target_feature = TARGET_FEATURES) +finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { + tmp: [16]byte + endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) + endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) + + t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) + + t0, t1: simd.u8x16 = ---, --- + switch st.rate { + case _RATE_128L: + t = simd.bit_xor(st.s2, t) + for _ in 0 ..< 7 { + update_hw_128l(st, t, t) + } + + t0 = simd.bit_xor(st.s0, st.s1) + t0 = simd.bit_xor(t0, st.s2) + t0 = simd.bit_xor(t0, st.s3) + + t1 = simd.bit_xor(st.s4, st.s5) + t1 = simd.bit_xor(t1, st.s6) + if len(tag) == TAG_SIZE_256 { + t1 = simd.bit_xor(t1, st.s7) + } + case _RATE_256: + t = simd.bit_xor(st.s3, t) + for _ in 0 ..< 7 { + update_hw_256(st, t) + } + + t0 = simd.bit_xor(st.s0, st.s1) + t0 = simd.bit_xor(t0, st.s2) + + t1 = simd.bit_xor(st.s3, st.s4) + t1 = simd.bit_xor(t1, st.s5) + } + switch len(tag) { + case TAG_SIZE_128: + t0 = simd.bit_xor(t0, t1) + intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0) + case TAG_SIZE_256: + intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0) + intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1) + } +} + +@(private) +reset_state_hw :: proc "contextless" (st: ^State_HW) { + crypto.zero_explicit(st, size_of(st^)) +} diff --git a/core/crypto/aegis/aegis_impl_hw_gen.odin b/core/crypto/aegis/aegis_impl_hw_gen.odin index 5ec2f3d6e..8f8b4c5da 100644 --- a/core/crypto/aegis/aegis_impl_hw_gen.odin +++ b/core/crypto/aegis/aegis_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package aegis @(private = "file") @@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported" @(private) State_HW :: struct {} -// is_hardware_accelerated returns true iff hardware accelerated AEGIS +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/aegis/aegis_impl_hw_intel.odin b/core/crypto/aegis/aegis_impl_hw_intel.odin deleted file mode 100644 index 7673b6b28..000000000 --- a/core/crypto/aegis/aegis_impl_hw_intel.odin +++ /dev/null @@ -1,389 +0,0 @@ -#+build amd64 -package aegis - -import "base:intrinsics" -import "core:crypto" -import "core:crypto/aes" -import "core:encoding/endian" -import "core:simd/x86" - -@(private) -State_HW :: struct { - s0: x86.__m128i, - s1: x86.__m128i, - s2: x86.__m128i, - s3: x86.__m128i, - s4: x86.__m128i, - s5: x86.__m128i, - s6: x86.__m128i, - s7: x86.__m128i, - rate: int, -} - -// is_hardware_accelerated returns true iff hardware accelerated AEGIS -// is supported. -is_hardware_accelerated :: proc "contextless" () -> bool { - return aes.is_hardware_accelerated() -} - -@(private, enable_target_feature = "sse2,aes") -init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { - switch ctx._key_len { - case KEY_SIZE_128L: - key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) - iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv))) - - st.s0 = x86._mm_xor_si128(key, iv) - st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) - st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) - st.s3 = st.s1 - st.s4 = st.s0 - st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0 - st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1 - st.s7 = st.s5 - st.rate = _RATE_128L - - for _ in 0 ..< 10 { - update_hw_128l(st, iv, key) - } - case KEY_SIZE_256: - k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) - k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16])) - n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0])) - n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16])) - - st.s0 = x86._mm_xor_si128(k0, n0) - st.s1 = x86._mm_xor_si128(k1, n1) - st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) - st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) - st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0 - st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1 - st.rate = _RATE_256 - - u0, u1 := st.s0, st.s1 - for _ in 0 ..< 4 { - update_hw_256(st, k0) - update_hw_256(st, k1) - update_hw_256(st, u0) - update_hw_256(st, u1) - } - } -} - -@(private = "file", enable_target_feature = "sse2,aes") -update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) { - s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0)) - s1_ := x86._mm_aesenc_si128(st.s0, st.s1) - s2_ := x86._mm_aesenc_si128(st.s1, st.s2) - s3_ := x86._mm_aesenc_si128(st.s2, st.s3) - s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1)) - s5_ := x86._mm_aesenc_si128(st.s4, st.s5) - s6_ := x86._mm_aesenc_si128(st.s5, st.s6) - s7_ := x86._mm_aesenc_si128(st.s6, st.s7) - st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_ -} - -@(private = "file", enable_target_feature = "sse2,aes") -update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) { - s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m)) - s1_ := x86._mm_aesenc_si128(st.s0, st.s1) - s2_ := x86._mm_aesenc_si128(st.s1, st.s2) - s3_ := x86._mm_aesenc_si128(st.s2, st.s3) - s4_ := x86._mm_aesenc_si128(st.s3, st.s4) - s5_ := x86._mm_aesenc_si128(st.s4, st.s5) - st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_ -} - -@(private = "file", enable_target_feature = "sse2,aes") -absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { - t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16])) - update_hw_128l(st, t0, t1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { - m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) - update_hw_256(st, m) -} - -@(private, enable_target_feature = "sse2,aes") -absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check { - ai, l := aad, len(aad) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - absorb_hw_128l(st, ai) - ai = ai[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - absorb_hw_256(st, ai) - - ai = ai[_RATE_256:] - l -= _RATE_256 - } - } - - // Pad out the remainder with `0`s till it is rate sized. - if l > 0 { - tmp: [_RATE_MAX]byte // AAD is not confidential. - copy(tmp[:], ai) - switch st.rate { - case _RATE_128L: - absorb_hw_128l(st, tmp[:]) - case _RATE_256: - absorb_hw_256(st, tmp[:]) - } - } -} - -@(private = "file", enable_target_feature = "sse2", require_results) -z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) { - z0 := x86._mm_xor_si128( - st.s6, - x86._mm_xor_si128( - st.s1, - x86._mm_and_si128(st.s2, st.s3), - ), - ) - z1 := x86._mm_xor_si128( - st.s2, - x86._mm_xor_si128( - st.s5, - x86._mm_and_si128(st.s6, st.s7), - ), - ) - return z0, z1 -} - -@(private = "file", enable_target_feature = "sse2", require_results) -z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i { - return x86._mm_xor_si128( - st.s1, - x86._mm_xor_si128( - st.s4, - x86._mm_xor_si128( - st.s5, - x86._mm_and_si128(st.s2, st.s3), - ), - ), - ) -} - -@(private = "file", enable_target_feature = "sse2,aes") -enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { - z0, z1 := z_hw_128l(st) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16])) - update_hw_128l(st, t0, t1) - - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { - z := z_hw_256(st) - - xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi))) - update_hw_256(st, xi_) - - ci_ := x86._mm_xor_si128(xi_, z) - intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_) -} - -@(private, enable_target_feature = "sse2,aes") -enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { - ci, xi, l := dst, src, len(src) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - enc_hw_128l(st, ci, xi) - ci = ci[_RATE_128L:] - xi = xi[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - enc_hw_256(st, ci, xi) - ci = ci[_RATE_256:] - xi = xi[_RATE_256:] - l -= _RATE_256 - } - } - - // Pad out the remainder with `0`s till it is rate sized. - if l > 0 { - tmp: [_RATE_MAX]byte // Ciphertext is not confidential. - copy(tmp[:], xi) - switch st.rate { - case _RATE_128L: - enc_hw_128l(st, tmp[:], tmp[:]) - case _RATE_256: - enc_hw_256(st, tmp[:], tmp[:]) - } - copy(ci, tmp[:l]) - } -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { - z0, z1 := z_hw_128l(st) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16])) - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - - update_hw_128l(st, out0, out1) - intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { - z := z_hw_256(st) - - ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci))) - xi_ := x86._mm_xor_si128(ci_, z) - - update_hw_256(st, xi_) - intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { - tmp: [_RATE_128L]byte - defer crypto.zero_explicit(&tmp, size_of(tmp)) - - z0, z1 := z_hw_128l(st) - copy(tmp[:], cn) - - t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) - out0 := x86._mm_xor_si128(t0, z0) - out1 := x86._mm_xor_si128(t1, z1) - - intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0) - intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1) - copy(xn, tmp[:]) - - for off := len(xn); off < _RATE_128L; off += 1 { - tmp[off] = 0 - } - out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0 - out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1 - update_hw_128l(st, out0, out1) -} - -@(private = "file", enable_target_feature = "sse2,aes") -dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { - tmp: [_RATE_256]byte - defer crypto.zero_explicit(&tmp, size_of(tmp)) - - z := z_hw_256(st) - copy(tmp[:], cn) - - cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - xn_ := x86._mm_xor_si128(cn_, z) - - intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_) - copy(xn, tmp[:]) - - for off := len(xn); off < _RATE_256; off += 1 { - tmp[off] = 0 - } - xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - update_hw_256(st, xn_) -} - -@(private, enable_target_feature = "sse2,aes") -dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { - xi, ci, l := dst, src, len(src) - - switch st.rate { - case _RATE_128L: - for l >= _RATE_128L { - dec_hw_128l(st, xi, ci) - xi = xi[_RATE_128L:] - ci = ci[_RATE_128L:] - l -= _RATE_128L - } - case _RATE_256: - for l >= _RATE_256 { - dec_hw_256(st, xi, ci) - xi = xi[_RATE_256:] - ci = ci[_RATE_256:] - l -= _RATE_256 - } - } - - // Process the remainder. - if l > 0 { - switch st.rate { - case _RATE_128L: - dec_partial_hw_128l(st, xi, ci) - case _RATE_256: - dec_partial_hw_256(st, xi, ci) - } - } -} - -@(private, enable_target_feature = "sse2,aes") -finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { - tmp: [16]byte - endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) - endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) - - t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) - - t0, t1: x86.__m128i = ---, --- - switch st.rate { - case _RATE_128L: - t = x86._mm_xor_si128(st.s2, t) - for _ in 0 ..< 7 { - update_hw_128l(st, t, t) - } - - t0 = x86._mm_xor_si128(st.s0, st.s1) - t0 = x86._mm_xor_si128(t0, st.s2) - t0 = x86._mm_xor_si128(t0, st.s3) - - t1 = x86._mm_xor_si128(st.s4, st.s5) - t1 = x86._mm_xor_si128(t1, st.s6) - if len(tag) == TAG_SIZE_256 { - t1 = x86._mm_xor_si128(t1, st.s7) - } - case _RATE_256: - t = x86._mm_xor_si128(st.s3, t) - for _ in 0 ..< 7 { - update_hw_256(st, t) - } - - t0 = x86._mm_xor_si128(st.s0, st.s1) - t0 = x86._mm_xor_si128(t0, st.s2) - - t1 = x86._mm_xor_si128(st.s3, st.s4) - t1 = x86._mm_xor_si128(t1, st.s5) - } - switch len(tag) { - case TAG_SIZE_128: - t0 = x86._mm_xor_si128(t0, t1) - intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) - case TAG_SIZE_256: - intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) - intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1) - } -} - -@(private) -reset_state_hw :: proc "contextless" (st: ^State_HW) { - crypto.zero_explicit(st, size_of(st^)) -} diff --git a/core/crypto/aes/aes_ctr_hw_intel.odin b/core/crypto/aes/aes_ctr_hw.odin similarity index 62% rename from core/crypto/aes/aes_ctr_hw_intel.odin rename to core/crypto/aes/aes_ctr_hw.odin index f30122c86..859b63a40 100644 --- a/core/crypto/aes/aes_ctr_hw_intel.odin +++ b/core/crypto/aes/aes_ctr_hw.odin @@ -1,30 +1,32 @@ -#+build amd64 +#+build amd64,arm64,arm32 package aes import "base:intrinsics" import "core:crypto/_aes" +import aes_hw "core:crypto/_aes/hw" +import "core:encoding/endian" import "core:math/bits" -import "core:simd/x86" +import "core:simd" @(private) CTR_STRIDE_HW :: 4 @(private) CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE -@(private, enable_target_feature = "sse2,aes") +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check { hw_ctx := ctx._impl.(Context_Impl_Hardware) - sks: [15]x86.__m128i = --- + sks: [15]simd.u8x16 = --- for i in 0 ..= hw_ctx._num_rounds { - sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i])) + sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i])) } - hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) { - ret := x86.__m128i{ - i64(intrinsics.byte_swap(hi)), - i64(intrinsics.byte_swap(lo)), - } + hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) { + buf: [BLOCK_SIZE]byte = --- + endian.unchecked_put_u64be(buf[0:], hi) + endian.unchecked_put_u64be(buf[8:], lo) + ret := intrinsics.unaligned_load((^simd.u8x16)(&buf)) hi, lo := hi, lo carry: u64 @@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b nr_blocks := nr_blocks ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo - blks: [CTR_STRIDE_HW]x86.__m128i = --- + blks: [CTR_STRIDE_HW]simd.u8x16 = --- for nr_blocks >= CTR_STRIDE_HW { #unroll for i in 0..< CTR_STRIDE_HW { blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + blks[i] = simd.bit_xor(blks[i], sks[0]) } #unroll for i in 1 ..= 9 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } switch hw_ctx._num_rounds { case _aes.ROUNDS_128: #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + blks[i] = aes_hw.aesenclast(blks[i], sks[10]) } case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + blks[i] = aes_hw.aesenclast(blks[i], sks[12]) } case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + blks[i] = aes_hw.aesenclast(blks[i], sks[14]) } } @@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b for nr_blocks > 0 { blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo) - blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + blks[0] = simd.bit_xor(blks[0], sks[0]) #unroll for i in 1 ..= 9 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } switch hw_ctx._num_rounds { case _aes.ROUNDS_128: - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + blks[0] = aes_hw.aesenclast(blks[0], sks[10]) case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + blks[0] = aes_hw.aesenclast(blks[0], sks[12]) case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + blks[0] = aes_hw.aesenclast(blks[0], sks[14]) } xor_blocks_hw(dst, src, blks[:1]) @@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b zero_explicit(&sks, size_of(sks)) } -@(private, enable_target_feature = "sse2") -xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) { +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) { #no_bounds_check { if src != nil { for i in 0 ..< len(blocks) { off := i * BLOCK_SIZE - tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:]))) - blocks[i] = x86._mm_xor_si128(blocks[i], tmp) + tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:]))) + blocks[i] = simd.bit_xor(blocks[i], tmp) } } for i in 0 ..< len(blocks) { - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i]) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i]) } } } diff --git a/core/crypto/aes/aes_ecb_hw.odin b/core/crypto/aes/aes_ecb_hw.odin new file mode 100644 index 000000000..87a006d9b --- /dev/null +++ b/core/crypto/aes/aes_ecb_hw.odin @@ -0,0 +1,59 @@ +#+build amd64,arm64,arm32 +package aes + +import "base:intrinsics" +import "core:crypto/_aes" +import aes_hw "core:crypto/_aes/hw" +import "core:simd" + +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))) + + blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0]))) + #unroll for i in 1 ..= 9 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))) + } + blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14]))) + } + + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk) +} + +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { + blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))) + + blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0]))) + #unroll for i in 1 ..= 9 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + switch ctx._num_rounds { + case _aes.ROUNDS_128: + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10]))) + case _aes.ROUNDS_192: + #unroll for i in 10 ..= 11 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12]))) + case _aes.ROUNDS_256: + #unroll for i in 10 ..= 13 { + blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i]))) + } + blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14]))) + } + + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk) +} diff --git a/core/crypto/aes/aes_ecb_hw_intel.odin b/core/crypto/aes/aes_ecb_hw_intel.odin deleted file mode 100644 index f1d44a25f..000000000 --- a/core/crypto/aes/aes_ecb_hw_intel.odin +++ /dev/null @@ -1,58 +0,0 @@ -#+build amd64 -package aes - -import "base:intrinsics" -import "core:crypto/_aes" -import "core:simd/x86" - -@(private, enable_target_feature = "sse2,aes") -encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { - blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) - - blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0]))) - #unroll for i in 1 ..= 9 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - switch ctx._num_rounds { - case _aes.ROUNDS_128: - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10]))) - case _aes.ROUNDS_192: - #unroll for i in 10 ..= 11 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12]))) - case _aes.ROUNDS_256: - #unroll for i in 10 ..= 13 { - blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))) - } - blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14]))) - } - - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) -} - -@(private, enable_target_feature = "sse2,aes") -decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) { - blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))) - - blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0]))) - #unroll for i in 1 ..= 9 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - switch ctx._num_rounds { - case _aes.ROUNDS_128: - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10]))) - case _aes.ROUNDS_192: - #unroll for i in 10 ..= 11 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12]))) - case _aes.ROUNDS_256: - #unroll for i in 10 ..= 13 { - blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i]))) - } - blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14]))) - } - - intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk) -} diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index bb87788ac..531844a32 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -4,6 +4,7 @@ import "core:bytes" import "core:crypto" import "core:crypto/_aes" import "core:crypto/_aes/ct64" +import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" // GCM_IV_SIZE is the default size of the GCM IV in bytes. @@ -26,6 +27,10 @@ Context_GCM :: struct { // init_gcm initializes a Context_GCM with the provided key. init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) { + when aes_hw.HAS_GHASH { + impl := aes_hw.is_ghash_supported() ? impl : .Portable + + } init_impl(&ctx._impl, key, impl) ctx._is_initialized = true } @@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) { // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context_GCM, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw.odin similarity index 63% rename from core/crypto/aes/aes_gcm_hw_intel.odin rename to core/crypto/aes/aes_gcm_hw.odin index c6e564773..13c035a20 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw.odin @@ -1,12 +1,13 @@ -#+build amd64 +#+build amd64,arm64,arm32 package aes import "base:intrinsics" import "core:crypto" import "core:crypto/_aes" -import "core:crypto/_aes/hw_intel" +@(require) import "core:crypto/_aes/ct64" +import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" -import "core:simd/x86" +import "core:simd" @(private) gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) { @@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [ init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. - hw_intel.ghash(s[:], h[:], aad) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], aad) + } else { + ct64.ghash(s[:], h[:], aad) + } gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext)) copy(tag, s[:]) @@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: s: [_aes.GHASH_TAG_SIZE]byte init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) - hw_intel.ghash(s[:], h[:], aad) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], aad) + } else { + ct64.ghash(s[:], h[:], aad) + } gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext)) @@ -71,18 +80,26 @@ init_ghash_hw :: proc( } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - hw_intel.ghash(j0[:], h[:], iv) + when aes_hw.HAS_GHASH { + aes_hw.ghash(j0[:], h[:], iv) + } else { + ct64.ghash(j0[:], h[:], iv) + } tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) - hw_intel.ghash(j0[:], h[:], tmp[:]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(j0[:], h[:], tmp[:]) + } else { + ct64.ghash(j0[:], h[:], tmp[:]) + } } // ECB encrypt j0, so that we can just XOR with the tag. encrypt_block_hw(ctx, j0_enc[:], j0[:]) } -@(private = "file", enable_target_feature = "sse2") +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) final_ghash_hw :: proc( s: ^[_aes.GHASH_BLOCK_SIZE]byte, h: ^[_aes.GHASH_KEY_SIZE]byte, @@ -94,14 +111,18 @@ final_ghash_hw :: proc( endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8) endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8) - hw_intel.ghash(s[:], h[:], blk[:]) - j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0)) - s_vec := intrinsics.unaligned_load((^x86.__m128i)(s)) - s_vec = x86._mm_xor_si128(s_vec, j0_vec) - intrinsics.unaligned_store((^x86.__m128i)(s), s_vec) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], blk[:]) + } else { + ct64.ghash(s[:], h[:], blk[:]) + } + j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0)) + s_vec := intrinsics.unaligned_load((^simd.u8x16)(s)) + s_vec = simd.bit_xor(s_vec, j0_vec) + intrinsics.unaligned_store((^simd.u8x16)(s), s_vec) } -@(private = "file", enable_target_feature = "sse2,sse4.1,aes") +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) gctr_hw :: proc( ctx: ^Context_Impl_Hardware, dst: []byte, @@ -111,13 +132,13 @@ gctr_hw :: proc( iv: ^[_aes.GHASH_BLOCK_SIZE]byte, is_seal: bool, ) #no_bounds_check { - sks: [15]x86.__m128i = --- + sks: [15]simd.u8x16 = --- for i in 0 ..= ctx._num_rounds { - sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])) + sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])) } // Setup the counter block - ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv)) + ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv)) ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1 src, dst := src, dst @@ -127,11 +148,15 @@ gctr_hw :: proc( // This results in an unreadable mess, so we opt for simplicity // as performance is adequate. - blks: [CTR_STRIDE_HW]x86.__m128i = --- + blks: [CTR_STRIDE_HW]simd.u8x16 = --- nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= CTR_STRIDE_HW { if !is_seal { - hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + } else { + ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + } } #unroll for i in 0 ..< CTR_STRIDE_HW { @@ -139,42 +164,46 @@ gctr_hw :: proc( } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_xor_si128(blks[i], sks[0]) + blks[i] = simd.bit_xor(blks[i], sks[0]) } #unroll for i in 1 ..= 9 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } switch ctx._num_rounds { case _aes.ROUNDS_128: #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10]) + blks[i] = aes_hw.aesenclast(blks[i], sks[10]) } case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12]) + blks[i] = aes_hw.aesenclast(blks[i], sks[12]) } case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { #unroll for j in 0 ..< CTR_STRIDE_HW { - blks[j] = x86._mm_aesenc_si128(blks[j], sks[i]) + blks[j] = aes_hw.aesenc(blks[j], sks[i]) } } #unroll for i in 0 ..< CTR_STRIDE_HW { - blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14]) + blks[i] = aes_hw.aesenclast(blks[i], sks[14]) } } xor_blocks_hw(dst, src, blks[:]) if is_seal { - hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + } else { + ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + } } src = src[CTR_STRIDE_BYTES_HW:] @@ -186,28 +215,32 @@ gctr_hw :: proc( for n := len(src); n > 0; { l := min(n, BLOCK_SIZE) if !is_seal { - hw_intel.ghash(s[:], h[:], src[:l]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], src[:l]) + } else { + ct64.ghash(s[:], h[:], src[:l]) + } } blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr) - blks[0] = x86._mm_xor_si128(blks[0], sks[0]) + blks[0] = simd.bit_xor(blks[0], sks[0]) #unroll for i in 1 ..= 9 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } switch ctx._num_rounds { case _aes.ROUNDS_128: - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10]) + blks[0] = aes_hw.aesenclast(blks[0], sks[10]) case _aes.ROUNDS_192: #unroll for i in 10 ..= 11 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12]) + blks[0] = aes_hw.aesenclast(blks[0], sks[12]) case _aes.ROUNDS_256: #unroll for i in 10 ..= 13 { - blks[0] = x86._mm_aesenc_si128(blks[0], sks[i]) + blks[0] = aes_hw.aesenc(blks[0], sks[i]) } - blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14]) + blks[0] = aes_hw.aesenclast(blks[0], sks[14]) } if l == BLOCK_SIZE { @@ -219,7 +252,11 @@ gctr_hw :: proc( copy(dst, blk[:l]) } if is_seal { - hw_intel.ghash(s[:], h[:], dst[:l]) + when aes_hw.HAS_GHASH { + aes_hw.ghash(s[:], h[:], dst[:l]) + } else { + ct64.ghash(s[:], h[:], dst[:l]) + } } dst = dst[l:] @@ -235,8 +272,17 @@ gctr_hw :: proc( // the compiler. // // src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity` -@(private = "file", enable_target_feature = "sse4.1") -hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) { - ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3) +@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES) +hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) { + when ODIN_ENDIAN == .Little { + ctr_be := intrinsics.byte_swap(ctr) + } else { + ctr_be := ctr + } + + ret := transmute(simd.u8x16)( + simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be) + ) + return ret, ctr + 1 } diff --git a/core/crypto/aes/aes_impl_hw.odin b/core/crypto/aes/aes_impl_hw.odin new file mode 100644 index 000000000..fe93966f8 --- /dev/null +++ b/core/crypto/aes/aes_impl_hw.odin @@ -0,0 +1,18 @@ +#+build amd64,arm64,arm32 +package aes + +import aes_hw "core:crypto/_aes/hw" + +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return aes_hw.is_supported() +} + +@(private) +Context_Impl_Hardware :: aes_hw.Context + +@(private, enable_target_feature = aes_hw.TARGET_FEATURES) +init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { + aes_hw.init(ctx, key) +} diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin index 0c9ec6edc..10b08b7b5 100644 --- a/core/crypto/aes/aes_impl_hw_gen.odin +++ b/core/crypto/aes/aes_impl_hw_gen.odin @@ -1,10 +1,12 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package aes @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported" -// is_hardware_accelerated returns true iff hardware accelerated AES +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin deleted file mode 100644 index 0f1fa6143..000000000 --- a/core/crypto/aes/aes_impl_hw_intel.odin +++ /dev/null @@ -1,18 +0,0 @@ -#+build amd64 -package aes - -import "core:crypto/_aes/hw_intel" - -// is_hardware_accelerated returns true iff hardware accelerated AES -// is supported. -is_hardware_accelerated :: proc "contextless" () -> bool { - return hw_intel.is_supported() -} - -@(private) -Context_Impl_Hardware :: hw_intel.Context - -@(private, enable_target_feature = "sse2,aes") -init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { - hw_intel.init(ctx, key) -} diff --git a/core/crypto/blake2b/blake2b.odin b/core/crypto/blake2b/blake2b.odin index 6c2c5c1e9..8cce6dac8 100644 --- a/core/crypto/blake2b/blake2b.odin +++ b/core/crypto/blake2b/blake2b.odin @@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _blake2.final(ctx, hash, finalize_clone) diff --git a/core/crypto/blake2s/blake2s.odin b/core/crypto/blake2s/blake2s.odin index 902f992b3..35e278f72 100644 --- a/core/crypto/blake2s/blake2s.odin +++ b/core/crypto/blake2s/blake2s.odin @@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _blake2.final(ctx, hash, finalize_clone) diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin index 0504acab0..c405a2736 100644 --- a/core/crypto/chacha20poly1305/chacha20poly1305.odin +++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin @@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin index b36bc2004..f4ddbfbe7 100644 --- a/core/crypto/crypto.odin +++ b/core/crypto/crypto.odin @@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle" // Omit large precomputed tables, trading off performance for size. COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false) -// HAS_RAND_BYTES is true iff the runtime provides a cryptographic +// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic // entropy source. HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES -// compare_constant_time returns 1 iff a and b are equal, 0 otherwise. +// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise. // // The execution time of this routine is constant regardless of the contents // of the slices being compared, as long as the length of the slices is equal. -// If the length of the two slices is different, it will early-return 0. +// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0. compare_constant_time :: proc "contextless" (a, b: []byte) -> int { // If the length of the slices is different, early return. // @@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int { return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n) } -// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by +// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by // a and b are equal, 0 otherwise. // // The execution time of this routine is constant regardless of the @@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i v |= x[i] ~ y[i] } - // After the loop, v == 0 iff a == b. The subtraction will underflow - // iff v == 0, setting the sign-bit, which gets returned. + // After the loop, v == 0 if and only if (⟺) a == b. The subtraction will underflow + // if and only if (⟺) v == 0, setting the sign-bit, which gets returned. return subtle.eq(0, v) } -// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise. +// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise. is_zero_constant_time :: proc "contextless" (b: []byte) -> int { v: byte for b_ in b { diff --git a/core/crypto/deoxysii/deoxysii.odin b/core/crypto/deoxysii/deoxysii.odin index 829d3d3ad..ffe9b4b32 100644 --- a/core/crypto/deoxysii/deoxysii.odin +++ b/core/crypto/deoxysii/deoxysii.odin @@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // open authenticates the aad and ciphertext, and decrypts the ciphertext, // with the provided Context, iv, and tag, and stores the output in dst, -// returning true iff the authentication was successful. If authentication +// returning true if and only if (⟺) the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin b/core/crypto/deoxysii/deoxysii_impl_hw.odin similarity index 57% rename from core/crypto/deoxysii/deoxysii_impl_hw_intel.odin rename to core/crypto/deoxysii/deoxysii_impl_hw.odin index cdad16f42..47f9ab55f 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw.odin @@ -1,152 +1,183 @@ -#+build amd64 +#+build amd64,arm64,arm32 package deoxysii import "base:intrinsics" import "core:crypto" -import "core:crypto/aes" +import aes_hw "core:crypto/_aes/hw" import "core:simd" -import "core:simd/x86" // This processes a maximum of 4 blocks at a time, as that is suitable // for most current hardware that doesn't say "Xeon". +// +// TODO/perf: ARM should be able to do 8 at a time. + +when ODIN_ARCH == .amd64 { + @(private="file") + TARGET_FEATURES :: "sse2,ssse3,aes" +} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private="file") + TARGET_FEATURES :: "neon,aes" +} @(private = "file") -_BIT_ENC :: x86.__m128i{0x80, 0} +_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} @(private = "file") -_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0} +_PREFIX_AD_BLOCK :: simd.u8x16{ + PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0} +_PREFIX_AD_FINAL :: simd.u8x16{ + PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0} +_PREFIX_MSG_BLOCK :: simd.u8x16{ + PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} @(private = "file") -_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0} +_PREFIX_MSG_FINAL :: simd.u8x16{ + PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} -// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { - return aes.is_hardware_accelerated() + return aes_hw.is_supported() } -@(private = "file", enable_target_feature = "sse4.1", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) auth_tweak :: #force_inline proc "contextless" ( - prefix: x86.__m128i, + prefix: simd.u8x16, block_nr: int, -) -> x86.__m128i { - return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1) -} +) -> simd.u8x16 { + when ODIN_ENDIAN == .Little { + block_nr_u64 := intrinsics.byte_swap(u64(block_nr)) + } else { + block_nr_u64 := u64(block_nr) + } -@(private = "file", enable_target_feature = "sse2", require_results) -enc_tweak :: #force_inline proc "contextless" ( - tag: x86.__m128i, - block_nr: int, -) -> x86.__m128i { - return x86._mm_xor_si128( - x86._mm_or_si128(tag, _BIT_ENC), - x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))}, + return simd.bit_or( + prefix, + transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}), ) } -@(private = "file", enable_target_feature = "ssse3", require_results) -h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i { - return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) +enc_tweak :: #force_inline proc "contextless" ( + tag: simd.u8x16, + block_nr: int, +) -> simd.u8x16 { + when ODIN_ENDIAN == .Little { + block_nr_u64 := intrinsics.byte_swap(u64(block_nr)) + } else { + block_nr_u64 := u64(block_nr) + } + + return simd.bit_xor( + simd.bit_or(tag, _BIT_ENC), + transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}), + ) } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_x4 :: #force_inline proc "contextless" ( ctx: ^Context, - s_0, s_1, s_2, s_3: x86.__m128i, - tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i, -) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check { + s_0, s_1, s_2, s_3: simd.u8x16, + tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16, +) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check { s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3 tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3 - sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) - stk_0 := x86._mm_xor_si128(tk1_0, sk) - stk_1 := x86._mm_xor_si128(tk1_1, sk) - stk_2 := x86._mm_xor_si128(tk1_2, sk) - stk_3 := x86._mm_xor_si128(tk1_3, sk) + sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0])) + stk_0 := simd.bit_xor(tk1_0, sk) + stk_1 := simd.bit_xor(tk1_1, sk) + stk_2 := simd.bit_xor(tk1_2, sk) + stk_3 := simd.bit_xor(tk1_3, sk) - s_0 = x86._mm_xor_si128(s_0, stk_0) - s_1 = x86._mm_xor_si128(s_1, stk_1) - s_2 = x86._mm_xor_si128(s_2, stk_2) - s_3 = x86._mm_xor_si128(s_3, stk_3) + s_0 = simd.bit_xor(s_0, stk_0) + s_1 = simd.bit_xor(s_1, stk_1) + s_2 = simd.bit_xor(s_2, stk_2) + s_3 = simd.bit_xor(s_3, stk_3) for i in 1 ..= BC_ROUNDS { - sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i])) - tk1_0 = h_(tk1_0) - tk1_1 = h_(tk1_1) - tk1_2 = h_(tk1_2) - tk1_3 = h_(tk1_3) + tk1_0 = h(tk1_0) + tk1_1 = h(tk1_1) + tk1_2 = h(tk1_2) + tk1_3 = h(tk1_3) - stk_0 = x86._mm_xor_si128(tk1_0, sk) - stk_1 = x86._mm_xor_si128(tk1_1, sk) - stk_2 = x86._mm_xor_si128(tk1_2, sk) - stk_3 = x86._mm_xor_si128(tk1_3, sk) + stk_0 = simd.bit_xor(tk1_0, sk) + stk_1 = simd.bit_xor(tk1_1, sk) + stk_2 = simd.bit_xor(tk1_2, sk) + stk_3 = simd.bit_xor(tk1_3, sk) - s_0 = x86._mm_aesenc_si128(s_0, stk_0) - s_1 = x86._mm_aesenc_si128(s_1, stk_1) - s_2 = x86._mm_aesenc_si128(s_2, stk_2) - s_3 = x86._mm_aesenc_si128(s_3, stk_3) + s_0 = aes_hw.aesenc(s_0, stk_0) + s_1 = aes_hw.aesenc(s_1, stk_1) + s_2 = aes_hw.aesenc(s_2, stk_2) + s_3 = aes_hw.aesenc(s_3, stk_3) } return s_0, s_1, s_2, s_3 } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_x1 :: #force_inline proc "contextless" ( ctx: ^Context, - s: x86.__m128i, - tweak: x86.__m128i, -) -> x86.__m128i #no_bounds_check { + s: simd.u8x16, + tweak: simd.u8x16, +) -> simd.u8x16 #no_bounds_check { s, tk1 := s, tweak - sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) - stk := x86._mm_xor_si128(tk1, sk) + sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0])) + stk := simd.bit_xor(tk1, sk) - s = x86._mm_xor_si128(s, stk) + s = simd.bit_xor(s, stk) for i in 1 ..= BC_ROUNDS { - sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i])) - tk1 = h_(tk1) + tk1 = h(tk1) - stk = x86._mm_xor_si128(tk1, sk) + stk = simd.bit_xor(tk1, sk) - s = x86._mm_aesenc_si128(s, stk) + s = aes_hw.aesenc(s, stk) } return s } -@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_absorb :: proc "contextless" ( ctx: ^Context, - tag: x86.__m128i, + tag: simd.u8x16, src: []byte, - tweak_prefix: x86.__m128i, + tweak_prefix: simd.u8x16, stk_block_nr: int, -) -> (x86.__m128i, int) #no_bounds_check { +) -> (simd.u8x16, int) #no_bounds_check { src, stk_block_nr, tag := src, stk_block_nr, tag nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= 4 { d_0, d_1, d_2, d_3 := bc_x4( ctx, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))), auth_tweak(tweak_prefix, stk_block_nr), auth_tweak(tweak_prefix, stk_block_nr + 1), auth_tweak(tweak_prefix, stk_block_nr + 2), auth_tweak(tweak_prefix, stk_block_nr + 3), ) - tag = x86._mm_xor_si128(tag, d_0) - tag = x86._mm_xor_si128(tag, d_1) - tag = x86._mm_xor_si128(tag, d_2) - tag = x86._mm_xor_si128(tag, d_3) + tag = simd.bit_xor(tag, d_0) + tag = simd.bit_xor(tag, d_1) + tag = simd.bit_xor(tag, d_2) + tag = simd.bit_xor(tag, d_3) src = src[4*BLOCK_SIZE:] stk_block_nr += 4 @@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" ( for nr_blocks > 0 { d := bc_x1( ctx, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), auth_tweak(tweak_prefix, stk_block_nr), ) - tag = x86._mm_xor_si128(tag, d) + tag = simd.bit_xor(tag, d) src = src[BLOCK_SIZE:] stk_block_nr += 1 @@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" ( return tag, stk_block_nr } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_final :: proc "contextless" ( ctx: ^Context, - tag: x86.__m128i, + tag: simd.u8x16, iv: []byte, -) -> x86.__m128i { +) -> simd.u8x16 { tmp: [BLOCK_SIZE]byte tmp[0] = PREFIX_TAG << PREFIX_SHIFT copy(tmp[1:], iv) - tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp)) + tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp)) return bc_x1(ctx, tag, tweak) } -@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +@(private = "file", enable_target_feature = TARGET_FEATURES, require_results) bc_encrypt :: proc "contextless" ( ctx: ^Context, dst: []byte, src: []byte, - iv: x86.__m128i, - tweak_tag: x86.__m128i, + iv: simd.u8x16, + tweak_tag: simd.u8x16, stk_block_nr: int, ) -> int { dst, src, stk_block_nr := dst, src, stk_block_nr @@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" ( ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst)), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst)), + simd.bit_xor( d_0, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])), + simd.bit_xor( d_1, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])), + simd.bit_xor( d_2, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))), ), ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])), + simd.bit_xor( d_3, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))), ), ) @@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" ( ) intrinsics.unaligned_store( - (^x86.__m128i)(raw_data(dst)), - x86._mm_xor_si128( + (^simd.u8x16)(raw_data(dst)), + simd.bit_xor( d, - intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))), ), ) @@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" ( e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check { tmp: [BLOCK_SIZE]byte copy(tmp[1:], iv) - iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp))) // Algorithm 3 // @@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) // if A_∗ != nil then // Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗)) // end - auth: x86.__m128i + auth: simd.u8x16 n: int aad := aad @@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) copy(dst[n*BLOCK_SIZE:], m_star[:]) } - intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth) } @(private, require_results) d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { tmp: [BLOCK_SIZE]byte copy(tmp[1:], iv) - iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp))) // Algorithm 4 // @@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // if C_∗ != nil then // M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N) // end - auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag))) + auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag))) m := ciphertext n := bc_encrypt(ctx, dst, m, iv_, auth, 0) @@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // if A∗ != nil then // Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗)) // end - auth = x86.__m128i{0, 0} + auth = simd.u8x16{} aad := aad auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0) aad = aad[BLOCK_SIZE*n:] @@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte // Tag verification // if tag0 = tag then return (M_1 || ... || M_l || M_∗) // else return false - intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth) + intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth) ok := crypto.compare_constant_time(tmp[:], tag) == 1 crypto.zero_explicit(&tmp, size_of(tmp)) diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin index b0705ca62..7f5444535 100644 --- a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin +++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin @@ -1,10 +1,12 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package deoxysii @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported" -// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { return false diff --git a/core/crypto/ecdh/ecdh.odin b/core/crypto/ecdh/ecdh.odin index af60f5649..6a8f6e466 100644 --- a/core/crypto/ecdh/ecdh.odin +++ b/core/crypto/ecdh/ecdh.odin @@ -104,7 +104,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { private_key_clear(priv_key) @@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool { private_key_clear(priv_key) @@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) { } } -// private_key_equal returns true iff the private keys are equal, +// private_key_equal returns true if and only if (⟺) the private keys are equal, // in constant time. private_key_equal :: proc(p, q: ^Private_Key) -> bool { if p._curve != q._curve { @@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool { public_key_clear(pub_key) @@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { } } -// public_key_equal returns true iff the public keys are equal, +// public_key_equal returns true if and only if (⟺) the public keys are equal, // in constant time. public_key_equal :: proc(p, q: ^Public_Key) -> bool { if p._curve != q._curve { diff --git a/core/crypto/ecdsa/ecdsa.odin b/core/crypto/ecdsa/ecdsa.odin index 241d50987..6c71feef7 100644 --- a/core/crypto/ecdsa/ecdsa.odin +++ b/core/crypto/ecdsa/ecdsa.odin @@ -79,7 +79,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { private_key_clear(priv_key) @@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool { private_key_clear(priv_key) @@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) { } } -// private_key_equal returns true iff the private keys are equal, +// private_key_equal returns true if and only if (⟺) the private keys are equal, // in constant time. private_key_equal :: proc(p, q: ^Private_Key) -> bool { if p._curve != q._curve { @@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool { public_key_clear(pub_key) @@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { } } -// public_key_equal returns true iff the public keys are equal, +// public_key_equal returns true if and only if (⟺) the public keys are equal, // in constant time. public_key_equal :: proc(p, q: ^Public_Key) -> bool { if p._curve != q._curve { diff --git a/core/crypto/ecdsa/ecdsa_asn1.odin b/core/crypto/ecdsa/ecdsa_asn1.odin index 0b423286d..74c9d65e6 100644 --- a/core/crypto/ecdsa/ecdsa_asn1.odin +++ b/core/crypto/ecdsa/ecdsa_asn1.odin @@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) { return nil, nil, false } - // DER requires a leading 0 iff the sign bit of the leading byte + // DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte // is set to distinguish between positive and negative integers, // and the minimal length representation. `r` and `s` are always // going to be unsigned, so we validate malformed DER and strip diff --git a/core/crypto/ecdsa/ecdsa_verify.odin b/core/crypto/ecdsa/ecdsa_verify.odin index 6b4e3dd4a..bd973a8df 100644 --- a/core/crypto/ecdsa/ecdsa_verify.odin +++ b/core/crypto/ecdsa/ecdsa_verify.odin @@ -3,7 +3,7 @@ package ecdsa import "core:crypto/hash" import secec "core:crypto/_weierstrass" -// verify_raw returns true iff sig is a valid signature by pub_key over +// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over // msg, hased using hash_algo, per the verification procedure specifed // in SEC 1, Version 2.0, Section 4.1.4. // @@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: [] panic("crypto/ecdsa: invalid curve") } -// verify_asn1 returns true iff sig is a valid signature by pub_key over +// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over // msg, hased using hash_algo, per the verification procedure specifed // in SEC 1, Version 2.0, Section 4.1.4. // diff --git a/core/crypto/ed25519/ed25519.odin b/core/crypto/ed25519/ed25519.odin index 817c8d34b..2020c0633 100644 --- a/core/crypto/ed25519/ed25519.odin +++ b/core/crypto/ed25519/ed25519.odin @@ -48,7 +48,7 @@ Public_Key :: struct { } // private_key_generate uses the system entropy source to generate a new -// Private_Key. This will only fail iff the system entropy source is +// Private_Key. This will only fail if and only if (⟺) the system entropy source is // missing or broken. private_key_generate :: proc(priv_key: ^Private_Key) -> bool { private_key_clear(priv_key) @@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool { } // private_key_set_bytes decodes a byte-encoded private key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool { if len(b) != PRIVATE_KEY_SIZE { return false @@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) { } // public_key_set_bytes decodes a byte-encoded public key, and returns -// true iff the operation was successful. +// true if and only if (⟺) the operation was successful. public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool { if len(b) != PUBLIC_KEY_SIZE { return false @@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { copy(dst, pub_key._b[:]) } -// public_key_equal returns true iff pub_key is equal to other. +// public_key_equal returns true if and only if (⟺) pub_key is equal to other. public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool { ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key") return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1 } -// verify returns true iff sig is a valid signature by pub_key over msg. +// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg. // // The optional `allow_small_order_A` parameter will make this // implementation strictly compatible with FIPS 186-5, at the expense of diff --git a/core/crypto/hash/low_level.odin b/core/crypto/hash/low_level.odin index 242eadd5f..44b2a8100 100644 --- a/core/crypto/hash/low_level.odin +++ b/core/crypto/hash/low_level.odin @@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { switch &impl in ctx._impl { diff --git a/core/crypto/hmac/hmac.odin b/core/crypto/hmac/hmac.odin index d28c03b5b..56decc0ef 100644 --- a/core/crypto/hmac/hmac.odin +++ b/core/crypto/hmac/hmac.odin @@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) { } // verify will verify the HMAC tag computed with the specified algorithm -// and key over msg and return true iff the tag is valid. It requires +// and key over msg and return true if and only if (⟺) the tag is valid. It requires // that the tag is correctly sized. verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool { tag_buf: [hash.MAX_DIGEST_SIZE]byte diff --git a/core/crypto/kmac/kmac.odin b/core/crypto/kmac/kmac.odin index 4ecff4f12..f0c27739a 100644 --- a/core/crypto/kmac/kmac.odin +++ b/core/crypto/kmac/kmac.odin @@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) { } // verify will verify the KMAC tag computed with the specified security -// strength, key and domain separator over msg and return true iff the +// strength, key and domain separator over msg and return true if and only if (⟺) the // tag is valid. verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool { derived_tag := make([]byte, len(tag), allocator) diff --git a/core/crypto/legacy/keccak/keccak.odin b/core/crypto/legacy/keccak/keccak.odin index ec6af2565..ffca1c95c 100644 --- a/core/crypto/legacy/keccak/keccak.odin +++ b/core/crypto/legacy/keccak/keccak.odin @@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/crypto/legacy/md5/md5.odin b/core/crypto/legacy/md5/md5.odin index 399a789ed..4bbc5d32a 100644 --- a/core/crypto/legacy/md5/md5.odin +++ b/core/crypto/legacy/md5/md5.odin @@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/legacy/sha1/sha1.odin b/core/crypto/legacy/sha1/sha1.odin index f9adcc3d1..892f893a6 100644 --- a/core/crypto/legacy/sha1/sha1.odin +++ b/core/crypto/legacy/sha1/sha1.odin @@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/pbkdf2/pbkdf2.odin b/core/crypto/pbkdf2/pbkdf2.odin index 9d8394031..c27ec4aa2 100644 --- a/core/crypto/pbkdf2/pbkdf2.odin +++ b/core/crypto/pbkdf2/pbkdf2.odin @@ -66,7 +66,7 @@ derive :: proc( dst_blk = dst_blk[h_len:] } - // Instead of rounding l up, just proceass the one extra block iff + // Instead of rounding l up, just proceass the one extra block if and only if (⟺) // r != 0. if r > 0 { tmp: [hash.MAX_DIGEST_SIZE]byte diff --git a/core/crypto/poly1305/poly1305.odin b/core/crypto/poly1305/poly1305.odin index 69e2e3ad3..ed2bec82d 100644 --- a/core/crypto/poly1305/poly1305.odin +++ b/core/crypto/poly1305/poly1305.odin @@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) { } // verify will verify the Poly1305 tag computed with the key over msg and -// return true iff the tag is valid. It requires that the tag is correctly +// return true if and only if (⟺) the tag is valid. It requires that the tag is correctly // sized. verify :: proc(tag, msg, key: []byte) -> bool { ctx: Context = --- diff --git a/core/crypto/ristretto255/ristretto255.odin b/core/crypto/ristretto255/ristretto255.odin index 3724aee7a..ef899d1da 100644 --- a/core/crypto/ristretto255/ristretto255.odin +++ b/core/crypto/ristretto255/ristretto255.odin @@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc( ge._is_initialized = true } -// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`. +// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{a}) @@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { ge._is_initialized = true } -// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`. +// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{ge, a}) @@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { grp.ge_cond_assign(&ge._p, &a._p, ctrl) } -// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`. +// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { _ge_ensure_initialized([]^Group_Element{a, b}) @@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { ge._is_initialized = true } -// ge_equal returns 1 iff `a == b`, and 0 otherwise. +// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise. @(require_results) ge_equal :: proc(a, b: ^Group_Element) -> int { _ge_ensure_initialized([]^Group_Element{a, b}) @@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int { return ret } -// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise. +// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise. @(require_results) ge_is_identity :: proc(ge: ^Group_Element) -> int { return ge_equal(ge, &GE_IDENTITY) diff --git a/core/crypto/ristretto255/ristretto255_scalar.odin b/core/crypto/ristretto255/ristretto255_scalar.odin index 75844b3f4..743e02ef3 100644 --- a/core/crypto/ristretto255/ristretto255_scalar.odin +++ b/core/crypto/ristretto255/ristretto255_scalar.odin @@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) { grp.sc_square(sc, a) } -// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`. +// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`. // Behavior for all other values of ctrl are undefined, sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) { grp.sc_cond_assign(sc, a, ctrl) } -// sc_equal returns 1 iff `a == b`, and 0 otherwise. +// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise. @(require_results) sc_equal :: proc(a, b: ^Scalar) -> int { return grp.sc_equal(a, b) diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index 36fa4aa02..dc41462e4 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -191,7 +191,7 @@ update :: proc(ctx: ^$T, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/sha2/sha256_impl_hw_arm.odin b/core/crypto/sha2/sha256_impl_hw_arm.odin new file mode 100644 index 000000000..618cc6fff --- /dev/null +++ b/core/crypto/sha2/sha256_impl_hw_arm.odin @@ -0,0 +1,224 @@ +#+build arm64,arm32 +package sha2 + +// Based on the public domain code by Jeffrey Walton, though +// realistically, there only is one sensible way to write this. +// +// See: https://github.com/noloader/SHA-Intrinsics + +import "base:intrinsics" +import "core:simd" +import "core:simd/arm" +import "core:sys/info" + +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware +// accelerated SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{ + .asimd, + .sha256, + } + return info.cpu_features() >= req_features +} + +@(private = "file") +K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5} +@(private = "file") +K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5} +@(private = "file") +K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3} +@(private = "file") +K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174} +@(private = "file") +K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC} +@(private = "file") +K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA} +@(private = "file") +K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7} +@(private = "file") +K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967} +@(private = "file") +K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13} +@(private = "file") +K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85} +@(private = "file") +K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3} +@(private = "file") +K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070} +@(private = "file") +K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5} +@(private = "file") +K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3} +@(private = "file") +K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208} +@(private = "file") +K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2} + +@(private, enable_target_feature = "neon,sha2") +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { + state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4])) + + data := data + for len(data) >= BLOCK_SIZE_256 { + // Save state + abef_save, cdgh_save := state_0, state_1 + + // Load message + msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data))) + msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:]))) + msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:]))) + msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:]))) + + // Reverse for little endian + when ODIN_ENDIAN == .Little { + msg_0 = byteswap_u32x4(msg_0) + msg_1 = byteswap_u32x4(msg_1) + msg_2 = byteswap_u32x4(msg_2) + msg_3 = byteswap_u32x4(msg_3) + } + + tmp_0 := simd.add(msg_0, K_0) + + // Rounds 0-3 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 := state_0 + tmp_1 := simd.add(msg_1, K_1) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 4-7 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_2) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 8-11 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_3) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 12-15 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_4) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 16-19 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_5) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 20-23 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_6) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 24-27 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_7) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 28-31 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_8) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 32-35 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_9) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 36-39 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_10) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 40-43 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_11) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 44-47 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_12) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 48-51 + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_13) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 52-55 + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_14) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Rounds 56-59 + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_15) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 60-63 + tmp_2 = state_0 + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Combine state + state_0 = simd.add(state_0, abef_save) + state_1 = simd.add(state_1, cdgh_save) + + data = data[BLOCK_SIZE_256:] + } + + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1) +} + +when ODIN_ENDIAN == .Little { + @(private = "file", enable_target_feature = "neon") + byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 { + return transmute(simd.u32x4)( + simd.shuffle( + transmute(simd.u8x16)(a), + transmute(simd.u8x16)(a), + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} \ No newline at end of file diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha256_impl_hw_intel.odin similarity index 99% rename from core/crypto/sha2/sha2_impl_hw_intel.odin rename to core/crypto/sha2/sha256_impl_hw_intel.odin index 83ef58a12..3f6ebb746 100644 --- a/core/crypto/sha2/sha2_impl_hw_intel.odin +++ b/core/crypto/sha2/sha256_impl_hw_intel.odin @@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814} K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7} -// is_hardware_accelerated_256 returns true iff hardware accelerated +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated // SHA-224/SHA-256 is supported. is_hardware_accelerated_256 :: proc "contextless" () -> bool { req_features :: info.CPU_Features{ diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin index 85c7f8b28..d735e3c61 100644 --- a/core/crypto/sha2/sha2_impl_hw_gen.odin +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -1,10 +1,12 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package sha2 @(private = "file") ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" -// is_hardware_accelerated_256 returns true iff hardware accelerated +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated // SHA-224/SHA-256 is supported. is_hardware_accelerated_256 :: proc "contextless" () -> bool { return false diff --git a/core/crypto/sha3/sha3.odin b/core/crypto/sha3/sha3.odin index 2ca70963a..2f8d95092 100644 --- a/core/crypto/sha3/sha3.odin +++ b/core/crypto/sha3/sha3.odin @@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/crypto/sm3/sm3.odin b/core/crypto/sm3/sm3.odin index ac38ca417..6f1d788e0 100644 --- a/core/crypto/sm3/sm3.odin +++ b/core/crypto/sm3/sm3.odin @@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { ensure(ctx.is_initialized) diff --git a/core/crypto/tuplehash/tuplehash.odin b/core/crypto/tuplehash/tuplehash.odin index 5c8d8e39b..a45c1b120 100644 --- a/core/crypto/tuplehash/tuplehash.odin +++ b/core/crypto/tuplehash/tuplehash.odin @@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the digest to hash, and calls // reset on the Context. // -// Iff finalize_clone is set, final will work on a copy of the Context, +// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone) diff --git a/core/io/io.odin b/core/io/io.odin index 9fae30946..631a232aa 100644 --- a/core/io/io.odin +++ b/core/io/io.odin @@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err // copy_n copies n bytes (or till an error) from src to dst. // It returns the number of bytes copied and the first error that occurred whilst copying, if any. -// On return, written == n IFF err == nil +// On return, written == n if and only if (⟺) err == nil copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) { nsrc := limited_reader_init(&Limited_Reader{}, src, n) written, err = copy(dst, nsrc) diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin index 1c772143b..7fa6d8e4a 100644 --- a/core/math/big/prime.odin +++ b/core/math/big/prime.odin @@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc If the modulus is odd or dr != 0 use the montgomery method. */ if internal_int_is_odd(P) || dr != 0 { - return _private_int_exponent_mod(res, G, X, P, dr) + return _private_int_exponent_mod_fast(res, G, X, P, dr) } /* diff --git a/core/math/big/private.odin b/core/math/big/private.odin index 506f68165..1ca706c6d 100644 --- a/core/math/big/private.odin +++ b/core/math/big/private.odin @@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex return _private_int_mul_high_comba(dest, a, b, digits) } - internal_grow(dest, a.used + b.used + 1) or_return - dest.used = a.used + b.used + 1 + /* + Set up temporary output `Int`, which we'll swap for `dest` when done. + */ + + t := &Int{} + + internal_grow(t, a.used + b.used + 1) or_return + t.used = a.used + b.used + 1 pa := a.used pb := b.used @@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex /* Calculate the double precision result. */ - r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry) + r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry) /* Get the lower part. */ - dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK)) + t.digit[ix + iy] = DIGIT(r & _WORD(_MASK)) /* Carry the carry. */ carry = DIGIT(r >> _WORD(_DIGIT_BITS)) } - dest.digit[ix + pb] = carry + t.digit[ix + pb] = carry } + + internal_swap(dest, t) + internal_destroy(t) return internal_clamp(dest) } diff --git a/core/mem/virtual/arena.odin b/core/mem/virtual/arena.odin index b515aa3cf..bcf3ee702 100644 --- a/core/mem/virtual/arena.odin +++ b/core/mem/virtual/arena.odin @@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc := needed := mem.align_forward_uint(size, alignment) needed = max(needed, arena.default_commit_size) - block_size := max(needed, arena.minimum_block_size) + block_size := max(needed, arena.minimum_block_size) + alignment - new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return + new_block := memory_block_alloc(needed, block_size) or_return new_block.prev = arena.curr_block arena.curr_block = new_block arena.total_reserved += new_block.reserved diff --git a/core/mem/virtual/virtual.odin b/core/mem/virtual/virtual.odin index 3f388acf3..d37c61267 100644 --- a/core/mem/virtual/virtual.odin +++ b/core/mem/virtual/virtual.odin @@ -154,7 +154,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint, pmblock.committed = platform_total_commit block.committed = pmblock.committed - base_offset - + assert(block.committed <= block.reserved) } return } @@ -174,7 +174,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint, err = .Out_Of_Memory return } - assert(block.committed <= block.reserved) + do_commit_if_necessary(block, size, default_commit_size) or_return data = block.base[block.used+alignment_offset:][:min_size] diff --git a/core/nbio/impl_posix.odin b/core/nbio/impl_posix.odin index 3845882da..0d3f57e9c 100644 --- a/core/nbio/impl_posix.odin +++ b/core/nbio/impl_posix.odin @@ -804,7 +804,7 @@ send_exec :: proc(op: ^Operation) -> Op_Result { op.send.sent += n - if op.send.sent < total { + if n < total { return send_exec(op) } @@ -868,7 +868,7 @@ recv_exec :: proc(op: ^Operation) -> Op_Result { assert(is_tcp || op.recv.received == 0) op.recv.received += n - if is_tcp && n != 0 && op.recv.received < total { + if is_tcp && n != 0 && n < total { return recv_exec(op) } diff --git a/core/simd/arm/aes.odin b/core/simd/arm/aes.odin new file mode 100644 index 000000000..b1f44e52c --- /dev/null +++ b/core/simd/arm/aes.odin @@ -0,0 +1,34 @@ +#+build arm64,arm32 +package simd_arm + +@(require_results, enable_target_feature = "aes") +vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { + return _vaeseq_u8(data, key) +} + +@(require_results, enable_target_feature = "aes") +vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { + return _vaesdq_u8(data, key) +} + +@(require_results, enable_target_feature = "aes") +vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { + return _vaesmcq_u8(data) +} + +@(require_results,enable_target_feature = "aes") +vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { + return _vaesimcq_u8(data) +} + +@(private, default_calling_convention = "none") +foreign _ { + @(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese") + _vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesd" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesd") + _vaesdq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesmc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesmc") + _vaesmcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t --- + @(link_name = "llvm.aarch64.crypto.aesimc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesimc") + _vaesimcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t --- +} diff --git a/core/simd/arm/doc.odin b/core/simd/arm/doc.odin new file mode 100644 index 000000000..ecedc7bac --- /dev/null +++ b/core/simd/arm/doc.odin @@ -0,0 +1,2 @@ +// `SIMD` intrinsics specific to ARMv8 `arm32` and `arm64` architectures. +package simd_arm \ No newline at end of file diff --git a/core/simd/arm/sha.odin b/core/simd/arm/sha.odin new file mode 100644 index 000000000..ca87c9795 --- /dev/null +++ b/core/simd/arm/sha.odin @@ -0,0 +1,108 @@ +#+build arm64,arm32 +package simd_arm + +@(require_results, enable_target_feature = "sha2") +vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1cq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1pq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1mq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t { + return _vsha1h_u32(e) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t { + return _vsha1su0q_u32(w0_3, w4_7, w8_11) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha1su1q_u32(tw0_3, w12_15) +} + +@(require_results, enable_target_feature = "sha2") +vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t { + return _vsha256hq_u32(hash_abcd, hash_efgh, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t { + return _vsha256h2q_u32(hash_efgh, hash_abcd, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t { + return _vsha256su0q_u32(w0_3, w4_7) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha256su1q_u32(tw0_3, w8_11, w12_15) +} + +// Note: The SHA512 instructions are part of the `sha3` feature set. + +@(require_results, enable_target_feature = "sha3") +vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t { + return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2) +} + +@(require_results, enable_target_feature = "sha3") +vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t { + return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t { + return _vsha512su0q_u64(w0_1, w2_) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t { + return _vsha512su1q_u64(s01_s02, w14_15, w9_10) +} + +@(private, default_calling_convention = "none") +foreign _ { + @(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c") + _vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p") + _vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m") + _vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h") + _vsha1h_u32 :: proc(e: uint32_t) -> uint32_t --- + @(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0") + _vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1") + _vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h") + _vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2") + _vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0") + _vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1") + _vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h") + _vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2") + _vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0") + _vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1") + _vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t --- +} diff --git a/core/simd/arm/types.odin b/core/simd/arm/types.odin new file mode 100644 index 000000000..05e3540b6 --- /dev/null +++ b/core/simd/arm/types.odin @@ -0,0 +1,9 @@ +#+build arm64,arm32 +package simd_arm + +// Type aliases to match `arm_neon.h`. +uint32_t :: u32 + +uint8x16_t :: #simd[16]u8 +uint32x4_t :: #simd[4]u32 +uint64x2_t :: #simd[2]u64 diff --git a/core/sys/darwin/Foundation/NSBlock.odin b/core/sys/darwin/Foundation/NSBlock.odin index 34e562d75..a5b99abc5 100644 --- a/core/sys/darwin/Foundation/NSBlock.odin +++ b/core/sys/darwin/Foundation/NSBlock.odin @@ -82,7 +82,7 @@ internal_block_literal_make :: proc (is_global: bool, user_data: rawptr, user_pr BLOCK_HAS_COPY_DISPOSE :: 1 << 25 BLOCK_HAS_CTOR :: 1 << 26 // helpers have C++ code BLOCK_IS_GLOBAL :: 1 << 28 - BLOCK_HAS_STRET :: 1 << 29 // IFF BLOCK_HAS_SIGNATURE + BLOCK_HAS_STRET :: 1 << 29 // if and only if (⟺) BLOCK_HAS_SIGNATURE BLOCK_HAS_SIGNATURE :: 1 << 30 bl.isa = is_global ? &_NSConcreteGlobalBlock : &_NSConcreteStackBlock diff --git a/src/build_settings.cpp b/src/build_settings.cpp index c2ef7b259..1d836b1ec 100644 --- a/src/build_settings.cpp +++ b/src/build_settings.cpp @@ -1683,7 +1683,20 @@ gb_internal void init_android_values(bool with_sdk) { gb_exit(1); } - bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/")); + switch (bc->metrics.arch) { + case TargetArch_arm64: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("aarch64-linux-android"); + break; + case TargetArch_arm32: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("arm-linux-androideabi"); + break; + case TargetArch_amd64: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("x86_64-linux-android"); + break; + case TargetArch_i386: + bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("i686-linux-android"); + break; + } char buf[32] = {}; gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL); @@ -1958,9 +1971,22 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta } else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) { switch (metrics->arch) { case TargetArch_arm64: - bc->metrics.target_triplet = str_lit("aarch64-none-linux-android"); + bc->metrics.target_triplet = str_lit("aarch64-linux-android"); bc->reloc_mode = RelocMode_PIC; break; + case TargetArch_arm32: + bc->metrics.target_triplet = str_lit("armv7a-linux-androideabi"); + bc->reloc_mode = RelocMode_PIC; + break; + case TargetArch_amd64: + bc->metrics.target_triplet = str_lit("x86_64-linux-android"); + bc->reloc_mode = RelocMode_PIC; + break; + case TargetArch_i386: + bc->metrics.target_triplet = str_lit("i686-linux-android"); + bc->reloc_mode = RelocMode_PIC; + break; + default: GB_PANIC("Unknown architecture for -subtarget:android"); } diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 39af640d8..492b38ffa 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -2971,14 +2971,21 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper if (check_is_assignable_to(c, x, y->type) || check_is_assignable_to(c, y, x->type)) { + if (x->type->failure || y->type->failure) { + // // skip any failures + x->mode = Addressing_Value; + x->type = t_untyped_bool; + return; + } + Type *err_type = x->type; bool defined = false; switch (op) { case Token_CmpEq: case Token_NotEq: - defined = (is_type_comparable(x->type) && is_type_comparable(y->type)) || - (is_operand_nil(*x) && type_has_nil(y->type)) || - (is_operand_nil(*y) && type_has_nil(x->type)); + defined = ((is_operand_nil(*x) && type_has_nil(y->type)) || + (is_operand_nil(*y) && type_has_nil(x->type)) || + is_type_comparable(x->type) && is_type_comparable(y->type)); break; case Token_Lt: case Token_Gt: @@ -4476,9 +4483,9 @@ gb_internal void check_binary_expr(CheckerContext *c, Operand *x, Ast *node, Typ truncated: r = a - b*trunc(a/b) floored: r = a - b*floor(a/b) - IFF a/0 == 0, then (a%0 == a) or (a%%0 == a) - IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0) - IFF a/0 == 0b111..., then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0) + If and only if (⟺) a/0 == 0b111..., then (a%0 == a) or (a%%0 == a) */ switch (zero_behaviour) { diff --git a/src/check_type.cpp b/src/check_type.cpp index 5cfc24981..071e30c18 100644 --- a/src/check_type.cpp +++ b/src/check_type.cpp @@ -1853,7 +1853,7 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para if (is_using && (feature_flags & OptInFeatureFlag_UsingStmt) == 0) { ERROR_BLOCK(); error(param, "'using' has been disallowed as it is considered bad practice to use as a statement/procedure parameter outside of immediate refactoring"); - error_line("\tIt you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n"); + error_line("\tIf you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n"); } if (type_expr == nullptr) { diff --git a/src/checker.cpp b/src/checker.cpp index cd5842b10..ec5abf067 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -2240,7 +2240,7 @@ gb_internal void add_type_info_type_internal(CheckerContext *c, Type *t) { case Type_BitSet: add_type_info_type_internal(c, bt->BitSet.elem); - add_type_info_type_internal(c, bt->BitSet.underlying); + add_type_info_type_internal(c, bit_set_to_int(bt)); break; case Type_Pointer: @@ -2484,7 +2484,7 @@ gb_internal void add_min_dep_type_info(Checker *c, Type *t) { case Type_BitSet: add_min_dep_type_info(c, bt->BitSet.elem); - add_min_dep_type_info(c, bt->BitSet.underlying); + add_min_dep_type_info(c, bit_set_to_int(bt)); break; case Type_Pointer: diff --git a/src/linker.cpp b/src/linker.cpp index e48486d9a..12f016cea 100644 --- a/src/linker.cpp +++ b/src/linker.cpp @@ -676,7 +676,7 @@ try_cross_linking:; defer (gb_string_free(glue)); glue = gb_string_append_fmt(glue, "bin/clang"); - glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL); + glue = gb_string_append_fmt(glue, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL); glue = gb_string_appendc(glue, "-c \""); glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len); glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c"); @@ -697,8 +697,9 @@ try_cross_linking:; glue = gb_string_appendc(glue, "\"-I"); glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len); - glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/"); - glue = gb_string_appendc(glue, "\" "); + glue = gb_string_appendc(glue, "sysroot/usr/include/"); + glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len); + glue = gb_string_appendc(glue, "/\" "); glue = gb_string_appendc(glue, "-Wno-macro-redefined "); @@ -969,7 +970,7 @@ try_cross_linking:; gbString ndk_bin_directory = gb_string_make_length(temporary_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len); link_command_line = gb_string_appendc(link_command_line, ndk_bin_directory); link_command_line = gb_string_appendc(link_command_line, "bin/clang"); - link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL); + link_command_line = gb_string_append_fmt(link_command_line, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL); } else { link_command_line = gb_string_appendc(link_command_line, clang_path); } diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index a4259b01e..81596c9ec 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -66,6 +66,19 @@ gb_internal String get_final_microarchitecture() { gb_internal String get_default_features() { BuildContext *bc = &build_context; + if (bc->microarch == str_lit("native")) { + String features = make_string_c(LLVMGetHostCPUFeatures()); + + // Update the features string so LLVM uses it later. + if (bc->target_features_string.len > 0) { + bc->target_features_string = concatenate3_strings(permanent_allocator(), features, str_lit(","), bc->target_features_string); + } else { + bc->target_features_string = features; + } + + return features; + } + int off = 0; for (int i = 0; i < bc->metrics.arch; i += 1) { off += target_microarch_counts[i]; diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index af498214e..b3aa57f16 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -1424,8 +1424,8 @@ gb_internal LLVMValueRef lb_integer_modulo(lbProcedure *p, LLVMValueRef lhs, LLV truncated: r = a - b*trunc(a/b) floored: r = a - b*floor(a/b) - IFF a/0 == 0, then (a%0 == a) or (a%%0 == a) - IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0) + If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a) + If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0) */ switch (behaviour) { diff --git a/tests/core/mem/test_core_mem.odin b/tests/core/mem/test_core_mem.odin index 9d64e50a3..3afd49a06 100644 --- a/tests/core/mem/test_core_mem.odin +++ b/tests/core/mem/test_core_mem.odin @@ -57,6 +57,33 @@ test_align_bumping_block_limit :: proc(t: ^testing.T) { testing.expect(t, len(data) == 896) } +@(test) +test_large_minimum_block_size :: proc(t: ^testing.T) { + a: virtual.Arena + defer virtual.arena_destroy(&a) + + init_err := virtual.arena_init_growing(&a, 16*mem.Megabyte) + testing.expect_value(t, init_err, nil) + + align : uint = 4 + for _ in 0..<6 { + data, err := virtual.arena_alloc(&a, 18874368, align) + testing.expect_value(t, err, nil) + testing.expect(t, len(data) == 18874368) + testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0) + align *= 2 + virtual.arena_free_all(&a) + } + + align = 4 + for _ in 0..<32 { + data, err := virtual.arena_alloc(&a, 1048576, align) + testing.expect_value(t, err, nil) + testing.expect(t, len(data) == 1048576) + testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0) + } +} + @(test) tlsf_test_overlap_and_zero :: proc(t: ^testing.T) { default_allocator := context.allocator diff --git a/vendor/lua/5.4/include/luaconf.h b/vendor/lua/5.4/include/luaconf.h index 3ad294e4f..fbfa5781b 100644 --- a/vendor/lua/5.4/include/luaconf.h +++ b/vendor/lua/5.4/include/luaconf.h @@ -71,7 +71,7 @@ /* -@@ LUAI_IS32INT is true iff 'int' has (at least) 32 bits. +@@ LUAI_IS32INT is true if and only if (⟺) 'int' has (at least) 32 bits. */ #define LUAI_IS32INT ((UINT_MAX >> 30) >= 3) diff --git a/vendor/portmidi/portmidi.odin b/vendor/portmidi/portmidi.odin index 58d7c2ec2..1f2aca286 100644 --- a/vendor/portmidi/portmidi.odin +++ b/vendor/portmidi/portmidi.odin @@ -119,8 +119,8 @@ DeviceInfo :: struct { structVersion: c.int, /**< this internal structure version */ interf: cstring, /**< underlying MIDI API, e.g. MMSystem or DirectX */ name: cstring, /**< device name, e.g. USB MidiSport 1x1 */ - input: b32, /**< true iff input is available */ - output: b32, /**< true iff output is available */ + input: b32, /**< true if and only if (⟺) input is available */ + output: b32, /**< true if and only if (⟺) output is available */ opened: b32, /**< used by generic PortMidi code to do error checking on arguments */ }