From adb2890d2bf05b4f3764418cc7bd6a5c16f9e67e Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sat, 14 Mar 2026 04:12:17 +0900 Subject: [PATCH 1/3] core/simd/arm: Formating fixes (NFC) --- core/simd/arm/aes.odin | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/simd/arm/aes.odin b/core/simd/arm/aes.odin index acafb9f1e..b1f44e52c 100644 --- a/core/simd/arm/aes.odin +++ b/core/simd/arm/aes.odin @@ -1,27 +1,27 @@ #+build arm64,arm32 package simd_arm -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { return _vaeseq_u8(data, key) } -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t { return _vaesdq_u8(data, key) } -@(require_results,enable_target_feature="aes") +@(require_results, enable_target_feature = "aes") vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { return _vaesmcq_u8(data) } -@(require_results,enable_target_feature="aes") +@(require_results,enable_target_feature = "aes") vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t { return _vaesimcq_u8(data) } -@(private,default_calling_convention="none") +@(private, default_calling_convention = "none") foreign _ { @(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese") _vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t --- From 3a59e8c84950eabf6c50a684a0e02dbde250ec22 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sat, 14 Mar 2026 05:09:35 +0900 Subject: [PATCH 2/3] core/simd/arm: Add the SHA intrinsics The SHA3 ones aren't in the developer.arm.com documentation. --- core/simd/arm/sha.odin | 108 +++++++++++++++++++++++++++++++++++++++ core/simd/arm/types.odin | 4 ++ 2 files changed, 112 insertions(+) create mode 100644 core/simd/arm/sha.odin diff --git a/core/simd/arm/sha.odin b/core/simd/arm/sha.odin new file mode 100644 index 000000000..ca87c9795 --- /dev/null +++ b/core/simd/arm/sha.odin @@ -0,0 +1,108 @@ +#+build arm64,arm32 +package simd_arm + +@(require_results, enable_target_feature = "sha2") +vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1cq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1pq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t { + return _vsha1mq_u32(hash_abcd, e, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t { + return _vsha1h_u32(e) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t { + return _vsha1su0q_u32(w0_3, w4_7, w8_11) +} + +@(require_results, enable_target_feature = "sha2") +vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha1su1q_u32(tw0_3, w12_15) +} + +@(require_results, enable_target_feature = "sha2") +vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t { + return _vsha256hq_u32(hash_abcd, hash_efgh, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t { + return _vsha256h2q_u32(hash_efgh, hash_abcd, wk) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t { + return _vsha256su0q_u32(w0_3, w4_7) +} + +@(require_results, enable_target_feature = "sha2") +vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t { + return _vsha256su1q_u32(tw0_3, w8_11, w12_15) +} + +// Note: The SHA512 instructions are part of the `sha3` feature set. + +@(require_results, enable_target_feature = "sha3") +vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t { + return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2) +} + +@(require_results, enable_target_feature = "sha3") +vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t { + return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t { + return _vsha512su0q_u64(w0_1, w2_) +} + +@(require_results, enable_target_feature = "sha3") +vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t { + return _vsha512su1q_u64(s01_s02, w14_15, w9_10) +} + +@(private, default_calling_convention = "none") +foreign _ { + @(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c") + _vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p") + _vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m") + _vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h") + _vsha1h_u32 :: proc(e: uint32_t) -> uint32_t --- + @(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0") + _vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1") + _vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h") + _vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2") + _vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0") + _vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t --- + @(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1") + _vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t --- + + @(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h") + _vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2") + _vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0") + _vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t --- + @(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1") + _vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t --- +} diff --git a/core/simd/arm/types.odin b/core/simd/arm/types.odin index 7c86483a7..05e3540b6 100644 --- a/core/simd/arm/types.odin +++ b/core/simd/arm/types.odin @@ -1,5 +1,9 @@ #+build arm64,arm32 package simd_arm +// Type aliases to match `arm_neon.h`. +uint32_t :: u32 + uint8x16_t :: #simd[16]u8 uint32x4_t :: #simd[4]u32 +uint64x2_t :: #simd[2]u64 From 9194b599ec5f3b3edbc88c31ef19ebc372cebdf2 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Sun, 15 Mar 2026 00:09:25 +0900 Subject: [PATCH 3/3] core/crypto/sha2: Add ARMv8 SHA256 acceleration --- core/crypto/sha2/sha256_impl_hw_arm.odin | 224 ++++++++++++++++++ ...w_intel.odin => sha256_impl_hw_intel.odin} | 0 core/crypto/sha2/sha2_impl_hw_gen.odin | 2 + 3 files changed, 226 insertions(+) create mode 100644 core/crypto/sha2/sha256_impl_hw_arm.odin rename core/crypto/sha2/{sha2_impl_hw_intel.odin => sha256_impl_hw_intel.odin} (100%) diff --git a/core/crypto/sha2/sha256_impl_hw_arm.odin b/core/crypto/sha2/sha256_impl_hw_arm.odin new file mode 100644 index 000000000..618cc6fff --- /dev/null +++ b/core/crypto/sha2/sha256_impl_hw_arm.odin @@ -0,0 +1,224 @@ +#+build arm64,arm32 +package sha2 + +// Based on the public domain code by Jeffrey Walton, though +// realistically, there only is one sensible way to write this. +// +// See: https://github.com/noloader/SHA-Intrinsics + +import "base:intrinsics" +import "core:simd" +import "core:simd/arm" +import "core:sys/info" + +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware +// accelerated SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{ + .asimd, + .sha256, + } + return info.cpu_features() >= req_features +} + +@(private = "file") +K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5} +@(private = "file") +K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5} +@(private = "file") +K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3} +@(private = "file") +K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174} +@(private = "file") +K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC} +@(private = "file") +K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA} +@(private = "file") +K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7} +@(private = "file") +K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967} +@(private = "file") +K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13} +@(private = "file") +K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85} +@(private = "file") +K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3} +@(private = "file") +K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070} +@(private = "file") +K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5} +@(private = "file") +K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3} +@(private = "file") +K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208} +@(private = "file") +K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2} + +@(private, enable_target_feature = "neon,sha2") +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { + state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4])) + + data := data + for len(data) >= BLOCK_SIZE_256 { + // Save state + abef_save, cdgh_save := state_0, state_1 + + // Load message + msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data))) + msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:]))) + msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:]))) + msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:]))) + + // Reverse for little endian + when ODIN_ENDIAN == .Little { + msg_0 = byteswap_u32x4(msg_0) + msg_1 = byteswap_u32x4(msg_1) + msg_2 = byteswap_u32x4(msg_2) + msg_3 = byteswap_u32x4(msg_3) + } + + tmp_0 := simd.add(msg_0, K_0) + + // Rounds 0-3 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 := state_0 + tmp_1 := simd.add(msg_1, K_1) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 4-7 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_2) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 8-11 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_3) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 12-15 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_4) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 16-19 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_5) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 20-23 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_6) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 24-27 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_7) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 28-31 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_8) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 32-35 + msg_0 = arm.vsha256su0q_u32(msg_0, msg_1) + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_9) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3) + + // Rounds 36-39 + msg_1 = arm.vsha256su0q_u32(msg_1, msg_2) + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_10) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0) + + // Rounds 40-43 + msg_2 = arm.vsha256su0q_u32(msg_2, msg_3) + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_11) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1) + + // Rounds 44-47 + msg_3 = arm.vsha256su0q_u32(msg_3, msg_0) + tmp_2 = state_0 + tmp_0 = simd.add(msg_0, K_12) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2) + + // Rounds 48-51 + tmp_2 = state_0 + tmp_1 = simd.add(msg_1, K_13) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 52-55 + tmp_2 = state_0 + tmp_0 = simd.add(msg_2, K_14) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Rounds 56-59 + tmp_2 = state_0 + tmp_1 = simd.add(msg_3, K_15) + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0) + + // Rounds 60-63 + tmp_2 = state_0 + state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1) + state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1) + + // Combine state + state_0 = simd.add(state_0, abef_save) + state_1 = simd.add(state_1, cdgh_save) + + data = data[BLOCK_SIZE_256:] + } + + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1) +} + +when ODIN_ENDIAN == .Little { + @(private = "file", enable_target_feature = "neon") + byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 { + return transmute(simd.u32x4)( + simd.shuffle( + transmute(simd.u8x16)(a), + transmute(simd.u8x16)(a), + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} \ No newline at end of file diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha256_impl_hw_intel.odin similarity index 100% rename from core/crypto/sha2/sha2_impl_hw_intel.odin rename to core/crypto/sha2/sha256_impl_hw_intel.odin diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin index 837d0656d..d735e3c61 100644 --- a/core/crypto/sha2/sha2_impl_hw_gen.odin +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -1,4 +1,6 @@ #+build !amd64 +#+build !arm64 +#+build !arm32 package sha2 @(private = "file")