From 9051f5a0757de1b15f21c74e8dce38919775e2dd Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 16 Mar 2026 15:51:04 +0900 Subject: [PATCH 1/3] core/simd/x86: Fix _mm_blend_epi16 --- core/simd/x86/sse41.odin | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/core/simd/x86/sse41.odin b/core/simd/x86/sse41.odin index 81089ed63..510f5d8e9 100644 --- a/core/simd/x86/sse41.odin +++ b/core/simd/x86/sse41.odin @@ -26,7 +26,18 @@ _mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i { } @(require_results, enable_target_feature="sse4.1") _mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i { - return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8) + return transmute(__m128i)simd.shuffle( + transmute(i16x8)b, + transmute(i16x8)a, + 0 when (IMM8 >> 0) & 1 == 1 else 8, + 1 when (IMM8 >> 1) & 1 == 1 else 9, + 2 when (IMM8 >> 2) & 1 == 1 else 10, + 3 when (IMM8 >> 3) & 1 == 1 else 11, + 4 when (IMM8 >> 4) & 1 == 1 else 12, + 5 when (IMM8 >> 5) & 1 == 1 else 13, + 6 when (IMM8 >> 6) & 1 == 1 else 14, + 7 when (IMM8 >> 7) & 1 == 1 else 15, + ) } @(require_results, enable_target_feature="sse4.1") _mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d { @@ -303,8 +314,6 @@ foreign _ { blendpd :: proc(a, b: __m128d, #const imm2: u8) -> __m128d --- @(link_name = "llvm.x86.sse41.blendps") blendps :: proc(a, b: __m128, #const imm4: u8) -> __m128 --- - @(link_name = "llvm.x86.sse41.pblendw") - pblendw :: proc(a: i16x8, b: i16x8, #const imm8: u8) -> i16x8 --- @(link_name = "llvm.x86.sse41.insertps") insertps :: proc(a, b: __m128, #const imm8: u8) -> __m128 --- @(link_name = "llvm.x86.sse41.pmaxsb") From a030fb659672f5831bbcc9de7c864e5762fdeb7e Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 16 Mar 2026 15:51:54 +0900 Subject: [PATCH 2/3] core/crypto/sha2: Remove the broken Intel intrinsic kludge --- core/crypto/sha2/sha256_impl_hw_intel.odin | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/core/crypto/sha2/sha256_impl_hw_intel.odin b/core/crypto/sha2/sha256_impl_hw_intel.odin index 3f6ebb746..fcec80a3c 100644 --- a/core/crypto/sha2/sha256_impl_hw_intel.odin +++ b/core/crypto/sha2/sha256_impl_hw_intel.odin @@ -70,8 +70,7 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF - // state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH - state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp) + state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH data := data for len(data) >= BLOCK_SIZE_256 { @@ -238,18 +237,9 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou // Write back the updated state tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG - // state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA - state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1) + state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0) intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1) } - -@(private = "file") -kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i { - // HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`. - a_ := simd.to_array(a) - b_ := simd.to_array(b) - return x86.__m128i{a_[0], b_[1]} -} From ccb56de19a386532228baf2c83a7d06493fb8abc Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 16 Mar 2026 14:58:23 +0900 Subject: [PATCH 3/3] core/crypto/sha2: Add scaffolding for SHA512 acceleration --- core/crypto/sha2/sha2.odin | 20 ++++++++++++++++--- ...pl_hw_gen.odin => sha256_impl_hw_gen.odin} | 8 +++----- core/crypto/sha2/sha512_impl_hw_gen.odin | 12 +++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) rename core/crypto/sha2/{sha2_impl_hw_gen.odin => sha256_impl_hw_gen.odin} (68%) create mode 100644 core/crypto/sha2/sha512_impl_hw_gen.odin diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index dc41462e4..a878e1c2a 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -44,7 +44,8 @@ Context_256 :: struct { length: u64, md_bits: int, - is_initialized: bool, + is_hw_accelerated: bool, + is_initialized: bool, } // Context_512 is a SHA-384, SHA-512 or SHA-512/256 instance. @@ -55,7 +56,8 @@ Context_512 :: struct { length: u64, md_bits: int, - is_initialized: bool, + is_hw_accelerated: bool, + is_initialized: bool, } // init_224 initializes a Context_256 for SHA-224. @@ -88,6 +90,9 @@ init_512_256 :: proc(ctx: ^Context_512) { _init(ctx) } +@(private) +ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" + @(private) _init :: proc(ctx: ^$T) { when T == Context_256 { @@ -113,6 +118,8 @@ _init :: proc(ctx: ^$T) { case: panic("crypto/sha2: invalid digest output length") } + + ctx.is_hw_accelerated = is_hardware_accelerated_256() } else when T == Context_512 { switch ctx.md_bits { case 256: @@ -148,6 +155,8 @@ _init :: proc(ctx: ^$T) { case: panic("crypto/sha2: invalid digest output length") } + + ctx.is_hw_accelerated = is_hardware_accelerated_512() } ctx.length = 0 @@ -399,7 +408,7 @@ SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 { @(private) sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check { when T == Context_256 { - if is_hardware_accelerated_256() { + if ctx.is_hw_accelerated { sha256_transf_hw(ctx, data) return } @@ -410,6 +419,11 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check { CURR_BLOCK_SIZE :: BLOCK_SIZE_256 } else when T == Context_512 { + if ctx.is_hw_accelerated { + sha512_transf_hw(ctx, data) + return + } + w: [SHA512_ROUNDS]u64 wv: [8]u64 t1, t2: u64 diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha256_impl_hw_gen.odin similarity index 68% rename from core/crypto/sha2/sha2_impl_hw_gen.odin rename to core/crypto/sha2/sha256_impl_hw_gen.odin index d735e3c61..ad384caaa 100644 --- a/core/crypto/sha2/sha2_impl_hw_gen.odin +++ b/core/crypto/sha2/sha256_impl_hw_gen.odin @@ -3,15 +3,13 @@ #+build !arm32 package sha2 -@(private = "file") -ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" - -// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated -// SHA-224/SHA-256 is supported. +// is_hardware_accelerated_256 returns true if and only if (⟺) hardware +// accelerated SHA-224/SHA-256 is supported. is_hardware_accelerated_256 :: proc "contextless" () -> bool { return false } +@(private) sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) { panic_contextless(ERR_HW_NOT_SUPPORTED) } diff --git a/core/crypto/sha2/sha512_impl_hw_gen.odin b/core/crypto/sha2/sha512_impl_hw_gen.odin new file mode 100644 index 000000000..5fd518d80 --- /dev/null +++ b/core/crypto/sha2/sha512_impl_hw_gen.odin @@ -0,0 +1,12 @@ +package sha2 + +// is_hardware_accelerated_512 returns true if and only if (⟺) hardware +// accelerated SHA-384/SHA-512/SHA-512/256 are supported. +is_hardware_accelerated_512 :: proc "contextless" () -> bool { + return false +} + +@(private) +sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +}