From 36a3f691c15c3bae16b23865c1bdd398b3d01ac5 Mon Sep 17 00:00:00 2001 From: Yawning Angel Date: Mon, 16 Mar 2026 18:30:14 +0900 Subject: [PATCH] core/crypto/sha2: Add Aarch64 SHA512 acceleration --- core/crypto/sha2/sha2.odin | 30 +- core/crypto/sha2/sha512_impl_hw_arm.odin | 498 +++++++++++++++++++++++ core/crypto/sha2/sha512_impl_hw_gen.odin | 3 +- 3 files changed, 515 insertions(+), 16 deletions(-) create mode 100644 core/crypto/sha2/sha512_impl_hw_arm.odin diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index a878e1c2a..2eac9240b 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -276,7 +276,7 @@ reset :: proc(ctx: ^$T) { SHA2 implementation */ -@(private, rodata) +@(private = "file", rodata) SHA256_K := [64]u32 { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, @@ -296,7 +296,7 @@ SHA256_K := [64]u32 { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, } -@(private, rodata) +@(private = "file", rodata) SHA512_K := [80]u64 { 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, @@ -345,67 +345,67 @@ SHA256_ROUNDS :: 64 @(private) SHA512_ROUNDS :: 80 -@(private) +@(private = "file") SHA256_CH :: #force_inline proc "contextless" (x, y, z: u32) -> u32 { return (x & y) ~ (~x & z) } -@(private) +@(private = "file") SHA256_MAJ :: #force_inline proc "contextless" (x, y, z: u32) -> u32 { return (x & y) ~ (x & z) ~ (y & z) } -@(private) +@(private = "file") SHA512_CH :: #force_inline proc "contextless" (x, y, z: u64) -> u64 { return (x & y) ~ (~x & z) } -@(private) +@(private = "file") SHA512_MAJ :: #force_inline proc "contextless" (x, y, z: u64) -> u64 { return (x & y) ~ (x & z) ~ (y & z) } -@(private) +@(private = "file") SHA256_F1 :: #force_inline proc "contextless" (x: u32) -> u32 { return bits.rotate_left32(x, 30) ~ bits.rotate_left32(x, 19) ~ bits.rotate_left32(x, 10) } -@(private) +@(private = "file") SHA256_F2 :: #force_inline proc "contextless" (x: u32) -> u32 { return bits.rotate_left32(x, 26) ~ bits.rotate_left32(x, 21) ~ bits.rotate_left32(x, 7) } -@(private) +@(private = "file") SHA256_F3 :: #force_inline proc "contextless" (x: u32) -> u32 { return bits.rotate_left32(x, 25) ~ bits.rotate_left32(x, 14) ~ (x >> 3) } -@(private) +@(private = "file") SHA256_F4 :: #force_inline proc "contextless" (x: u32) -> u32 { return bits.rotate_left32(x, 15) ~ bits.rotate_left32(x, 13) ~ (x >> 10) } -@(private) +@(private = "file") SHA512_F1 :: #force_inline proc "contextless" (x: u64) -> u64 { return bits.rotate_left64(x, 36) ~ bits.rotate_left64(x, 30) ~ bits.rotate_left64(x, 25) } -@(private) +@(private = "file") SHA512_F2 :: #force_inline proc "contextless" (x: u64) -> u64 { return bits.rotate_left64(x, 50) ~ bits.rotate_left64(x, 46) ~ bits.rotate_left64(x, 23) } -@(private) +@(private = "file") SHA512_F3 :: #force_inline proc "contextless" (x: u64) -> u64 { return bits.rotate_left64(x, 63) ~ bits.rotate_left64(x, 56) ~ (x >> 7) } -@(private) +@(private = "file") SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 { return bits.rotate_left64(x, 45) ~ bits.rotate_left64(x, 3) ~ (x >> 6) } -@(private) +@(private = "file") sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check { when T == Context_256 { if ctx.is_hw_accelerated { diff --git a/core/crypto/sha2/sha512_impl_hw_arm.odin b/core/crypto/sha2/sha512_impl_hw_arm.odin new file mode 100644 index 000000000..27ef83f5a --- /dev/null +++ b/core/crypto/sha2/sha512_impl_hw_arm.odin @@ -0,0 +1,498 @@ +// The round function's intrinsic calls are based on: +// https://github.com/LostInCompilation/HashMe/blob/main/src/SHA512_Hardware.cpp +// +// The zlib License +// +// Copyright (C) 2024 Marc Schöndorf +// +// This software is provided 'as-is', without any express or implied warranty. In +// no event will the authors be held liable for any damages arising from the use of +// this software. +// +// Permission is granted to anyone to use this software for any purpose, including +// commercial applications, and to alter it and redistribute it freely, subject to +// the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not claim +// that you wrote the original software. If you use this software in a product, +// an acknowledgment in the product documentation would be appreciated but is +// not required. +// +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// +// 3. This notice may not be removed or altered from any source distribution. + +#+build arm64 +package sha2 + +import "base:intrinsics" +import "core:simd" +import "core:simd/arm" +import "core:sys/info" + +// is_hardware_accelerated_512 returns true if and only if (⟺) hardware +// accelerated SHA-384, SHA-512, and SHA-512/256 are supported. +is_hardware_accelerated_512 :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{ + .asimd, + .sha512, + .sha3, // XXX: LLVM groups these under `sha3`. + } + return info.cpu_features() >= req_features +} + +@(private = "file") +K_0 :: simd.u64x2{0x428a2f98d728ae22, 0x7137449123ef65cd} +@(private = "file") +K_1 :: simd.u64x2{0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc} +@(private = "file") +K_2 :: simd.u64x2{0x3956c25bf348b538, 0x59f111f1b605d019} +@(private = "file") +K_3 :: simd.u64x2{0x923f82a4af194f9b, 0xab1c5ed5da6d8118} +@(private = "file") +K_4 :: simd.u64x2{0xd807aa98a3030242, 0x12835b0145706fbe} +@(private = "file") +K_5 :: simd.u64x2{0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2} +@(private = "file") +K_6 :: simd.u64x2{0x72be5d74f27b896f, 0x80deb1fe3b1696b1} +@(private = "file") +K_7 :: simd.u64x2{0x9bdc06a725c71235, 0xc19bf174cf692694} +@(private = "file") +K_8 :: simd.u64x2{0xe49b69c19ef14ad2, 0xefbe4786384f25e3} +@(private = "file") +K_9 :: simd.u64x2{0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65} +@(private = "file") +K_10 :: simd.u64x2{0x2de92c6f592b0275, 0x4a7484aa6ea6e483} +@(private = "file") +K_11 :: simd.u64x2{0x5cb0a9dcbd41fbd4, 0x76f988da831153b5} +@(private = "file") +K_12 :: simd.u64x2{0x983e5152ee66dfab, 0xa831c66d2db43210} +@(private = "file") +K_13 :: simd.u64x2{0xb00327c898fb213f, 0xbf597fc7beef0ee4} +@(private = "file") +K_14 :: simd.u64x2{0xc6e00bf33da88fc2, 0xd5a79147930aa725} +@(private = "file") +K_15 :: simd.u64x2{0x06ca6351e003826f, 0x142929670a0e6e70} +@(private = "file") +K_16 :: simd.u64x2{0x27b70a8546d22ffc, 0x2e1b21385c26c926} +@(private = "file") +K_17 :: simd.u64x2{0x4d2c6dfc5ac42aed, 0x53380d139d95b3df} +@(private = "file") +K_18 :: simd.u64x2{0x650a73548baf63de, 0x766a0abb3c77b2a8} +@(private = "file") +K_19 :: simd.u64x2{0x81c2c92e47edaee6, 0x92722c851482353b} +@(private = "file") +K_20 :: simd.u64x2{0xa2bfe8a14cf10364, 0xa81a664bbc423001} +@(private = "file") +K_21 :: simd.u64x2{0xc24b8b70d0f89791, 0xc76c51a30654be30} +@(private = "file") +K_22 :: simd.u64x2{0xd192e819d6ef5218, 0xd69906245565a910} +@(private = "file") +K_23 :: simd.u64x2{0xf40e35855771202a, 0x106aa07032bbd1b8} +@(private = "file") +K_24 :: simd.u64x2{0x19a4c116b8d2d0c8, 0x1e376c085141ab53} +@(private = "file") +K_25 :: simd.u64x2{0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8} +@(private = "file") +K_26 :: simd.u64x2{0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb} +@(private = "file") +K_27 :: simd.u64x2{0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3} +@(private = "file") +K_28 :: simd.u64x2{0x748f82ee5defb2fc, 0x78a5636f43172f60} +@(private = "file") +K_29 :: simd.u64x2{0x84c87814a1f0ab72, 0x8cc702081a6439ec} +@(private = "file") +K_30 :: simd.u64x2{0x90befffa23631e28, 0xa4506cebde82bde9} +@(private = "file") +K_31 :: simd.u64x2{0xbef9a3f7b2c67915, 0xc67178f2e372532b} +@(private = "file") +K_32 :: simd.u64x2{0xca273eceea26619c, 0xd186b8c721c0c207} +@(private = "file") +K_33 :: simd.u64x2{0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178} +@(private = "file") +K_34 :: simd.u64x2{0x06f067aa72176fba, 0x0a637dc5a2c898a6} +@(private = "file") +K_35 :: simd.u64x2{0x113f9804bef90dae, 0x1b710b35131c471b} +@(private = "file") +K_36 :: simd.u64x2{0x28db77f523047d84, 0x32caab7b40c72493} +@(private = "file") +K_37 :: simd.u64x2{0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c} +@(private = "file") +K_38 :: simd.u64x2{0x4cc5d4becb3e42b6, 0x597f299cfc657e2a} +@(private = "file") +K_39 :: simd.u64x2{0x5fcb6fab3ad6faec, 0x6c44198c4a475817} + +@(private, enable_target_feature = "neon,sha3") +sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) #no_bounds_check { + state_0 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[2])) + state_2 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[4])) + state_3 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[6])) + + data := data + for len(data) >= BLOCK_SIZE_512 { + ab_save, cd_save, ef_save, gh_save := state_0, state_1, state_2, state_3 + + // Load message + msg_0 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data))) + msg_1 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[16:]))) + msg_2 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[32:]))) + msg_3 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[48:]))) + msg_4 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[64:]))) + msg_5 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[80:]))) + msg_6 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[96:]))) + msg_7 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[112:]))) + + // Reverse for little endian + when ODIN_ENDIAN == .Little { + msg_0 = byteswap_u64x2(msg_0) + msg_1 = byteswap_u64x2(msg_1) + msg_2 = byteswap_u64x2(msg_2) + msg_3 = byteswap_u64x2(msg_3) + msg_4 = byteswap_u64x2(msg_4) + msg_5 = byteswap_u64x2(msg_5) + msg_6 = byteswap_u64x2(msg_6) + msg_7 = byteswap_u64x2(msg_7) + } + + // Rounds 0-1 + msg_k := simd.add(msg_0, K_0) + tmp_0 := simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 := arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2)) + + // Rounds 2-3 + msg_k = simd.add(msg_1, K_1) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2)) + + // Rounds 4-5 + msg_k = simd.add(msg_2, K_2) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2)) + + // Rounds 6-7 + msg_k = simd.add(msg_3, K_3) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2)) + + // Rounds 8-9 + msg_k = simd.add(msg_4, K_4) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2)) + + // Rounds 10-11 + msg_k = simd.add(msg_5, K_5) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2)) + + // Rounds 12-13 + msg_k = simd.add(msg_6, K_6) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2)) + + // Rounds 14-15 + msg_k = simd.add(msg_7, K_7) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2)) + + // Rounds 16-17 + msg_k = simd.add(msg_0, K_8) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2)) + + // Rounds 18-19 + msg_k = simd.add(msg_1, K_9) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2)) + + // Rounds 20-21 + msg_k = simd.add(msg_2, K_10) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2)) + + // Rounds 22-23 + msg_k = simd.add(msg_3, K_11) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2)) + + // Rounds 24-25 + msg_k = simd.add(msg_4, K_12) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2)) + + // Rounds 26-27 + msg_k = simd.add(msg_5, K_13) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2)) + + // Rounds 28-29 + msg_k = simd.add(msg_6, K_14) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2)) + + // Rounds 30-31 + msg_k = simd.add(msg_7, K_15) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2)) + + // Rounds 32-33 + msg_k = simd.add(msg_0, K_16) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2)) + + // Rounds 34-35 + msg_k = simd.add(msg_1, K_17) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2)) + + // Rounds 36-37 + msg_k = simd.add(msg_2, K_18) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2)) + + // Rounds 38-39 + msg_k = simd.add(msg_3, K_19) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2)) + + // Rounds 40-41 + msg_k = simd.add(msg_4, K_20) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2)) + + // Rounds 42-43 + msg_k = simd.add(msg_5, K_21) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2)) + + // Rounds 44-45 + msg_k = simd.add(msg_6, K_22) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2)) + + // Rounds 46-47 + msg_k = simd.add(msg_7, K_23) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2)) + + // Rounds 48-49 + msg_k = simd.add(msg_0, K_24) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2)) + + // Rounds 50-51 + msg_k = simd.add(msg_1, K_25) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2)) + + // Rounds 52-53 + msg_k = simd.add(msg_2, K_26) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2)) + + // Rounds 54-55 + msg_k = simd.add(msg_3, K_27) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2)) + + // Rounds 56-57 + msg_k = simd.add(msg_4, K_28) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2)) + + // Rounds 58-59 + msg_k = simd.add(msg_5, K_29) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2)) + + // Rounds 60-61 + msg_k = simd.add(msg_6, K_30) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2)) + + // Rounds 62-63 + msg_k = simd.add(msg_7, K_31) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2)) + + // Rounds 64-65 + msg_k = simd.add(msg_0, K_32) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + + // Rounds 66-67 + msg_k = simd.add(msg_1, K_33) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + + // Rounds 68-69 + msg_k = simd.add(msg_2, K_34) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + + // Rounds 70-71 + msg_k = simd.add(msg_3, K_35) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + + // Rounds 72-73 + msg_k = simd.add(msg_4, K_36) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2)) + state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0) + state_1 = simd.add(state_1, tmp_1) + + // Rounds 74-75 + msg_k = simd.add(msg_5, K_37) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2)) + state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3) + state_0 = simd.add(state_0, tmp_1) + + // Rounds 76-77 + msg_k = simd.add(msg_6, K_38) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2)) + state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2) + state_3 = simd.add(state_3, tmp_1) + + // Rounds 78-79 + msg_k = simd.add(msg_7, K_39) + tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0) + tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2)) + state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1) + state_2 = simd.add(state_2, tmp_1) + + // Combine state + state_0 = simd.add(state_0, ab_save) + state_1 = simd.add(state_1, cd_save) + state_2 = simd.add(state_2, ef_save) + state_3 = simd.add(state_3, gh_save) + + data = data[BLOCK_SIZE_512:] + } + + intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[2]), state_1) + intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[4]), state_2) + intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[6]), state_3) +} + +when ODIN_ENDIAN == .Little { + @(private = "file", enable_target_feature = "neon") + byteswap_u64x2 :: #force_inline proc "contextless" (a: simd.u64x2) -> simd.u64x2 { + return transmute(simd.u64x2)( + simd.shuffle( + transmute(simd.u8x16)(a), + transmute(simd.u8x16)(a), + 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8, + ) + ) + } +} diff --git a/core/crypto/sha2/sha512_impl_hw_gen.odin b/core/crypto/sha2/sha512_impl_hw_gen.odin index 5fd518d80..701f82709 100644 --- a/core/crypto/sha2/sha512_impl_hw_gen.odin +++ b/core/crypto/sha2/sha512_impl_hw_gen.odin @@ -1,7 +1,8 @@ +#+build !arm64 package sha2 // is_hardware_accelerated_512 returns true if and only if (⟺) hardware -// accelerated SHA-384/SHA-512/SHA-512/256 are supported. +// accelerated SHA-384, SHA-512, and SHA-512/256 are supported. is_hardware_accelerated_512 :: proc "contextless" () -> bool { return false }