mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-06 06:38:20 +00:00
Merge branch 'master' into bill/fixed-capacity-dynamic-array
This commit is contained in:
@@ -141,7 +141,7 @@ Type_Info_Struct :: struct {
|
||||
|
||||
flags: Type_Info_Struct_Flags,
|
||||
|
||||
// These are only set iff this structure is an SOA structure
|
||||
// These are only set if and only if (⟺) this structure is an SOA structure
|
||||
soa_kind: Type_Info_Struct_Soa_Kind,
|
||||
soa_len: i32,
|
||||
soa_base_type: ^Type_Info,
|
||||
|
||||
@@ -1525,7 +1525,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {
|
||||
|
||||
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `context.assertion_failure_procedure` to assert.
|
||||
//
|
||||
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
|
||||
@@ -1549,7 +1549,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `context.assertion_failure_procedure` to assert.
|
||||
// This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
|
||||
@builtin
|
||||
@@ -1589,7 +1589,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
|
||||
p("not yet implemented", message, loc)
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `default_assertion_contextless_failure_proc` to assert.
|
||||
//
|
||||
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
|
||||
@@ -1609,7 +1609,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `default_assertion_contextless_failure_proc` to assert.
|
||||
@builtin
|
||||
ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
|
||||
|
||||
@@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
|
||||
//
|
||||
// LLVM appears not to consider "this instruction is totally
|
||||
// awful on the given microarchitcture", which leads to
|
||||
// `VPCOMPRESSED` being generated iff AVX512 support is
|
||||
// `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is
|
||||
// enabled for `intrinsics.simd_masked_compress_store`.
|
||||
// On Zen 4, this leads to a 50% performance regression vs
|
||||
// the 128-bit SIMD code.
|
||||
|
||||
@@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
|
||||
b.buf = buf
|
||||
}
|
||||
|
||||
// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
|
||||
// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
|
||||
reader_destroy :: proc(b: ^Reader) {
|
||||
delete(b.buf, b.buf_allocator)
|
||||
b^ = {}
|
||||
|
||||
@@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
|
||||
b.buf = buf
|
||||
}
|
||||
|
||||
// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
|
||||
// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
|
||||
writer_destroy :: proc(b: ^Writer) {
|
||||
delete(b.buf, b.buf_allocator)
|
||||
b^ = {}
|
||||
|
||||
@@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
|
||||
return subslices[:]
|
||||
}
|
||||
|
||||
// alias returns true iff a and b have a non-zero length, and any part of
|
||||
// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of
|
||||
// a overlaps with b.
|
||||
alias :: proc "contextless" (a, b: []byte) -> bool {
|
||||
a_len, b_len := len(a), len(b)
|
||||
@@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool {
|
||||
return a_start <= b_end && b_start <= a_end
|
||||
}
|
||||
|
||||
// alias_inexactly returns true iff a and b have a non-zero length,
|
||||
// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length,
|
||||
// the base pointer of a and b are NOT equal, and any part of a overlaps
|
||||
// with b (ie: `alias(a, b)` with an exception that returns false for
|
||||
// `a == b`, `b = a[:len(a)-69]` and similar conditions).
|
||||
|
||||
@@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
|
||||
return t._size
|
||||
}
|
||||
|
||||
// first returns the first node in the tree (in-order) or nil iff
|
||||
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Backward)
|
||||
}
|
||||
|
||||
// last returns the last element in the tree (in-order) or nil iff
|
||||
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Forward)
|
||||
}
|
||||
|
||||
// find finds the value in the tree, and returns the corresponding
|
||||
// node or nil iff the value is not present.
|
||||
// node or nil if and only if (⟺) the value is not present.
|
||||
find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
|
||||
cur := t._root
|
||||
descend_loop: for cur != nil {
|
||||
@@ -168,7 +168,7 @@ find_or_insert :: proc(
|
||||
return
|
||||
}
|
||||
|
||||
// remove removes a node or value from the tree, and returns true iff the
|
||||
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove :: proc {
|
||||
@@ -176,7 +176,7 @@ remove :: proc {
|
||||
remove_node,
|
||||
}
|
||||
|
||||
// remove_value removes a value from the tree, and returns true iff the
|
||||
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
|
||||
@@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t
|
||||
return remove_node(t, n, call_on_remove)
|
||||
}
|
||||
|
||||
// remove_node removes a node from the tree, and returns true iff the
|
||||
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
|
||||
@@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" (
|
||||
}
|
||||
|
||||
// iterator_get returns the node currently pointed to by the iterator,
|
||||
// or nil iff the node has been removed, the tree is empty, or the end
|
||||
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
|
||||
// of the tree has been reached.
|
||||
iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
|
||||
return it._cur
|
||||
}
|
||||
|
||||
// iterator_remove removes the node currently pointed to by the iterator,
|
||||
// and returns true iff the removal was successful. Semantics are the
|
||||
// and returns true if and only if (⟺) the removal was successful. Semantics are the
|
||||
// same as the Tree remove.
|
||||
iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
|
||||
if it._cur == nil {
|
||||
@@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -
|
||||
}
|
||||
|
||||
// iterator_next advances the iterator and returns the (node, true) or
|
||||
// or (nil, false) iff the end of the tree has been reached.
|
||||
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
|
||||
//
|
||||
// Note: The first call to iterator_next will return the first node instead
|
||||
// of advancing the iterator.
|
||||
|
||||
@@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
|
||||
return t._size
|
||||
}
|
||||
|
||||
// first returns the first node in the tree (in-order) or nil iff
|
||||
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Backward)
|
||||
}
|
||||
|
||||
// last returns the last element in the tree (in-order) or nil iff
|
||||
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Forward)
|
||||
}
|
||||
|
||||
// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
|
||||
// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present.
|
||||
find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
|
||||
node = t._root
|
||||
for node != nil {
|
||||
@@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
|
||||
return node
|
||||
}
|
||||
|
||||
// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
|
||||
// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present.
|
||||
find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
|
||||
if n := find(t, key); n != nil {
|
||||
return n.value, true
|
||||
@@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
|
||||
return n, true, nil
|
||||
}
|
||||
|
||||
// remove removes a node or value from the tree, and returns true iff the
|
||||
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove :: proc {
|
||||
@@ -162,7 +162,7 @@ remove :: proc {
|
||||
remove_node,
|
||||
}
|
||||
|
||||
// remove_value removes a value from the tree, and returns true iff the
|
||||
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's key + value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
|
||||
@@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true)
|
||||
return remove_node(t, n, call_on_remove)
|
||||
}
|
||||
|
||||
// remove_node removes a node from the tree, and returns true iff the
|
||||
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's key + value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) {
|
||||
@@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K
|
||||
}
|
||||
|
||||
// iterator_get returns the node currently pointed to by the iterator,
|
||||
// or nil iff the node has been removed, the tree is empty, or the end
|
||||
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
|
||||
// of the tree has been reached.
|
||||
iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return it._cur
|
||||
}
|
||||
|
||||
// iterator_remove removes the node currently pointed to by the iterator,
|
||||
// and returns true iff the removal was successful. Semantics are the
|
||||
// and returns true if and only if (⟺) the removal was successful. Semantics are the
|
||||
// same as the Tree remove.
|
||||
iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool {
|
||||
if it._cur == nil {
|
||||
@@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t
|
||||
}
|
||||
|
||||
// iterator_next advances the iterator and returns the (node, true) or
|
||||
// or (nil, false) iff the end of the tree has been reached.
|
||||
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
|
||||
//
|
||||
// Note: The first call to iterator_next will return the first node instead
|
||||
// of advancing the iterator.
|
||||
|
||||
69
core/crypto/_aes/hw/api.odin
Normal file
69
core/crypto/_aes/hw/api.odin
Normal file
@@ -0,0 +1,69 @@
|
||||
package aes_hw
|
||||
|
||||
@(require) import "core:sys/info"
|
||||
|
||||
// is_supported returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
when ODIN_ARCH == .amd64 {
|
||||
// Note: Everything with AES-NI has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// is_ghash_supported returns true if and only if (⟺) hardware accelerated
|
||||
// GHASH is supported.
|
||||
is_ghash_supported :: proc "contextless" () -> bool {
|
||||
// Just having hardware GHASH is silly.
|
||||
if !is_supported() {
|
||||
return false
|
||||
}
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
return info.cpu_features() >= info.CPU_Features{
|
||||
.pclmulqdq,
|
||||
}
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
|
||||
// Once we can actually use this, we can re-enable this.
|
||||
//
|
||||
// return info.cpu_features() >= info.CPU_Features{
|
||||
// .pmull,
|
||||
// }
|
||||
return false
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `u8x16`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// instruction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
@@ -21,7 +21,7 @@
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
@@ -0,0 +1,115 @@
|
||||
#+build arm64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
|
||||
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
|
||||
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
HAS_GHASH :: false // Temporary
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
aesimc :: arm.vaesimcq_u8
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
|
||||
|
||||
// Undo ShiftRows step from AESE and extract X1 and X3
|
||||
dest := simd.swizzle(
|
||||
a,
|
||||
0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
|
||||
0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
|
||||
0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
|
||||
0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
|
||||
)
|
||||
|
||||
rcons := simd.u8x16{
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
}
|
||||
|
||||
return simd.bit_xor(dest, rcons)
|
||||
}
|
||||
|
||||
// The keyschedule implementation is easier to read with some extra
|
||||
// Intel intrinsics that are emulated by built-in LLVM ops anyway.
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
shift :: IMM8 & 0xff
|
||||
|
||||
// This needs to emit behavior identical to PSLLDQ which is as follows:
|
||||
//
|
||||
// TEMP := COUNT
|
||||
// IF (TEMP > 15) THEN TEMP := 16; FI
|
||||
// DEST := DEST << (TEMP * 8)
|
||||
// DEST[MAXVL-1:128] (Unmodified)
|
||||
|
||||
return simd.shuffle(
|
||||
simd.u8x16{},
|
||||
a,
|
||||
0 when shift > 15 else (16 - shift + 0),
|
||||
1 when shift > 15 else (16 - shift + 1),
|
||||
2 when shift > 15 else (16 - shift + 2),
|
||||
3 when shift > 15 else (16 - shift + 3),
|
||||
4 when shift > 15 else (16 - shift + 4),
|
||||
5 when shift > 15 else (16 - shift + 5),
|
||||
6 when shift > 15 else (16 - shift + 6),
|
||||
7 when shift > 15 else (16 - shift + 7),
|
||||
8 when shift > 15 else (16 - shift + 8),
|
||||
9 when shift > 15 else (16 - shift + 9),
|
||||
10 when shift > 15 else (16 - shift + 10),
|
||||
11 when shift > 15 else (16 - shift + 11),
|
||||
12 when shift > 15 else (16 - shift + 12),
|
||||
13 when shift > 15 else (16 - shift + 13),
|
||||
14 when shift > 15 else (16 - shift + 14),
|
||||
15 when shift > 15 else (16 - shift + 15),
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
v := transmute(simd.i32x4)a
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
v,
|
||||
v,
|
||||
IMM8 & 0b11,
|
||||
(IMM8 >> 2) & 0b11,
|
||||
(IMM8 >> 4) & 0b11,
|
||||
(IMM8 >> 6) & 0b11,
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
transmute(simd.u32x4)(a),
|
||||
transmute(simd.u32x4)(b),
|
||||
u32(MASK) & 0b11,
|
||||
(u32(MASK)>>2) & 0b11,
|
||||
((u32(MASK)>>4) & 0b11)+4,
|
||||
((u32(MASK)>>6) & 0b11)+4)
|
||||
}
|
||||
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
@@ -0,0 +1,55 @@
|
||||
#+build amd64
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel/RISC-V semantics.
|
||||
|
||||
TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
|
||||
HAS_GHASH :: true
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
|
||||
}
|
||||
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
@@ -0,0 +1,181 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd"
|
||||
|
||||
// Inspiration taken from BearSSL's AES-NI implementation.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performant otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := _mm_shuffle_ps(tmp, k1, 0x44)
|
||||
r2 := _mm_shuffle_ps(k1, k2, 0x4e)
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := aesimc(sks[i])
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]simd.u8x16 = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
|
||||
k1_tmp: [16]byte
|
||||
copy(k1_tmp[:], key[16:24])
|
||||
k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
|
||||
crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
|
||||
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
crypto.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
11
core/crypto/_aes/hw/unsupported.odin
Normal file
11
core/crypto/_aes/hw/unsupported.odin
Normal file
@@ -0,0 +1,11 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aes_hw
|
||||
|
||||
HAS_GHASH :: false
|
||||
|
||||
@(private)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
panic("crypto/aes: hardware implementation unsupported")
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "core:sys/info"
|
||||
|
||||
// is_supported returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
// Note: Everything with AES-NI and PCLMULQDQ has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
.pclmulqdq,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `__m128i`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// insturction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel AES-NI based implementation. Inspiration taken from BearSSL.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performat otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse,sse2")
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
|
||||
r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "aes")
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := x86._mm_aesimc_si128(sks[i])
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse,sse2,aes")
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]x86.__m128i = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
k1 := x86.__m128i{
|
||||
intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
|
||||
0,
|
||||
}
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
/*
|
||||
Set each byte of a memory range to zero.
|
||||
|
||||
This procedure copies the value `0` into the `len` bytes of a memory range,
|
||||
starting at address `data`.
|
||||
|
||||
This procedure returns the pointer to `data`.
|
||||
|
||||
Unlike the `zero()` procedure, which can be optimized away or reordered by the
|
||||
compiler under certain circumstances, `zero_explicit()` procedure can not be
|
||||
optimized away or reordered with other memory access operations, and the
|
||||
compiler assumes volatile semantics of the memory.
|
||||
*/
|
||||
zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
|
||||
// This routine tries to avoid the compiler optimizing away the call,
|
||||
// so that it is always executed. It is intended to provide
|
||||
// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
|
||||
// memset_s call.
|
||||
intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
|
||||
intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
|
||||
return data
|
||||
}
|
||||
@@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" (
|
||||
intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
|
||||
}
|
||||
|
||||
// is_performant returns true iff the target and current host both support
|
||||
// is_performant returns true if and only if (⟺) the target and current host both support
|
||||
// "enough" 128-bit SIMD to make this implementation performant.
|
||||
is_performant :: proc "contextless" () -> bool {
|
||||
when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {
|
||||
|
||||
@@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
|
||||
@(private = "file")
|
||||
_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
|
||||
|
||||
// is_performant returns true iff the target and current host both support
|
||||
// is_performant returns true if and only if (⟺) the target and current host both support
|
||||
// "enough" SIMD to make this implementation performant.
|
||||
is_performant :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{.avx, .avx2}
|
||||
|
||||
@@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
|
||||
tmp: Montgomery_Domain_Field_Element = ---
|
||||
fe_sub(&tmp, arg1, arg2)
|
||||
|
||||
// This will only underflow iff arg1 == arg2, and we return the borrow,
|
||||
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
|
||||
// which will be 1.
|
||||
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
|
||||
tmp: Montgomery_Domain_Field_Element = ---
|
||||
fe_sub(&tmp, arg1, arg2)
|
||||
|
||||
// This will only underflow iff arg1 == arg2, and we return the borrow,
|
||||
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
|
||||
// which will be 1.
|
||||
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
|
||||
|
||||
|
||||
@@ -5,17 +5,17 @@ package _subtle
|
||||
|
||||
import "core:math/bits"
|
||||
|
||||
// byte_eq returns 1 iff a == b, 0 otherwise.
|
||||
// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
byte_eq :: proc "contextless" (a, b: byte) -> int {
|
||||
v := a ~ b
|
||||
|
||||
// v == 0 iff a == b. The subtraction will underflow, setting the
|
||||
// v == 0 if and only if (⟺) a == b. The subtraction will underflow, setting the
|
||||
// sign bit, which will get returned.
|
||||
return int((u32(v)-1) >> 31)
|
||||
}
|
||||
|
||||
// u64_eq returns 1 iff a == b, 0 otherwise.
|
||||
// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_eq :: proc "contextless" (a, b: u64) -> u64 {
|
||||
_, borrow := bits.sub_u64(0, a ~ b, 0)
|
||||
@@ -27,14 +27,14 @@ eq :: proc {
|
||||
u64_eq,
|
||||
}
|
||||
|
||||
// u64_is_zero returns 1 iff a == 0, 0 otherwise.
|
||||
// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_is_zero :: proc "contextless" (a: u64) -> u64 {
|
||||
_, borrow := bits.sub_u64(a, 1, 0)
|
||||
return borrow
|
||||
}
|
||||
|
||||
// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
|
||||
// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
|
||||
is_zero := u64_is_zero(a)
|
||||
|
||||
@@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided algorithm, key, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and ciphertext MUST alias exactly or not at all.
|
||||
|
||||
@@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
397
core/crypto/aegis/aegis_impl_hw.odin
Normal file
397
core/crypto/aegis/aegis_impl_hw.odin
Normal file
@@ -0,0 +1,397 @@
|
||||
#+build amd64,arm32
|
||||
package aegis
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
State_HW :: struct {
|
||||
s0: simd.u8x16,
|
||||
s1: simd.u8x16,
|
||||
s2: simd.u8x16,
|
||||
s3: simd.u8x16,
|
||||
s4: simd.u8x16,
|
||||
s5: simd.u8x16,
|
||||
s6: simd.u8x16,
|
||||
s7: simd.u8x16,
|
||||
rate: int,
|
||||
}
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "sse2,aes"
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware
|
||||
// accelerated AEGIS is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
|
||||
switch ctx._key_len {
|
||||
case KEY_SIZE_128L:
|
||||
key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
|
||||
iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv)))
|
||||
|
||||
st.s0 = simd.bit_xor(key, iv)
|
||||
st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
|
||||
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
|
||||
st.s3 = st.s1
|
||||
st.s4 = st.s0
|
||||
st.s5 = simd.bit_xor(key, st.s2) // key ^ C0
|
||||
st.s6 = simd.bit_xor(key, st.s1) // key ^ C1
|
||||
st.s7 = st.s5
|
||||
st.rate = _RATE_128L
|
||||
|
||||
for _ in 0 ..< 10 {
|
||||
update_hw_128l(st, iv, key)
|
||||
}
|
||||
case KEY_SIZE_256:
|
||||
k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
|
||||
k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16]))
|
||||
n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0]))
|
||||
n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16]))
|
||||
|
||||
st.s0 = simd.bit_xor(k0, n0)
|
||||
st.s1 = simd.bit_xor(k1, n1)
|
||||
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
|
||||
st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
|
||||
st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0
|
||||
st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1
|
||||
st.rate = _RATE_256
|
||||
|
||||
u0, u1 := st.s0, st.s1
|
||||
for _ in 0 ..< 4 {
|
||||
update_hw_256(st, k0)
|
||||
update_hw_256(st, k1)
|
||||
update_hw_256(st, u0)
|
||||
update_hw_256(st, u1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) {
|
||||
s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0))
|
||||
s1_ := aes_hw.aesenc(st.s0, st.s1)
|
||||
s2_ := aes_hw.aesenc(st.s1, st.s2)
|
||||
s3_ := aes_hw.aesenc(st.s2, st.s3)
|
||||
s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1))
|
||||
s5_ := aes_hw.aesenc(st.s4, st.s5)
|
||||
s6_ := aes_hw.aesenc(st.s5, st.s6)
|
||||
s7_ := aes_hw.aesenc(st.s6, st.s7)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) {
|
||||
s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m))
|
||||
s1_ := aes_hw.aesenc(st.s0, st.s1)
|
||||
s2_ := aes_hw.aesenc(st.s1, st.s2)
|
||||
s3_ := aes_hw.aesenc(st.s2, st.s3)
|
||||
s4_ := aes_hw.aesenc(st.s3, st.s4)
|
||||
s5_ := aes_hw.aesenc(st.s4, st.s5)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
|
||||
update_hw_256(st, m)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
|
||||
ai, l := aad, len(aad)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
absorb_hw_128l(st, ai)
|
||||
ai = ai[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
absorb_hw_256(st, ai)
|
||||
|
||||
ai = ai[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // AAD is not confidential.
|
||||
copy(tmp[:], ai)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
absorb_hw_128l(st, tmp[:])
|
||||
case _RATE_256:
|
||||
absorb_hw_256(st, tmp[:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) {
|
||||
z0 := simd.bit_xor(
|
||||
st.s6,
|
||||
simd.bit_xor(
|
||||
st.s1,
|
||||
simd.bit_and(st.s2, st.s3),
|
||||
),
|
||||
)
|
||||
z1 := simd.bit_xor(
|
||||
st.s2,
|
||||
simd.bit_xor(
|
||||
st.s5,
|
||||
simd.bit_and(st.s6, st.s7),
|
||||
),
|
||||
)
|
||||
return z0, z1
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 {
|
||||
return simd.bit_xor(
|
||||
st.s1,
|
||||
simd.bit_xor(
|
||||
st.s4,
|
||||
simd.bit_xor(
|
||||
st.s5,
|
||||
simd.bit_and(st.s2, st.s3),
|
||||
),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi)))
|
||||
update_hw_256(st, xi_)
|
||||
|
||||
ci_ := simd.bit_xor(xi_, z)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
ci, xi, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
enc_hw_128l(st, ci, xi)
|
||||
ci = ci[_RATE_128L:]
|
||||
xi = xi[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
enc_hw_256(st, ci, xi)
|
||||
ci = ci[_RATE_256:]
|
||||
xi = xi[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
|
||||
copy(tmp[:], xi)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
enc_hw_128l(st, tmp[:], tmp[:])
|
||||
case _RATE_256:
|
||||
enc_hw_256(st, tmp[:], tmp[:])
|
||||
}
|
||||
copy(ci, tmp[:l])
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16]))
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
|
||||
update_hw_128l(st, out0, out1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci)))
|
||||
xi_ := simd.bit_xor(ci_, z)
|
||||
|
||||
update_hw_256(st, xi_)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_128L]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z0, z1 := z_hw_128l(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16]))
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_128L; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0
|
||||
out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1
|
||||
update_hw_128l(st, out0, out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_256]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z := z_hw_256(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
xn_ := simd.bit_xor(cn_, z)
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_256; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
update_hw_256(st, xn_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
xi, ci, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
dec_hw_128l(st, xi, ci)
|
||||
xi = xi[_RATE_128L:]
|
||||
ci = ci[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
dec_hw_256(st, xi, ci)
|
||||
xi = xi[_RATE_256:]
|
||||
ci = ci[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Process the remainder.
|
||||
if l > 0 {
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
dec_partial_hw_128l(st, xi, ci)
|
||||
case _RATE_256:
|
||||
dec_partial_hw_256(st, xi, ci)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
|
||||
tmp: [16]byte
|
||||
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
|
||||
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
|
||||
|
||||
t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
|
||||
t0, t1: simd.u8x16 = ---, ---
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
t = simd.bit_xor(st.s2, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_128l(st, t, t)
|
||||
}
|
||||
|
||||
t0 = simd.bit_xor(st.s0, st.s1)
|
||||
t0 = simd.bit_xor(t0, st.s2)
|
||||
t0 = simd.bit_xor(t0, st.s3)
|
||||
|
||||
t1 = simd.bit_xor(st.s4, st.s5)
|
||||
t1 = simd.bit_xor(t1, st.s6)
|
||||
if len(tag) == TAG_SIZE_256 {
|
||||
t1 = simd.bit_xor(t1, st.s7)
|
||||
}
|
||||
case _RATE_256:
|
||||
t = simd.bit_xor(st.s3, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_256(st, t)
|
||||
}
|
||||
|
||||
t0 = simd.bit_xor(st.s0, st.s1)
|
||||
t0 = simd.bit_xor(t0, st.s2)
|
||||
|
||||
t1 = simd.bit_xor(st.s3, st.s4)
|
||||
t1 = simd.bit_xor(t1, st.s5)
|
||||
}
|
||||
switch len(tag) {
|
||||
case TAG_SIZE_128:
|
||||
t0 = simd.bit_xor(t0, t1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
|
||||
case TAG_SIZE_256:
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1)
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
reset_state_hw :: proc "contextless" (st: ^State_HW) {
|
||||
crypto.zero_explicit(st, size_of(st^))
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aegis
|
||||
|
||||
@(private = "file")
|
||||
@@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
|
||||
@(private)
|
||||
State_HW :: struct {}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -1,389 +0,0 @@
|
||||
#+build amd64
|
||||
package aegis
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/aes"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private)
|
||||
State_HW :: struct {
|
||||
s0: x86.__m128i,
|
||||
s1: x86.__m128i,
|
||||
s2: x86.__m128i,
|
||||
s3: x86.__m128i,
|
||||
s4: x86.__m128i,
|
||||
s5: x86.__m128i,
|
||||
s6: x86.__m128i,
|
||||
s7: x86.__m128i,
|
||||
rate: int,
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes.is_hardware_accelerated()
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
|
||||
switch ctx._key_len {
|
||||
case KEY_SIZE_128L:
|
||||
key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
|
||||
iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
|
||||
|
||||
st.s0 = x86._mm_xor_si128(key, iv)
|
||||
st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
|
||||
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
|
||||
st.s3 = st.s1
|
||||
st.s4 = st.s0
|
||||
st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
|
||||
st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
|
||||
st.s7 = st.s5
|
||||
st.rate = _RATE_128L
|
||||
|
||||
for _ in 0 ..< 10 {
|
||||
update_hw_128l(st, iv, key)
|
||||
}
|
||||
case KEY_SIZE_256:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
|
||||
k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
|
||||
n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
|
||||
n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
|
||||
|
||||
st.s0 = x86._mm_xor_si128(k0, n0)
|
||||
st.s1 = x86._mm_xor_si128(k1, n1)
|
||||
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
|
||||
st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
|
||||
st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
|
||||
st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
|
||||
st.rate = _RATE_256
|
||||
|
||||
u0, u1 := st.s0, st.s1
|
||||
for _ in 0 ..< 4 {
|
||||
update_hw_256(st, k0)
|
||||
update_hw_256(st, k1)
|
||||
update_hw_256(st, u0)
|
||||
update_hw_256(st, u1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
|
||||
s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
|
||||
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
|
||||
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
|
||||
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
|
||||
s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
|
||||
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
|
||||
s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
|
||||
s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
|
||||
s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
|
||||
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
|
||||
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
|
||||
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
|
||||
s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
|
||||
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
|
||||
update_hw_256(st, m)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
|
||||
ai, l := aad, len(aad)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
absorb_hw_128l(st, ai)
|
||||
ai = ai[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
absorb_hw_256(st, ai)
|
||||
|
||||
ai = ai[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // AAD is not confidential.
|
||||
copy(tmp[:], ai)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
absorb_hw_128l(st, tmp[:])
|
||||
case _RATE_256:
|
||||
absorb_hw_256(st, tmp[:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
|
||||
z0 := x86._mm_xor_si128(
|
||||
st.s6,
|
||||
x86._mm_xor_si128(
|
||||
st.s1,
|
||||
x86._mm_and_si128(st.s2, st.s3),
|
||||
),
|
||||
)
|
||||
z1 := x86._mm_xor_si128(
|
||||
st.s2,
|
||||
x86._mm_xor_si128(
|
||||
st.s5,
|
||||
x86._mm_and_si128(st.s6, st.s7),
|
||||
),
|
||||
)
|
||||
return z0, z1
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
|
||||
return x86._mm_xor_si128(
|
||||
st.s1,
|
||||
x86._mm_xor_si128(
|
||||
st.s4,
|
||||
x86._mm_xor_si128(
|
||||
st.s5,
|
||||
x86._mm_and_si128(st.s2, st.s3),
|
||||
),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
|
||||
update_hw_256(st, xi_)
|
||||
|
||||
ci_ := x86._mm_xor_si128(xi_, z)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
ci, xi, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
enc_hw_128l(st, ci, xi)
|
||||
ci = ci[_RATE_128L:]
|
||||
xi = xi[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
enc_hw_256(st, ci, xi)
|
||||
ci = ci[_RATE_256:]
|
||||
xi = xi[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
|
||||
copy(tmp[:], xi)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
enc_hw_128l(st, tmp[:], tmp[:])
|
||||
case _RATE_256:
|
||||
enc_hw_256(st, tmp[:], tmp[:])
|
||||
}
|
||||
copy(ci, tmp[:l])
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
|
||||
update_hw_128l(st, out0, out1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
|
||||
xi_ := x86._mm_xor_si128(ci_, z)
|
||||
|
||||
update_hw_256(st, xi_)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_128L]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z0, z1 := z_hw_128l(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_128L; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
|
||||
out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
|
||||
update_hw_128l(st, out0, out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_256]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z := z_hw_256(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
xn_ := x86._mm_xor_si128(cn_, z)
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_256; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
update_hw_256(st, xn_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
xi, ci, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
dec_hw_128l(st, xi, ci)
|
||||
xi = xi[_RATE_128L:]
|
||||
ci = ci[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
dec_hw_256(st, xi, ci)
|
||||
xi = xi[_RATE_256:]
|
||||
ci = ci[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Process the remainder.
|
||||
if l > 0 {
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
dec_partial_hw_128l(st, xi, ci)
|
||||
case _RATE_256:
|
||||
dec_partial_hw_256(st, xi, ci)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
|
||||
tmp: [16]byte
|
||||
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
|
||||
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
|
||||
|
||||
t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
|
||||
t0, t1: x86.__m128i = ---, ---
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
t = x86._mm_xor_si128(st.s2, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_128l(st, t, t)
|
||||
}
|
||||
|
||||
t0 = x86._mm_xor_si128(st.s0, st.s1)
|
||||
t0 = x86._mm_xor_si128(t0, st.s2)
|
||||
t0 = x86._mm_xor_si128(t0, st.s3)
|
||||
|
||||
t1 = x86._mm_xor_si128(st.s4, st.s5)
|
||||
t1 = x86._mm_xor_si128(t1, st.s6)
|
||||
if len(tag) == TAG_SIZE_256 {
|
||||
t1 = x86._mm_xor_si128(t1, st.s7)
|
||||
}
|
||||
case _RATE_256:
|
||||
t = x86._mm_xor_si128(st.s3, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_256(st, t)
|
||||
}
|
||||
|
||||
t0 = x86._mm_xor_si128(st.s0, st.s1)
|
||||
t0 = x86._mm_xor_si128(t0, st.s2)
|
||||
|
||||
t1 = x86._mm_xor_si128(st.s3, st.s4)
|
||||
t1 = x86._mm_xor_si128(t1, st.s5)
|
||||
}
|
||||
switch len(tag) {
|
||||
case TAG_SIZE_128:
|
||||
t0 = x86._mm_xor_si128(t0, t1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
|
||||
case TAG_SIZE_256:
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
reset_state_hw :: proc "contextless" (st: ^State_HW) {
|
||||
crypto.zero_explicit(st, size_of(st^))
|
||||
}
|
||||
@@ -1,30 +1,32 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:math/bits"
|
||||
import "core:simd/x86"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
CTR_STRIDE_HW :: 4
|
||||
@(private)
|
||||
CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
|
||||
hw_ctx := ctx._impl.(Context_Impl_Hardware)
|
||||
|
||||
sks: [15]x86.__m128i = ---
|
||||
sks: [15]simd.u8x16 = ---
|
||||
for i in 0 ..= hw_ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
|
||||
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
|
||||
ret := x86.__m128i{
|
||||
i64(intrinsics.byte_swap(hi)),
|
||||
i64(intrinsics.byte_swap(lo)),
|
||||
}
|
||||
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) {
|
||||
buf: [BLOCK_SIZE]byte = ---
|
||||
endian.unchecked_put_u64be(buf[0:], hi)
|
||||
endian.unchecked_put_u64be(buf[8:], lo)
|
||||
ret := intrinsics.unaligned_load((^simd.u8x16)(&buf))
|
||||
|
||||
hi, lo := hi, lo
|
||||
carry: u64
|
||||
@@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
nr_blocks := nr_blocks
|
||||
ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
#unroll for i in 0..< CTR_STRIDE_HW {
|
||||
blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
blks[i] = simd.bit_xor(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
for nr_blocks > 0 {
|
||||
blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
blks[0] = simd.bit_xor(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:1])
|
||||
@@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2")
|
||||
xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) {
|
||||
#no_bounds_check {
|
||||
if src != nil {
|
||||
for i in 0 ..< len(blocks) {
|
||||
off := i * BLOCK_SIZE
|
||||
tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
|
||||
blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
|
||||
tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:])))
|
||||
blocks[i] = simd.bit_xor(blocks[i], tmp)
|
||||
}
|
||||
}
|
||||
for i in 0 ..< len(blocks) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
59
core/crypto/aes/aes_ecb_hw.odin
Normal file
59
core/crypto/aes/aes_ecb_hw.odin
Normal file
@@ -0,0 +1,59 @@
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:simd"
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
|
||||
|
||||
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
|
||||
|
||||
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
#+build amd64
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import "core:bytes"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/ct64"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
|
||||
// GCM_IV_SIZE is the default size of the GCM IV in bytes.
|
||||
@@ -26,6 +27,10 @@ Context_GCM :: struct {
|
||||
|
||||
// init_gcm initializes a Context_GCM with the provided key.
|
||||
init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
|
||||
when aes_hw.HAS_GHASH {
|
||||
impl := aes_hw.is_ghash_supported() ? impl : .Portable
|
||||
|
||||
}
|
||||
init_impl(&ctx._impl, key, impl)
|
||||
ctx._is_initialized = true
|
||||
}
|
||||
@@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context_GCM, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
@(require) import "core:crypto/_aes/ct64"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd/x86"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
|
||||
@@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
// Note: Our GHASH implementation handles appending padding.
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
}
|
||||
gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
|
||||
copy(tag, s[:])
|
||||
@@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
}
|
||||
gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
|
||||
|
||||
@@ -71,18 +80,26 @@ init_ghash_hw :: proc(
|
||||
} else {
|
||||
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
|
||||
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
|
||||
hw_intel.ghash(j0[:], h[:], iv)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(j0[:], h[:], iv)
|
||||
} else {
|
||||
ct64.ghash(j0[:], h[:], iv)
|
||||
}
|
||||
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
|
||||
hw_intel.ghash(j0[:], h[:], tmp[:])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(j0[:], h[:], tmp[:])
|
||||
} else {
|
||||
ct64.ghash(j0[:], h[:], tmp[:])
|
||||
}
|
||||
}
|
||||
|
||||
// ECB encrypt j0, so that we can just XOR with the tag.
|
||||
encrypt_block_hw(ctx, j0_enc[:], j0[:])
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2")
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
final_ghash_hw :: proc(
|
||||
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
@@ -94,14 +111,18 @@ final_ghash_hw :: proc(
|
||||
endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
|
||||
endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], blk[:])
|
||||
j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
|
||||
s_vec = x86._mm_xor_si128(s_vec, j0_vec)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], blk[:])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], blk[:])
|
||||
}
|
||||
j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^simd.u8x16)(s))
|
||||
s_vec = simd.bit_xor(s_vec, j0_vec)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(s), s_vec)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
gctr_hw :: proc(
|
||||
ctx: ^Context_Impl_Hardware,
|
||||
dst: []byte,
|
||||
@@ -111,13 +132,13 @@ gctr_hw :: proc(
|
||||
iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
is_seal: bool,
|
||||
) #no_bounds_check {
|
||||
sks: [15]x86.__m128i = ---
|
||||
sks: [15]simd.u8x16 = ---
|
||||
for i in 0 ..= ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
|
||||
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
// Setup the counter block
|
||||
ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
|
||||
ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv))
|
||||
ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
|
||||
|
||||
src, dst := src, dst
|
||||
@@ -127,11 +148,15 @@ gctr_hw :: proc(
|
||||
// This results in an unreadable mess, so we opt for simplicity
|
||||
// as performance is adequate.
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
@@ -139,42 +164,46 @@ gctr_hw :: proc(
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
blks[i] = simd.bit_xor(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:])
|
||||
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
}
|
||||
|
||||
src = src[CTR_STRIDE_BYTES_HW:]
|
||||
@@ -186,28 +215,32 @@ gctr_hw :: proc(
|
||||
for n := len(src); n > 0; {
|
||||
l := min(n, BLOCK_SIZE)
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:l])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], src[:l])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], src[:l])
|
||||
}
|
||||
}
|
||||
|
||||
blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
blks[0] = simd.bit_xor(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
|
||||
}
|
||||
|
||||
if l == BLOCK_SIZE {
|
||||
@@ -219,7 +252,11 @@ gctr_hw :: proc(
|
||||
copy(dst, blk[:l])
|
||||
}
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:l])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], dst[:l])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], dst[:l])
|
||||
}
|
||||
}
|
||||
|
||||
dst = dst[l:]
|
||||
@@ -235,8 +272,17 @@ gctr_hw :: proc(
|
||||
// the compiler.
|
||||
//
|
||||
// src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
|
||||
@(private = "file", enable_target_feature = "sse4.1")
|
||||
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
|
||||
ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
ctr_be := intrinsics.byte_swap(ctr)
|
||||
} else {
|
||||
ctr_be := ctr
|
||||
}
|
||||
|
||||
ret := transmute(simd.u8x16)(
|
||||
simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be)
|
||||
)
|
||||
|
||||
return ret, ctr + 1
|
||||
}
|
||||
18
core/crypto/aes/aes_impl_hw.odin
Normal file
18
core/crypto/aes/aes_impl_hw.odin
Normal file
@@ -0,0 +1,18 @@
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: aes_hw.Context
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
aes_hw.init(ctx, key)
|
||||
}
|
||||
@@ -1,10 +1,12 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aes
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AES
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
#+build amd64
|
||||
package aes
|
||||
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return hw_intel.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: hw_intel.Context
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
hw_intel.init(ctx, key)
|
||||
}
|
||||
@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_blake2.final(ctx, hash, finalize_clone)
|
||||
|
||||
@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_blake2.final(ctx, hash, finalize_clone)
|
||||
|
||||
@@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle"
|
||||
// Omit large precomputed tables, trading off performance for size.
|
||||
COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)
|
||||
|
||||
// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
|
||||
// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic
|
||||
// entropy source.
|
||||
HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
|
||||
|
||||
// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
|
||||
// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise.
|
||||
//
|
||||
// The execution time of this routine is constant regardless of the contents
|
||||
// of the slices being compared, as long as the length of the slices is equal.
|
||||
// If the length of the two slices is different, it will early-return 0.
|
||||
// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0.
|
||||
compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
|
||||
// If the length of the slices is different, early return.
|
||||
//
|
||||
@@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
|
||||
return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
|
||||
}
|
||||
|
||||
// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
|
||||
// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by
|
||||
// a and b are equal, 0 otherwise.
|
||||
//
|
||||
// The execution time of this routine is constant regardless of the
|
||||
@@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
|
||||
v |= x[i] ~ y[i]
|
||||
}
|
||||
|
||||
// After the loop, v == 0 iff a == b. The subtraction will underflow
|
||||
// iff v == 0, setting the sign-bit, which gets returned.
|
||||
// After the loop, v == 0 if and only if (⟺) a == b. The subtraction will underflow
|
||||
// if and only if (⟺) v == 0, setting the sign-bit, which gets returned.
|
||||
return subtle.eq(0, v)
|
||||
}
|
||||
|
||||
// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
|
||||
// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise.
|
||||
is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
|
||||
v: byte
|
||||
for b_ in b {
|
||||
|
||||
@@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -1,152 +1,183 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package deoxysii
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
// This processes a maximum of 4 blocks at a time, as that is suitable
|
||||
// for most current hardware that doesn't say "Xeon".
|
||||
//
|
||||
// TODO/perf: ARM should be able to do 8 at a time.
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "sse2,ssse3,aes"
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
_BIT_ENC :: x86.__m128i{0x80, 0}
|
||||
_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
||||
@(private = "file")
|
||||
_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
|
||||
_PREFIX_AD_BLOCK :: simd.u8x16{
|
||||
PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
|
||||
_PREFIX_AD_FINAL :: simd.u8x16{
|
||||
PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
|
||||
_PREFIX_MSG_BLOCK :: simd.u8x16{
|
||||
PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
|
||||
_PREFIX_MSG_FINAL :: simd.u8x16{
|
||||
PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes.is_hardware_accelerated()
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse4.1", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
auth_tweak :: #force_inline proc "contextless" (
|
||||
prefix: x86.__m128i,
|
||||
prefix: simd.u8x16,
|
||||
block_nr: int,
|
||||
) -> x86.__m128i {
|
||||
return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
|
||||
}
|
||||
) -> simd.u8x16 {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
|
||||
} else {
|
||||
block_nr_u64 := u64(block_nr)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
enc_tweak :: #force_inline proc "contextless" (
|
||||
tag: x86.__m128i,
|
||||
block_nr: int,
|
||||
) -> x86.__m128i {
|
||||
return x86._mm_xor_si128(
|
||||
x86._mm_or_si128(tag, _BIT_ENC),
|
||||
x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
|
||||
return simd.bit_or(
|
||||
prefix,
|
||||
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "ssse3", require_results)
|
||||
h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
|
||||
return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
enc_tweak :: #force_inline proc "contextless" (
|
||||
tag: simd.u8x16,
|
||||
block_nr: int,
|
||||
) -> simd.u8x16 {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
|
||||
} else {
|
||||
block_nr_u64 := u64(block_nr)
|
||||
}
|
||||
|
||||
return simd.bit_xor(
|
||||
simd.bit_or(tag, _BIT_ENC),
|
||||
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_x4 :: #force_inline proc "contextless" (
|
||||
ctx: ^Context,
|
||||
s_0, s_1, s_2, s_3: x86.__m128i,
|
||||
tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
|
||||
) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
|
||||
s_0, s_1, s_2, s_3: simd.u8x16,
|
||||
tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16,
|
||||
) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check {
|
||||
s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
|
||||
tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3
|
||||
|
||||
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
|
||||
stk_0 := x86._mm_xor_si128(tk1_0, sk)
|
||||
stk_1 := x86._mm_xor_si128(tk1_1, sk)
|
||||
stk_2 := x86._mm_xor_si128(tk1_2, sk)
|
||||
stk_3 := x86._mm_xor_si128(tk1_3, sk)
|
||||
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
|
||||
stk_0 := simd.bit_xor(tk1_0, sk)
|
||||
stk_1 := simd.bit_xor(tk1_1, sk)
|
||||
stk_2 := simd.bit_xor(tk1_2, sk)
|
||||
stk_3 := simd.bit_xor(tk1_3, sk)
|
||||
|
||||
s_0 = x86._mm_xor_si128(s_0, stk_0)
|
||||
s_1 = x86._mm_xor_si128(s_1, stk_1)
|
||||
s_2 = x86._mm_xor_si128(s_2, stk_2)
|
||||
s_3 = x86._mm_xor_si128(s_3, stk_3)
|
||||
s_0 = simd.bit_xor(s_0, stk_0)
|
||||
s_1 = simd.bit_xor(s_1, stk_1)
|
||||
s_2 = simd.bit_xor(s_2, stk_2)
|
||||
s_3 = simd.bit_xor(s_3, stk_3)
|
||||
|
||||
for i in 1 ..= BC_ROUNDS {
|
||||
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
|
||||
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
|
||||
|
||||
tk1_0 = h_(tk1_0)
|
||||
tk1_1 = h_(tk1_1)
|
||||
tk1_2 = h_(tk1_2)
|
||||
tk1_3 = h_(tk1_3)
|
||||
tk1_0 = h(tk1_0)
|
||||
tk1_1 = h(tk1_1)
|
||||
tk1_2 = h(tk1_2)
|
||||
tk1_3 = h(tk1_3)
|
||||
|
||||
stk_0 = x86._mm_xor_si128(tk1_0, sk)
|
||||
stk_1 = x86._mm_xor_si128(tk1_1, sk)
|
||||
stk_2 = x86._mm_xor_si128(tk1_2, sk)
|
||||
stk_3 = x86._mm_xor_si128(tk1_3, sk)
|
||||
stk_0 = simd.bit_xor(tk1_0, sk)
|
||||
stk_1 = simd.bit_xor(tk1_1, sk)
|
||||
stk_2 = simd.bit_xor(tk1_2, sk)
|
||||
stk_3 = simd.bit_xor(tk1_3, sk)
|
||||
|
||||
s_0 = x86._mm_aesenc_si128(s_0, stk_0)
|
||||
s_1 = x86._mm_aesenc_si128(s_1, stk_1)
|
||||
s_2 = x86._mm_aesenc_si128(s_2, stk_2)
|
||||
s_3 = x86._mm_aesenc_si128(s_3, stk_3)
|
||||
s_0 = aes_hw.aesenc(s_0, stk_0)
|
||||
s_1 = aes_hw.aesenc(s_1, stk_1)
|
||||
s_2 = aes_hw.aesenc(s_2, stk_2)
|
||||
s_3 = aes_hw.aesenc(s_3, stk_3)
|
||||
}
|
||||
|
||||
return s_0, s_1, s_2, s_3
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_x1 :: #force_inline proc "contextless" (
|
||||
ctx: ^Context,
|
||||
s: x86.__m128i,
|
||||
tweak: x86.__m128i,
|
||||
) -> x86.__m128i #no_bounds_check {
|
||||
s: simd.u8x16,
|
||||
tweak: simd.u8x16,
|
||||
) -> simd.u8x16 #no_bounds_check {
|
||||
s, tk1 := s, tweak
|
||||
|
||||
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
|
||||
stk := x86._mm_xor_si128(tk1, sk)
|
||||
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
|
||||
stk := simd.bit_xor(tk1, sk)
|
||||
|
||||
s = x86._mm_xor_si128(s, stk)
|
||||
s = simd.bit_xor(s, stk)
|
||||
|
||||
for i in 1 ..= BC_ROUNDS {
|
||||
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
|
||||
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
|
||||
|
||||
tk1 = h_(tk1)
|
||||
tk1 = h(tk1)
|
||||
|
||||
stk = x86._mm_xor_si128(tk1, sk)
|
||||
stk = simd.bit_xor(tk1, sk)
|
||||
|
||||
s = x86._mm_aesenc_si128(s, stk)
|
||||
s = aes_hw.aesenc(s, stk)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_absorb :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
tag: x86.__m128i,
|
||||
tag: simd.u8x16,
|
||||
src: []byte,
|
||||
tweak_prefix: x86.__m128i,
|
||||
tweak_prefix: simd.u8x16,
|
||||
stk_block_nr: int,
|
||||
) -> (x86.__m128i, int) #no_bounds_check {
|
||||
) -> (simd.u8x16, int) #no_bounds_check {
|
||||
src, stk_block_nr, tag := src, stk_block_nr, tag
|
||||
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= 4 {
|
||||
d_0, d_1, d_2, d_3 := bc_x4(
|
||||
ctx,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
auth_tweak(tweak_prefix, stk_block_nr),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 1),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 2),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 3),
|
||||
)
|
||||
|
||||
tag = x86._mm_xor_si128(tag, d_0)
|
||||
tag = x86._mm_xor_si128(tag, d_1)
|
||||
tag = x86._mm_xor_si128(tag, d_2)
|
||||
tag = x86._mm_xor_si128(tag, d_3)
|
||||
tag = simd.bit_xor(tag, d_0)
|
||||
tag = simd.bit_xor(tag, d_1)
|
||||
tag = simd.bit_xor(tag, d_2)
|
||||
tag = simd.bit_xor(tag, d_3)
|
||||
|
||||
src = src[4*BLOCK_SIZE:]
|
||||
stk_block_nr += 4
|
||||
@@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" (
|
||||
for nr_blocks > 0 {
|
||||
d := bc_x1(
|
||||
ctx,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
auth_tweak(tweak_prefix, stk_block_nr),
|
||||
)
|
||||
|
||||
tag = x86._mm_xor_si128(tag, d)
|
||||
tag = simd.bit_xor(tag, d)
|
||||
|
||||
src = src[BLOCK_SIZE:]
|
||||
stk_block_nr += 1
|
||||
@@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" (
|
||||
return tag, stk_block_nr
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_final :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
tag: x86.__m128i,
|
||||
tag: simd.u8x16,
|
||||
iv: []byte,
|
||||
) -> x86.__m128i {
|
||||
) -> simd.u8x16 {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
|
||||
tmp[0] = PREFIX_TAG << PREFIX_SHIFT
|
||||
copy(tmp[1:], iv)
|
||||
|
||||
tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
|
||||
tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp))
|
||||
|
||||
return bc_x1(ctx, tag, tweak)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_encrypt :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
dst: []byte,
|
||||
src: []byte,
|
||||
iv: x86.__m128i,
|
||||
tweak_tag: x86.__m128i,
|
||||
iv: simd.u8x16,
|
||||
tweak_tag: simd.u8x16,
|
||||
stk_block_nr: int,
|
||||
) -> int {
|
||||
dst, src, stk_block_nr := dst, src, stk_block_nr
|
||||
@@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" (
|
||||
)
|
||||
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst)),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst)),
|
||||
simd.bit_xor(
|
||||
d_0,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_1,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_2,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_3,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" (
|
||||
)
|
||||
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst)),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst)),
|
||||
simd.bit_xor(
|
||||
d,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" (
|
||||
e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
copy(tmp[1:], iv)
|
||||
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
|
||||
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
|
||||
|
||||
// Algorithm 3
|
||||
//
|
||||
@@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
|
||||
// if A_∗ != nil then
|
||||
// Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
|
||||
// end
|
||||
auth: x86.__m128i
|
||||
auth: simd.u8x16
|
||||
n: int
|
||||
|
||||
aad := aad
|
||||
@@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
|
||||
copy(dst[n*BLOCK_SIZE:], m_star[:])
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth)
|
||||
}
|
||||
|
||||
@(private, require_results)
|
||||
d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
copy(tmp[1:], iv)
|
||||
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
|
||||
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
|
||||
|
||||
// Algorithm 4
|
||||
//
|
||||
@@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// if C_∗ != nil then
|
||||
// M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
|
||||
// end
|
||||
auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
|
||||
auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag)))
|
||||
|
||||
m := ciphertext
|
||||
n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
|
||||
@@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// if A∗ != nil then
|
||||
// Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
|
||||
// end
|
||||
auth = x86.__m128i{0, 0}
|
||||
auth = simd.u8x16{}
|
||||
aad := aad
|
||||
auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
|
||||
aad = aad[BLOCK_SIZE*n:]
|
||||
@@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// Tag verification
|
||||
// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
|
||||
// else return false
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth)
|
||||
ok := crypto.compare_constant_time(tmp[:], tag) == 1
|
||||
|
||||
crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
@@ -1,10 +1,12 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package deoxysii
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -104,7 +104,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
|
||||
@@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// private_key_equal returns true iff the private keys are equal,
|
||||
// private_key_equal returns true if and only if (⟺) the private keys are equal,
|
||||
// in constant time.
|
||||
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
@@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
|
||||
public_key_clear(pub_key)
|
||||
|
||||
@@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff the public keys are equal,
|
||||
// public_key_equal returns true if and only if (⟺) the public keys are equal,
|
||||
// in constant time.
|
||||
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
|
||||
@@ -79,7 +79,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
|
||||
@@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// private_key_equal returns true iff the private keys are equal,
|
||||
// private_key_equal returns true if and only if (⟺) the private keys are equal,
|
||||
// in constant time.
|
||||
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
@@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
|
||||
public_key_clear(pub_key)
|
||||
|
||||
@@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff the public keys are equal,
|
||||
// public_key_equal returns true if and only if (⟺) the public keys are equal,
|
||||
// in constant time.
|
||||
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
|
||||
@@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) {
|
||||
return nil, nil, false
|
||||
}
|
||||
|
||||
// DER requires a leading 0 iff the sign bit of the leading byte
|
||||
// DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte
|
||||
// is set to distinguish between positive and negative integers,
|
||||
// and the minimal length representation. `r` and `s` are always
|
||||
// going to be unsigned, so we validate malformed DER and strip
|
||||
|
||||
@@ -3,7 +3,7 @@ package ecdsa
|
||||
import "core:crypto/hash"
|
||||
import secec "core:crypto/_weierstrass"
|
||||
|
||||
// verify_raw returns true iff sig is a valid signature by pub_key over
|
||||
// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over
|
||||
// msg, hased using hash_algo, per the verification procedure specifed
|
||||
// in SEC 1, Version 2.0, Section 4.1.4.
|
||||
//
|
||||
@@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: []
|
||||
panic("crypto/ecdsa: invalid curve")
|
||||
}
|
||||
|
||||
// verify_asn1 returns true iff sig is a valid signature by pub_key over
|
||||
// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over
|
||||
// msg, hased using hash_algo, per the verification procedure specifed
|
||||
// in SEC 1, Version 2.0, Section 4.1.4.
|
||||
//
|
||||
|
||||
@@ -48,7 +48,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
|
||||
if len(b) != PRIVATE_KEY_SIZE {
|
||||
return false
|
||||
@@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
|
||||
if len(b) != PUBLIC_KEY_SIZE {
|
||||
return false
|
||||
@@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
copy(dst, pub_key._b[:])
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff pub_key is equal to other.
|
||||
// public_key_equal returns true if and only if (⟺) pub_key is equal to other.
|
||||
public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
|
||||
ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")
|
||||
|
||||
return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
|
||||
}
|
||||
|
||||
// verify returns true iff sig is a valid signature by pub_key over msg.
|
||||
// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg.
|
||||
//
|
||||
// The optional `allow_small_order_A` parameter will make this
|
||||
// implementation strictly compatible with FIPS 186-5, at the expense of
|
||||
|
||||
@@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
switch &impl in ctx._impl {
|
||||
|
||||
@@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the HMAC tag computed with the specified algorithm
|
||||
// and key over msg and return true iff the tag is valid. It requires
|
||||
// and key over msg and return true if and only if (⟺) the tag is valid. It requires
|
||||
// that the tag is correctly sized.
|
||||
verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
|
||||
tag_buf: [hash.MAX_DIGEST_SIZE]byte
|
||||
|
||||
@@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the KMAC tag computed with the specified security
|
||||
// strength, key and domain separator over msg and return true iff the
|
||||
// strength, key and domain separator over msg and return true if and only if (⟺) the
|
||||
// tag is valid.
|
||||
verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
|
||||
derived_tag := make([]byte, len(tag), allocator)
|
||||
|
||||
@@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -66,7 +66,7 @@ derive :: proc(
|
||||
dst_blk = dst_blk[h_len:]
|
||||
}
|
||||
|
||||
// Instead of rounding l up, just proceass the one extra block iff
|
||||
// Instead of rounding l up, just proceass the one extra block if and only if (⟺)
|
||||
// r != 0.
|
||||
if r > 0 {
|
||||
tmp: [hash.MAX_DIGEST_SIZE]byte
|
||||
|
||||
@@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the Poly1305 tag computed with the key over msg and
|
||||
// return true iff the tag is valid. It requires that the tag is correctly
|
||||
// return true if and only if (⟺) the tag is valid. It requires that the tag is correctly
|
||||
// sized.
|
||||
verify :: proc(tag, msg, key: []byte) -> bool {
|
||||
ctx: Context = ---
|
||||
|
||||
@@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc(
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
|
||||
// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{a})
|
||||
@@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
|
||||
// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{ge, a})
|
||||
@@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
grp.ge_cond_assign(&ge._p, &a._p, ctrl)
|
||||
}
|
||||
|
||||
// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
|
||||
// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{a, b})
|
||||
@@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_equal returns 1 iff `a == b`, and 0 otherwise.
|
||||
// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
|
||||
@(require_results)
|
||||
ge_equal :: proc(a, b: ^Group_Element) -> int {
|
||||
_ge_ensure_initialized([]^Group_Element{a, b})
|
||||
@@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int {
|
||||
return ret
|
||||
}
|
||||
|
||||
// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise.
|
||||
// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise.
|
||||
@(require_results)
|
||||
ge_is_identity :: proc(ge: ^Group_Element) -> int {
|
||||
return ge_equal(ge, &GE_IDENTITY)
|
||||
|
||||
@@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) {
|
||||
grp.sc_square(sc, a)
|
||||
}
|
||||
|
||||
// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`.
|
||||
// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) {
|
||||
grp.sc_cond_assign(sc, a, ctrl)
|
||||
}
|
||||
|
||||
// sc_equal returns 1 iff `a == b`, and 0 otherwise.
|
||||
// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
|
||||
@(require_results)
|
||||
sc_equal :: proc(a, b: ^Scalar) -> int {
|
||||
return grp.sc_equal(a, b)
|
||||
|
||||
@@ -191,7 +191,7 @@ update :: proc(ctx: ^$T, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
224
core/crypto/sha2/sha256_impl_hw_arm.odin
Normal file
224
core/crypto/sha2/sha256_impl_hw_arm.odin
Normal file
@@ -0,0 +1,224 @@
|
||||
#+build arm64,arm32
|
||||
package sha2
|
||||
|
||||
// Based on the public domain code by Jeffrey Walton, though
|
||||
// realistically, there only is one sensible way to write this.
|
||||
//
|
||||
// See: https://github.com/noloader/SHA-Intrinsics
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
import "core:sys/info"
|
||||
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
|
||||
// accelerated SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.sha256,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5}
|
||||
@(private = "file")
|
||||
K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5}
|
||||
@(private = "file")
|
||||
K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3}
|
||||
@(private = "file")
|
||||
K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174}
|
||||
@(private = "file")
|
||||
K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC}
|
||||
@(private = "file")
|
||||
K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA}
|
||||
@(private = "file")
|
||||
K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7}
|
||||
@(private = "file")
|
||||
K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967}
|
||||
@(private = "file")
|
||||
K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13}
|
||||
@(private = "file")
|
||||
K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85}
|
||||
@(private = "file")
|
||||
K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3}
|
||||
@(private = "file")
|
||||
K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070}
|
||||
@(private = "file")
|
||||
K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5}
|
||||
@(private = "file")
|
||||
K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3}
|
||||
@(private = "file")
|
||||
K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208}
|
||||
@(private = "file")
|
||||
K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2}
|
||||
|
||||
@(private, enable_target_feature = "neon,sha2")
|
||||
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
|
||||
state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0]))
|
||||
state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4]))
|
||||
|
||||
data := data
|
||||
for len(data) >= BLOCK_SIZE_256 {
|
||||
// Save state
|
||||
abef_save, cdgh_save := state_0, state_1
|
||||
|
||||
// Load message
|
||||
msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data)))
|
||||
msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:])))
|
||||
msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:])))
|
||||
msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:])))
|
||||
|
||||
// Reverse for little endian
|
||||
when ODIN_ENDIAN == .Little {
|
||||
msg_0 = byteswap_u32x4(msg_0)
|
||||
msg_1 = byteswap_u32x4(msg_1)
|
||||
msg_2 = byteswap_u32x4(msg_2)
|
||||
msg_3 = byteswap_u32x4(msg_3)
|
||||
}
|
||||
|
||||
tmp_0 := simd.add(msg_0, K_0)
|
||||
|
||||
// Rounds 0-3
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 := state_0
|
||||
tmp_1 := simd.add(msg_1, K_1)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 4-7
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_2)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 8-11
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_3)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 12-15
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_4)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 16-19
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_5)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 20-23
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_6)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 24-27
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_7)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 28-31
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_8)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 32-35
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_9)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 36-39
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_10)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 40-43
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_11)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 44-47
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_12)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 48-51
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_13)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
|
||||
// Rounds 52-55
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_14)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
|
||||
// Rounds 56-59
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_15)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
|
||||
// Rounds 60-63
|
||||
tmp_2 = state_0
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
|
||||
// Combine state
|
||||
state_0 = simd.add(state_0, abef_save)
|
||||
state_1 = simd.add(state_1, cdgh_save)
|
||||
|
||||
data = data[BLOCK_SIZE_256:]
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0)
|
||||
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1)
|
||||
}
|
||||
|
||||
when ODIN_ENDIAN == .Little {
|
||||
@(private = "file", enable_target_feature = "neon")
|
||||
byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 {
|
||||
return transmute(simd.u32x4)(
|
||||
simd.shuffle(
|
||||
transmute(simd.u8x16)(a),
|
||||
transmute(simd.u8x16)(a),
|
||||
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
|
||||
K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}
|
||||
|
||||
|
||||
// is_hardware_accelerated_256 returns true iff hardware accelerated
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
|
||||
// SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{
|
||||
@@ -1,10 +1,12 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package sha2
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated_256 returns true iff hardware accelerated
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
|
||||
// SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err
|
||||
|
||||
// copy_n copies n bytes (or till an error) from src to dst.
|
||||
// It returns the number of bytes copied and the first error that occurred whilst copying, if any.
|
||||
// On return, written == n IFF err == nil
|
||||
// On return, written == n if and only if (⟺) err == nil
|
||||
copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) {
|
||||
nsrc := limited_reader_init(&Limited_Reader{}, src, n)
|
||||
written, err = copy(dst, nsrc)
|
||||
|
||||
@@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc
|
||||
If the modulus is odd or dr != 0 use the montgomery method.
|
||||
*/
|
||||
if internal_int_is_odd(P) || dr != 0 {
|
||||
return _private_int_exponent_mod(res, G, X, P, dr)
|
||||
return _private_int_exponent_mod_fast(res, G, X, P, dr)
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
|
||||
return _private_int_mul_high_comba(dest, a, b, digits)
|
||||
}
|
||||
|
||||
internal_grow(dest, a.used + b.used + 1) or_return
|
||||
dest.used = a.used + b.used + 1
|
||||
/*
|
||||
Set up temporary output `Int`, which we'll swap for `dest` when done.
|
||||
*/
|
||||
|
||||
t := &Int{}
|
||||
|
||||
internal_grow(t, a.used + b.used + 1) or_return
|
||||
t.used = a.used + b.used + 1
|
||||
|
||||
pa := a.used
|
||||
pb := b.used
|
||||
@@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
|
||||
/*
|
||||
Calculate the double precision result.
|
||||
*/
|
||||
r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
||||
r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
||||
|
||||
/*
|
||||
Get the lower part.
|
||||
*/
|
||||
dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
||||
t.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
||||
|
||||
/*
|
||||
Carry the carry.
|
||||
*/
|
||||
carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
||||
}
|
||||
dest.digit[ix + pb] = carry
|
||||
t.digit[ix + pb] = carry
|
||||
}
|
||||
|
||||
internal_swap(dest, t)
|
||||
internal_destroy(t)
|
||||
return internal_clamp(dest)
|
||||
}
|
||||
|
||||
|
||||
@@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc :=
|
||||
|
||||
needed := mem.align_forward_uint(size, alignment)
|
||||
needed = max(needed, arena.default_commit_size)
|
||||
block_size := max(needed, arena.minimum_block_size)
|
||||
block_size := max(needed, arena.minimum_block_size) + alignment
|
||||
|
||||
new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return
|
||||
new_block := memory_block_alloc(needed, block_size) or_return
|
||||
new_block.prev = arena.curr_block
|
||||
arena.curr_block = new_block
|
||||
arena.total_reserved += new_block.reserved
|
||||
|
||||
@@ -154,7 +154,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,
|
||||
|
||||
pmblock.committed = platform_total_commit
|
||||
block.committed = pmblock.committed - base_offset
|
||||
|
||||
assert(block.committed <= block.reserved)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -174,7 +174,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,
|
||||
err = .Out_Of_Memory
|
||||
return
|
||||
}
|
||||
assert(block.committed <= block.reserved)
|
||||
|
||||
do_commit_if_necessary(block, size, default_commit_size) or_return
|
||||
|
||||
data = block.base[block.used+alignment_offset:][:min_size]
|
||||
|
||||
@@ -804,7 +804,7 @@ send_exec :: proc(op: ^Operation) -> Op_Result {
|
||||
|
||||
op.send.sent += n
|
||||
|
||||
if op.send.sent < total {
|
||||
if n < total {
|
||||
return send_exec(op)
|
||||
}
|
||||
|
||||
@@ -868,7 +868,7 @@ recv_exec :: proc(op: ^Operation) -> Op_Result {
|
||||
assert(is_tcp || op.recv.received == 0)
|
||||
op.recv.received += n
|
||||
|
||||
if is_tcp && n != 0 && op.recv.received < total {
|
||||
if is_tcp && n != 0 && n < total {
|
||||
return recv_exec(op)
|
||||
}
|
||||
|
||||
|
||||
34
core/simd/arm/aes.odin
Normal file
34
core/simd/arm/aes.odin
Normal file
@@ -0,0 +1,34 @@
|
||||
#+build arm64,arm32
|
||||
package simd_arm
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
|
||||
return _vaeseq_u8(data, key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
|
||||
return _vaesdq_u8(data, key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
|
||||
return _vaesmcq_u8(data)
|
||||
}
|
||||
|
||||
@(require_results,enable_target_feature = "aes")
|
||||
vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
|
||||
return _vaesimcq_u8(data)
|
||||
}
|
||||
|
||||
@(private, default_calling_convention = "none")
|
||||
foreign _ {
|
||||
@(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese")
|
||||
_vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.aesd" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesd")
|
||||
_vaesdq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.aesmc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesmc")
|
||||
_vaesmcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.aesimc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesimc")
|
||||
_vaesimcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
|
||||
}
|
||||
2
core/simd/arm/doc.odin
Normal file
2
core/simd/arm/doc.odin
Normal file
@@ -0,0 +1,2 @@
|
||||
// `SIMD` intrinsics specific to ARMv8 `arm32` and `arm64` architectures.
|
||||
package simd_arm
|
||||
108
core/simd/arm/sha.odin
Normal file
108
core/simd/arm/sha.odin
Normal file
@@ -0,0 +1,108 @@
|
||||
#+build arm64,arm32
|
||||
package simd_arm
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha1cq_u32(hash_abcd, e, wk)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha1pq_u32(hash_abcd, e, wk)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha1mq_u32(hash_abcd, e, wk)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t {
|
||||
return _vsha1h_u32(e)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha1su0q_u32(w0_3, w4_7, w8_11)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha1su1q_u32(tw0_3, w12_15)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha256hq_u32(hash_abcd, hash_efgh, wk)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha256h2q_u32(hash_efgh, hash_abcd, wk)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha256su0q_u32(w0_3, w4_7)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha2")
|
||||
vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t {
|
||||
return _vsha256su1q_u32(tw0_3, w8_11, w12_15)
|
||||
}
|
||||
|
||||
// Note: The SHA512 instructions are part of the `sha3` feature set.
|
||||
|
||||
@(require_results, enable_target_feature = "sha3")
|
||||
vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t {
|
||||
return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha3")
|
||||
vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t {
|
||||
return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha3")
|
||||
vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t {
|
||||
return _vsha512su0q_u64(w0_1, w2_)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "sha3")
|
||||
vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t {
|
||||
return _vsha512su1q_u64(s01_s02, w14_15, w9_10)
|
||||
}
|
||||
|
||||
@(private, default_calling_convention = "none")
|
||||
foreign _ {
|
||||
@(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c")
|
||||
_vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p")
|
||||
_vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m")
|
||||
_vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h")
|
||||
_vsha1h_u32 :: proc(e: uint32_t) -> uint32_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0")
|
||||
_vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1")
|
||||
_vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t ---
|
||||
|
||||
@(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h")
|
||||
_vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2")
|
||||
_vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0")
|
||||
_vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1")
|
||||
_vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t ---
|
||||
|
||||
@(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h")
|
||||
_vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2")
|
||||
_vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0")
|
||||
_vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t ---
|
||||
@(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1")
|
||||
_vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t ---
|
||||
}
|
||||
9
core/simd/arm/types.odin
Normal file
9
core/simd/arm/types.odin
Normal file
@@ -0,0 +1,9 @@
|
||||
#+build arm64,arm32
|
||||
package simd_arm
|
||||
|
||||
// Type aliases to match `arm_neon.h`.
|
||||
uint32_t :: u32
|
||||
|
||||
uint8x16_t :: #simd[16]u8
|
||||
uint32x4_t :: #simd[4]u32
|
||||
uint64x2_t :: #simd[2]u64
|
||||
@@ -82,7 +82,7 @@ internal_block_literal_make :: proc (is_global: bool, user_data: rawptr, user_pr
|
||||
BLOCK_HAS_COPY_DISPOSE :: 1 << 25
|
||||
BLOCK_HAS_CTOR :: 1 << 26 // helpers have C++ code
|
||||
BLOCK_IS_GLOBAL :: 1 << 28
|
||||
BLOCK_HAS_STRET :: 1 << 29 // IFF BLOCK_HAS_SIGNATURE
|
||||
BLOCK_HAS_STRET :: 1 << 29 // if and only if (⟺) BLOCK_HAS_SIGNATURE
|
||||
BLOCK_HAS_SIGNATURE :: 1 << 30
|
||||
|
||||
bl.isa = is_global ? &_NSConcreteGlobalBlock : &_NSConcreteStackBlock
|
||||
|
||||
@@ -1683,7 +1683,20 @@ gb_internal void init_android_values(bool with_sdk) {
|
||||
gb_exit(1);
|
||||
}
|
||||
|
||||
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/"));
|
||||
switch (bc->metrics.arch) {
|
||||
case TargetArch_arm64:
|
||||
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("aarch64-linux-android");
|
||||
break;
|
||||
case TargetArch_arm32:
|
||||
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("arm-linux-androideabi");
|
||||
break;
|
||||
case TargetArch_amd64:
|
||||
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("x86_64-linux-android");
|
||||
break;
|
||||
case TargetArch_i386:
|
||||
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("i686-linux-android");
|
||||
break;
|
||||
}
|
||||
|
||||
char buf[32] = {};
|
||||
gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL);
|
||||
@@ -1958,9 +1971,22 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
|
||||
} else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) {
|
||||
switch (metrics->arch) {
|
||||
case TargetArch_arm64:
|
||||
bc->metrics.target_triplet = str_lit("aarch64-none-linux-android");
|
||||
bc->metrics.target_triplet = str_lit("aarch64-linux-android");
|
||||
bc->reloc_mode = RelocMode_PIC;
|
||||
break;
|
||||
case TargetArch_arm32:
|
||||
bc->metrics.target_triplet = str_lit("armv7a-linux-androideabi");
|
||||
bc->reloc_mode = RelocMode_PIC;
|
||||
break;
|
||||
case TargetArch_amd64:
|
||||
bc->metrics.target_triplet = str_lit("x86_64-linux-android");
|
||||
bc->reloc_mode = RelocMode_PIC;
|
||||
break;
|
||||
case TargetArch_i386:
|
||||
bc->metrics.target_triplet = str_lit("i686-linux-android");
|
||||
bc->reloc_mode = RelocMode_PIC;
|
||||
break;
|
||||
|
||||
default:
|
||||
GB_PANIC("Unknown architecture for -subtarget:android");
|
||||
}
|
||||
|
||||
@@ -2971,14 +2971,21 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
|
||||
|
||||
if (check_is_assignable_to(c, x, y->type) ||
|
||||
check_is_assignable_to(c, y, x->type)) {
|
||||
if (x->type->failure || y->type->failure) {
|
||||
// // skip any failures
|
||||
x->mode = Addressing_Value;
|
||||
x->type = t_untyped_bool;
|
||||
return;
|
||||
}
|
||||
|
||||
Type *err_type = x->type;
|
||||
bool defined = false;
|
||||
switch (op) {
|
||||
case Token_CmpEq:
|
||||
case Token_NotEq:
|
||||
defined = (is_type_comparable(x->type) && is_type_comparable(y->type)) ||
|
||||
(is_operand_nil(*x) && type_has_nil(y->type)) ||
|
||||
(is_operand_nil(*y) && type_has_nil(x->type));
|
||||
defined = ((is_operand_nil(*x) && type_has_nil(y->type)) ||
|
||||
(is_operand_nil(*y) && type_has_nil(x->type)) ||
|
||||
is_type_comparable(x->type) && is_type_comparable(y->type));
|
||||
break;
|
||||
case Token_Lt:
|
||||
case Token_Gt:
|
||||
@@ -4476,9 +4483,9 @@ gb_internal void check_binary_expr(CheckerContext *c, Operand *x, Ast *node, Typ
|
||||
truncated: r = a - b*trunc(a/b)
|
||||
floored: r = a - b*floor(a/b)
|
||||
|
||||
IFF a/0 == 0, then (a%0 == a) or (a%%0 == a)
|
||||
IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0)
|
||||
IFF a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
|
||||
If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a)
|
||||
If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0)
|
||||
If and only if (⟺) a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
|
||||
*/
|
||||
|
||||
switch (zero_behaviour) {
|
||||
|
||||
@@ -1853,7 +1853,7 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
|
||||
if (is_using && (feature_flags & OptInFeatureFlag_UsingStmt) == 0) {
|
||||
ERROR_BLOCK();
|
||||
error(param, "'using' has been disallowed as it is considered bad practice to use as a statement/procedure parameter outside of immediate refactoring");
|
||||
error_line("\tIt you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
|
||||
error_line("\tIf you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
|
||||
}
|
||||
|
||||
if (type_expr == nullptr) {
|
||||
|
||||
@@ -2240,7 +2240,7 @@ gb_internal void add_type_info_type_internal(CheckerContext *c, Type *t) {
|
||||
|
||||
case Type_BitSet:
|
||||
add_type_info_type_internal(c, bt->BitSet.elem);
|
||||
add_type_info_type_internal(c, bt->BitSet.underlying);
|
||||
add_type_info_type_internal(c, bit_set_to_int(bt));
|
||||
break;
|
||||
|
||||
case Type_Pointer:
|
||||
@@ -2484,7 +2484,7 @@ gb_internal void add_min_dep_type_info(Checker *c, Type *t) {
|
||||
|
||||
case Type_BitSet:
|
||||
add_min_dep_type_info(c, bt->BitSet.elem);
|
||||
add_min_dep_type_info(c, bt->BitSet.underlying);
|
||||
add_min_dep_type_info(c, bit_set_to_int(bt));
|
||||
break;
|
||||
|
||||
case Type_Pointer:
|
||||
|
||||
@@ -676,7 +676,7 @@ try_cross_linking:;
|
||||
defer (gb_string_free(glue));
|
||||
|
||||
glue = gb_string_append_fmt(glue, "bin/clang");
|
||||
glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
|
||||
glue = gb_string_append_fmt(glue, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL);
|
||||
glue = gb_string_appendc(glue, "-c \"");
|
||||
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len);
|
||||
glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c");
|
||||
@@ -697,8 +697,9 @@ try_cross_linking:;
|
||||
|
||||
glue = gb_string_appendc(glue, "\"-I");
|
||||
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
|
||||
glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/");
|
||||
glue = gb_string_appendc(glue, "\" ");
|
||||
glue = gb_string_appendc(glue, "sysroot/usr/include/");
|
||||
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len);
|
||||
glue = gb_string_appendc(glue, "/\" ");
|
||||
|
||||
|
||||
glue = gb_string_appendc(glue, "-Wno-macro-redefined ");
|
||||
@@ -969,7 +970,7 @@ try_cross_linking:;
|
||||
gbString ndk_bin_directory = gb_string_make_length(temporary_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
|
||||
link_command_line = gb_string_appendc(link_command_line, ndk_bin_directory);
|
||||
link_command_line = gb_string_appendc(link_command_line, "bin/clang");
|
||||
link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
|
||||
link_command_line = gb_string_append_fmt(link_command_line, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL);
|
||||
} else {
|
||||
link_command_line = gb_string_appendc(link_command_line, clang_path);
|
||||
}
|
||||
|
||||
@@ -66,6 +66,19 @@ gb_internal String get_final_microarchitecture() {
|
||||
gb_internal String get_default_features() {
|
||||
BuildContext *bc = &build_context;
|
||||
|
||||
if (bc->microarch == str_lit("native")) {
|
||||
String features = make_string_c(LLVMGetHostCPUFeatures());
|
||||
|
||||
// Update the features string so LLVM uses it later.
|
||||
if (bc->target_features_string.len > 0) {
|
||||
bc->target_features_string = concatenate3_strings(permanent_allocator(), features, str_lit(","), bc->target_features_string);
|
||||
} else {
|
||||
bc->target_features_string = features;
|
||||
}
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
int off = 0;
|
||||
for (int i = 0; i < bc->metrics.arch; i += 1) {
|
||||
off += target_microarch_counts[i];
|
||||
|
||||
@@ -1424,8 +1424,8 @@ gb_internal LLVMValueRef lb_integer_modulo(lbProcedure *p, LLVMValueRef lhs, LLV
|
||||
truncated: r = a - b*trunc(a/b)
|
||||
floored: r = a - b*floor(a/b)
|
||||
|
||||
IFF a/0 == 0, then (a%0 == a) or (a%%0 == a)
|
||||
IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0)
|
||||
If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a)
|
||||
If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0)
|
||||
*/
|
||||
|
||||
switch (behaviour) {
|
||||
|
||||
@@ -57,6 +57,33 @@ test_align_bumping_block_limit :: proc(t: ^testing.T) {
|
||||
testing.expect(t, len(data) == 896)
|
||||
}
|
||||
|
||||
@(test)
|
||||
test_large_minimum_block_size :: proc(t: ^testing.T) {
|
||||
a: virtual.Arena
|
||||
defer virtual.arena_destroy(&a)
|
||||
|
||||
init_err := virtual.arena_init_growing(&a, 16*mem.Megabyte)
|
||||
testing.expect_value(t, init_err, nil)
|
||||
|
||||
align : uint = 4
|
||||
for _ in 0..<6 {
|
||||
data, err := virtual.arena_alloc(&a, 18874368, align)
|
||||
testing.expect_value(t, err, nil)
|
||||
testing.expect(t, len(data) == 18874368)
|
||||
testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
|
||||
align *= 2
|
||||
virtual.arena_free_all(&a)
|
||||
}
|
||||
|
||||
align = 4
|
||||
for _ in 0..<32 {
|
||||
data, err := virtual.arena_alloc(&a, 1048576, align)
|
||||
testing.expect_value(t, err, nil)
|
||||
testing.expect(t, len(data) == 1048576)
|
||||
testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
|
||||
}
|
||||
}
|
||||
|
||||
@(test)
|
||||
tlsf_test_overlap_and_zero :: proc(t: ^testing.T) {
|
||||
default_allocator := context.allocator
|
||||
|
||||
2
vendor/lua/5.4/include/luaconf.h
vendored
2
vendor/lua/5.4/include/luaconf.h
vendored
@@ -71,7 +71,7 @@
|
||||
|
||||
|
||||
/*
|
||||
@@ LUAI_IS32INT is true iff 'int' has (at least) 32 bits.
|
||||
@@ LUAI_IS32INT is true if and only if (⟺) 'int' has (at least) 32 bits.
|
||||
*/
|
||||
#define LUAI_IS32INT ((UINT_MAX >> 30) >= 3)
|
||||
|
||||
|
||||
4
vendor/portmidi/portmidi.odin
vendored
4
vendor/portmidi/portmidi.odin
vendored
@@ -119,8 +119,8 @@ DeviceInfo :: struct {
|
||||
structVersion: c.int, /**< this internal structure version */
|
||||
interf: cstring, /**< underlying MIDI API, e.g. MMSystem or DirectX */
|
||||
name: cstring, /**< device name, e.g. USB MidiSport 1x1 */
|
||||
input: b32, /**< true iff input is available */
|
||||
output: b32, /**< true iff output is available */
|
||||
input: b32, /**< true if and only if (⟺) input is available */
|
||||
output: b32, /**< true if and only if (⟺) output is available */
|
||||
opened: b32, /**< used by generic PortMidi code to do error checking on arguments */
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user