Merge branch 'master' into bill/fixed-capacity-dynamic-array

This commit is contained in:
gingerBill
2026-03-15 11:41:01 +00:00
85 changed files with 1762 additions and 1008 deletions

View File

@@ -141,7 +141,7 @@ Type_Info_Struct :: struct {
flags: Type_Info_Struct_Flags,
// These are only set iff this structure is an SOA structure
// These are only set if and only if (⟺) this structure is an SOA structure
soa_kind: Type_Info_Struct_Soa_Kind,
soa_len: i32,
soa_base_type: ^Type_Info,

View File

@@ -1525,7 +1525,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {
// Evaluates the condition and panics the program iff the condition is false.
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
// This uses the `context.assertion_failure_procedure` to assert.
//
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1549,7 +1549,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
}
}
// Evaluates the condition and panics the program iff the condition is false.
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
// This uses the `context.assertion_failure_procedure` to assert.
// This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
@builtin
@@ -1589,7 +1589,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
p("not yet implemented", message, loc)
}
// Evaluates the condition and panics the program iff the condition is false.
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
// This uses the `default_assertion_contextless_failure_proc` to assert.
//
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1609,7 +1609,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
}
}
// Evaluates the condition and panics the program iff the condition is false.
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
// This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {

View File

@@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
//
// LLVM appears not to consider "this instruction is totally
// awful on the given microarchitcture", which leads to
// `VPCOMPRESSED` being generated iff AVX512 support is
// `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is
// enabled for `intrinsics.simd_masked_compress_store`.
// On Zen 4, this leads to a 50% performance regression vs
// the 128-bit SIMD code.

View File

@@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
b.buf = buf
}
// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
reader_destroy :: proc(b: ^Reader) {
delete(b.buf, b.buf_allocator)
b^ = {}

View File

@@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
b.buf = buf
}
// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
writer_destroy :: proc(b: ^Writer) {
delete(b.buf, b.buf_allocator)
b^ = {}

View File

@@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
return subslices[:]
}
// alias returns true iff a and b have a non-zero length, and any part of
// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of
// a overlaps with b.
alias :: proc "contextless" (a, b: []byte) -> bool {
a_len, b_len := len(a), len(b)
@@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool {
return a_start <= b_end && b_start <= a_end
}
// alias_inexactly returns true iff a and b have a non-zero length,
// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length,
// the base pointer of a and b are NOT equal, and any part of a overlaps
// with b (ie: `alias(a, b)` with an exception that returns false for
// `a == b`, `b = a[:len(a)-69]` and similar conditions).

View File

@@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
return t._size
}
// first returns the first node in the tree (in-order) or nil iff
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
// the tree is empty.
first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
return tree_first_or_last_in_order(t, Direction.Backward)
}
// last returns the last element in the tree (in-order) or nil iff
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
// the tree is empty.
last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
return tree_first_or_last_in_order(t, Direction.Forward)
}
// find finds the value in the tree, and returns the corresponding
// node or nil iff the value is not present.
// node or nil if and only if (⟺) the value is not present.
find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
cur := t._root
descend_loop: for cur != nil {
@@ -168,7 +168,7 @@ find_or_insert :: proc(
return
}
// remove removes a node or value from the tree, and returns true iff the
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove :: proc {
@@ -176,7 +176,7 @@ remove :: proc {
remove_node,
}
// remove_value removes a value from the tree, and returns true iff the
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
@@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t
return remove_node(t, n, call_on_remove)
}
// remove_node removes a node from the tree, and returns true iff the
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
@@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" (
}
// iterator_get returns the node currently pointed to by the iterator,
// or nil iff the node has been removed, the tree is empty, or the end
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
// of the tree has been reached.
iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
return it._cur
}
// iterator_remove removes the node currently pointed to by the iterator,
// and returns true iff the removal was successful. Semantics are the
// and returns true if and only if (⟺) the removal was successful. Semantics are the
// same as the Tree remove.
iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
if it._cur == nil {
@@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -
}
// iterator_next advances the iterator and returns the (node, true) or
// or (nil, false) iff the end of the tree has been reached.
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
//
// Note: The first call to iterator_next will return the first node instead
// of advancing the iterator.

View File

@@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
return t._size
}
// first returns the first node in the tree (in-order) or nil iff
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
// the tree is empty.
first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
return tree_first_or_last_in_order(t, Direction.Backward)
}
// last returns the last element in the tree (in-order) or nil iff
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
// the tree is empty.
last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
return tree_first_or_last_in_order(t, Direction.Forward)
}
// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present.
find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
node = t._root
for node != nil {
@@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
return node
}
// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present.
find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
if n := find(t, key); n != nil {
return n.value, true
@@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
return n, true, nil
}
// remove removes a node or value from the tree, and returns true iff the
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove :: proc {
@@ -162,7 +162,7 @@ remove :: proc {
remove_node,
}
// remove_value removes a value from the tree, and returns true iff the
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's key + value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
@@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true)
return remove_node(t, n, call_on_remove)
}
// remove_node removes a node from the tree, and returns true iff the
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
// removal was successful. While the node's key + value will be left intact,
// the node itself will be freed via the tree's node allocator.
remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) {
@@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K
}
// iterator_get returns the node currently pointed to by the iterator,
// or nil iff the node has been removed, the tree is empty, or the end
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
// of the tree has been reached.
iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) {
return it._cur
}
// iterator_remove removes the node currently pointed to by the iterator,
// and returns true iff the removal was successful. Semantics are the
// and returns true if and only if (⟺) the removal was successful. Semantics are the
// same as the Tree remove.
iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool {
if it._cur == nil {
@@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t
}
// iterator_next advances the iterator and returns the (node, true) or
// or (nil, false) iff the end of the tree has been reached.
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
//
// Note: The first call to iterator_next will return the first node instead
// of advancing the iterator.

View File

@@ -0,0 +1,69 @@
package aes_hw
@(require) import "core:sys/info"
// is_supported returns true if and only if (⟺) hardware accelerated AES
// is supported.
is_supported :: proc "contextless" () -> bool {
when ODIN_ARCH == .amd64 {
// Note: Everything with AES-NI has support for
// the required SSE extxtensions.
req_features :: info.CPU_Features{
.sse2,
.ssse3,
.sse41,
.aes,
}
return info.cpu_features() >= req_features
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
req_features :: info.CPU_Features{
.asimd,
.aes,
}
return info.cpu_features() >= req_features
} else {
return false
}
}
// is_ghash_supported returns true if and only if (⟺) hardware accelerated
// GHASH is supported.
is_ghash_supported :: proc "contextless" () -> bool {
// Just having hardware GHASH is silly.
if !is_supported() {
return false
}
when ODIN_ARCH == .amd64 {
return info.cpu_features() >= info.CPU_Features{
.pclmulqdq,
}
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
// Once we can actually use this, we can re-enable this.
//
// return info.cpu_features() >= info.CPU_Features{
// .pmull,
// }
return false
} else {
return false
}
}
// Context is a keyed AES (ECB) instance.
Context :: struct {
// Note: The ideal thing to do is for the expanded round keys to be
// arrays of `u8x16`, however that implies alignment (or using AVX).
//
// All the people using e-waste processors that don't support an
// instruction set that has been around for over 10 years are why
// we can't have nice things.
_sk_exp_enc: [15][16]byte,
_sk_exp_dec: [15][16]byte,
_num_rounds: int,
}
// init initializes a context for AES with the provided key.
init :: proc(ctx: ^Context, key: []byte) {
keysched(ctx, key)
}

View File

@@ -21,7 +21,7 @@
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#+build amd64
package aes_hw_intel
package aes_hw
import "base:intrinsics"
import "core:crypto/_aes"

View File

@@ -0,0 +1,115 @@
#+build arm64,arm32
package aes_hw
import "core:simd"
import "core:simd/arm"
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
TARGET_FEATURES :: "neon,aes"
HAS_GHASH :: false // Temporary
@(require_results, enable_target_feature = "aes")
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
}
@(require_results, enable_target_feature = "aes")
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
}
@(require_results, enable_target_feature = "aes")
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
}
@(require_results, enable_target_feature = "aes")
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
}
aesimc :: arm.vaesimcq_u8
@(require_results, enable_target_feature = "aes")
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
// Undo ShiftRows step from AESE and extract X1 and X3
dest := simd.swizzle(
a,
0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
)
rcons := simd.u8x16{
0, 0, 0, 0,
IMM8, 0, 0, 0,
0, 0, 0, 0,
IMM8, 0, 0, 0,
}
return simd.bit_xor(dest, rcons)
}
// The keyschedule implementation is easier to read with some extra
// Intel intrinsics that are emulated by built-in LLVM ops anyway.
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
shift :: IMM8 & 0xff
// This needs to emit behavior identical to PSLLDQ which is as follows:
//
// TEMP := COUNT
// IF (TEMP > 15) THEN TEMP := 16; FI
// DEST := DEST << (TEMP * 8)
// DEST[MAXVL-1:128] (Unmodified)
return simd.shuffle(
simd.u8x16{},
a,
0 when shift > 15 else (16 - shift + 0),
1 when shift > 15 else (16 - shift + 1),
2 when shift > 15 else (16 - shift + 2),
3 when shift > 15 else (16 - shift + 3),
4 when shift > 15 else (16 - shift + 4),
5 when shift > 15 else (16 - shift + 5),
6 when shift > 15 else (16 - shift + 6),
7 when shift > 15 else (16 - shift + 7),
8 when shift > 15 else (16 - shift + 8),
9 when shift > 15 else (16 - shift + 9),
10 when shift > 15 else (16 - shift + 10),
11 when shift > 15 else (16 - shift + 11),
12 when shift > 15 else (16 - shift + 12),
13 when shift > 15 else (16 - shift + 13),
14 when shift > 15 else (16 - shift + 14),
15 when shift > 15 else (16 - shift + 15),
)
}
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
v := transmute(simd.i32x4)a
return transmute(simd.u8x16)simd.shuffle(
v,
v,
IMM8 & 0b11,
(IMM8 >> 2) & 0b11,
(IMM8 >> 4) & 0b11,
(IMM8 >> 6) & 0b11,
)
}
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
return transmute(simd.u8x16)simd.shuffle(
transmute(simd.u32x4)(a),
transmute(simd.u32x4)(b),
u32(MASK) & 0b11,
(u32(MASK)>>2) & 0b11,
((u32(MASK)>>4) & 0b11)+4,
((u32(MASK)>>6) & 0b11)+4)
}

View File

@@ -0,0 +1,55 @@
#+build amd64
package aes_hw
import "core:simd"
import "core:simd/x86"
// Intel/RISC-V semantics.
TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
HAS_GHASH :: true
@(require_results, enable_target_feature = "aes")
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
}
@(require_results, enable_target_feature = "aes")
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
}
@(require_results, enable_target_feature = "aes")
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
}
@(require_results, enable_target_feature = "aes")
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
}
@(require_results, enable_target_feature = "aes")
aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
}
@(require_results, enable_target_feature = "aes")
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
}
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
}
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
}
@(private, require_results, enable_target_feature = TARGET_FEATURES)
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
}

View File

@@ -0,0 +1,181 @@
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#+build amd64,arm32
package aes_hw
import "base:intrinsics"
import "core:crypto"
import "core:crypto/_aes"
import "core:simd"
// Inspiration taken from BearSSL's AES-NI implementation.
//
// Note: This assumes that the SROA optimization pass is enabled to be
// anything resembling performant otherwise, LLVM will not elide a massive
// number of redundant loads/stores it generates for every intrinsic call.
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
k1, k2 := k1, k2
k2 = _mm_shuffle_epi32(k2, 0xff)
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
return simd.bit_xor(k1, k2)
}
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
k1, k2, k3 := k1_^, k2_^, k3
k3 = _mm_shuffle_epi32(k3, 0x55)
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, k3)
tmp := k2
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
k1_, k2_ := k1_, k2_
k1_^, k2_^ = k1, k2
r1 := _mm_shuffle_ps(tmp, k1, 0x44)
r2 := _mm_shuffle_ps(k1, k2, 0x4e)
return r1, r2
}
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
k1, k2, k3 := k1_^, k2_^, k3
k3 = _mm_shuffle_epi32(k3, 0x55)
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, k3)
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
k1_, k2_ := k1_, k2_
k1_^, k2_^ = k1, k2
return k1
}
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
k1, k2 := k1, k2
k2 = _mm_shuffle_epi32(k2, 0xaa)
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
return simd.bit_xor(k1, k2)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
for i in 1 ..< num_rounds {
tmp := aesimc(sks[i])
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
}
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
}
@(private, enable_target_feature = TARGET_FEATURES)
keysched :: proc(ctx: ^Context, key: []byte) {
sks: [15]simd.u8x16 = ---
// Compute the encryption keys.
num_rounds, key_len := 0, len(key)
switch key_len {
case _aes.KEY_SIZE_128:
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
num_rounds = _aes.ROUNDS_128
case _aes.KEY_SIZE_192:
k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
k1_tmp: [16]byte
copy(k1_tmp[:], key[16:24])
k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
sks[0] = k0
sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
num_rounds = _aes.ROUNDS_192
case _aes.KEY_SIZE_256:
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
num_rounds = _aes.ROUNDS_256
case:
panic("crypto/aes: invalid AES key size")
}
for i in 0 ..= num_rounds {
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
}
// Compute the decryption keys. GCM and CTR do not need this, however
// ECB, CBC, OCB3, etc do.
derive_dec_keys(ctx, &sks, num_rounds)
ctx._num_rounds = num_rounds
crypto.zero_explicit(&sks, size_of(sks))
}

View File

@@ -0,0 +1,11 @@
#+build !amd64
#+build !arm64
#+build !arm32
package aes_hw
HAS_GHASH :: false
@(private)
keysched :: proc(ctx: ^Context, key: []byte) {
panic("crypto/aes: hardware implementation unsupported")
}

View File

@@ -1,38 +0,0 @@
#+build amd64
package aes_hw_intel
import "core:sys/info"
// is_supported returns true iff hardware accelerated AES
// is supported.
is_supported :: proc "contextless" () -> bool {
// Note: Everything with AES-NI and PCLMULQDQ has support for
// the required SSE extxtensions.
req_features :: info.CPU_Features{
.sse2,
.ssse3,
.sse41,
.aes,
.pclmulqdq,
}
return info.cpu_features() >= req_features
}
// Context is a keyed AES (ECB) instance.
Context :: struct {
// Note: The ideal thing to do is for the expanded round keys to be
// arrays of `__m128i`, however that implies alignment (or using AVX).
//
// All the people using e-waste processors that don't support an
// insturction set that has been around for over 10 years are why
// we can't have nice things.
_sk_exp_enc: [15][16]byte,
_sk_exp_dec: [15][16]byte,
_num_rounds: int,
}
// init initializes a context for AES with the provided key.
init :: proc(ctx: ^Context, key: []byte) {
keysched(ctx, key)
}

View File

@@ -1,200 +0,0 @@
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#+build amd64
package aes_hw_intel
import "base:intrinsics"
import "core:crypto/_aes"
import "core:simd/x86"
// Intel AES-NI based implementation. Inspiration taken from BearSSL.
//
// Note: This assumes that the SROA optimization pass is enabled to be
// anything resembling performat otherwise, LLVM will not elide a massive
// number of redundant loads/stores it generates for every intrinsic call.
@(private = "file", require_results, enable_target_feature = "sse2")
expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
k1, k2 := k1, k2
k2 = x86._mm_shuffle_epi32(k2, 0xff)
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
return x86._mm_xor_si128(k1, k2)
}
@(private = "file", require_results, enable_target_feature = "sse,sse2")
expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
k1, k2, k3 := k1_^, k2_^, k3
k3 = x86._mm_shuffle_epi32(k3, 0x55)
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, k3)
tmp := k2
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
k1_, k2_ := k1_, k2_
k1_^, k2_^ = k1, k2
r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
return r1, r2
}
@(private = "file", require_results, enable_target_feature = "sse2")
expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
k1, k2, k3 := k1_^, k2_^, k3
k3 = x86._mm_shuffle_epi32(k3, 0x55)
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, k3)
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
k1_, k2_ := k1_, k2_
k1_^, k2_^ = k1, k2
return k1
}
@(private = "file", require_results, enable_target_feature = "sse2")
expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
k1, k2 := k1, k2
k2 = x86._mm_shuffle_epi32(k2, 0xaa)
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
return x86._mm_xor_si128(k1, k2)
}
@(private = "file", enable_target_feature = "aes")
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
for i in 1 ..< num_rounds {
tmp := x86._mm_aesimc_si128(sks[i])
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
}
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
}
@(private, enable_target_feature = "sse,sse2,aes")
keysched :: proc(ctx: ^Context, key: []byte) {
sks: [15]x86.__m128i = ---
// Compute the encryption keys.
num_rounds, key_len := 0, len(key)
switch key_len {
case _aes.KEY_SIZE_128:
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
num_rounds = _aes.ROUNDS_128
case _aes.KEY_SIZE_192:
k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
k1 := x86.__m128i{
intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
0,
}
sks[0] = k0
sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
num_rounds = _aes.ROUNDS_192
case _aes.KEY_SIZE_256:
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
num_rounds = _aes.ROUNDS_256
case:
panic("crypto/aes: invalid AES key size")
}
for i in 0 ..= num_rounds {
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
}
// Compute the decryption keys. GCM and CTR do not need this, however
// ECB, CBC, OCB3, etc do.
derive_dec_keys(ctx, &sks, num_rounds)
ctx._num_rounds = num_rounds
zero_explicit(&sks, size_of(sks))
}
/*
Set each byte of a memory range to zero.
This procedure copies the value `0` into the `len` bytes of a memory range,
starting at address `data`.
This procedure returns the pointer to `data`.
Unlike the `zero()` procedure, which can be optimized away or reordered by the
compiler under certain circumstances, `zero_explicit()` procedure can not be
optimized away or reordered with other memory access operations, and the
compiler assumes volatile semantics of the memory.
*/
zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
// This routine tries to avoid the compiler optimizing away the call,
// so that it is always executed. It is intended to provide
// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
// memset_s call.
intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
return data
}

View File

@@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" (
intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
}
// is_performant returns true iff the target and current host both support
// is_performant returns true if and only if (⟺) the target and current host both support
// "enough" 128-bit SIMD to make this implementation performant.
is_performant :: proc "contextless" () -> bool {
when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {

View File

@@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
@(private = "file")
_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
// is_performant returns true iff the target and current host both support
// is_performant returns true if and only if (⟺) the target and current host both support
// "enough" SIMD to make this implementation performant.
is_performant :: proc "contextless" () -> bool {
req_features :: info.CPU_Features{.avx, .avx2}

View File

@@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
tmp: Montgomery_Domain_Field_Element = ---
fe_sub(&tmp, arg1, arg2)
// This will only underflow iff arg1 == arg2, and we return the borrow,
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
// which will be 1.
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

View File

@@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
tmp: Montgomery_Domain_Field_Element = ---
fe_sub(&tmp, arg1, arg2)
// This will only underflow iff arg1 == arg2, and we return the borrow,
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
// which will be 1.
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

View File

@@ -5,17 +5,17 @@ package _subtle
import "core:math/bits"
// byte_eq returns 1 iff a == b, 0 otherwise.
// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
byte_eq :: proc "contextless" (a, b: byte) -> int {
v := a ~ b
// v == 0 iff a == b. The subtraction will underflow, setting the
// v == 0 if and only if (⟺) a == b. The subtraction will underflow, setting the
// sign bit, which will get returned.
return int((u32(v)-1) >> 31)
}
// u64_eq returns 1 iff a == b, 0 otherwise.
// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
u64_eq :: proc "contextless" (a, b: u64) -> u64 {
_, borrow := bits.sub_u64(0, a ~ b, 0)
@@ -27,14 +27,14 @@ eq :: proc {
u64_eq,
}
// u64_is_zero returns 1 iff a == 0, 0 otherwise.
// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise.
@(optimization_mode="none")
u64_is_zero :: proc "contextless" (a: u64) -> u64 {
_, borrow := bits.sub_u64(a, 1, 0)
return borrow
}
// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise.
@(optimization_mode="none")
u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
is_zero := u64_is_zero(a)

View File

@@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided algorithm, key, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and ciphertext MUST alias exactly or not at all.

View File

@@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided Context, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and plaintext MUST alias exactly or not at all.

View File

@@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided Context, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and plaintext MUST alias exactly or not at all.

View File

@@ -0,0 +1,397 @@
#+build amd64,arm32
package aegis
import "base:intrinsics"
import "core:crypto"
import aes_hw "core:crypto/_aes/hw"
import "core:encoding/endian"
import "core:simd"
@(private)
State_HW :: struct {
s0: simd.u8x16,
s1: simd.u8x16,
s2: simd.u8x16,
s3: simd.u8x16,
s4: simd.u8x16,
s5: simd.u8x16,
s6: simd.u8x16,
s7: simd.u8x16,
rate: int,
}
when ODIN_ARCH == .amd64 {
@(private="file")
TARGET_FEATURES :: "sse2,aes"
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
@(private="file")
TARGET_FEATURES :: "neon,aes"
}
// is_hardware_accelerated returns true if and only if (⟺) hardware
// accelerated AEGIS is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return aes_hw.is_supported()
}
@(private, enable_target_feature = TARGET_FEATURES)
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
switch ctx._key_len {
case KEY_SIZE_128L:
key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv)))
st.s0 = simd.bit_xor(key, iv)
st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
st.s3 = st.s1
st.s4 = st.s0
st.s5 = simd.bit_xor(key, st.s2) // key ^ C0
st.s6 = simd.bit_xor(key, st.s1) // key ^ C1
st.s7 = st.s5
st.rate = _RATE_128L
for _ in 0 ..< 10 {
update_hw_128l(st, iv, key)
}
case KEY_SIZE_256:
k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16]))
n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0]))
n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16]))
st.s0 = simd.bit_xor(k0, n0)
st.s1 = simd.bit_xor(k1, n1)
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0
st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1
st.rate = _RATE_256
u0, u1 := st.s0, st.s1
for _ in 0 ..< 4 {
update_hw_256(st, k0)
update_hw_256(st, k1)
update_hw_256(st, u0)
update_hw_256(st, u1)
}
}
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) {
s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0))
s1_ := aes_hw.aesenc(st.s0, st.s1)
s2_ := aes_hw.aesenc(st.s1, st.s2)
s3_ := aes_hw.aesenc(st.s2, st.s3)
s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1))
s5_ := aes_hw.aesenc(st.s4, st.s5)
s6_ := aes_hw.aesenc(st.s5, st.s6)
s7_ := aes_hw.aesenc(st.s6, st.s7)
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) {
s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m))
s1_ := aes_hw.aesenc(st.s0, st.s1)
s2_ := aes_hw.aesenc(st.s1, st.s2)
s3_ := aes_hw.aesenc(st.s2, st.s3)
s4_ := aes_hw.aesenc(st.s3, st.s4)
s5_ := aes_hw.aesenc(st.s4, st.s5)
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16]))
update_hw_128l(st, t0, t1)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
update_hw_256(st, m)
}
@(private, enable_target_feature = TARGET_FEATURES)
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
ai, l := aad, len(aad)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
absorb_hw_128l(st, ai)
ai = ai[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
absorb_hw_256(st, ai)
ai = ai[_RATE_256:]
l -= _RATE_256
}
}
// Pad out the remainder with `0`s till it is rate sized.
if l > 0 {
tmp: [_RATE_MAX]byte // AAD is not confidential.
copy(tmp[:], ai)
switch st.rate {
case _RATE_128L:
absorb_hw_128l(st, tmp[:])
case _RATE_256:
absorb_hw_256(st, tmp[:])
}
}
}
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) {
z0 := simd.bit_xor(
st.s6,
simd.bit_xor(
st.s1,
simd.bit_and(st.s2, st.s3),
),
)
z1 := simd.bit_xor(
st.s2,
simd.bit_xor(
st.s5,
simd.bit_and(st.s6, st.s7),
),
)
return z0, z1
}
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 {
return simd.bit_xor(
st.s1,
simd.bit_xor(
st.s4,
simd.bit_xor(
st.s5,
simd.bit_and(st.s2, st.s3),
),
),
)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
z0, z1 := z_hw_128l(st)
t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0]))
t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16]))
update_hw_128l(st, t0, t1)
out0 := simd.bit_xor(t0, z0)
out1 := simd.bit_xor(t1, z1)
intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0)
intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
z := z_hw_256(st)
xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi)))
update_hw_256(st, xi_)
ci_ := simd.bit_xor(xi_, z)
intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_)
}
@(private, enable_target_feature = TARGET_FEATURES)
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
ci, xi, l := dst, src, len(src)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
enc_hw_128l(st, ci, xi)
ci = ci[_RATE_128L:]
xi = xi[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
enc_hw_256(st, ci, xi)
ci = ci[_RATE_256:]
xi = xi[_RATE_256:]
l -= _RATE_256
}
}
// Pad out the remainder with `0`s till it is rate sized.
if l > 0 {
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
copy(tmp[:], xi)
switch st.rate {
case _RATE_128L:
enc_hw_128l(st, tmp[:], tmp[:])
case _RATE_256:
enc_hw_256(st, tmp[:], tmp[:])
}
copy(ci, tmp[:l])
}
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
z0, z1 := z_hw_128l(st)
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0]))
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16]))
out0 := simd.bit_xor(t0, z0)
out1 := simd.bit_xor(t1, z1)
update_hw_128l(st, out0, out1)
intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0)
intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
z := z_hw_256(st)
ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci)))
xi_ := simd.bit_xor(ci_, z)
update_hw_256(st, xi_)
intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
tmp: [_RATE_128L]byte
defer crypto.zero_explicit(&tmp, size_of(tmp))
z0, z1 := z_hw_128l(st)
copy(tmp[:], cn)
t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16]))
out0 := simd.bit_xor(t0, z0)
out1 := simd.bit_xor(t1, z1)
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0)
intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1)
copy(xn, tmp[:])
for off := len(xn); off < _RATE_128L; off += 1 {
tmp[off] = 0
}
out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0
out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1
update_hw_128l(st, out0, out1)
}
@(private = "file", enable_target_feature = TARGET_FEATURES)
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
tmp: [_RATE_256]byte
defer crypto.zero_explicit(&tmp, size_of(tmp))
z := z_hw_256(st)
copy(tmp[:], cn)
cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
xn_ := simd.bit_xor(cn_, z)
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_)
copy(xn, tmp[:])
for off := len(xn); off < _RATE_256; off += 1 {
tmp[off] = 0
}
xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
update_hw_256(st, xn_)
}
@(private, enable_target_feature = TARGET_FEATURES)
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
xi, ci, l := dst, src, len(src)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
dec_hw_128l(st, xi, ci)
xi = xi[_RATE_128L:]
ci = ci[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
dec_hw_256(st, xi, ci)
xi = xi[_RATE_256:]
ci = ci[_RATE_256:]
l -= _RATE_256
}
}
// Process the remainder.
if l > 0 {
switch st.rate {
case _RATE_128L:
dec_partial_hw_128l(st, xi, ci)
case _RATE_256:
dec_partial_hw_256(st, xi, ci)
}
}
}
@(private, enable_target_feature = TARGET_FEATURES)
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
tmp: [16]byte
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
t0, t1: simd.u8x16 = ---, ---
switch st.rate {
case _RATE_128L:
t = simd.bit_xor(st.s2, t)
for _ in 0 ..< 7 {
update_hw_128l(st, t, t)
}
t0 = simd.bit_xor(st.s0, st.s1)
t0 = simd.bit_xor(t0, st.s2)
t0 = simd.bit_xor(t0, st.s3)
t1 = simd.bit_xor(st.s4, st.s5)
t1 = simd.bit_xor(t1, st.s6)
if len(tag) == TAG_SIZE_256 {
t1 = simd.bit_xor(t1, st.s7)
}
case _RATE_256:
t = simd.bit_xor(st.s3, t)
for _ in 0 ..< 7 {
update_hw_256(st, t)
}
t0 = simd.bit_xor(st.s0, st.s1)
t0 = simd.bit_xor(t0, st.s2)
t1 = simd.bit_xor(st.s3, st.s4)
t1 = simd.bit_xor(t1, st.s5)
}
switch len(tag) {
case TAG_SIZE_128:
t0 = simd.bit_xor(t0, t1)
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
case TAG_SIZE_256:
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1)
}
}
@(private)
reset_state_hw :: proc "contextless" (st: ^State_HW) {
crypto.zero_explicit(st, size_of(st^))
}

View File

@@ -1,4 +1,6 @@
#+build !amd64
#+build !arm64
#+build !arm32
package aegis
@(private = "file")
@@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
@(private)
State_HW :: struct {}
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return false

View File

@@ -1,389 +0,0 @@
#+build amd64
package aegis
import "base:intrinsics"
import "core:crypto"
import "core:crypto/aes"
import "core:encoding/endian"
import "core:simd/x86"
@(private)
State_HW :: struct {
s0: x86.__m128i,
s1: x86.__m128i,
s2: x86.__m128i,
s3: x86.__m128i,
s4: x86.__m128i,
s5: x86.__m128i,
s6: x86.__m128i,
s7: x86.__m128i,
rate: int,
}
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return aes.is_hardware_accelerated()
}
@(private, enable_target_feature = "sse2,aes")
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
switch ctx._key_len {
case KEY_SIZE_128L:
key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
st.s0 = x86._mm_xor_si128(key, iv)
st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
st.s3 = st.s1
st.s4 = st.s0
st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
st.s7 = st.s5
st.rate = _RATE_128L
for _ in 0 ..< 10 {
update_hw_128l(st, iv, key)
}
case KEY_SIZE_256:
k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
st.s0 = x86._mm_xor_si128(k0, n0)
st.s1 = x86._mm_xor_si128(k1, n1)
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
st.rate = _RATE_256
u0, u1 := st.s0, st.s1
for _ in 0 ..< 4 {
update_hw_256(st, k0)
update_hw_256(st, k1)
update_hw_256(st, u0)
update_hw_256(st, u1)
}
}
}
@(private = "file", enable_target_feature = "sse2,aes")
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
}
@(private = "file", enable_target_feature = "sse2,aes")
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
}
@(private = "file", enable_target_feature = "sse2,aes")
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
update_hw_128l(st, t0, t1)
}
@(private = "file", enable_target_feature = "sse2,aes")
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
update_hw_256(st, m)
}
@(private, enable_target_feature = "sse2,aes")
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
ai, l := aad, len(aad)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
absorb_hw_128l(st, ai)
ai = ai[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
absorb_hw_256(st, ai)
ai = ai[_RATE_256:]
l -= _RATE_256
}
}
// Pad out the remainder with `0`s till it is rate sized.
if l > 0 {
tmp: [_RATE_MAX]byte // AAD is not confidential.
copy(tmp[:], ai)
switch st.rate {
case _RATE_128L:
absorb_hw_128l(st, tmp[:])
case _RATE_256:
absorb_hw_256(st, tmp[:])
}
}
}
@(private = "file", enable_target_feature = "sse2", require_results)
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
z0 := x86._mm_xor_si128(
st.s6,
x86._mm_xor_si128(
st.s1,
x86._mm_and_si128(st.s2, st.s3),
),
)
z1 := x86._mm_xor_si128(
st.s2,
x86._mm_xor_si128(
st.s5,
x86._mm_and_si128(st.s6, st.s7),
),
)
return z0, z1
}
@(private = "file", enable_target_feature = "sse2", require_results)
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
return x86._mm_xor_si128(
st.s1,
x86._mm_xor_si128(
st.s4,
x86._mm_xor_si128(
st.s5,
x86._mm_and_si128(st.s2, st.s3),
),
),
)
}
@(private = "file", enable_target_feature = "sse2,aes")
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
z0, z1 := z_hw_128l(st)
t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
update_hw_128l(st, t0, t1)
out0 := x86._mm_xor_si128(t0, z0)
out1 := x86._mm_xor_si128(t1, z1)
intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
}
@(private = "file", enable_target_feature = "sse2,aes")
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
z := z_hw_256(st)
xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
update_hw_256(st, xi_)
ci_ := x86._mm_xor_si128(xi_, z)
intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
}
@(private, enable_target_feature = "sse2,aes")
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
ci, xi, l := dst, src, len(src)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
enc_hw_128l(st, ci, xi)
ci = ci[_RATE_128L:]
xi = xi[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
enc_hw_256(st, ci, xi)
ci = ci[_RATE_256:]
xi = xi[_RATE_256:]
l -= _RATE_256
}
}
// Pad out the remainder with `0`s till it is rate sized.
if l > 0 {
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
copy(tmp[:], xi)
switch st.rate {
case _RATE_128L:
enc_hw_128l(st, tmp[:], tmp[:])
case _RATE_256:
enc_hw_256(st, tmp[:], tmp[:])
}
copy(ci, tmp[:l])
}
}
@(private = "file", enable_target_feature = "sse2,aes")
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
z0, z1 := z_hw_128l(st)
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
out0 := x86._mm_xor_si128(t0, z0)
out1 := x86._mm_xor_si128(t1, z1)
update_hw_128l(st, out0, out1)
intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
}
@(private = "file", enable_target_feature = "sse2,aes")
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
z := z_hw_256(st)
ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
xi_ := x86._mm_xor_si128(ci_, z)
update_hw_256(st, xi_)
intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
}
@(private = "file", enable_target_feature = "sse2,aes")
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
tmp: [_RATE_128L]byte
defer crypto.zero_explicit(&tmp, size_of(tmp))
z0, z1 := z_hw_128l(st)
copy(tmp[:], cn)
t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
out0 := x86._mm_xor_si128(t0, z0)
out1 := x86._mm_xor_si128(t1, z1)
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
copy(xn, tmp[:])
for off := len(xn); off < _RATE_128L; off += 1 {
tmp[off] = 0
}
out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
update_hw_128l(st, out0, out1)
}
@(private = "file", enable_target_feature = "sse2,aes")
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
tmp: [_RATE_256]byte
defer crypto.zero_explicit(&tmp, size_of(tmp))
z := z_hw_256(st)
copy(tmp[:], cn)
cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
xn_ := x86._mm_xor_si128(cn_, z)
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
copy(xn, tmp[:])
for off := len(xn); off < _RATE_256; off += 1 {
tmp[off] = 0
}
xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
update_hw_256(st, xn_)
}
@(private, enable_target_feature = "sse2,aes")
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
xi, ci, l := dst, src, len(src)
switch st.rate {
case _RATE_128L:
for l >= _RATE_128L {
dec_hw_128l(st, xi, ci)
xi = xi[_RATE_128L:]
ci = ci[_RATE_128L:]
l -= _RATE_128L
}
case _RATE_256:
for l >= _RATE_256 {
dec_hw_256(st, xi, ci)
xi = xi[_RATE_256:]
ci = ci[_RATE_256:]
l -= _RATE_256
}
}
// Process the remainder.
if l > 0 {
switch st.rate {
case _RATE_128L:
dec_partial_hw_128l(st, xi, ci)
case _RATE_256:
dec_partial_hw_256(st, xi, ci)
}
}
}
@(private, enable_target_feature = "sse2,aes")
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
tmp: [16]byte
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
t0, t1: x86.__m128i = ---, ---
switch st.rate {
case _RATE_128L:
t = x86._mm_xor_si128(st.s2, t)
for _ in 0 ..< 7 {
update_hw_128l(st, t, t)
}
t0 = x86._mm_xor_si128(st.s0, st.s1)
t0 = x86._mm_xor_si128(t0, st.s2)
t0 = x86._mm_xor_si128(t0, st.s3)
t1 = x86._mm_xor_si128(st.s4, st.s5)
t1 = x86._mm_xor_si128(t1, st.s6)
if len(tag) == TAG_SIZE_256 {
t1 = x86._mm_xor_si128(t1, st.s7)
}
case _RATE_256:
t = x86._mm_xor_si128(st.s3, t)
for _ in 0 ..< 7 {
update_hw_256(st, t)
}
t0 = x86._mm_xor_si128(st.s0, st.s1)
t0 = x86._mm_xor_si128(t0, st.s2)
t1 = x86._mm_xor_si128(st.s3, st.s4)
t1 = x86._mm_xor_si128(t1, st.s5)
}
switch len(tag) {
case TAG_SIZE_128:
t0 = x86._mm_xor_si128(t0, t1)
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
case TAG_SIZE_256:
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
}
}
@(private)
reset_state_hw :: proc "contextless" (st: ^State_HW) {
crypto.zero_explicit(st, size_of(st^))
}

View File

@@ -1,30 +1,32 @@
#+build amd64
#+build amd64,arm32
package aes
import "base:intrinsics"
import "core:crypto/_aes"
import aes_hw "core:crypto/_aes/hw"
import "core:encoding/endian"
import "core:math/bits"
import "core:simd/x86"
import "core:simd"
@(private)
CTR_STRIDE_HW :: 4
@(private)
CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
@(private, enable_target_feature = "sse2,aes")
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
hw_ctx := ctx._impl.(Context_Impl_Hardware)
sks: [15]x86.__m128i = ---
sks: [15]simd.u8x16 = ---
for i in 0 ..= hw_ctx._num_rounds {
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i]))
}
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
ret := x86.__m128i{
i64(intrinsics.byte_swap(hi)),
i64(intrinsics.byte_swap(lo)),
}
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) {
buf: [BLOCK_SIZE]byte = ---
endian.unchecked_put_u64be(buf[0:], hi)
endian.unchecked_put_u64be(buf[8:], lo)
ret := intrinsics.unaligned_load((^simd.u8x16)(&buf))
hi, lo := hi, lo
carry: u64
@@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
nr_blocks := nr_blocks
ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
blks: [CTR_STRIDE_HW]x86.__m128i = ---
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
for nr_blocks >= CTR_STRIDE_HW {
#unroll for i in 0..< CTR_STRIDE_HW {
blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
blks[i] = simd.bit_xor(blks[i], sks[0])
}
#unroll for i in 1 ..= 9 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
switch hw_ctx._num_rounds {
case _aes.ROUNDS_128:
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
}
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
}
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
}
}
@@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
for nr_blocks > 0 {
blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
blks[0] = simd.bit_xor(blks[0], sks[0])
#unroll for i in 1 ..= 9 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
switch hw_ctx._num_rounds {
case _aes.ROUNDS_128:
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
}
xor_blocks_hw(dst, src, blks[:1])
@@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
zero_explicit(&sks, size_of(sks))
}
@(private, enable_target_feature = "sse2")
xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) {
#no_bounds_check {
if src != nil {
for i in 0 ..< len(blocks) {
off := i * BLOCK_SIZE
tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:])))
blocks[i] = simd.bit_xor(blocks[i], tmp)
}
}
for i in 0 ..< len(blocks) {
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
}
}
}

View File

@@ -0,0 +1,59 @@
#+build amd64,arm32
package aes
import "base:intrinsics"
import "core:crypto/_aes"
import aes_hw "core:crypto/_aes/hw"
import "core:simd"
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0])))
#unroll for i in 1 ..= 9 {
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10])))
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
}
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12])))
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
}
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14])))
}
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
}
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0])))
#unroll for i in 1 ..= 9 {
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10])))
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
}
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12])))
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
}
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14])))
}
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
}

View File

@@ -1,58 +0,0 @@
#+build amd64
package aes
import "base:intrinsics"
import "core:crypto/_aes"
import "core:simd/x86"
@(private, enable_target_feature = "sse2,aes")
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
#unroll for i in 1 ..= 9 {
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
}
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
}
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
}
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
}
@(private, enable_target_feature = "sse2,aes")
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
#unroll for i in 1 ..= 9 {
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
}
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
}
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
}
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
}

View File

@@ -4,6 +4,7 @@ import "core:bytes"
import "core:crypto"
import "core:crypto/_aes"
import "core:crypto/_aes/ct64"
import aes_hw "core:crypto/_aes/hw"
import "core:encoding/endian"
// GCM_IV_SIZE is the default size of the GCM IV in bytes.
@@ -26,6 +27,10 @@ Context_GCM :: struct {
// init_gcm initializes a Context_GCM with the provided key.
init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
when aes_hw.HAS_GHASH {
impl := aes_hw.is_ghash_supported() ? impl : .Portable
}
init_impl(&ctx._impl, key, impl)
ctx._is_initialized = true
}
@@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
// open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided Context_GCM, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and plaintext MUST alias exactly or not at all.

View File

@@ -1,12 +1,13 @@
#+build amd64
#+build amd64,arm32
package aes
import "base:intrinsics"
import "core:crypto"
import "core:crypto/_aes"
import "core:crypto/_aes/hw_intel"
@(require) import "core:crypto/_aes/ct64"
import aes_hw "core:crypto/_aes/hw"
import "core:encoding/endian"
import "core:simd/x86"
import "core:simd"
@(private)
gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
@@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
// Note: Our GHASH implementation handles appending padding.
hw_intel.ghash(s[:], h[:], aad)
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], aad)
} else {
ct64.ghash(s[:], h[:], aad)
}
gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
copy(tag, s[:])
@@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
s: [_aes.GHASH_TAG_SIZE]byte
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
hw_intel.ghash(s[:], h[:], aad)
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], aad)
} else {
ct64.ghash(s[:], h[:], aad)
}
gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
@@ -71,18 +80,26 @@ init_ghash_hw :: proc(
} else {
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
hw_intel.ghash(j0[:], h[:], iv)
when aes_hw.HAS_GHASH {
aes_hw.ghash(j0[:], h[:], iv)
} else {
ct64.ghash(j0[:], h[:], iv)
}
tmp: [_aes.GHASH_BLOCK_SIZE]byte
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
hw_intel.ghash(j0[:], h[:], tmp[:])
when aes_hw.HAS_GHASH {
aes_hw.ghash(j0[:], h[:], tmp[:])
} else {
ct64.ghash(j0[:], h[:], tmp[:])
}
}
// ECB encrypt j0, so that we can just XOR with the tag.
encrypt_block_hw(ctx, j0_enc[:], j0[:])
}
@(private = "file", enable_target_feature = "sse2")
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
final_ghash_hw :: proc(
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
h: ^[_aes.GHASH_KEY_SIZE]byte,
@@ -94,14 +111,18 @@ final_ghash_hw :: proc(
endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
hw_intel.ghash(s[:], h[:], blk[:])
j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
s_vec = x86._mm_xor_si128(s_vec, j0_vec)
intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], blk[:])
} else {
ct64.ghash(s[:], h[:], blk[:])
}
j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0))
s_vec := intrinsics.unaligned_load((^simd.u8x16)(s))
s_vec = simd.bit_xor(s_vec, j0_vec)
intrinsics.unaligned_store((^simd.u8x16)(s), s_vec)
}
@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
gctr_hw :: proc(
ctx: ^Context_Impl_Hardware,
dst: []byte,
@@ -111,13 +132,13 @@ gctr_hw :: proc(
iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
is_seal: bool,
) #no_bounds_check {
sks: [15]x86.__m128i = ---
sks: [15]simd.u8x16 = ---
for i in 0 ..= ctx._num_rounds {
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))
}
// Setup the counter block
ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv))
ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
src, dst := src, dst
@@ -127,11 +148,15 @@ gctr_hw :: proc(
// This results in an unreadable mess, so we opt for simplicity
// as performance is adequate.
blks: [CTR_STRIDE_HW]x86.__m128i = ---
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
nr_blocks := len(src) / BLOCK_SIZE
for nr_blocks >= CTR_STRIDE_HW {
if !is_seal {
hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
} else {
ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
}
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
@@ -139,42 +164,46 @@ gctr_hw :: proc(
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
blks[i] = simd.bit_xor(blks[i], sks[0])
}
#unroll for i in 1 ..= 9 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
}
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
}
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
#unroll for j in 0 ..< CTR_STRIDE_HW {
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
blks[j] = aes_hw.aesenc(blks[j], sks[i])
}
}
#unroll for i in 0 ..< CTR_STRIDE_HW {
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
}
}
xor_blocks_hw(dst, src, blks[:])
if is_seal {
hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
} else {
ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
}
}
src = src[CTR_STRIDE_BYTES_HW:]
@@ -186,28 +215,32 @@ gctr_hw :: proc(
for n := len(src); n > 0; {
l := min(n, BLOCK_SIZE)
if !is_seal {
hw_intel.ghash(s[:], h[:], src[:l])
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], src[:l])
} else {
ct64.ghash(s[:], h[:], src[:l])
}
}
blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
blks[0] = simd.bit_xor(blks[0], sks[0])
#unroll for i in 1 ..= 9 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
switch ctx._num_rounds {
case _aes.ROUNDS_128:
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
case _aes.ROUNDS_192:
#unroll for i in 10 ..= 11 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
case _aes.ROUNDS_256:
#unroll for i in 10 ..= 13 {
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
blks[0] = aes_hw.aesenc(blks[0], sks[i])
}
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
}
if l == BLOCK_SIZE {
@@ -219,7 +252,11 @@ gctr_hw :: proc(
copy(dst, blk[:l])
}
if is_seal {
hw_intel.ghash(s[:], h[:], dst[:l])
when aes_hw.HAS_GHASH {
aes_hw.ghash(s[:], h[:], dst[:l])
} else {
ct64.ghash(s[:], h[:], dst[:l])
}
}
dst = dst[l:]
@@ -235,8 +272,17 @@ gctr_hw :: proc(
// the compiler.
//
// src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
@(private = "file", enable_target_feature = "sse4.1")
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) {
when ODIN_ENDIAN == .Little {
ctr_be := intrinsics.byte_swap(ctr)
} else {
ctr_be := ctr
}
ret := transmute(simd.u8x16)(
simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be)
)
return ret, ctr + 1
}

View File

@@ -0,0 +1,18 @@
#+build amd64,arm32
package aes
import aes_hw "core:crypto/_aes/hw"
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return aes_hw.is_supported()
}
@(private)
Context_Impl_Hardware :: aes_hw.Context
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
aes_hw.init(ctx, key)
}

View File

@@ -1,10 +1,12 @@
#+build !amd64
#+build !arm64
#+build !arm32
package aes
@(private = "file")
ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported"
// is_hardware_accelerated returns true iff hardware accelerated AES
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return false

View File

@@ -1,18 +0,0 @@
#+build amd64
package aes
import "core:crypto/_aes/hw_intel"
// is_hardware_accelerated returns true iff hardware accelerated AES
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return hw_intel.is_supported()
}
@(private)
Context_Impl_Hardware :: hw_intel.Context
@(private, enable_target_feature = "sse2,aes")
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
hw_intel.init(ctx, key)
}

View File

@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
_blake2.final(ctx, hash, finalize_clone)

View File

@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
_blake2.final(ctx, hash, finalize_clone)

View File

@@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided Context, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and plaintext MUST alias exactly or not at all.

View File

@@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle"
// Omit large precomputed tables, trading off performance for size.
COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)
// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic
// entropy source.
HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise.
//
// The execution time of this routine is constant regardless of the contents
// of the slices being compared, as long as the length of the slices is equal.
// If the length of the two slices is different, it will early-return 0.
// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0.
compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
// If the length of the slices is different, early return.
//
@@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
}
// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by
// a and b are equal, 0 otherwise.
//
// The execution time of this routine is constant regardless of the
@@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
v |= x[i] ~ y[i]
}
// After the loop, v == 0 iff a == b. The subtraction will underflow
// iff v == 0, setting the sign-bit, which gets returned.
// After the loop, v == 0 if and only if (⟺) a == b. The subtraction will underflow
// if and only if (⟺) v == 0, setting the sign-bit, which gets returned.
return subtle.eq(0, v)
}
// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise.
is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
v: byte
for b_ in b {

View File

@@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
// with the provided Context, iv, and tag, and stores the output in dst,
// returning true iff the authentication was successful. If authentication
// returning true if and only if (⟺) the authentication was successful. If authentication
// fails, the destination buffer will be zeroed.
//
// dst and plaintext MUST alias exactly or not at all.

View File

@@ -1,152 +1,183 @@
#+build amd64
#+build amd64,arm32
package deoxysii
import "base:intrinsics"
import "core:crypto"
import "core:crypto/aes"
import aes_hw "core:crypto/_aes/hw"
import "core:simd"
import "core:simd/x86"
// This processes a maximum of 4 blocks at a time, as that is suitable
// for most current hardware that doesn't say "Xeon".
//
// TODO/perf: ARM should be able to do 8 at a time.
when ODIN_ARCH == .amd64 {
@(private="file")
TARGET_FEATURES :: "sse2,ssse3,aes"
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
@(private="file")
TARGET_FEATURES :: "neon,aes"
}
@(private = "file")
_BIT_ENC :: x86.__m128i{0x80, 0}
_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
@(private = "file")
_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
_PREFIX_AD_BLOCK :: simd.u8x16{
PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
}
@(private = "file")
_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
_PREFIX_AD_FINAL :: simd.u8x16{
PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
}
@(private = "file")
_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
_PREFIX_MSG_BLOCK :: simd.u8x16{
PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
}
@(private = "file")
_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
_PREFIX_MSG_FINAL :: simd.u8x16{
PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
}
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
// is_hardware_accelerated returns true if and only if () hardware accelerated Deoxys-II
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return aes.is_hardware_accelerated()
return aes_hw.is_supported()
}
@(private = "file", enable_target_feature = "sse4.1", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
auth_tweak :: #force_inline proc "contextless" (
prefix: x86.__m128i,
prefix: simd.u8x16,
block_nr: int,
) -> x86.__m128i {
return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
}
) -> simd.u8x16 {
when ODIN_ENDIAN == .Little {
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
} else {
block_nr_u64 := u64(block_nr)
}
@(private = "file", enable_target_feature = "sse2", require_results)
enc_tweak :: #force_inline proc "contextless" (
tag: x86.__m128i,
block_nr: int,
) -> x86.__m128i {
return x86._mm_xor_si128(
x86._mm_or_si128(tag, _BIT_ENC),
x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
return simd.bit_or(
prefix,
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
)
}
@(private = "file", enable_target_feature = "ssse3", require_results)
h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
enc_tweak :: #force_inline proc "contextless" (
tag: simd.u8x16,
block_nr: int,
) -> simd.u8x16 {
when ODIN_ENDIAN == .Little {
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
} else {
block_nr_u64 := u64(block_nr)
}
return simd.bit_xor(
simd.bit_or(tag, _BIT_ENC),
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
)
}
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
bc_x4 :: #force_inline proc "contextless" (
ctx: ^Context,
s_0, s_1, s_2, s_3: x86.__m128i,
tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
s_0, s_1, s_2, s_3: simd.u8x16,
tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16,
) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check {
s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
stk_0 := x86._mm_xor_si128(tk1_0, sk)
stk_1 := x86._mm_xor_si128(tk1_1, sk)
stk_2 := x86._mm_xor_si128(tk1_2, sk)
stk_3 := x86._mm_xor_si128(tk1_3, sk)
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
stk_0 := simd.bit_xor(tk1_0, sk)
stk_1 := simd.bit_xor(tk1_1, sk)
stk_2 := simd.bit_xor(tk1_2, sk)
stk_3 := simd.bit_xor(tk1_3, sk)
s_0 = x86._mm_xor_si128(s_0, stk_0)
s_1 = x86._mm_xor_si128(s_1, stk_1)
s_2 = x86._mm_xor_si128(s_2, stk_2)
s_3 = x86._mm_xor_si128(s_3, stk_3)
s_0 = simd.bit_xor(s_0, stk_0)
s_1 = simd.bit_xor(s_1, stk_1)
s_2 = simd.bit_xor(s_2, stk_2)
s_3 = simd.bit_xor(s_3, stk_3)
for i in 1 ..= BC_ROUNDS {
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
tk1_0 = h_(tk1_0)
tk1_1 = h_(tk1_1)
tk1_2 = h_(tk1_2)
tk1_3 = h_(tk1_3)
tk1_0 = h(tk1_0)
tk1_1 = h(tk1_1)
tk1_2 = h(tk1_2)
tk1_3 = h(tk1_3)
stk_0 = x86._mm_xor_si128(tk1_0, sk)
stk_1 = x86._mm_xor_si128(tk1_1, sk)
stk_2 = x86._mm_xor_si128(tk1_2, sk)
stk_3 = x86._mm_xor_si128(tk1_3, sk)
stk_0 = simd.bit_xor(tk1_0, sk)
stk_1 = simd.bit_xor(tk1_1, sk)
stk_2 = simd.bit_xor(tk1_2, sk)
stk_3 = simd.bit_xor(tk1_3, sk)
s_0 = x86._mm_aesenc_si128(s_0, stk_0)
s_1 = x86._mm_aesenc_si128(s_1, stk_1)
s_2 = x86._mm_aesenc_si128(s_2, stk_2)
s_3 = x86._mm_aesenc_si128(s_3, stk_3)
s_0 = aes_hw.aesenc(s_0, stk_0)
s_1 = aes_hw.aesenc(s_1, stk_1)
s_2 = aes_hw.aesenc(s_2, stk_2)
s_3 = aes_hw.aesenc(s_3, stk_3)
}
return s_0, s_1, s_2, s_3
}
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
bc_x1 :: #force_inline proc "contextless" (
ctx: ^Context,
s: x86.__m128i,
tweak: x86.__m128i,
) -> x86.__m128i #no_bounds_check {
s: simd.u8x16,
tweak: simd.u8x16,
) -> simd.u8x16 #no_bounds_check {
s, tk1 := s, tweak
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
stk := x86._mm_xor_si128(tk1, sk)
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
stk := simd.bit_xor(tk1, sk)
s = x86._mm_xor_si128(s, stk)
s = simd.bit_xor(s, stk)
for i in 1 ..= BC_ROUNDS {
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
tk1 = h_(tk1)
tk1 = h(tk1)
stk = x86._mm_xor_si128(tk1, sk)
stk = simd.bit_xor(tk1, sk)
s = x86._mm_aesenc_si128(s, stk)
s = aes_hw.aesenc(s, stk)
}
return s
}
@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
bc_absorb :: proc "contextless" (
ctx: ^Context,
tag: x86.__m128i,
tag: simd.u8x16,
src: []byte,
tweak_prefix: x86.__m128i,
tweak_prefix: simd.u8x16,
stk_block_nr: int,
) -> (x86.__m128i, int) #no_bounds_check {
) -> (simd.u8x16, int) #no_bounds_check {
src, stk_block_nr, tag := src, stk_block_nr, tag
nr_blocks := len(src) / BLOCK_SIZE
for nr_blocks >= 4 {
d_0, d_1, d_2, d_3 := bc_x4(
ctx,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
auth_tweak(tweak_prefix, stk_block_nr),
auth_tweak(tweak_prefix, stk_block_nr + 1),
auth_tweak(tweak_prefix, stk_block_nr + 2),
auth_tweak(tweak_prefix, stk_block_nr + 3),
)
tag = x86._mm_xor_si128(tag, d_0)
tag = x86._mm_xor_si128(tag, d_1)
tag = x86._mm_xor_si128(tag, d_2)
tag = x86._mm_xor_si128(tag, d_3)
tag = simd.bit_xor(tag, d_0)
tag = simd.bit_xor(tag, d_1)
tag = simd.bit_xor(tag, d_2)
tag = simd.bit_xor(tag, d_3)
src = src[4*BLOCK_SIZE:]
stk_block_nr += 4
@@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" (
for nr_blocks > 0 {
d := bc_x1(
ctx,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
auth_tweak(tweak_prefix, stk_block_nr),
)
tag = x86._mm_xor_si128(tag, d)
tag = simd.bit_xor(tag, d)
src = src[BLOCK_SIZE:]
stk_block_nr += 1
@@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" (
return tag, stk_block_nr
}
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
bc_final :: proc "contextless" (
ctx: ^Context,
tag: x86.__m128i,
tag: simd.u8x16,
iv: []byte,
) -> x86.__m128i {
) -> simd.u8x16 {
tmp: [BLOCK_SIZE]byte
tmp[0] = PREFIX_TAG << PREFIX_SHIFT
copy(tmp[1:], iv)
tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp))
return bc_x1(ctx, tag, tweak)
}
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
bc_encrypt :: proc "contextless" (
ctx: ^Context,
dst: []byte,
src: []byte,
iv: x86.__m128i,
tweak_tag: x86.__m128i,
iv: simd.u8x16,
tweak_tag: simd.u8x16,
stk_block_nr: int,
) -> int {
dst, src, stk_block_nr := dst, src, stk_block_nr
@@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" (
)
intrinsics.unaligned_store(
(^x86.__m128i)(raw_data(dst)),
x86._mm_xor_si128(
(^simd.u8x16)(raw_data(dst)),
simd.bit_xor(
d_0,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
),
)
intrinsics.unaligned_store(
(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
x86._mm_xor_si128(
(^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])),
simd.bit_xor(
d_1,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
),
)
intrinsics.unaligned_store(
(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
x86._mm_xor_si128(
(^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])),
simd.bit_xor(
d_2,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
),
)
intrinsics.unaligned_store(
(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
x86._mm_xor_si128(
(^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])),
simd.bit_xor(
d_3,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
),
)
@@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" (
)
intrinsics.unaligned_store(
(^x86.__m128i)(raw_data(dst)),
x86._mm_xor_si128(
(^simd.u8x16)(raw_data(dst)),
simd.bit_xor(
d,
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
),
)
@@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" (
e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
tmp: [BLOCK_SIZE]byte
copy(tmp[1:], iv)
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
// Algorithm 3
//
@@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
// if A_ != nil then
// Auth <- Auth ^ EK(0110 || la, pad10(A_))
// end
auth: x86.__m128i
auth: simd.u8x16
n: int
aad := aad
@@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
copy(dst[n*BLOCK_SIZE:], m_star[:])
}
intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth)
}
@(private, require_results)
d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
tmp: [BLOCK_SIZE]byte
copy(tmp[1:], iv)
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
// Algorithm 4
//
@@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
// if C_ != nil then
// M_ <- C_ ^ EK(1 || tag ^ l, 0^8 || N)
// end
auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag)))
m := ciphertext
n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
@@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
// if A != nil then
// Auth <- Auth ^ EK(0110| | l_a, pad10(A_))
// end
auth = x86.__m128i{0, 0}
auth = simd.u8x16{}
aad := aad
auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
aad = aad[BLOCK_SIZE*n:]
@@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
// Tag verification
// if tag0 = tag then return (M_1 || ... || M_l || M_)
// else return false
intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth)
ok := crypto.compare_constant_time(tmp[:], tag) == 1
crypto.zero_explicit(&tmp, size_of(tmp))

View File

@@ -1,10 +1,12 @@
#+build !amd64
#+build !arm64
#+build !arm32
package deoxysii
@(private = "file")
ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
// is supported.
is_hardware_accelerated :: proc "contextless" () -> bool {
return false

View File

@@ -104,7 +104,7 @@ Public_Key :: struct {
}
// private_key_generate uses the system entropy source to generate a new
// Private_Key. This will only fail iff the system entropy source is
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
// missing or broken.
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
private_key_clear(priv_key)
@@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
}
// private_key_set_bytes decodes a byte-encoded private key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
private_key_clear(priv_key)
@@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
}
}
// private_key_equal returns true iff the private keys are equal,
// private_key_equal returns true if and only if (⟺) the private keys are equal,
// in constant time.
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
if p._curve != q._curve {
@@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
}
// public_key_set_bytes decodes a byte-encoded public key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
public_key_clear(pub_key)
@@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
}
}
// public_key_equal returns true iff the public keys are equal,
// public_key_equal returns true if and only if (⟺) the public keys are equal,
// in constant time.
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
if p._curve != q._curve {

View File

@@ -79,7 +79,7 @@ Public_Key :: struct {
}
// private_key_generate uses the system entropy source to generate a new
// Private_Key. This will only fail iff the system entropy source is
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
// missing or broken.
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
private_key_clear(priv_key)
@@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
}
// private_key_set_bytes decodes a byte-encoded private key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
private_key_clear(priv_key)
@@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
}
}
// private_key_equal returns true iff the private keys are equal,
// private_key_equal returns true if and only if (⟺) the private keys are equal,
// in constant time.
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
if p._curve != q._curve {
@@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
}
// public_key_set_bytes decodes a byte-encoded public key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
public_key_clear(pub_key)
@@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
}
}
// public_key_equal returns true iff the public keys are equal,
// public_key_equal returns true if and only if (⟺) the public keys are equal,
// in constant time.
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
if p._curve != q._curve {

View File

@@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) {
return nil, nil, false
}
// DER requires a leading 0 iff the sign bit of the leading byte
// DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte
// is set to distinguish between positive and negative integers,
// and the minimal length representation. `r` and `s` are always
// going to be unsigned, so we validate malformed DER and strip

View File

@@ -3,7 +3,7 @@ package ecdsa
import "core:crypto/hash"
import secec "core:crypto/_weierstrass"
// verify_raw returns true iff sig is a valid signature by pub_key over
// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over
// msg, hased using hash_algo, per the verification procedure specifed
// in SEC 1, Version 2.0, Section 4.1.4.
//
@@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: []
panic("crypto/ecdsa: invalid curve")
}
// verify_asn1 returns true iff sig is a valid signature by pub_key over
// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over
// msg, hased using hash_algo, per the verification procedure specifed
// in SEC 1, Version 2.0, Section 4.1.4.
//

View File

@@ -48,7 +48,7 @@ Public_Key :: struct {
}
// private_key_generate uses the system entropy source to generate a new
// Private_Key. This will only fail iff the system entropy source is
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
// missing or broken.
private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
private_key_clear(priv_key)
@@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
}
// private_key_set_bytes decodes a byte-encoded private key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
if len(b) != PRIVATE_KEY_SIZE {
return false
@@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
}
// public_key_set_bytes decodes a byte-encoded public key, and returns
// true iff the operation was successful.
// true if and only if (⟺) the operation was successful.
public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
if len(b) != PUBLIC_KEY_SIZE {
return false
@@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
copy(dst, pub_key._b[:])
}
// public_key_equal returns true iff pub_key is equal to other.
// public_key_equal returns true if and only if (⟺) pub_key is equal to other.
public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")
return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
}
// verify returns true iff sig is a valid signature by pub_key over msg.
// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg.
//
// The optional `allow_small_order_A` parameter will make this
// implementation strictly compatible with FIPS 186-5, at the expense of

View File

@@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
switch &impl in ctx._impl {

View File

@@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
}
// verify will verify the HMAC tag computed with the specified algorithm
// and key over msg and return true iff the tag is valid. It requires
// and key over msg and return true if and only if (⟺) the tag is valid. It requires
// that the tag is correctly sized.
verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
tag_buf: [hash.MAX_DIGEST_SIZE]byte

View File

@@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
}
// verify will verify the KMAC tag computed with the specified security
// strength, key and domain separator over msg and return true iff the
// strength, key and domain separator over msg and return true if and only if (⟺) the
// tag is valid.
verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
derived_tag := make([]byte, len(tag), allocator)

View File

@@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)

View File

@@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
ensure(ctx.is_initialized)

View File

@@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
ensure(ctx.is_initialized)

View File

@@ -66,7 +66,7 @@ derive :: proc(
dst_blk = dst_blk[h_len:]
}
// Instead of rounding l up, just proceass the one extra block iff
// Instead of rounding l up, just proceass the one extra block if and only if (⟺)
// r != 0.
if r > 0 {
tmp: [hash.MAX_DIGEST_SIZE]byte

View File

@@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) {
}
// verify will verify the Poly1305 tag computed with the key over msg and
// return true iff the tag is valid. It requires that the tag is correctly
// return true if and only if (⟺) the tag is valid. It requires that the tag is correctly
// sized.
verify :: proc(tag, msg, key: []byte) -> bool {
ctx: Context = ---

View File

@@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc(
ge._is_initialized = true
}
// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`.
// Behavior for all other values of ctrl are undefined,
ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
_ge_ensure_initialized([]^Group_Element{a})
@@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
ge._is_initialized = true
}
// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`.
// Behavior for all other values of ctrl are undefined,
ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
_ge_ensure_initialized([]^Group_Element{ge, a})
@@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
grp.ge_cond_assign(&ge._p, &a._p, ctrl)
}
// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`.
// Behavior for all other values of ctrl are undefined,
ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
_ge_ensure_initialized([]^Group_Element{a, b})
@@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
ge._is_initialized = true
}
// ge_equal returns 1 iff `a == b`, and 0 otherwise.
// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
ge_equal :: proc(a, b: ^Group_Element) -> int {
_ge_ensure_initialized([]^Group_Element{a, b})
@@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int {
return ret
}
// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise.
// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise.
@(require_results)
ge_is_identity :: proc(ge: ^Group_Element) -> int {
return ge_equal(ge, &GE_IDENTITY)

View File

@@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) {
grp.sc_square(sc, a)
}
// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`.
// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`.
// Behavior for all other values of ctrl are undefined,
sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) {
grp.sc_cond_assign(sc, a, ctrl)
}
// sc_equal returns 1 iff `a == b`, and 0 otherwise.
// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
sc_equal :: proc(a, b: ^Scalar) -> int {
return grp.sc_equal(a, b)

View File

@@ -191,7 +191,7 @@ update :: proc(ctx: ^$T, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
ensure(ctx.is_initialized)

View File

@@ -0,0 +1,224 @@
#+build arm64,arm32
package sha2
// Based on the public domain code by Jeffrey Walton, though
// realistically, there only is one sensible way to write this.
//
// See: https://github.com/noloader/SHA-Intrinsics
import "base:intrinsics"
import "core:simd"
import "core:simd/arm"
import "core:sys/info"
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
// accelerated SHA-224/SHA-256 is supported.
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
req_features :: info.CPU_Features{
.asimd,
.sha256,
}
return info.cpu_features() >= req_features
}
@(private = "file")
K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5}
@(private = "file")
K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5}
@(private = "file")
K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3}
@(private = "file")
K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174}
@(private = "file")
K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC}
@(private = "file")
K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA}
@(private = "file")
K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7}
@(private = "file")
K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967}
@(private = "file")
K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13}
@(private = "file")
K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85}
@(private = "file")
K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3}
@(private = "file")
K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070}
@(private = "file")
K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5}
@(private = "file")
K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3}
@(private = "file")
K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208}
@(private = "file")
K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2}
@(private, enable_target_feature = "neon,sha2")
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0]))
state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4]))
data := data
for len(data) >= BLOCK_SIZE_256 {
// Save state
abef_save, cdgh_save := state_0, state_1
// Load message
msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data)))
msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:])))
msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:])))
msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:])))
// Reverse for little endian
when ODIN_ENDIAN == .Little {
msg_0 = byteswap_u32x4(msg_0)
msg_1 = byteswap_u32x4(msg_1)
msg_2 = byteswap_u32x4(msg_2)
msg_3 = byteswap_u32x4(msg_3)
}
tmp_0 := simd.add(msg_0, K_0)
// Rounds 0-3
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
tmp_2 := state_0
tmp_1 := simd.add(msg_1, K_1)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
// Rounds 4-7
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
tmp_2 = state_0
tmp_0 = simd.add(msg_2, K_2)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
// Rounds 8-11
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
tmp_2 = state_0
tmp_1 = simd.add(msg_3, K_3)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
// Rounds 12-15
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
tmp_2 = state_0
tmp_0 = simd.add(msg_0, K_4)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
// Rounds 16-19
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
tmp_2 = state_0
tmp_1 = simd.add(msg_1, K_5)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
// Rounds 20-23
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
tmp_2 = state_0
tmp_0 = simd.add(msg_2, K_6)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
// Rounds 24-27
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
tmp_2 = state_0
tmp_1 = simd.add(msg_3, K_7)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
// Rounds 28-31
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
tmp_2 = state_0
tmp_0 = simd.add(msg_0, K_8)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
// Rounds 32-35
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
tmp_2 = state_0
tmp_1 = simd.add(msg_1, K_9)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
// Rounds 36-39
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
tmp_2 = state_0
tmp_0 = simd.add(msg_2, K_10)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
// Rounds 40-43
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
tmp_2 = state_0
tmp_1 = simd.add(msg_3, K_11)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
// Rounds 44-47
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
tmp_2 = state_0
tmp_0 = simd.add(msg_0, K_12)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
// Rounds 48-51
tmp_2 = state_0
tmp_1 = simd.add(msg_1, K_13)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
// Rounds 52-55
tmp_2 = state_0
tmp_0 = simd.add(msg_2, K_14)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
// Rounds 56-59
tmp_2 = state_0
tmp_1 = simd.add(msg_3, K_15)
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
// Rounds 60-63
tmp_2 = state_0
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
// Combine state
state_0 = simd.add(state_0, abef_save)
state_1 = simd.add(state_1, cdgh_save)
data = data[BLOCK_SIZE_256:]
}
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0)
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1)
}
when ODIN_ENDIAN == .Little {
@(private = "file", enable_target_feature = "neon")
byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 {
return transmute(simd.u32x4)(
simd.shuffle(
transmute(simd.u8x16)(a),
transmute(simd.u8x16)(a),
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
)
)
}
}

View File

@@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}
// is_hardware_accelerated_256 returns true iff hardware accelerated
// is_hardware_accelerated_256 returns true if and only if () hardware accelerated
// SHA-224/SHA-256 is supported.
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
req_features :: info.CPU_Features{

View File

@@ -1,10 +1,12 @@
#+build !amd64
#+build !arm64
#+build !arm32
package sha2
@(private = "file")
ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
// is_hardware_accelerated_256 returns true iff hardware accelerated
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
// SHA-224/SHA-256 is supported.
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
return false

View File

@@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)

View File

@@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
ensure(ctx.is_initialized)

View File

@@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) {
// final finalizes the Context, writes the digest to hash, and calls
// reset on the Context.
//
// Iff finalize_clone is set, final will work on a copy of the Context,
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
// which is useful for for calculating rolling digests.
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
_sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone)

View File

@@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err
// copy_n copies n bytes (or till an error) from src to dst.
// It returns the number of bytes copied and the first error that occurred whilst copying, if any.
// On return, written == n IFF err == nil
// On return, written == n if and only if (⟺) err == nil
copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) {
nsrc := limited_reader_init(&Limited_Reader{}, src, n)
written, err = copy(dst, nsrc)

View File

@@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc
If the modulus is odd or dr != 0 use the montgomery method.
*/
if internal_int_is_odd(P) || dr != 0 {
return _private_int_exponent_mod(res, G, X, P, dr)
return _private_int_exponent_mod_fast(res, G, X, P, dr)
}
/*

View File

@@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
return _private_int_mul_high_comba(dest, a, b, digits)
}
internal_grow(dest, a.used + b.used + 1) or_return
dest.used = a.used + b.used + 1
/*
Set up temporary output `Int`, which we'll swap for `dest` when done.
*/
t := &Int{}
internal_grow(t, a.used + b.used + 1) or_return
t.used = a.used + b.used + 1
pa := a.used
pb := b.used
@@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
/*
Calculate the double precision result.
*/
r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
/*
Get the lower part.
*/
dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
t.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
/*
Carry the carry.
*/
carry = DIGIT(r >> _WORD(_DIGIT_BITS))
}
dest.digit[ix + pb] = carry
t.digit[ix + pb] = carry
}
internal_swap(dest, t)
internal_destroy(t)
return internal_clamp(dest)
}

View File

@@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc :=
needed := mem.align_forward_uint(size, alignment)
needed = max(needed, arena.default_commit_size)
block_size := max(needed, arena.minimum_block_size)
block_size := max(needed, arena.minimum_block_size) + alignment
new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return
new_block := memory_block_alloc(needed, block_size) or_return
new_block.prev = arena.curr_block
arena.curr_block = new_block
arena.total_reserved += new_block.reserved

View File

@@ -154,7 +154,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,
pmblock.committed = platform_total_commit
block.committed = pmblock.committed - base_offset
assert(block.committed <= block.reserved)
}
return
}
@@ -174,7 +174,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,
err = .Out_Of_Memory
return
}
assert(block.committed <= block.reserved)
do_commit_if_necessary(block, size, default_commit_size) or_return
data = block.base[block.used+alignment_offset:][:min_size]

View File

@@ -804,7 +804,7 @@ send_exec :: proc(op: ^Operation) -> Op_Result {
op.send.sent += n
if op.send.sent < total {
if n < total {
return send_exec(op)
}
@@ -868,7 +868,7 @@ recv_exec :: proc(op: ^Operation) -> Op_Result {
assert(is_tcp || op.recv.received == 0)
op.recv.received += n
if is_tcp && n != 0 && op.recv.received < total {
if is_tcp && n != 0 && n < total {
return recv_exec(op)
}

34
core/simd/arm/aes.odin Normal file
View File

@@ -0,0 +1,34 @@
#+build arm64,arm32
package simd_arm
@(require_results, enable_target_feature = "aes")
vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
return _vaeseq_u8(data, key)
}
@(require_results, enable_target_feature = "aes")
vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
return _vaesdq_u8(data, key)
}
@(require_results, enable_target_feature = "aes")
vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
return _vaesmcq_u8(data)
}
@(require_results,enable_target_feature = "aes")
vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
return _vaesimcq_u8(data)
}
@(private, default_calling_convention = "none")
foreign _ {
@(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese")
_vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
@(link_name = "llvm.aarch64.crypto.aesd" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesd")
_vaesdq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
@(link_name = "llvm.aarch64.crypto.aesmc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesmc")
_vaesmcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
@(link_name = "llvm.aarch64.crypto.aesimc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesimc")
_vaesimcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
}

2
core/simd/arm/doc.odin Normal file
View File

@@ -0,0 +1,2 @@
// `SIMD` intrinsics specific to ARMv8 `arm32` and `arm64` architectures.
package simd_arm

108
core/simd/arm/sha.odin Normal file
View File

@@ -0,0 +1,108 @@
#+build arm64,arm32
package simd_arm
@(require_results, enable_target_feature = "sha2")
vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
return _vsha1cq_u32(hash_abcd, e, wk)
}
@(require_results, enable_target_feature = "sha2")
vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
return _vsha1pq_u32(hash_abcd, e, wk)
}
@(require_results, enable_target_feature = "sha2")
vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
return _vsha1mq_u32(hash_abcd, e, wk)
}
@(require_results, enable_target_feature = "sha2")
vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t {
return _vsha1h_u32(e)
}
@(require_results, enable_target_feature = "sha2")
vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t {
return _vsha1su0q_u32(w0_3, w4_7, w8_11)
}
@(require_results, enable_target_feature = "sha2")
vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t {
return _vsha1su1q_u32(tw0_3, w12_15)
}
@(require_results, enable_target_feature = "sha2")
vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t {
return _vsha256hq_u32(hash_abcd, hash_efgh, wk)
}
@(require_results, enable_target_feature = "sha2")
vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t {
return _vsha256h2q_u32(hash_efgh, hash_abcd, wk)
}
@(require_results, enable_target_feature = "sha2")
vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t {
return _vsha256su0q_u32(w0_3, w4_7)
}
@(require_results, enable_target_feature = "sha2")
vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t {
return _vsha256su1q_u32(tw0_3, w8_11, w12_15)
}
// Note: The SHA512 instructions are part of the `sha3` feature set.
@(require_results, enable_target_feature = "sha3")
vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t {
return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2)
}
@(require_results, enable_target_feature = "sha3")
vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t {
return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab)
}
@(require_results, enable_target_feature = "sha3")
vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t {
return _vsha512su0q_u64(w0_1, w2_)
}
@(require_results, enable_target_feature = "sha3")
vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t {
return _vsha512su1q_u64(s01_s02, w14_15, w9_10)
}
@(private, default_calling_convention = "none")
foreign _ {
@(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c")
_vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p")
_vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m")
_vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h")
_vsha1h_u32 :: proc(e: uint32_t) -> uint32_t ---
@(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0")
_vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1")
_vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h")
_vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2")
_vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0")
_vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1")
_vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t ---
@(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h")
_vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t ---
@(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2")
_vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t ---
@(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0")
_vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t ---
@(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1")
_vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t ---
}

9
core/simd/arm/types.odin Normal file
View File

@@ -0,0 +1,9 @@
#+build arm64,arm32
package simd_arm
// Type aliases to match `arm_neon.h`.
uint32_t :: u32
uint8x16_t :: #simd[16]u8
uint32x4_t :: #simd[4]u32
uint64x2_t :: #simd[2]u64

View File

@@ -82,7 +82,7 @@ internal_block_literal_make :: proc (is_global: bool, user_data: rawptr, user_pr
BLOCK_HAS_COPY_DISPOSE :: 1 << 25
BLOCK_HAS_CTOR :: 1 << 26 // helpers have C++ code
BLOCK_IS_GLOBAL :: 1 << 28
BLOCK_HAS_STRET :: 1 << 29 // IFF BLOCK_HAS_SIGNATURE
BLOCK_HAS_STRET :: 1 << 29 // if and only if (⟺) BLOCK_HAS_SIGNATURE
BLOCK_HAS_SIGNATURE :: 1 << 30
bl.isa = is_global ? &_NSConcreteGlobalBlock : &_NSConcreteStackBlock

View File

@@ -1683,7 +1683,20 @@ gb_internal void init_android_values(bool with_sdk) {
gb_exit(1);
}
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/"));
switch (bc->metrics.arch) {
case TargetArch_arm64:
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("aarch64-linux-android");
break;
case TargetArch_arm32:
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("arm-linux-androideabi");
break;
case TargetArch_amd64:
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("x86_64-linux-android");
break;
case TargetArch_i386:
bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("i686-linux-android");
break;
}
char buf[32] = {};
gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL);
@@ -1958,9 +1971,22 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
} else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) {
switch (metrics->arch) {
case TargetArch_arm64:
bc->metrics.target_triplet = str_lit("aarch64-none-linux-android");
bc->metrics.target_triplet = str_lit("aarch64-linux-android");
bc->reloc_mode = RelocMode_PIC;
break;
case TargetArch_arm32:
bc->metrics.target_triplet = str_lit("armv7a-linux-androideabi");
bc->reloc_mode = RelocMode_PIC;
break;
case TargetArch_amd64:
bc->metrics.target_triplet = str_lit("x86_64-linux-android");
bc->reloc_mode = RelocMode_PIC;
break;
case TargetArch_i386:
bc->metrics.target_triplet = str_lit("i686-linux-android");
bc->reloc_mode = RelocMode_PIC;
break;
default:
GB_PANIC("Unknown architecture for -subtarget:android");
}

View File

@@ -2971,14 +2971,21 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper
if (check_is_assignable_to(c, x, y->type) ||
check_is_assignable_to(c, y, x->type)) {
if (x->type->failure || y->type->failure) {
// // skip any failures
x->mode = Addressing_Value;
x->type = t_untyped_bool;
return;
}
Type *err_type = x->type;
bool defined = false;
switch (op) {
case Token_CmpEq:
case Token_NotEq:
defined = (is_type_comparable(x->type) && is_type_comparable(y->type)) ||
(is_operand_nil(*x) && type_has_nil(y->type)) ||
(is_operand_nil(*y) && type_has_nil(x->type));
defined = ((is_operand_nil(*x) && type_has_nil(y->type)) ||
(is_operand_nil(*y) && type_has_nil(x->type)) ||
is_type_comparable(x->type) && is_type_comparable(y->type));
break;
case Token_Lt:
case Token_Gt:
@@ -4476,9 +4483,9 @@ gb_internal void check_binary_expr(CheckerContext *c, Operand *x, Ast *node, Typ
truncated: r = a - b*trunc(a/b)
floored: r = a - b*floor(a/b)
IFF a/0 == 0, then (a%0 == a) or (a%%0 == a)
IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0)
IFF a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a)
If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0)
If and only if (⟺) a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
*/
switch (zero_behaviour) {

View File

@@ -1853,7 +1853,7 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
if (is_using && (feature_flags & OptInFeatureFlag_UsingStmt) == 0) {
ERROR_BLOCK();
error(param, "'using' has been disallowed as it is considered bad practice to use as a statement/procedure parameter outside of immediate refactoring");
error_line("\tIt you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
error_line("\tIf you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
}
if (type_expr == nullptr) {

View File

@@ -2240,7 +2240,7 @@ gb_internal void add_type_info_type_internal(CheckerContext *c, Type *t) {
case Type_BitSet:
add_type_info_type_internal(c, bt->BitSet.elem);
add_type_info_type_internal(c, bt->BitSet.underlying);
add_type_info_type_internal(c, bit_set_to_int(bt));
break;
case Type_Pointer:
@@ -2484,7 +2484,7 @@ gb_internal void add_min_dep_type_info(Checker *c, Type *t) {
case Type_BitSet:
add_min_dep_type_info(c, bt->BitSet.elem);
add_min_dep_type_info(c, bt->BitSet.underlying);
add_min_dep_type_info(c, bit_set_to_int(bt));
break;
case Type_Pointer:

View File

@@ -676,7 +676,7 @@ try_cross_linking:;
defer (gb_string_free(glue));
glue = gb_string_append_fmt(glue, "bin/clang");
glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
glue = gb_string_append_fmt(glue, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL);
glue = gb_string_appendc(glue, "-c \"");
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len);
glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c");
@@ -697,8 +697,9 @@ try_cross_linking:;
glue = gb_string_appendc(glue, "\"-I");
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/");
glue = gb_string_appendc(glue, "\" ");
glue = gb_string_appendc(glue, "sysroot/usr/include/");
glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len);
glue = gb_string_appendc(glue, "/\" ");
glue = gb_string_appendc(glue, "-Wno-macro-redefined ");
@@ -969,7 +970,7 @@ try_cross_linking:;
gbString ndk_bin_directory = gb_string_make_length(temporary_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
link_command_line = gb_string_appendc(link_command_line, ndk_bin_directory);
link_command_line = gb_string_appendc(link_command_line, "bin/clang");
link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
link_command_line = gb_string_append_fmt(link_command_line, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL);
} else {
link_command_line = gb_string_appendc(link_command_line, clang_path);
}

View File

@@ -66,6 +66,19 @@ gb_internal String get_final_microarchitecture() {
gb_internal String get_default_features() {
BuildContext *bc = &build_context;
if (bc->microarch == str_lit("native")) {
String features = make_string_c(LLVMGetHostCPUFeatures());
// Update the features string so LLVM uses it later.
if (bc->target_features_string.len > 0) {
bc->target_features_string = concatenate3_strings(permanent_allocator(), features, str_lit(","), bc->target_features_string);
} else {
bc->target_features_string = features;
}
return features;
}
int off = 0;
for (int i = 0; i < bc->metrics.arch; i += 1) {
off += target_microarch_counts[i];

View File

@@ -1424,8 +1424,8 @@ gb_internal LLVMValueRef lb_integer_modulo(lbProcedure *p, LLVMValueRef lhs, LLV
truncated: r = a - b*trunc(a/b)
floored: r = a - b*floor(a/b)
IFF a/0 == 0, then (a%0 == a) or (a%%0 == a)
IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0)
If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a)
If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0)
*/
switch (behaviour) {

View File

@@ -57,6 +57,33 @@ test_align_bumping_block_limit :: proc(t: ^testing.T) {
testing.expect(t, len(data) == 896)
}
@(test)
test_large_minimum_block_size :: proc(t: ^testing.T) {
a: virtual.Arena
defer virtual.arena_destroy(&a)
init_err := virtual.arena_init_growing(&a, 16*mem.Megabyte)
testing.expect_value(t, init_err, nil)
align : uint = 4
for _ in 0..<6 {
data, err := virtual.arena_alloc(&a, 18874368, align)
testing.expect_value(t, err, nil)
testing.expect(t, len(data) == 18874368)
testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
align *= 2
virtual.arena_free_all(&a)
}
align = 4
for _ in 0..<32 {
data, err := virtual.arena_alloc(&a, 1048576, align)
testing.expect_value(t, err, nil)
testing.expect(t, len(data) == 1048576)
testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
}
}
@(test)
tlsf_test_overlap_and_zero :: proc(t: ^testing.T) {
default_allocator := context.allocator

View File

@@ -71,7 +71,7 @@
/*
@@ LUAI_IS32INT is true iff 'int' has (at least) 32 bits.
@@ LUAI_IS32INT is true if and only if (⟺) 'int' has (at least) 32 bits.
*/
#define LUAI_IS32INT ((UINT_MAX >> 30) >= 3)

View File

@@ -119,8 +119,8 @@ DeviceInfo :: struct {
structVersion: c.int, /**< this internal structure version */
interf: cstring, /**< underlying MIDI API, e.g. MMSystem or DirectX */
name: cstring, /**< device name, e.g. USB MidiSport 1x1 */
input: b32, /**< true iff input is available */
output: b32, /**< true iff output is available */
input: b32, /**< true if and only if () input is available */
output: b32, /**< true if and only if () output is available */
opened: b32, /**< used by generic PortMidi code to do error checking on arguments */
}