mirror of
https://github.com/odin-lang/Odin.git
synced 2026-05-26 21:58:14 +00:00
Merge branch 'master' into fix/field-first-writes-into-dynamic-soa
This commit is contained in:
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -8,3 +8,5 @@ vendor/box2d/lib/box2d_windows_amd64_sse2.lib filter=lfs diff=lfs merge=lfs -tex
|
||||
vendor/miniaudio/lib/miniaudio.lib filter=lfs diff=lfs merge=lfs -text
|
||||
vendor/sdl3/SDL3.dll filter=lfs diff=lfs merge=lfs -text
|
||||
vendor/sdl3/SDL3.lib filter=lfs diff=lfs merge=lfs -text
|
||||
vendor/sdl3/mixer/*.dll filter=lfs diff=lfs merge=lfs -text
|
||||
vendor/sdl3/mixer/*.lib filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
15
.github/workflows/ci.yml
vendored
15
.github/workflows/ci.yml
vendored
@@ -93,8 +93,8 @@ jobs:
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
brew update
|
||||
brew install llvm@20 wasmtime lua@5.4 lld
|
||||
echo "$(brew --prefix llvm@20)/bin" >> $GITHUB_PATH
|
||||
brew install llvm@22 wasmtime lua@5.4 lld
|
||||
echo "$(brew --prefix llvm@22)/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Download LLVM (Ubuntu)
|
||||
if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-24.04-arm'
|
||||
@@ -140,9 +140,18 @@ jobs:
|
||||
- name: Optimized Core library tests
|
||||
run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
|
||||
- name: Wycheproof tests
|
||||
run: ./odin test tests/core/crypto/wycheproof -vet -vet-tabs -strict-style -vet-style -vet-cast -warnings-as-errors -disallow-do -o:speed -microarch:native
|
||||
|
||||
run: ./odin test tests/core/crypto/wycheproof -vet -vet-tabs -strict-style -vet-style -vet-cast -warnings-as-errors -disallow-do -o:speed
|
||||
- name: Vendor library tests
|
||||
run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
|
||||
if: matrix.os != 'macos-15-intel' && matrix.os != 'macos-latest'
|
||||
- name: Vendor library tests (MacOS ARM)
|
||||
run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address -extra-linker-flags:"-L/opt/homebrew/opt/lua@5.4/lib"
|
||||
if: matrix.os == 'macos-latest'
|
||||
- name: Vendor library tests (MacOS Intel)
|
||||
run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address -extra-linker-flags:"-L/usr/local/opt/lua@5.4/lib"
|
||||
if: matrix.os == 'macos-15-intel'
|
||||
|
||||
- name: Internals tests
|
||||
run: ./odin test tests/internal -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
|
||||
- name: GitHub Issue tests
|
||||
|
||||
1
.github/workflows/nightly.yml
vendored
1
.github/workflows/nightly.yml
vendored
@@ -58,6 +58,7 @@ jobs:
|
||||
musl-dev llvm20-dev clang20 git mold lz4 \
|
||||
libxml2-static llvm20-static zlib-static zstd-static \
|
||||
make &&
|
||||
git config --global --add safe.directory /src &&
|
||||
./ci/build_linux_static.sh
|
||||
'
|
||||
- name: Odin run
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// This is purely for documentation
|
||||
7// This is purely for documentation
|
||||
#+build ignore
|
||||
package intrinsics
|
||||
|
||||
@@ -77,7 +77,9 @@ prefetch_write_instruction :: proc(address: rawptr, #const locality: i32 /* 0..=
|
||||
prefetch_write_data :: proc(address: rawptr, #const locality: i32 /* 0..=3 */) ---
|
||||
|
||||
// Compiler Hints
|
||||
expect :: proc(val, expected_val: $T) -> T ---
|
||||
expect :: proc(val, expected_val: $T) -> T ---
|
||||
likely :: proc(val: $T) -> T where type_is_boolean(T) ---
|
||||
unlikely :: proc(val: $T) -> T where type_is_boolean(T) ---
|
||||
|
||||
// Linux and Darwin Only
|
||||
syscall :: proc(id: uintptr, args: ..uintptr) -> uintptr ---
|
||||
@@ -180,6 +182,7 @@ type_is_bit_set :: proc($T: typeid) -> bool ---
|
||||
type_is_bit_field :: proc($T: typeid) -> bool ---
|
||||
type_is_simd_vector :: proc($T: typeid) -> bool ---
|
||||
type_is_matrix :: proc($T: typeid) -> bool ---
|
||||
type_is_fixed_capacity_dynamic_array :: proc($T: typeid) -> bool ---
|
||||
|
||||
type_has_nil :: proc($T: typeid) -> bool ---
|
||||
|
||||
@@ -202,6 +205,9 @@ type_bit_set_underlying_type :: proc($T: typeid) -> typeid where type_is_bit_set
|
||||
type_has_field :: proc($T: typeid, $name: string) -> bool ---
|
||||
type_field_type :: proc($T: typeid, $name: string) -> typeid ---
|
||||
|
||||
type_field_bit_size :: proc($T: typeid, $name: string) -> int where type_is_bit_field(T) ---
|
||||
type_field_bit_offset :: proc($T: typeid, $name: string) -> int where type_is_bit_field(T) ---
|
||||
|
||||
type_proc_parameter_count :: proc($T: typeid) -> int where type_is_proc(T) ---
|
||||
type_proc_return_count :: proc($T: typeid) -> int where type_is_proc(T) ---
|
||||
|
||||
@@ -222,6 +228,8 @@ type_is_superset_of :: proc($Super, $Sub: typeid) -> bool ---
|
||||
|
||||
type_field_index_of :: proc($T: typeid, $name: string) -> uintptr ---
|
||||
|
||||
type_fixed_capacity_dynamic_array_len_offset :: proc($T: typeid/[dynamic; $N]$E) -> uintptr ---
|
||||
|
||||
// "Contiguous" means that the set of enum constants, when sorted, have a difference of either 0 or 1 between consecutive values.
|
||||
// This is the exact opposite of "sparse".
|
||||
type_enum_is_contiguous :: proc($T: typeid) -> bool where type_is_enum(T) ---
|
||||
@@ -340,7 +348,11 @@ simd_trunc :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
|
||||
// rounding to the nearest integral value; if two values are equally near, rounds to the even one
|
||||
simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
|
||||
|
||||
simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
|
||||
simd_approx_recip :: proc(x: #simd[N]T) -> #simd[N]T where type_is_float(T)) ---
|
||||
simd_approx_recip_sqrt :: proc(x: #simd[N]T) -> #simd[N]T where type_is_float(T)) ---
|
||||
|
||||
simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
|
||||
simd_to_bits_signed :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), !type_is_unsigned(Integer) ---
|
||||
|
||||
// equivalent to a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
|
||||
simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
|
||||
@@ -348,6 +360,16 @@ simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
|
||||
simd_lanes_rotate_left :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
|
||||
simd_lanes_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
|
||||
|
||||
// return {b[0], a[1], b[2], a[3], ...}
|
||||
simd_odd_even :: proc(a, b: #simd[N]T) -> #simd[N]T ---
|
||||
|
||||
// Returns the sums of N consecutive lanes
|
||||
simd_sums_of_n :: proc(a: #simd[LANES]T, $N: uint) -> #simd[LANES/N]T where is_power_of_two(N) ---
|
||||
|
||||
simd_pairwise_add :: proc(a, b: #simd[LANES]T) -> #simd[LANES/N]T ---
|
||||
simd_pairwise_sub :: proc(a, b: #simd[LANES]T) -> #simd[LANES/N]T ---
|
||||
|
||||
|
||||
// Checks if the current target supports the given target features.
|
||||
//
|
||||
// Takes a constant comma-seperated string (eg: "sha512,sse4.1"), or a procedure type which has either
|
||||
|
||||
@@ -39,6 +39,10 @@ Calling_Convention :: enum u8 {
|
||||
|
||||
Win64 = 9,
|
||||
SysV = 10,
|
||||
|
||||
Preserve_None = 11,
|
||||
Preserve_Most = 12,
|
||||
Preserve_All = 13,
|
||||
}
|
||||
|
||||
Type_Info_Enum_Value :: distinct i64
|
||||
@@ -137,7 +141,7 @@ Type_Info_Struct :: struct {
|
||||
|
||||
flags: Type_Info_Struct_Flags,
|
||||
|
||||
// These are only set iff this structure is an SOA structure
|
||||
// These are only set if and only if (⟺) this structure is an SOA structure
|
||||
soa_kind: Type_Info_Struct_Soa_Kind,
|
||||
soa_len: i32,
|
||||
soa_base_type: ^Type_Info,
|
||||
@@ -166,10 +170,11 @@ Type_Info_Map :: struct {
|
||||
map_info: ^Map_Info,
|
||||
}
|
||||
Type_Info_Bit_Set :: struct {
|
||||
elem: ^Type_Info,
|
||||
underlying: ^Type_Info, // Possibly nil
|
||||
lower: i64,
|
||||
upper: i64,
|
||||
elem: ^Type_Info,
|
||||
underlying: ^Type_Info,
|
||||
explicit_underlying: bool, // false = bit_set[T], true = bit_set[T, U]
|
||||
lower: i64,
|
||||
upper: i64,
|
||||
}
|
||||
Type_Info_Simd_Vector :: struct {
|
||||
elem: ^Type_Info,
|
||||
@@ -201,6 +206,14 @@ Type_Info_Bit_Field :: struct {
|
||||
field_count: int,
|
||||
}
|
||||
|
||||
Type_Info_Fixed_Capacity_Dynamic_Array :: struct {
|
||||
elem: ^Type_Info,
|
||||
elem_size: int,
|
||||
capacity: int,
|
||||
len_offset: uintptr,
|
||||
}
|
||||
|
||||
|
||||
Type_Info_Flag :: enum u8 {
|
||||
Comparable = 0,
|
||||
Simple_Compare = 1,
|
||||
@@ -241,6 +254,7 @@ Type_Info :: struct {
|
||||
Type_Info_Matrix,
|
||||
Type_Info_Soa_Pointer,
|
||||
Type_Info_Bit_Field,
|
||||
Type_Info_Fixed_Capacity_Dynamic_Array,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -420,6 +434,11 @@ Raw_Dynamic_Array :: struct {
|
||||
allocator: Allocator,
|
||||
}
|
||||
|
||||
Raw_Fixed_Capacity_Dynamic_Array :: struct($Capacity: uint, $T: typeid) {
|
||||
data: [Capacity]T,
|
||||
len: int,
|
||||
}
|
||||
|
||||
// The raw, type-erased representation of a map.
|
||||
//
|
||||
// 32-bytes on 64-bit
|
||||
@@ -654,9 +673,8 @@ type_info_base :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
|
||||
return base
|
||||
}
|
||||
|
||||
|
||||
// type_info_core returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
|
||||
// returns the backing integer type of an enum or bit_set `^Type_Info`.
|
||||
// returns the backing integer type of an enum `^Type_Info`.
|
||||
// This is also aliased as `type_info_base_without_enum`
|
||||
@(require_results)
|
||||
type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
|
||||
@@ -676,11 +694,35 @@ type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
|
||||
return base
|
||||
}
|
||||
|
||||
// type_info_base_without_enum returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
|
||||
|
||||
|
||||
// type_info_underlying returns the underlying (backing) type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
|
||||
// returns the backing integer type of an enum `^Type_Info` AND/OR the underlying integer type of a bit_set or bit_field.
|
||||
@(require_results)
|
||||
type_info_underlying :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
|
||||
if info == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
base := info
|
||||
loop: for {
|
||||
#partial switch i in base.variant {
|
||||
case Type_Info_Named: base = i.base
|
||||
case Type_Info_Enum: base = i.base
|
||||
case Type_Info_Bit_Set: base = i.underlying
|
||||
case Type_Info_Bit_Field: base = i.backing_type
|
||||
case: break loop
|
||||
}
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// `type_info_base_without_enum` returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
|
||||
// returns the backing integer type of an enum or bit_set `^Type_Info`.
|
||||
// This is also aliased as `type_info_core`
|
||||
type_info_base_without_enum :: type_info_core
|
||||
|
||||
@(require_results)
|
||||
__type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check {
|
||||
n := u64(len(type_table))
|
||||
i := transmute(u64)id % n
|
||||
@@ -696,14 +738,16 @@ __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check
|
||||
|
||||
when !ODIN_NO_RTTI {
|
||||
// typeid_base returns the base-type of a `typeid` stripping the `distinct`ness from the first level
|
||||
@(require_results)
|
||||
typeid_base :: proc "contextless" (id: typeid) -> typeid {
|
||||
ti := type_info_of(id)
|
||||
ti = type_info_base(ti)
|
||||
return ti.id
|
||||
}
|
||||
// typeid_core returns the core-type of a `typeid` stripping the `distinct`ness from the first level AND/OR
|
||||
// returns the backing integer type of an enum or bit_set `typeid`.
|
||||
// returns the backing integer type of an enum `typeid`.
|
||||
// This is also aliased as `typeid_base_without_enum`
|
||||
@(require_results)
|
||||
typeid_core :: proc "contextless" (id: typeid) -> typeid {
|
||||
ti := type_info_core(type_info_of(id))
|
||||
return ti.id
|
||||
@@ -713,6 +757,12 @@ when !ODIN_NO_RTTI {
|
||||
// returns the backing integer type of an enum or bit_set `typeid`.
|
||||
// This is also aliased as `typeid_core`
|
||||
typeid_base_without_enum :: typeid_core
|
||||
|
||||
@(require_results)
|
||||
typeid_underlying :: proc "contextless" (id: typeid) -> typeid {
|
||||
ti := type_info_underlying(type_info_of(id))
|
||||
return ti.id
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -119,14 +119,14 @@ copy :: proc{copy_slice, copy_from_string, copy_from_string16}
|
||||
|
||||
|
||||
|
||||
// `unordered_remove` removed the element at the specified `index`. It does so by replacing the current end value
|
||||
// `unordered_remove_dynamic_array` removed the element at the specified `index`. It does so by replacing the current end value
|
||||
// with the old value, and reducing the length of the dynamic array by 1.
|
||||
//
|
||||
// Note: This is an O(1) operation.
|
||||
// Note: If you want the elements to remain in their order, use `ordered_remove`.
|
||||
// Note: If the index is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
unordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
unordered_remove_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
bounds_check_error_loc(loc, index, len(array))
|
||||
n := len(array)-1
|
||||
if index != n {
|
||||
@@ -134,13 +134,13 @@ unordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #ca
|
||||
}
|
||||
(^Raw_Dynamic_Array)(array).len -= 1
|
||||
}
|
||||
// `ordered_remove` removed the element at the specified `index` whilst keeping the order of the other elements.
|
||||
// `ordered_remove_dynamic_array` removed the element at the specified `index` whilst keeping the order of the other elements.
|
||||
//
|
||||
// Note: This is an O(N) operation.
|
||||
// Note: If the elements do not have to remain in their order, prefer `unordered_remove`.
|
||||
// Note: If the index is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
ordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
ordered_remove_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
bounds_check_error_loc(loc, index, len(array))
|
||||
if index+1 < len(array) {
|
||||
copy(array[index:], array[index+1:])
|
||||
@@ -148,12 +148,12 @@ ordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #call
|
||||
(^Raw_Dynamic_Array)(array).len -= 1
|
||||
}
|
||||
|
||||
// `remove_range` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
|
||||
// `remove_range_dynamic_array` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
|
||||
//
|
||||
// Note: This is an O(N) operation.
|
||||
// Note: If the range is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
|
||||
remove_range_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
|
||||
slice_expr_error_lo_hi_loc(loc, lo, hi, len(array))
|
||||
n := max(hi-lo, 0)
|
||||
if n > 0 {
|
||||
@@ -164,29 +164,117 @@ remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #calle
|
||||
}
|
||||
}
|
||||
|
||||
// `unordered_remove_fixed_capacity_dynamic_array` removed the element at the specified `index`. It does so by replacing the current end value
|
||||
// with the old value, and reducing the length of the dynamic array by 1.
|
||||
//
|
||||
// Note: This is an O(1) operation.
|
||||
// Note: If you want the elements to remain in their order, use `ordered_remove`.
|
||||
// Note: If the index is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
unordered_remove_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
bounds_check_error_loc(loc, index, len(array))
|
||||
n := len(array)-1
|
||||
if index != n {
|
||||
array[index] = array[n]
|
||||
}
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
}
|
||||
// `ordered_remove_fixed_capacity_dynamic_array` removed the element at the specified `index` whilst keeping the order of the other elements.
|
||||
//
|
||||
// Note: This is an O(N) operation.
|
||||
// Note: If the elements do not have to remain in their order, prefer `unordered_remove`.
|
||||
// Note: If the index is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
ordered_remove_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
|
||||
bounds_check_error_loc(loc, index, len(array))
|
||||
if index+1 < len(array) {
|
||||
copy(array[index:], array[index+1:])
|
||||
}
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
}
|
||||
|
||||
// `pop` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// `remove_range_fixed_capacity_dynamic_array` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
|
||||
//
|
||||
// Note: This is an O(N) operation.
|
||||
// Note: If the range is out of bounds, this procedure will panic.
|
||||
@builtin
|
||||
remove_range_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
|
||||
slice_expr_error_lo_hi_loc(loc, lo, hi, len(array))
|
||||
n := max(hi-lo, 0)
|
||||
if n > 0 {
|
||||
if hi != len(array) {
|
||||
copy(array[lo:], array[hi:])
|
||||
}
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= n
|
||||
}
|
||||
}
|
||||
|
||||
@builtin
|
||||
unordered_remove :: proc{
|
||||
unordered_remove_dynamic_array,
|
||||
unordered_remove_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
@builtin
|
||||
ordered_remove :: proc{
|
||||
ordered_remove_dynamic_array,
|
||||
ordered_remove_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
@builtin
|
||||
remove_range :: proc{
|
||||
remove_range_dynamic_array,
|
||||
remove_range_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// `pop_dynamic_array` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the dynamic array has no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
pop_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
assert(len(array) > 0, loc=loc)
|
||||
_pop_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
|
||||
_pop_dynamic_array_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
|
||||
return res
|
||||
}
|
||||
|
||||
_pop_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int, loc := #caller_location) {
|
||||
_pop_dynamic_array_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int) {
|
||||
end := rawptr(uintptr(array.data) + uintptr(elem_size*(array.len-1)))
|
||||
intrinsics.mem_copy_non_overlapping(res, end, elem_size)
|
||||
array.len -= 1
|
||||
}
|
||||
|
||||
|
||||
// `pop_fixed_capacity_dynamic_array` will remove and return the end value of fixed capacity dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the fixed capacity dynamic array has no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
assert(len(array) > 0, loc=loc)
|
||||
|
||||
// `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
elem_size :: size_of(E)
|
||||
end := rawptr(uintptr(array) + uintptr(elem_size*(len(array)-1)))
|
||||
intrinsics.mem_copy_non_overlapping(&res, end, elem_size)
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
return res
|
||||
}
|
||||
|
||||
|
||||
// `pop` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the dynamic array has no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop :: proc{
|
||||
pop_dynamic_array,
|
||||
pop_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
// `pop_safe_dynamic_array` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
pop_safe_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
if len(array) == 0 {
|
||||
return
|
||||
}
|
||||
@@ -195,11 +283,32 @@ pop_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #n
|
||||
return
|
||||
}
|
||||
|
||||
// `pop_front` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// `pop_safe_fixed_capacity_dynamic_array` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_safe_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
if len(array) == 0 {
|
||||
return
|
||||
}
|
||||
res, ok = array[len(array)-1], true
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
return
|
||||
}
|
||||
|
||||
// `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_safe :: proc{
|
||||
pop_safe_dynamic_array,
|
||||
pop_safe_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
// `pop_front_dynamic_array` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the dynamic array as no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop_front :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
pop_front_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
assert(len(array) > 0, loc=loc)
|
||||
res = array[0]
|
||||
if len(array) > 1 {
|
||||
@@ -209,10 +318,35 @@ pop_front :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #
|
||||
return res
|
||||
}
|
||||
|
||||
// `pop_front_safe` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// `pop_front_fixed_capacity_dynamic_array` will remove and return the first value of fixed capacity dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the fixed capacity dynamic array as no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop_front_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
|
||||
assert(len(array) > 0, loc=loc)
|
||||
res = array[0]
|
||||
if len(array) > 1 {
|
||||
copy(array[0:], array[1:])
|
||||
}
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
return res
|
||||
}
|
||||
|
||||
|
||||
// `pop_front` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
//
|
||||
// Note: If the dynamic array as no elements (`len(array) == 0`), this procedure will panic.
|
||||
@builtin
|
||||
pop_front :: proc{
|
||||
pop_front_dynamic_array,
|
||||
pop_front_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
// `pop_front_safe_dynamic_array` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_front_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
pop_front_safe_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
if len(array) == 0 {
|
||||
return
|
||||
}
|
||||
@@ -224,12 +358,37 @@ pop_front_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bo
|
||||
return
|
||||
}
|
||||
|
||||
// `pop_front_safe_fixed_capacity_dynamic_array` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_front_safe_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (res: E, ok: bool) #no_bounds_check {
|
||||
if len(array) == 0 {
|
||||
return
|
||||
}
|
||||
res, ok = array[0], true
|
||||
if len(array) > 1 {
|
||||
copy(array[0:], array[1:])
|
||||
}
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
|
||||
return
|
||||
}
|
||||
|
||||
// `pop_front_safe` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
|
||||
// If the operation is not possible, it will return false.
|
||||
@builtin
|
||||
pop_front_safe :: proc {
|
||||
pop_front_safe_dynamic_array,
|
||||
pop_front_safe_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// `clear` will set the length of a passed dynamic array or map to `0`
|
||||
@builtin
|
||||
clear :: proc{
|
||||
clear_dynamic_array,
|
||||
clear_map,
|
||||
clear_fixed_capacity_dynamic_array,
|
||||
|
||||
clear_soa_dynamic_array,
|
||||
}
|
||||
@@ -254,6 +413,7 @@ non_zero_reserve :: proc{
|
||||
@builtin
|
||||
resize :: proc{
|
||||
resize_dynamic_array,
|
||||
resize_fixed_capacity_dynamic_array,
|
||||
|
||||
resize_soa,
|
||||
}
|
||||
@@ -261,6 +421,7 @@ resize :: proc{
|
||||
@builtin
|
||||
non_zero_resize :: proc{
|
||||
non_zero_resize_dynamic_array,
|
||||
non_zero_resize_fixed_capacity_dynamic_array,
|
||||
|
||||
non_zero_resize_soa,
|
||||
}
|
||||
@@ -532,6 +693,8 @@ shrink_map :: proc(m: ^$T/map[$K]$V, loc := #caller_location) -> (did_shrink: bo
|
||||
|
||||
// The delete_key built-in procedure deletes the element with the specified key (m[key]) from the map.
|
||||
// If m is nil, or there is no such element, this procedure is a no-op
|
||||
// It is safe to use `delete_key` while iterating a map.
|
||||
// But if you iterate across a map and insert a new key, it could resize which means you are not iterating across all of the elements.
|
||||
@builtin
|
||||
delete_key :: proc(m: ^$T/map[$K]$V, key: K) -> (deleted_key: K, deleted_value: V) {
|
||||
if m != nil {
|
||||
@@ -669,6 +832,15 @@ non_zero_append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, l
|
||||
return _append_elem_string(array, arg, false, loc)
|
||||
}
|
||||
|
||||
// `non_zero_append_elem_fixed_capacity_string` appends a string to the end of a dynamic array of bytes, without zeroing any reserved memory
|
||||
//
|
||||
// Note: Prefer using the procedure group `non_zero_append`.
|
||||
@builtin
|
||||
non_zero_append_elem_fixed_capacity_string :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, arg: $A/string) -> (n: int) {
|
||||
return append_fixed_capacity_elem(array, transmute([]byte)arg)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// The append_string built-in procedure appends multiple strings to the end of a [dynamic]u8 like type
|
||||
//
|
||||
@@ -686,6 +858,57 @@ append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// `append_fixed_capacity_elem` appends an element to the end of a fixed capacity dynamic array. Returns 0 on failure
|
||||
@builtin
|
||||
append_fixed_capacity_elem :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #no_broadcast arg: E) -> (n: int) {
|
||||
Raw :: Raw_Fixed_Capacity_Dynamic_Array(N, E)
|
||||
|
||||
if (^Raw)(array).len >= N {
|
||||
return 0
|
||||
}
|
||||
|
||||
when size_of(E) != 0 {
|
||||
#no_bounds_check (^Raw)(array).data[(^Raw)(array).len] = arg
|
||||
}
|
||||
(^Raw)(array).len += 1
|
||||
return 1
|
||||
}
|
||||
|
||||
|
||||
// `append_fixed_capacity_elem` appends an element to the end of a fixed capacity dynamic array. Returns 0 on failure
|
||||
@builtin
|
||||
append_fixed_capacity_elems :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #no_broadcast args: ..E) -> (n: int) {
|
||||
Raw :: Raw_Fixed_Capacity_Dynamic_Array(N, E)
|
||||
raw := (^Raw)(array)
|
||||
|
||||
n = min(N - len(array), len(args))
|
||||
|
||||
#no_bounds_check when size_of(E) != 0 {
|
||||
intrinsics.mem_copy(&raw.data[raw.len], raw_data(args), n*size_of(E))
|
||||
}
|
||||
|
||||
raw.len += n
|
||||
return n
|
||||
}
|
||||
|
||||
// The append_fixed_capacity_string built-in procedure appends multiple strings to the end of a [dynamic]u8 like type
|
||||
//
|
||||
// Note: Prefer using the procedure group `append`.
|
||||
@builtin
|
||||
append_fixed_capacity_string :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, args: ..string) -> (n: int) {
|
||||
n_arg: int
|
||||
for arg in args {
|
||||
n_arg = append_fixed_capacity_elems(array, ..transmute([]E)(arg))
|
||||
n += n_arg
|
||||
if n_arg < len(arg) {
|
||||
return
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// The append built-in procedure appends elements to the end of a dynamic array
|
||||
@builtin
|
||||
append :: proc{
|
||||
@@ -693,6 +916,10 @@ append :: proc{
|
||||
append_elems,
|
||||
append_elem_string,
|
||||
|
||||
append_fixed_capacity_elem,
|
||||
append_fixed_capacity_elems,
|
||||
append_fixed_capacity_string,
|
||||
|
||||
append_soa_elem,
|
||||
append_soa_elems,
|
||||
}
|
||||
@@ -703,6 +930,10 @@ non_zero_append :: proc{
|
||||
non_zero_append_elems,
|
||||
non_zero_append_elem_string,
|
||||
|
||||
append_fixed_capacity_elem,
|
||||
append_fixed_capacity_elems,
|
||||
non_zero_append_elem_fixed_capacity_string,
|
||||
|
||||
non_zero_append_soa_elem,
|
||||
non_zero_append_soa_elems,
|
||||
}
|
||||
@@ -711,7 +942,7 @@ non_zero_append :: proc{
|
||||
// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
|
||||
// whatever `err` happens to be.
|
||||
@builtin
|
||||
append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
|
||||
append_nothing_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
|
||||
if array == nil {
|
||||
return 0, nil
|
||||
}
|
||||
@@ -720,6 +951,27 @@ append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: i
|
||||
return len(array)-prev_len, nil
|
||||
}
|
||||
|
||||
// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
|
||||
// whatever `err` happens to be.
|
||||
@builtin
|
||||
append_nothing_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (n: int, ok: bool) {
|
||||
if array == nil {
|
||||
return 0, true
|
||||
}
|
||||
prev_len := len(array)
|
||||
resize_fixed_capacity_dynamic_array(array, len(array)+1) or_return
|
||||
return len(array)-prev_len, true
|
||||
}
|
||||
|
||||
|
||||
// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
|
||||
// whatever `err` happens to be.
|
||||
@builtin
|
||||
append_nothing :: proc{
|
||||
append_nothing_dynamic_array,
|
||||
append_nothing_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
// `inject_at_elem` injects an element in a dynamic array at a specified index and moves the previous elements after that index "across"
|
||||
@builtin
|
||||
@@ -795,16 +1047,92 @@ inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// `inject_at_elem_fixed_capacity_dynamic_array` injects an element in a dynamic array at a specified index and moves the previous elements after that index "across"
|
||||
@builtin
|
||||
inject_at_elem_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool) #no_bounds_check {
|
||||
when !ODIN_NO_BOUNDS_CHECK {
|
||||
ensure(index >= 0, "Index must be positive.", loc)
|
||||
}
|
||||
if array == nil {
|
||||
return false
|
||||
}
|
||||
n := max(len(array), index)
|
||||
m :: 1
|
||||
new_size := n + m
|
||||
|
||||
resize(array, new_size) or_return
|
||||
when size_of(E) != 0 {
|
||||
copy(array[index + m:], array[index:])
|
||||
array[index] = arg
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// `inject_at_elems_fixed_capacity_dynamic_array` injects multiple elements in a dynamic array at a specified index and moves the previous elements after that index "across"
|
||||
@builtin
|
||||
inject_at_elems_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool) #no_bounds_check {
|
||||
when !ODIN_NO_BOUNDS_CHECK {
|
||||
ensure(index >= 0, "Index must be positive.", loc)
|
||||
}
|
||||
if array == nil {
|
||||
return false
|
||||
}
|
||||
if len(args) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
n := max(len(array), index)
|
||||
m := len(args)
|
||||
new_size := n + m
|
||||
|
||||
resize(array, new_size) or_return
|
||||
when size_of(E) != 0 {
|
||||
copy(array[index + m:], array[index:])
|
||||
copy(array[index:], args)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// `inject_at_elem_string_fixed_capacity_dynamic_array` injects a string into a dynamic array at a specified index and moves the previous elements after that index "across"
|
||||
@builtin
|
||||
inject_at_elem_string_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool) #no_bounds_check {
|
||||
when !ODIN_NO_BOUNDS_CHECK {
|
||||
ensure(index >= 0, "Index must be positive.", loc)
|
||||
}
|
||||
if array == nil {
|
||||
return false
|
||||
}
|
||||
if len(arg) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
n := max(len(array), index)
|
||||
m := len(arg)
|
||||
new_size := n + m
|
||||
|
||||
resize(array, new_size) or_return
|
||||
copy(array[index+m:], array[index:])
|
||||
copy(array[index:], arg)
|
||||
return true
|
||||
}
|
||||
|
||||
|
||||
// `inject_at` injects something into a dynamic array at a specified index and moves the previous elements after that index "across"
|
||||
@builtin inject_at :: proc{
|
||||
@builtin
|
||||
inject_at :: proc{
|
||||
inject_at_elem,
|
||||
inject_at_elems,
|
||||
inject_at_elem_string,
|
||||
|
||||
inject_at_elem_fixed_capacity_dynamic_array,
|
||||
inject_at_elems_fixed_capacity_dynamic_array,
|
||||
inject_at_elem_string_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// `assign_at_elem` assigns a value at a given index. If the requested index is smaller than the current
|
||||
// `assign_at_elem` assigns a value at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+1` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
|
||||
@@ -820,7 +1148,7 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc
|
||||
}
|
||||
|
||||
|
||||
// `assign_at_elems` assigns a values at a given index. If the requested index is smaller than the current
|
||||
// `assign_at_elems` assigns a values at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(args)` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
|
||||
@@ -838,7 +1166,7 @@ assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadca
|
||||
return
|
||||
}
|
||||
|
||||
// `assign_at_elem_string` assigns a string at a given index. If the requested index is smaller than the current
|
||||
// `assign_at_elem_string` assigns a string at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(arg)` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
|
||||
@@ -856,13 +1184,71 @@ assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
|
||||
return
|
||||
}
|
||||
|
||||
// `assign_at` assigns a value at a given index. If the requested index is smaller than the current
|
||||
|
||||
// `assign_at_elem_fixed_capacity_dynamic_array` assigns a value at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+1` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elem_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int index: int, arg: E) -> (ok: bool) #no_bounds_check {
|
||||
if index < len(array) {
|
||||
array[index] = arg
|
||||
ok = true
|
||||
} else {
|
||||
resize(array, index+1, loc) or_return
|
||||
array[index] = arg
|
||||
ok = true
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// `assign_at_elems_fixed_capacity_dynamic_array` assigns a values at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(args)` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elems_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast args: ..E) -> (ok: bool) #no_bounds_check {
|
||||
new_size := index + len(args)
|
||||
if len(args) == 0 {
|
||||
ok = true
|
||||
} else if new_size < len(array) {
|
||||
copy(array[index:], args)
|
||||
ok = true
|
||||
} else {
|
||||
resize(array, new_size, loc) or_return
|
||||
copy(array[index:], args)
|
||||
ok = true
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// `assign_at_elem_string_fixed_capacity_dynamic_array` assigns a string at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(arg)` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at_elem_string_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, #any_int index: int, arg: string) -> (ok: bool) #no_bounds_check {
|
||||
new_size := index + len(arg)
|
||||
if len(arg) == 0 {
|
||||
ok = true
|
||||
} else if new_size < len(array) {
|
||||
copy(array[index:], arg)
|
||||
ok = true
|
||||
} else {
|
||||
resize(array, new_size, loc) or_return
|
||||
copy(array[index:], arg)
|
||||
ok = true
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
// `assign_at` assigns a value at a given index. If the requested index is past the end of the current
|
||||
// size of the dynamic array, it will attempt to `resize` the a new length of `index+size_needed` and then assign as `index`.
|
||||
@builtin
|
||||
assign_at :: proc{
|
||||
assign_at_elem,
|
||||
assign_at_elems,
|
||||
assign_at_elem_string,
|
||||
|
||||
assign_at_elem_fixed_capacity_dynamic_array,
|
||||
assign_at_elems_fixed_capacity_dynamic_array,
|
||||
assign_at_elem_string_fixed_capacity_dynamic_array,
|
||||
}
|
||||
|
||||
|
||||
@@ -877,6 +1263,16 @@ clear_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) {
|
||||
}
|
||||
}
|
||||
|
||||
// `clear_fixed_capacity_dynamic_array` will set the length of a passed dynamic array to `0`
|
||||
//
|
||||
// Note: Prefer the procedure group `clear`.
|
||||
@builtin
|
||||
clear_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) {
|
||||
if array != nil {
|
||||
(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len = 0
|
||||
}
|
||||
}
|
||||
|
||||
// `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
|
||||
//
|
||||
// When a memory resize allocation is required, the memory will be asked to be zeroed (i.e. it calls `mem_resize`).
|
||||
@@ -996,6 +1392,43 @@ non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: i
|
||||
return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, false, loc=loc)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// `resize_fixed_capacity_dynamic_array` will try to resize memory of a passed fixed capacity dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
|
||||
//
|
||||
// Note: Prefer the procedure group `resize`
|
||||
@builtin
|
||||
resize_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int length: int) -> bool {
|
||||
if array == nil {
|
||||
return false
|
||||
}
|
||||
if len(array) < length {
|
||||
size_of_elem :: size_of(E)
|
||||
|
||||
num_reused := min(N, length) - len(array)
|
||||
intrinsics.mem_zero(([^]byte)(array)[len(array)*size_of_elem:], num_reused*size_of_elem)
|
||||
}
|
||||
|
||||
raw := (^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array)
|
||||
new_length := clamp(length, 0, N)
|
||||
raw.len = new_length
|
||||
return true
|
||||
}
|
||||
|
||||
// `non_zero_resize_fixed_capacity_dynamic_array` will try to resize memory of a passed fixed capacity dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
|
||||
//
|
||||
// Note: Prefer the procedure group `resize`
|
||||
@builtin
|
||||
non_zero_resize_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int length: int) -> bool {
|
||||
if array == nil {
|
||||
return false
|
||||
}
|
||||
raw := (^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array)
|
||||
new_length := clamp(length, 0, N)
|
||||
raw.len = new_length
|
||||
return true
|
||||
}
|
||||
|
||||
// Shrinks the capacity of a dynamic array down to the current length, or the given capacity.
|
||||
//
|
||||
// If `new_cap` is negative, then `len(array)` is used.
|
||||
@@ -1094,7 +1527,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {
|
||||
|
||||
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `context.assertion_failure_procedure` to assert.
|
||||
//
|
||||
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
|
||||
@@ -1118,7 +1551,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `context.assertion_failure_procedure` to assert.
|
||||
// This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
|
||||
@builtin
|
||||
@@ -1158,7 +1591,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
|
||||
p("not yet implemented", message, loc)
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `default_assertion_contextless_failure_proc` to assert.
|
||||
//
|
||||
// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
|
||||
@@ -1178,7 +1611,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluates the condition and panics the program iff the condition is false.
|
||||
// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
|
||||
// This uses the `default_assertion_contextless_failure_proc` to assert.
|
||||
@builtin
|
||||
ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
|
||||
|
||||
@@ -97,15 +97,6 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint)
|
||||
|
||||
@(require_results)
|
||||
arena_alloc :: proc(arena: ^Arena, size, alignment: uint, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
|
||||
align_forward_uint :: proc "contextless" (ptr, align: uint) -> uint {
|
||||
p := ptr
|
||||
modulo := p & (align-1)
|
||||
if modulo != 0 {
|
||||
p += align - modulo
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
assert(alignment & (alignment-1) == 0, "non-power of two alignment", loc)
|
||||
|
||||
size := size
|
||||
|
||||
@@ -29,6 +29,30 @@ byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byt
|
||||
return ([^]byte)(data)[:max(len, 0)]
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
align_forward_uint :: #force_inline proc "odin" (ptr, align: uint) -> uint {
|
||||
assert(is_power_of_two_uint(align))
|
||||
return (ptr + align-1) & ~(align-1)
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
align_forward_int :: #force_inline proc "odin" (ptr, align: int) -> int {
|
||||
assert(is_power_of_two_int(align))
|
||||
return int(align_forward_uint(uint(ptr), uint(align)))
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
align_forward_uintptr :: #force_inline proc "odin" (ptr, align: uintptr) -> uintptr {
|
||||
return uintptr(align_forward_uint(uint(ptr), uint(align)))
|
||||
}
|
||||
|
||||
align_forward :: proc {
|
||||
align_forward_int,
|
||||
align_forward_uint,
|
||||
align_forward_uintptr,
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
|
||||
if x <= 0 {
|
||||
return false
|
||||
@@ -36,51 +60,17 @@ is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
|
||||
return (x & (x-1)) == 0
|
||||
}
|
||||
|
||||
align_forward_int :: #force_inline proc "odin" (ptr, align: int) -> int {
|
||||
assert(is_power_of_two_int(align))
|
||||
|
||||
p := ptr
|
||||
modulo := p & (align-1)
|
||||
if modulo != 0 {
|
||||
p += align - modulo
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_power_of_two_uint :: #force_inline proc "contextless" (x: uint) -> bool {
|
||||
if x <= 0 {
|
||||
if x == 0 {
|
||||
return false
|
||||
}
|
||||
return (x & (x-1)) == 0
|
||||
}
|
||||
|
||||
align_forward_uint :: #force_inline proc "odin" (ptr, align: uint) -> uint {
|
||||
assert(is_power_of_two_uint(align))
|
||||
|
||||
p := ptr
|
||||
modulo := p & (align-1)
|
||||
if modulo != 0 {
|
||||
p += align - modulo
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_power_of_two_uintptr :: #force_inline proc "contextless" (x: uintptr) -> bool {
|
||||
if x <= 0 {
|
||||
return false
|
||||
}
|
||||
return (x & (x-1)) == 0
|
||||
}
|
||||
|
||||
align_forward_uintptr :: #force_inline proc "odin" (ptr, align: uintptr) -> uintptr {
|
||||
assert(is_power_of_two_uintptr(align))
|
||||
|
||||
p := ptr
|
||||
modulo := p & (align-1)
|
||||
if modulo != 0 {
|
||||
p += align - modulo
|
||||
}
|
||||
return p
|
||||
return is_power_of_two_uint(uint(x))
|
||||
}
|
||||
|
||||
is_power_of_two :: proc {
|
||||
@@ -89,12 +79,6 @@ is_power_of_two :: proc {
|
||||
is_power_of_two_uintptr,
|
||||
}
|
||||
|
||||
align_forward :: proc {
|
||||
align_forward_int,
|
||||
align_forward_uint,
|
||||
align_forward_uintptr,
|
||||
}
|
||||
|
||||
mem_zero :: proc "contextless" (data: rawptr, len: int) -> rawptr {
|
||||
if data == nil {
|
||||
return nil
|
||||
@@ -718,7 +702,7 @@ quaternion256_eq :: #force_inline proc "contextless" (a, b: quaternion256) -> bo
|
||||
quaternion256_ne :: #force_inline proc "contextless" (a, b: quaternion256) -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }
|
||||
|
||||
|
||||
string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
|
||||
string_decode_rune :: proc "contextless" (s: string) -> (rune, int) #no_bounds_check {
|
||||
// NOTE(bill): Duplicated here to remove dependency on package unicode/utf8
|
||||
|
||||
@(static, rodata) accept_sizes := [256]u8{
|
||||
@@ -797,7 +781,7 @@ string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
|
||||
return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
|
||||
}
|
||||
|
||||
string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
|
||||
string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) #no_bounds_check {
|
||||
RUNE_ERROR :: '\ufffd'
|
||||
RUNE_SELF :: 0x80
|
||||
UTF_MAX :: 4
|
||||
@@ -833,7 +817,7 @@ string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
|
||||
}
|
||||
|
||||
|
||||
string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
|
||||
string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) #no_bounds_check {
|
||||
REPLACEMENT_CHAR :: '\ufffd'
|
||||
_surr1 :: 0xd800
|
||||
_surr2 :: 0xdc00
|
||||
@@ -861,7 +845,7 @@ string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
|
||||
return r, w
|
||||
}
|
||||
|
||||
string16_decode_last_rune :: proc "contextless" (s: string16) -> (rune, int) {
|
||||
string16_decode_last_rune :: proc "contextless" (s: string16) -> (rune, int) #no_bounds_check {
|
||||
REPLACEMENT_CHAR :: '\ufffd'
|
||||
_surr1 :: 0xd800
|
||||
_surr2 :: 0xdc00
|
||||
|
||||
@@ -392,6 +392,12 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
|
||||
print_string("[]")
|
||||
print_type(info.elem)
|
||||
|
||||
case Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
print_string("[dynamic; ")
|
||||
print_u64(u64(info.capacity))
|
||||
print_string("]")
|
||||
print_type(info.elem)
|
||||
|
||||
case Type_Info_Map:
|
||||
print_string("map[")
|
||||
print_type(info.key)
|
||||
@@ -478,7 +484,7 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
|
||||
print_string("..")
|
||||
print_i64(info.upper)
|
||||
}
|
||||
if info.underlying != nil {
|
||||
if info.explicit_underlying {
|
||||
print_string("; ")
|
||||
print_type(info.underlying)
|
||||
}
|
||||
@@ -807,6 +813,12 @@ write_write_type :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, t
|
||||
write_string (i, buf, "[]") or_return
|
||||
write_write_type(i, buf, info.elem) or_return
|
||||
|
||||
case Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
write_string (i, buf, "[dynamic; ") or_return
|
||||
write_u64 (i, buf, u64(info.capacity)) or_return
|
||||
write_string (i, buf, "]") or_return
|
||||
write_write_type(i, buf, info.elem) or_return
|
||||
|
||||
case Type_Info_Map:
|
||||
write_string (i, buf, "map[") or_return
|
||||
write_write_type(i, buf, info.key) or_return
|
||||
@@ -893,7 +905,7 @@ write_write_type :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, t
|
||||
write_string(i, buf, "..") or_return
|
||||
write_i64 (i, buf, info.upper) or_return
|
||||
}
|
||||
if info.underlying != nil {
|
||||
if info.explicit_underlying {
|
||||
write_string (i, buf, "; ") or_return
|
||||
write_write_type(i, buf, info.underlying) or_return
|
||||
}
|
||||
|
||||
@@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
|
||||
//
|
||||
// LLVM appears not to consider "this instruction is totally
|
||||
// awful on the given microarchitcture", which leads to
|
||||
// `VPCOMPRESSED` being generated iff AVX512 support is
|
||||
// `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is
|
||||
// enabled for `intrinsics.simd_masked_compress_store`.
|
||||
// On Zen 4, this leads to a 50% performance regression vs
|
||||
// the 128-bit SIMD code.
|
||||
|
||||
11
build.bat
11
build.bat
@@ -94,6 +94,7 @@ if %release_mode% EQU 0 ( rem Debug
|
||||
set compiler_warnings= ^
|
||||
-W4 -WX ^
|
||||
-wd4100 -wd4101 -wd4127 -wd4146 ^
|
||||
-wd4324 ^
|
||||
-wd4505 ^
|
||||
-wd4456 -wd4457
|
||||
|
||||
@@ -106,16 +107,6 @@ set libs= ^
|
||||
set odin_res=misc\odin.res
|
||||
set odin_rc=misc\odin.rc
|
||||
|
||||
rem DO NOT TOUCH!
|
||||
rem THIS TILDE STUFF IS FOR DEVELOPMENT ONLY!
|
||||
set tilde_backend=0
|
||||
if %tilde_backend% EQU 1 (
|
||||
set libs=%libs% src\tilde\tb.lib
|
||||
set compiler_defines=%compiler_defines% -DODIN_TILDE_BACKEND
|
||||
)
|
||||
rem DO NOT TOUCH!
|
||||
|
||||
|
||||
set linker_flags= -incremental:no -opt:ref -subsystem:console -MANIFEST:EMBED
|
||||
|
||||
if %release_mode% EQU 0 ( rem Debug
|
||||
|
||||
@@ -26,7 +26,7 @@ error() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
SUPPORTED_LLVM_VERSIONS="21 20 19 18 17 14"
|
||||
SUPPORTED_LLVM_VERSIONS="22 21 20 19 18 17 14"
|
||||
|
||||
# Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
|
||||
if [ -z "$LLVM_CONFIG" ] && [ -n "$(command -v brew)" ]; then
|
||||
@@ -78,8 +78,8 @@ LLVM_VERSION_MAJOR="$(echo $LLVM_VERSION | awk -F. '{print $1}')"
|
||||
LLVM_VERSION_MINOR="$(echo $LLVM_VERSION | awk -F. '{print $2}')"
|
||||
LLVM_VERSION_PATCH="$(echo $LLVM_VERSION | awk -F. '{print $3}')"
|
||||
|
||||
if [ $LLVM_VERSION_MAJOR -lt 14 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 21 ]; then
|
||||
error "Invalid LLVM version $LLVM_VERSION: must be 14, 17, 18, 19, 20, or 21"
|
||||
if [ $LLVM_VERSION_MAJOR -lt 14 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 22 ]; then
|
||||
error "Invalid LLVM version $LLVM_VERSION: must be 14, 17, 18, 19, 20, 21 or 22"
|
||||
fi
|
||||
|
||||
case "$OS_NAME" in
|
||||
|
||||
@@ -6,7 +6,14 @@ LLVM_CONFIG="llvm-config-20"
|
||||
|
||||
DISABLED_WARNINGS="-Wno-switch -Wno-macro-redefined -Wno-unused-value"
|
||||
|
||||
CPPFLAGS="-DODIN_VERSION_RAW=\"dev-$(date +"%Y-%m")\""
|
||||
if [ -d ".git" ] && [ -n "$(command -v git)" ]; then
|
||||
GIT_SHA=$(git show --pretty='%h' --no-patch --no-notes HEAD)
|
||||
GIT_DATE=$(git show "--pretty=%cd" "--date=format:%Y-%m" --no-patch --no-notes HEAD)
|
||||
CPPFLAGS="$CPPFLAGS -DGIT_SHA=\"$GIT_SHA\""
|
||||
else
|
||||
GIT_DATE=$(date +"%Y-%m")
|
||||
fi
|
||||
CPPFLAGS="$CPPFLAGS -DODIN_VERSION_RAW=\"dev-$GIT_DATE\""
|
||||
CXXFLAGS="-std=c++14 $($LLVM_CONFIG --cxxflags --ldflags)"
|
||||
|
||||
LDFLAGS="-static -lm -lzstd -lz -lffi -pthread -ldl -fuse-ld=mold"
|
||||
|
||||
@@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
|
||||
b.buf = buf
|
||||
}
|
||||
|
||||
// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
|
||||
// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
|
||||
reader_destroy :: proc(b: ^Reader) {
|
||||
delete(b.buf, b.buf_allocator)
|
||||
b^ = {}
|
||||
|
||||
@@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
|
||||
b.buf = buf
|
||||
}
|
||||
|
||||
// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
|
||||
// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
|
||||
writer_destroy :: proc(b: ^Writer) {
|
||||
delete(b.buf, b.buf_allocator)
|
||||
b^ = {}
|
||||
|
||||
@@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
|
||||
return subslices[:]
|
||||
}
|
||||
|
||||
// alias returns true iff a and b have a non-zero length, and any part of
|
||||
// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of
|
||||
// a overlaps with b.
|
||||
alias :: proc "contextless" (a, b: []byte) -> bool {
|
||||
a_len, b_len := len(a), len(b)
|
||||
@@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool {
|
||||
return a_start <= b_end && b_start <= a_end
|
||||
}
|
||||
|
||||
// alias_inexactly returns true iff a and b have a non-zero length,
|
||||
// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length,
|
||||
// the base pointer of a and b are NOT equal, and any part of a overlaps
|
||||
// with b (ie: `alias(a, b)` with an exception that returns false for
|
||||
// `a == b`, `b = a[:len(a)-69]` and similar conditions).
|
||||
|
||||
@@ -154,12 +154,12 @@ _nan_bit_pattern := ~u64(0)
|
||||
|
||||
// On amd64 Windows and Linux, float_t and double_t are respectively both
|
||||
// their usual types. On x86 it's not possible to define these types correctly
|
||||
// since they would be long double which Odin does have support for.
|
||||
// since they would be long double which Odin does NOT have support for.
|
||||
float_t :: float
|
||||
double_t :: double
|
||||
|
||||
NAN := transmute(double)(_nan_bit_pattern)
|
||||
INFINITY :: 1e5000
|
||||
INFINITY :: 0h7ff00000_00000000 // +Inf
|
||||
|
||||
HUGE_VALF :: INFINITY
|
||||
HUGE_VAL :: double(INFINITY)
|
||||
|
||||
@@ -360,23 +360,26 @@ refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width :=
|
||||
refill := u64(width)
|
||||
b := u64(0)
|
||||
|
||||
if z.num_bits > refill {
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
if z.num_bits > refill {
|
||||
break
|
||||
}
|
||||
if z.code_buffer == 0 && z.num_bits > 63 {
|
||||
z.num_bits = 0
|
||||
}
|
||||
if z.code_buffer >= 1 << uint(z.num_bits) {
|
||||
// Code buffer is malformed.
|
||||
z.num_bits = max(u64)
|
||||
return
|
||||
}
|
||||
if len(z.input_data) != 0 {
|
||||
b = u64(z.input_data[0])
|
||||
z.input_data = z.input_data[1:]
|
||||
} else {
|
||||
b = 0
|
||||
return
|
||||
}
|
||||
|
||||
z.code_buffer |= b << u8(z.num_bits)
|
||||
z.num_bits += 8
|
||||
if z.num_bits > refill {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -322,9 +322,6 @@ decode_huffman_slowpath :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Erro
|
||||
@(optimization_mode="favor_size")
|
||||
decode_huffman :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
|
||||
if z.num_bits < 16 {
|
||||
if z.num_bits > 63 {
|
||||
return 0, .Code_Buffer_Malformed
|
||||
}
|
||||
compress.refill_lsb(z)
|
||||
if z.num_bits > 63 {
|
||||
return 0, .Stream_Too_Short
|
||||
|
||||
@@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
|
||||
return t._size
|
||||
}
|
||||
|
||||
// first returns the first node in the tree (in-order) or nil iff
|
||||
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Backward)
|
||||
}
|
||||
|
||||
// last returns the last element in the tree (in-order) or nil iff
|
||||
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Forward)
|
||||
}
|
||||
|
||||
// find finds the value in the tree, and returns the corresponding
|
||||
// node or nil iff the value is not present.
|
||||
// node or nil if and only if (⟺) the value is not present.
|
||||
find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
|
||||
cur := t._root
|
||||
descend_loop: for cur != nil {
|
||||
@@ -168,7 +168,7 @@ find_or_insert :: proc(
|
||||
return
|
||||
}
|
||||
|
||||
// remove removes a node or value from the tree, and returns true iff the
|
||||
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove :: proc {
|
||||
@@ -176,7 +176,7 @@ remove :: proc {
|
||||
remove_node,
|
||||
}
|
||||
|
||||
// remove_value removes a value from the tree, and returns true iff the
|
||||
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
|
||||
@@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t
|
||||
return remove_node(t, n, call_on_remove)
|
||||
}
|
||||
|
||||
// remove_node removes a node from the tree, and returns true iff the
|
||||
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
|
||||
@@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" (
|
||||
}
|
||||
|
||||
// iterator_get returns the node currently pointed to by the iterator,
|
||||
// or nil iff the node has been removed, the tree is empty, or the end
|
||||
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
|
||||
// of the tree has been reached.
|
||||
iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
|
||||
return it._cur
|
||||
}
|
||||
|
||||
// iterator_remove removes the node currently pointed to by the iterator,
|
||||
// and returns true iff the removal was successful. Semantics are the
|
||||
// and returns true if and only if (⟺) the removal was successful. Semantics are the
|
||||
// same as the Tree remove.
|
||||
iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
|
||||
if it._cur == nil {
|
||||
@@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -
|
||||
}
|
||||
|
||||
// iterator_next advances the iterator and returns the (node, true) or
|
||||
// or (nil, false) iff the end of the tree has been reached.
|
||||
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
|
||||
//
|
||||
// Note: The first call to iterator_next will return the first node instead
|
||||
// of advancing the iterator.
|
||||
|
||||
@@ -236,16 +236,6 @@ back_ptr :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
|
||||
}
|
||||
|
||||
|
||||
@(deprecated="Use `front_ptr` instead")
|
||||
peek_front :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
|
||||
return front_ptr(q, loc)
|
||||
}
|
||||
|
||||
@(deprecated="Use `back_ptr` instead")
|
||||
peek_back :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
|
||||
return back_ptr(q, loc)
|
||||
}
|
||||
|
||||
/*
|
||||
Push an element to the back of the queue.
|
||||
|
||||
|
||||
@@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
|
||||
return t._size
|
||||
}
|
||||
|
||||
// first returns the first node in the tree (in-order) or nil iff
|
||||
// first returns the first node in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Backward)
|
||||
}
|
||||
|
||||
// last returns the last element in the tree (in-order) or nil iff
|
||||
// last returns the last element in the tree (in-order) or nil if and only if (⟺)
|
||||
// the tree is empty.
|
||||
last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return tree_first_or_last_in_order(t, Direction.Forward)
|
||||
}
|
||||
|
||||
// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
|
||||
// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present.
|
||||
find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
|
||||
node = t._root
|
||||
for node != nil {
|
||||
@@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
|
||||
return node
|
||||
}
|
||||
|
||||
// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
|
||||
// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present.
|
||||
find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
|
||||
if n := find(t, key); n != nil {
|
||||
return n.value, true
|
||||
@@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
|
||||
return n, true, nil
|
||||
}
|
||||
|
||||
// remove removes a node or value from the tree, and returns true iff the
|
||||
// remove removes a node or value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove :: proc {
|
||||
@@ -162,7 +162,7 @@ remove :: proc {
|
||||
remove_node,
|
||||
}
|
||||
|
||||
// remove_value removes a value from the tree, and returns true iff the
|
||||
// remove_value removes a value from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's key + value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
|
||||
@@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true)
|
||||
return remove_node(t, n, call_on_remove)
|
||||
}
|
||||
|
||||
// remove_node removes a node from the tree, and returns true iff the
|
||||
// remove_node removes a node from the tree, and returns true if and only if (⟺) the
|
||||
// removal was successful. While the node's key + value will be left intact,
|
||||
// the node itself will be freed via the tree's node allocator.
|
||||
remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) {
|
||||
@@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K
|
||||
}
|
||||
|
||||
// iterator_get returns the node currently pointed to by the iterator,
|
||||
// or nil iff the node has been removed, the tree is empty, or the end
|
||||
// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
|
||||
// of the tree has been reached.
|
||||
iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) {
|
||||
return it._cur
|
||||
}
|
||||
|
||||
// iterator_remove removes the node currently pointed to by the iterator,
|
||||
// and returns true iff the removal was successful. Semantics are the
|
||||
// and returns true if and only if (⟺) the removal was successful. Semantics are the
|
||||
// same as the Tree remove.
|
||||
iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool {
|
||||
if it._cur == nil {
|
||||
@@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t
|
||||
}
|
||||
|
||||
// iterator_next advances the iterator and returns the (node, true) or
|
||||
// or (nil, false) iff the end of the tree has been reached.
|
||||
// or (nil, false) if and only if (⟺) the end of the tree has been reached.
|
||||
//
|
||||
// Note: The first call to iterator_next will return the first node instead
|
||||
// of advancing the iterator.
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
/*
|
||||
Deprecation Notice: Prefer using `[dynamic; N]T` (fixed capacity dynamic arrays).
|
||||
|
||||
A dynamic array-like interface on a stack-allocated, fixed-size array.
|
||||
|
||||
The `Small_Array` type is optimal for scenarios where you need
|
||||
|
||||
@@ -21,7 +21,7 @@ Example:
|
||||
}
|
||||
*/
|
||||
Small_Array :: struct($N: int, $T: typeid) where N >= 0 {
|
||||
data: [N]T,
|
||||
data: [N]T `fmt:",len"`,
|
||||
len: int,
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,8 @@ freelist_clear :: proc(x: ^$X/Freelist_Array($T, $SHIFT)) {
|
||||
freelist_push_with_index :: proc(x: ^$X/Freelist_Array($T, $SHIFT), value: T, loc := #caller_location) -> (ptr: ^T, index: int, err: runtime.Allocator_Error) {
|
||||
if x.freelist != nil {
|
||||
slot := x.freelist
|
||||
idx, _ := freelist_linear_search(x, slot)
|
||||
idx, found := freelist_linear_search(x, slot)
|
||||
assert(found)
|
||||
x.freelist = (^^T)(slot)^
|
||||
slot^ = value
|
||||
return slot, idx, nil
|
||||
|
||||
69
core/crypto/_aes/hw/api.odin
Normal file
69
core/crypto/_aes/hw/api.odin
Normal file
@@ -0,0 +1,69 @@
|
||||
package aes_hw
|
||||
|
||||
@(require) import "core:sys/info"
|
||||
|
||||
// is_supported returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
when ODIN_ARCH == .amd64 {
|
||||
// Note: Everything with AES-NI has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// is_ghash_supported returns true if and only if (⟺) hardware accelerated
|
||||
// GHASH is supported.
|
||||
is_ghash_supported :: proc "contextless" () -> bool {
|
||||
// Just having hardware GHASH is silly.
|
||||
if !is_supported() {
|
||||
return false
|
||||
}
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
return info.cpu_features() >= info.CPU_Features{
|
||||
.pclmulqdq,
|
||||
}
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
|
||||
// Once we can actually use this, we can re-enable this.
|
||||
//
|
||||
// return info.cpu_features() >= info.CPU_Features{
|
||||
// .pmull,
|
||||
// }
|
||||
return false
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `u8x16`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// instruction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
@@ -21,7 +21,7 @@
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
@@ -0,0 +1,115 @@
|
||||
#+build arm64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
|
||||
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
|
||||
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
HAS_GHASH :: false // Temporary
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
aesimc :: arm.vaesimcq_u8
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
|
||||
|
||||
// Undo ShiftRows step from AESE and extract X1 and X3
|
||||
dest := simd.swizzle(
|
||||
a,
|
||||
0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
|
||||
0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
|
||||
0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
|
||||
0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
|
||||
)
|
||||
|
||||
rcons := simd.u8x16{
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
}
|
||||
|
||||
return simd.bit_xor(dest, rcons)
|
||||
}
|
||||
|
||||
// The keyschedule implementation is easier to read with some extra
|
||||
// Intel intrinsics that are emulated by built-in LLVM ops anyway.
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
shift :: IMM8 & 0xff
|
||||
|
||||
// This needs to emit behavior identical to PSLLDQ which is as follows:
|
||||
//
|
||||
// TEMP := COUNT
|
||||
// IF (TEMP > 15) THEN TEMP := 16; FI
|
||||
// DEST := DEST << (TEMP * 8)
|
||||
// DEST[MAXVL-1:128] (Unmodified)
|
||||
|
||||
return simd.shuffle(
|
||||
simd.u8x16{},
|
||||
a,
|
||||
0 when shift > 15 else (16 - shift + 0),
|
||||
1 when shift > 15 else (16 - shift + 1),
|
||||
2 when shift > 15 else (16 - shift + 2),
|
||||
3 when shift > 15 else (16 - shift + 3),
|
||||
4 when shift > 15 else (16 - shift + 4),
|
||||
5 when shift > 15 else (16 - shift + 5),
|
||||
6 when shift > 15 else (16 - shift + 6),
|
||||
7 when shift > 15 else (16 - shift + 7),
|
||||
8 when shift > 15 else (16 - shift + 8),
|
||||
9 when shift > 15 else (16 - shift + 9),
|
||||
10 when shift > 15 else (16 - shift + 10),
|
||||
11 when shift > 15 else (16 - shift + 11),
|
||||
12 when shift > 15 else (16 - shift + 12),
|
||||
13 when shift > 15 else (16 - shift + 13),
|
||||
14 when shift > 15 else (16 - shift + 14),
|
||||
15 when shift > 15 else (16 - shift + 15),
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
v := transmute(simd.i32x4)a
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
v,
|
||||
v,
|
||||
IMM8 & 0b11,
|
||||
(IMM8 >> 2) & 0b11,
|
||||
(IMM8 >> 4) & 0b11,
|
||||
(IMM8 >> 6) & 0b11,
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
transmute(simd.u32x4)(a),
|
||||
transmute(simd.u32x4)(b),
|
||||
u32(MASK) & 0b11,
|
||||
(u32(MASK)>>2) & 0b11,
|
||||
((u32(MASK)>>4) & 0b11)+4,
|
||||
((u32(MASK)>>6) & 0b11)+4)
|
||||
}
|
||||
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
@@ -0,0 +1,55 @@
|
||||
#+build amd64
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel/RISC-V semantics.
|
||||
|
||||
TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
|
||||
HAS_GHASH :: true
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
|
||||
}
|
||||
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
@@ -0,0 +1,181 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd"
|
||||
|
||||
// Inspiration taken from BearSSL's AES-NI implementation.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performant otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := _mm_shuffle_ps(tmp, k1, 0x44)
|
||||
r2 := _mm_shuffle_ps(k1, k2, 0x4e)
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := aesimc(sks[i])
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]simd.u8x16 = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
|
||||
k1_tmp: [16]byte
|
||||
copy(k1_tmp[:], key[16:24])
|
||||
k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
|
||||
crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
|
||||
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
crypto.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
11
core/crypto/_aes/hw/unsupported.odin
Normal file
11
core/crypto/_aes/hw/unsupported.odin
Normal file
@@ -0,0 +1,11 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aes_hw
|
||||
|
||||
HAS_GHASH :: false
|
||||
|
||||
@(private)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
panic("crypto/aes: hardware implementation unsupported")
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "core:sys/info"
|
||||
|
||||
// is_supported returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
// Note: Everything with AES-NI and PCLMULQDQ has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
.pclmulqdq,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `__m128i`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// insturction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel AES-NI based implementation. Inspiration taken from BearSSL.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performat otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse,sse2")
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
|
||||
r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "aes")
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := x86._mm_aesimc_si128(sks[i])
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse,sse2,aes")
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]x86.__m128i = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
k1 := x86.__m128i{
|
||||
intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
|
||||
0,
|
||||
}
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
/*
|
||||
Set each byte of a memory range to zero.
|
||||
|
||||
This procedure copies the value `0` into the `len` bytes of a memory range,
|
||||
starting at address `data`.
|
||||
|
||||
This procedure returns the pointer to `data`.
|
||||
|
||||
Unlike the `zero()` procedure, which can be optimized away or reordered by the
|
||||
compiler under certain circumstances, `zero_explicit()` procedure can not be
|
||||
optimized away or reordered with other memory access operations, and the
|
||||
compiler assumes volatile semantics of the memory.
|
||||
*/
|
||||
zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
|
||||
// This routine tries to avoid the compiler optimizing away the call,
|
||||
// so that it is always executed. It is intended to provide
|
||||
// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
|
||||
// memset_s call.
|
||||
intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
|
||||
intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
|
||||
return data
|
||||
}
|
||||
@@ -11,6 +11,7 @@ package _blake2
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:encoding/endian"
|
||||
|
||||
BLAKE2S_BLOCK_SIZE :: 64
|
||||
@@ -18,17 +19,12 @@ BLAKE2S_SIZE :: 32
|
||||
BLAKE2B_BLOCK_SIZE :: 128
|
||||
BLAKE2B_SIZE :: 64
|
||||
|
||||
MAX_SIZE :: 255
|
||||
|
||||
Blake2s_Context :: struct {
|
||||
h: [8]u32,
|
||||
t: [2]u32,
|
||||
f: [2]u32,
|
||||
x: [BLAKE2S_BLOCK_SIZE]byte,
|
||||
nx: int,
|
||||
ih: [8]u32,
|
||||
padded_key: [BLAKE2S_BLOCK_SIZE]byte,
|
||||
is_keyed: bool,
|
||||
size: byte,
|
||||
is_last_node: bool,
|
||||
|
||||
@@ -41,9 +37,6 @@ Blake2b_Context :: struct {
|
||||
f: [2]u64,
|
||||
x: [BLAKE2B_BLOCK_SIZE]byte,
|
||||
nx: int,
|
||||
ih: [8]u64,
|
||||
padded_key: [BLAKE2B_BLOCK_SIZE]byte,
|
||||
is_keyed: bool,
|
||||
size: byte,
|
||||
is_last_node: bool,
|
||||
|
||||
@@ -86,11 +79,12 @@ BLAKE2B_IV := [8]u64 {
|
||||
|
||||
init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
|
||||
when T == Blake2s_Context {
|
||||
max_size :: BLAKE2S_SIZE
|
||||
MAX_SIZE :: BLAKE2S_SIZE
|
||||
} else when T == Blake2b_Context {
|
||||
max_size :: BLAKE2B_SIZE
|
||||
MAX_SIZE :: BLAKE2B_SIZE
|
||||
}
|
||||
ensure_contextless(cfg.size <= max_size, "blake2: requested output size exceeeds algorithm max")
|
||||
ensure_contextless(cfg.size <= MAX_SIZE, "blake2: requested output size exceeeds algorithm max")
|
||||
ensure_contextless(len(cfg.key) <= MAX_SIZE, "blake2: requested key size exceeeds algorithm max")
|
||||
|
||||
// To save having to allocate a scratch buffer, use the internal
|
||||
// data buffer (`ctx.x`), as it is exactly the correct size.
|
||||
@@ -133,7 +127,7 @@ init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
|
||||
p[17] = cfg.tree.(Blake2_Tree).inner_hash_size
|
||||
}
|
||||
} else {
|
||||
p[2], p[3] = 1, 1
|
||||
p[2], p[3], p[4], p[5], p[6], p[7] = 1, 1, 0, 0, 0, 0
|
||||
}
|
||||
ctx.size = cfg.size
|
||||
for i := 0; i < 8; i += 1 {
|
||||
@@ -151,17 +145,11 @@ init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
|
||||
ctx.is_last_node = true
|
||||
}
|
||||
if len(cfg.key) > 0 {
|
||||
copy(ctx.padded_key[:], cfg.key)
|
||||
update(ctx, ctx.padded_key[:])
|
||||
ctx.is_keyed = true
|
||||
copy(ctx.x[:], cfg.key)
|
||||
ctx.nx = len(ctx.x)
|
||||
} else {
|
||||
ctx.nx = 0
|
||||
}
|
||||
copy(ctx.ih[:], ctx.h[:])
|
||||
copy(ctx.h[:], ctx.ih[:])
|
||||
if ctx.is_keyed {
|
||||
update(ctx, ctx.padded_key[:])
|
||||
}
|
||||
|
||||
ctx.nx = 0
|
||||
|
||||
ctx.is_initialized = true
|
||||
}
|
||||
@@ -171,22 +159,22 @@ update :: proc "contextless" (ctx: ^$T, p: []byte) {
|
||||
|
||||
p := p
|
||||
when T == Blake2s_Context {
|
||||
block_size :: BLAKE2S_BLOCK_SIZE
|
||||
BLOCK_SIZE :: BLAKE2S_BLOCK_SIZE
|
||||
} else when T == Blake2b_Context {
|
||||
block_size :: BLAKE2B_BLOCK_SIZE
|
||||
BLOCK_SIZE :: BLAKE2B_BLOCK_SIZE
|
||||
}
|
||||
|
||||
left := block_size - ctx.nx
|
||||
left := BLOCK_SIZE - ctx.nx
|
||||
if len(p) > left {
|
||||
copy(ctx.x[ctx.nx:], p[:left])
|
||||
p = p[left:]
|
||||
blocks(ctx, ctx.x[:])
|
||||
ctx.nx = 0
|
||||
}
|
||||
if len(p) > block_size {
|
||||
n := len(p) &~ (block_size - 1)
|
||||
if len(p) > BLOCK_SIZE {
|
||||
n := len(p) &~ (BLOCK_SIZE - 1)
|
||||
if n == len(p) {
|
||||
n -= block_size
|
||||
n -= BLOCK_SIZE
|
||||
}
|
||||
blocks(ctx, p[:n])
|
||||
p = p[n:]
|
||||
@@ -222,17 +210,11 @@ reset :: proc "contextless" (ctx: ^$T) {
|
||||
return
|
||||
}
|
||||
|
||||
zero_explicit(ctx, size_of(ctx^))
|
||||
crypto.zero_explicit(ctx, size_of(ctx^))
|
||||
}
|
||||
|
||||
@(private)
|
||||
blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
|
||||
if ctx.is_keyed {
|
||||
for i := 0; i < len(ctx.padded_key); i += 1 {
|
||||
ctx.padded_key[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
dec := BLAKE2S_BLOCK_SIZE - u32(ctx.nx)
|
||||
if ctx.t[0] < dec {
|
||||
ctx.t[1] -= 1
|
||||
@@ -244,23 +226,20 @@ blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
|
||||
ctx.f[1] = 0xffffffff
|
||||
}
|
||||
|
||||
for i := ctx.nx; i < BLAKE2S_BLOCK_SIZE; i+= 1 {
|
||||
ctx.x[i] = 0
|
||||
}
|
||||
blocks(ctx, ctx.x[:])
|
||||
|
||||
dst: [BLAKE2S_SIZE]byte
|
||||
for i := 0; i < BLAKE2S_SIZE / 4; i += 1 {
|
||||
endian.unchecked_put_u32le(dst[i * 4:], ctx.h[i])
|
||||
}
|
||||
copy(hash, dst[:])
|
||||
copy(hash, dst[:ctx.size])
|
||||
}
|
||||
|
||||
@(private)
|
||||
blake2b_final :: proc "contextless" (ctx: ^Blake2b_Context, hash: []byte) {
|
||||
if ctx.is_keyed {
|
||||
for i := 0; i < len(ctx.padded_key); i += 1 {
|
||||
ctx.padded_key[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
dec := BLAKE2B_BLOCK_SIZE - u64(ctx.nx)
|
||||
if ctx.t[0] < dec {
|
||||
ctx.t[1] -= 1
|
||||
@@ -272,6 +251,9 @@ blake2b_final :: proc "contextless" (ctx: ^Blake2b_Context, hash: []byte) {
|
||||
ctx.f[1] = 0xffffffffffffffff
|
||||
}
|
||||
|
||||
for i := ctx.nx; i < BLAKE2B_BLOCK_SIZE; i+= 1 {
|
||||
ctx.x[i] = 0
|
||||
}
|
||||
blocks(ctx, ctx.x[:])
|
||||
|
||||
dst: [BLAKE2B_SIZE]byte
|
||||
@@ -2877,27 +2859,3 @@ blake2b_blocks :: #force_inline proc "contextless" (ctx: ^Blake2b_Context, p: []
|
||||
ctx.h[0], ctx.h[1], ctx.h[2], ctx.h[3], ctx.h[4], ctx.h[5], ctx.h[6], ctx.h[7] =
|
||||
h0, h1, h2, h3, h4, h5, h6, h7
|
||||
}
|
||||
|
||||
/*
|
||||
Set each byte of a memory range to zero.
|
||||
|
||||
This procedure copies the value `0` into the `len` bytes of a memory range,
|
||||
starting at address `data`.
|
||||
|
||||
This procedure returns the pointer to `data`.
|
||||
|
||||
Unlike the `zero()` procedure, which can be optimized away or reordered by the
|
||||
compiler under certain circumstances, `zero_explicit()` procedure can not be
|
||||
optimized away or reordered with other memory access operations, and the
|
||||
compiler assumes volatile semantics of the memory.
|
||||
*/
|
||||
@(private)
|
||||
zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
|
||||
// This routine tries to avoid the compiler optimizing away the call,
|
||||
// so that it is always executed. It is intended to provide
|
||||
// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
|
||||
// memset_s call.
|
||||
intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
|
||||
intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
|
||||
return data
|
||||
}
|
||||
@@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" (
|
||||
intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
|
||||
}
|
||||
|
||||
// is_performant returns true iff the target and current host both support
|
||||
// is_performant returns true if and only if (⟺) the target and current host both support
|
||||
// "enough" 128-bit SIMD to make this implementation performant.
|
||||
is_performant :: proc "contextless" () -> bool {
|
||||
when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {
|
||||
|
||||
@@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
|
||||
@(private = "file")
|
||||
_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
|
||||
|
||||
// is_performant returns true iff the target and current host both support
|
||||
// is_performant returns true if and only if (⟺) the target and current host both support
|
||||
// "enough" SIMD to make this implementation performant.
|
||||
is_performant :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{.avx, .avx2}
|
||||
|
||||
@@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
|
||||
tmp: Montgomery_Domain_Field_Element = ---
|
||||
fe_sub(&tmp, arg1, arg2)
|
||||
|
||||
// This will only underflow iff arg1 == arg2, and we return the borrow,
|
||||
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
|
||||
// which will be 1.
|
||||
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
|
||||
tmp: Montgomery_Domain_Field_Element = ---
|
||||
fe_sub(&tmp, arg1, arg2)
|
||||
|
||||
// This will only underflow iff arg1 == arg2, and we return the borrow,
|
||||
// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
|
||||
// which will be 1.
|
||||
is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
|
||||
|
||||
|
||||
@@ -5,17 +5,17 @@ package _subtle
|
||||
|
||||
import "core:math/bits"
|
||||
|
||||
// byte_eq returns 1 iff a == b, 0 otherwise.
|
||||
// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
byte_eq :: proc "contextless" (a, b: byte) -> int {
|
||||
v := a ~ b
|
||||
|
||||
// v == 0 iff a == b. The subtraction will underflow, setting the
|
||||
// v == 0 if and only if (⟺) a == b. The subtraction will underflow, setting the
|
||||
// sign bit, which will get returned.
|
||||
return int((u32(v)-1) >> 31)
|
||||
}
|
||||
|
||||
// u64_eq returns 1 iff a == b, 0 otherwise.
|
||||
// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_eq :: proc "contextless" (a, b: u64) -> u64 {
|
||||
_, borrow := bits.sub_u64(0, a ~ b, 0)
|
||||
@@ -27,14 +27,14 @@ eq :: proc {
|
||||
u64_eq,
|
||||
}
|
||||
|
||||
// u64_is_zero returns 1 iff a == 0, 0 otherwise.
|
||||
// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_is_zero :: proc "contextless" (a: u64) -> u64 {
|
||||
_, borrow := bits.sub_u64(a, 1, 0)
|
||||
return borrow
|
||||
}
|
||||
|
||||
// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
|
||||
// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise.
|
||||
@(optimization_mode="none")
|
||||
u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
|
||||
is_zero := u64_is_zero(a)
|
||||
|
||||
@@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided algorithm, key, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and ciphertext MUST alias exactly or not at all.
|
||||
|
||||
@@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
397
core/crypto/aegis/aegis_impl_hw.odin
Normal file
397
core/crypto/aegis/aegis_impl_hw.odin
Normal file
@@ -0,0 +1,397 @@
|
||||
#+build amd64,arm32
|
||||
package aegis
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
State_HW :: struct {
|
||||
s0: simd.u8x16,
|
||||
s1: simd.u8x16,
|
||||
s2: simd.u8x16,
|
||||
s3: simd.u8x16,
|
||||
s4: simd.u8x16,
|
||||
s5: simd.u8x16,
|
||||
s6: simd.u8x16,
|
||||
s7: simd.u8x16,
|
||||
rate: int,
|
||||
}
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "sse2,aes"
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware
|
||||
// accelerated AEGIS is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
|
||||
switch ctx._key_len {
|
||||
case KEY_SIZE_128L:
|
||||
key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
|
||||
iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv)))
|
||||
|
||||
st.s0 = simd.bit_xor(key, iv)
|
||||
st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
|
||||
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
|
||||
st.s3 = st.s1
|
||||
st.s4 = st.s0
|
||||
st.s5 = simd.bit_xor(key, st.s2) // key ^ C0
|
||||
st.s6 = simd.bit_xor(key, st.s1) // key ^ C1
|
||||
st.s7 = st.s5
|
||||
st.rate = _RATE_128L
|
||||
|
||||
for _ in 0 ..< 10 {
|
||||
update_hw_128l(st, iv, key)
|
||||
}
|
||||
case KEY_SIZE_256:
|
||||
k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
|
||||
k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16]))
|
||||
n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0]))
|
||||
n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16]))
|
||||
|
||||
st.s0 = simd.bit_xor(k0, n0)
|
||||
st.s1 = simd.bit_xor(k1, n1)
|
||||
st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
|
||||
st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
|
||||
st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0
|
||||
st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1
|
||||
st.rate = _RATE_256
|
||||
|
||||
u0, u1 := st.s0, st.s1
|
||||
for _ in 0 ..< 4 {
|
||||
update_hw_256(st, k0)
|
||||
update_hw_256(st, k1)
|
||||
update_hw_256(st, u0)
|
||||
update_hw_256(st, u1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) {
|
||||
s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0))
|
||||
s1_ := aes_hw.aesenc(st.s0, st.s1)
|
||||
s2_ := aes_hw.aesenc(st.s1, st.s2)
|
||||
s3_ := aes_hw.aesenc(st.s2, st.s3)
|
||||
s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1))
|
||||
s5_ := aes_hw.aesenc(st.s4, st.s5)
|
||||
s6_ := aes_hw.aesenc(st.s5, st.s6)
|
||||
s7_ := aes_hw.aesenc(st.s6, st.s7)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) {
|
||||
s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m))
|
||||
s1_ := aes_hw.aesenc(st.s0, st.s1)
|
||||
s2_ := aes_hw.aesenc(st.s1, st.s2)
|
||||
s3_ := aes_hw.aesenc(st.s2, st.s3)
|
||||
s4_ := aes_hw.aesenc(st.s3, st.s4)
|
||||
s5_ := aes_hw.aesenc(st.s4, st.s5)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
|
||||
update_hw_256(st, m)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
|
||||
ai, l := aad, len(aad)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
absorb_hw_128l(st, ai)
|
||||
ai = ai[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
absorb_hw_256(st, ai)
|
||||
|
||||
ai = ai[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // AAD is not confidential.
|
||||
copy(tmp[:], ai)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
absorb_hw_128l(st, tmp[:])
|
||||
case _RATE_256:
|
||||
absorb_hw_256(st, tmp[:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) {
|
||||
z0 := simd.bit_xor(
|
||||
st.s6,
|
||||
simd.bit_xor(
|
||||
st.s1,
|
||||
simd.bit_and(st.s2, st.s3),
|
||||
),
|
||||
)
|
||||
z1 := simd.bit_xor(
|
||||
st.s2,
|
||||
simd.bit_xor(
|
||||
st.s5,
|
||||
simd.bit_and(st.s6, st.s7),
|
||||
),
|
||||
)
|
||||
return z0, z1
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 {
|
||||
return simd.bit_xor(
|
||||
st.s1,
|
||||
simd.bit_xor(
|
||||
st.s4,
|
||||
simd.bit_xor(
|
||||
st.s5,
|
||||
simd.bit_and(st.s2, st.s3),
|
||||
),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi)))
|
||||
update_hw_256(st, xi_)
|
||||
|
||||
ci_ := simd.bit_xor(xi_, z)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
ci, xi, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
enc_hw_128l(st, ci, xi)
|
||||
ci = ci[_RATE_128L:]
|
||||
xi = xi[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
enc_hw_256(st, ci, xi)
|
||||
ci = ci[_RATE_256:]
|
||||
xi = xi[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
|
||||
copy(tmp[:], xi)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
enc_hw_128l(st, tmp[:], tmp[:])
|
||||
case _RATE_256:
|
||||
enc_hw_256(st, tmp[:], tmp[:])
|
||||
}
|
||||
copy(ci, tmp[:l])
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16]))
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
|
||||
update_hw_128l(st, out0, out1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci)))
|
||||
xi_ := simd.bit_xor(ci_, z)
|
||||
|
||||
update_hw_256(st, xi_)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_128L]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z0, z1 := z_hw_128l(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16]))
|
||||
out0 := simd.bit_xor(t0, z0)
|
||||
out1 := simd.bit_xor(t1, z1)
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_128L; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0
|
||||
out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1
|
||||
update_hw_128l(st, out0, out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_256]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z := z_hw_256(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
xn_ := simd.bit_xor(cn_, z)
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_256; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
update_hw_256(st, xn_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
xi, ci, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
dec_hw_128l(st, xi, ci)
|
||||
xi = xi[_RATE_128L:]
|
||||
ci = ci[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
dec_hw_256(st, xi, ci)
|
||||
xi = xi[_RATE_256:]
|
||||
ci = ci[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Process the remainder.
|
||||
if l > 0 {
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
dec_partial_hw_128l(st, xi, ci)
|
||||
case _RATE_256:
|
||||
dec_partial_hw_256(st, xi, ci)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
|
||||
tmp: [16]byte
|
||||
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
|
||||
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
|
||||
|
||||
t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
|
||||
|
||||
t0, t1: simd.u8x16 = ---, ---
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
t = simd.bit_xor(st.s2, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_128l(st, t, t)
|
||||
}
|
||||
|
||||
t0 = simd.bit_xor(st.s0, st.s1)
|
||||
t0 = simd.bit_xor(t0, st.s2)
|
||||
t0 = simd.bit_xor(t0, st.s3)
|
||||
|
||||
t1 = simd.bit_xor(st.s4, st.s5)
|
||||
t1 = simd.bit_xor(t1, st.s6)
|
||||
if len(tag) == TAG_SIZE_256 {
|
||||
t1 = simd.bit_xor(t1, st.s7)
|
||||
}
|
||||
case _RATE_256:
|
||||
t = simd.bit_xor(st.s3, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_256(st, t)
|
||||
}
|
||||
|
||||
t0 = simd.bit_xor(st.s0, st.s1)
|
||||
t0 = simd.bit_xor(t0, st.s2)
|
||||
|
||||
t1 = simd.bit_xor(st.s3, st.s4)
|
||||
t1 = simd.bit_xor(t1, st.s5)
|
||||
}
|
||||
switch len(tag) {
|
||||
case TAG_SIZE_128:
|
||||
t0 = simd.bit_xor(t0, t1)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
|
||||
case TAG_SIZE_256:
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1)
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
reset_state_hw :: proc "contextless" (st: ^State_HW) {
|
||||
crypto.zero_explicit(st, size_of(st^))
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aegis
|
||||
|
||||
@(private = "file")
|
||||
@@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
|
||||
@(private)
|
||||
State_HW :: struct {}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -1,389 +0,0 @@
|
||||
#+build amd64
|
||||
package aegis
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/aes"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private)
|
||||
State_HW :: struct {
|
||||
s0: x86.__m128i,
|
||||
s1: x86.__m128i,
|
||||
s2: x86.__m128i,
|
||||
s3: x86.__m128i,
|
||||
s4: x86.__m128i,
|
||||
s5: x86.__m128i,
|
||||
s6: x86.__m128i,
|
||||
s7: x86.__m128i,
|
||||
rate: int,
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AEGIS
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes.is_hardware_accelerated()
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
|
||||
switch ctx._key_len {
|
||||
case KEY_SIZE_128L:
|
||||
key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
|
||||
iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
|
||||
|
||||
st.s0 = x86._mm_xor_si128(key, iv)
|
||||
st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
|
||||
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
|
||||
st.s3 = st.s1
|
||||
st.s4 = st.s0
|
||||
st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
|
||||
st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
|
||||
st.s7 = st.s5
|
||||
st.rate = _RATE_128L
|
||||
|
||||
for _ in 0 ..< 10 {
|
||||
update_hw_128l(st, iv, key)
|
||||
}
|
||||
case KEY_SIZE_256:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
|
||||
k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
|
||||
n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
|
||||
n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
|
||||
|
||||
st.s0 = x86._mm_xor_si128(k0, n0)
|
||||
st.s1 = x86._mm_xor_si128(k1, n1)
|
||||
st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
|
||||
st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
|
||||
st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
|
||||
st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
|
||||
st.rate = _RATE_256
|
||||
|
||||
u0, u1 := st.s0, st.s1
|
||||
for _ in 0 ..< 4 {
|
||||
update_hw_256(st, k0)
|
||||
update_hw_256(st, k1)
|
||||
update_hw_256(st, u0)
|
||||
update_hw_256(st, u1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
|
||||
s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
|
||||
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
|
||||
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
|
||||
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
|
||||
s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
|
||||
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
|
||||
s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
|
||||
s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
|
||||
s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
|
||||
s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
|
||||
s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
|
||||
s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
|
||||
s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
|
||||
s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
|
||||
st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
|
||||
m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
|
||||
update_hw_256(st, m)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
|
||||
ai, l := aad, len(aad)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
absorb_hw_128l(st, ai)
|
||||
ai = ai[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
absorb_hw_256(st, ai)
|
||||
|
||||
ai = ai[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // AAD is not confidential.
|
||||
copy(tmp[:], ai)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
absorb_hw_128l(st, tmp[:])
|
||||
case _RATE_256:
|
||||
absorb_hw_256(st, tmp[:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
|
||||
z0 := x86._mm_xor_si128(
|
||||
st.s6,
|
||||
x86._mm_xor_si128(
|
||||
st.s1,
|
||||
x86._mm_and_si128(st.s2, st.s3),
|
||||
),
|
||||
)
|
||||
z1 := x86._mm_xor_si128(
|
||||
st.s2,
|
||||
x86._mm_xor_si128(
|
||||
st.s5,
|
||||
x86._mm_and_si128(st.s6, st.s7),
|
||||
),
|
||||
)
|
||||
return z0, z1
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
|
||||
return x86._mm_xor_si128(
|
||||
st.s1,
|
||||
x86._mm_xor_si128(
|
||||
st.s4,
|
||||
x86._mm_xor_si128(
|
||||
st.s5,
|
||||
x86._mm_and_si128(st.s2, st.s3),
|
||||
),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
|
||||
update_hw_128l(st, t0, t1)
|
||||
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
|
||||
update_hw_256(st, xi_)
|
||||
|
||||
ci_ := x86._mm_xor_si128(xi_, z)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
ci, xi, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
enc_hw_128l(st, ci, xi)
|
||||
ci = ci[_RATE_128L:]
|
||||
xi = xi[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
enc_hw_256(st, ci, xi)
|
||||
ci = ci[_RATE_256:]
|
||||
xi = xi[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Pad out the remainder with `0`s till it is rate sized.
|
||||
if l > 0 {
|
||||
tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
|
||||
copy(tmp[:], xi)
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
enc_hw_128l(st, tmp[:], tmp[:])
|
||||
case _RATE_256:
|
||||
enc_hw_256(st, tmp[:], tmp[:])
|
||||
}
|
||||
copy(ci, tmp[:l])
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z0, z1 := z_hw_128l(st)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
|
||||
update_hw_128l(st, out0, out1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
|
||||
z := z_hw_256(st)
|
||||
|
||||
ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
|
||||
xi_ := x86._mm_xor_si128(ci_, z)
|
||||
|
||||
update_hw_256(st, xi_)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_128L]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z0, z1 := z_hw_128l(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
|
||||
out0 := x86._mm_xor_si128(t0, z0)
|
||||
out1 := x86._mm_xor_si128(t1, z1)
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_128L; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
|
||||
out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
|
||||
update_hw_128l(st, out0, out1)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,aes")
|
||||
dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
|
||||
tmp: [_RATE_256]byte
|
||||
defer crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
|
||||
z := z_hw_256(st)
|
||||
copy(tmp[:], cn)
|
||||
|
||||
cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
xn_ := x86._mm_xor_si128(cn_, z)
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
|
||||
copy(xn, tmp[:])
|
||||
|
||||
for off := len(xn); off < _RATE_256; off += 1 {
|
||||
tmp[off] = 0
|
||||
}
|
||||
xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
update_hw_256(st, xn_)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
|
||||
xi, ci, l := dst, src, len(src)
|
||||
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
for l >= _RATE_128L {
|
||||
dec_hw_128l(st, xi, ci)
|
||||
xi = xi[_RATE_128L:]
|
||||
ci = ci[_RATE_128L:]
|
||||
l -= _RATE_128L
|
||||
}
|
||||
case _RATE_256:
|
||||
for l >= _RATE_256 {
|
||||
dec_hw_256(st, xi, ci)
|
||||
xi = xi[_RATE_256:]
|
||||
ci = ci[_RATE_256:]
|
||||
l -= _RATE_256
|
||||
}
|
||||
}
|
||||
|
||||
// Process the remainder.
|
||||
if l > 0 {
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
dec_partial_hw_128l(st, xi, ci)
|
||||
case _RATE_256:
|
||||
dec_partial_hw_256(st, xi, ci)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
|
||||
tmp: [16]byte
|
||||
endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
|
||||
endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
|
||||
|
||||
t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
|
||||
|
||||
t0, t1: x86.__m128i = ---, ---
|
||||
switch st.rate {
|
||||
case _RATE_128L:
|
||||
t = x86._mm_xor_si128(st.s2, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_128l(st, t, t)
|
||||
}
|
||||
|
||||
t0 = x86._mm_xor_si128(st.s0, st.s1)
|
||||
t0 = x86._mm_xor_si128(t0, st.s2)
|
||||
t0 = x86._mm_xor_si128(t0, st.s3)
|
||||
|
||||
t1 = x86._mm_xor_si128(st.s4, st.s5)
|
||||
t1 = x86._mm_xor_si128(t1, st.s6)
|
||||
if len(tag) == TAG_SIZE_256 {
|
||||
t1 = x86._mm_xor_si128(t1, st.s7)
|
||||
}
|
||||
case _RATE_256:
|
||||
t = x86._mm_xor_si128(st.s3, t)
|
||||
for _ in 0 ..< 7 {
|
||||
update_hw_256(st, t)
|
||||
}
|
||||
|
||||
t0 = x86._mm_xor_si128(st.s0, st.s1)
|
||||
t0 = x86._mm_xor_si128(t0, st.s2)
|
||||
|
||||
t1 = x86._mm_xor_si128(st.s3, st.s4)
|
||||
t1 = x86._mm_xor_si128(t1, st.s5)
|
||||
}
|
||||
switch len(tag) {
|
||||
case TAG_SIZE_128:
|
||||
t0 = x86._mm_xor_si128(t0, t1)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
|
||||
case TAG_SIZE_256:
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
reset_state_hw :: proc "contextless" (st: ^State_HW) {
|
||||
crypto.zero_explicit(st, size_of(st^))
|
||||
}
|
||||
@@ -1,30 +1,32 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:math/bits"
|
||||
import "core:simd/x86"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
CTR_STRIDE_HW :: 4
|
||||
@(private)
|
||||
CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
|
||||
hw_ctx := ctx._impl.(Context_Impl_Hardware)
|
||||
|
||||
sks: [15]x86.__m128i = ---
|
||||
sks: [15]simd.u8x16 = ---
|
||||
for i in 0 ..= hw_ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
|
||||
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
|
||||
ret := x86.__m128i{
|
||||
i64(intrinsics.byte_swap(hi)),
|
||||
i64(intrinsics.byte_swap(lo)),
|
||||
}
|
||||
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) {
|
||||
buf: [BLOCK_SIZE]byte = ---
|
||||
endian.unchecked_put_u64be(buf[0:], hi)
|
||||
endian.unchecked_put_u64be(buf[8:], lo)
|
||||
ret := intrinsics.unaligned_load((^simd.u8x16)(&buf))
|
||||
|
||||
hi, lo := hi, lo
|
||||
carry: u64
|
||||
@@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
nr_blocks := nr_blocks
|
||||
ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
#unroll for i in 0..< CTR_STRIDE_HW {
|
||||
blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
blks[i] = simd.bit_xor(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
for nr_blocks > 0 {
|
||||
blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
blks[0] = simd.bit_xor(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:1])
|
||||
@@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
|
||||
zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2")
|
||||
xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) {
|
||||
#no_bounds_check {
|
||||
if src != nil {
|
||||
for i in 0 ..< len(blocks) {
|
||||
off := i * BLOCK_SIZE
|
||||
tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
|
||||
blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
|
||||
tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:])))
|
||||
blocks[i] = simd.bit_xor(blocks[i], tmp)
|
||||
}
|
||||
}
|
||||
for i in 0 ..< len(blocks) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,7 +21,7 @@ init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION)
|
||||
encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
|
||||
ensure(ctx._is_initialized)
|
||||
ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
|
||||
ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")
|
||||
ensure(len(src) == BLOCK_SIZE, "crypto/aes: invalid src size")
|
||||
|
||||
switch &impl in ctx._impl {
|
||||
case ct64.Context:
|
||||
@@ -35,7 +35,7 @@ encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
|
||||
decrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
|
||||
ensure(ctx._is_initialized)
|
||||
ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
|
||||
ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")
|
||||
ensure(len(src) == BLOCK_SIZE, "crypto/aes: invalid src size")
|
||||
|
||||
switch &impl in ctx._impl {
|
||||
case ct64.Context:
|
||||
|
||||
59
core/crypto/aes/aes_ecb_hw.odin
Normal file
59
core/crypto/aes/aes_ecb_hw.odin
Normal file
@@ -0,0 +1,59 @@
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:simd"
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
|
||||
|
||||
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
|
||||
|
||||
blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
#+build amd64
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import "core:bytes"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/ct64"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
|
||||
// GCM_IV_SIZE is the default size of the GCM IV in bytes.
|
||||
@@ -26,6 +27,10 @@ Context_GCM :: struct {
|
||||
|
||||
// init_gcm initializes a Context_GCM with the provided key.
|
||||
init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
|
||||
when aes_hw.HAS_GHASH {
|
||||
impl := aes_hw.is_ghash_supported() ? impl : .Portable
|
||||
|
||||
}
|
||||
init_impl(&ctx._impl, key, impl)
|
||||
ctx._is_initialized = true
|
||||
}
|
||||
@@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context_GCM, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
@(require) import "core:crypto/_aes/ct64"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd/x86"
|
||||
import "core:simd"
|
||||
|
||||
@(private)
|
||||
gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
|
||||
@@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
// Note: Our GHASH implementation handles appending padding.
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
}
|
||||
gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
|
||||
copy(tag, s[:])
|
||||
@@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
}
|
||||
gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
|
||||
|
||||
@@ -71,18 +80,26 @@ init_ghash_hw :: proc(
|
||||
} else {
|
||||
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
|
||||
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
|
||||
hw_intel.ghash(j0[:], h[:], iv)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(j0[:], h[:], iv)
|
||||
} else {
|
||||
ct64.ghash(j0[:], h[:], iv)
|
||||
}
|
||||
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
|
||||
hw_intel.ghash(j0[:], h[:], tmp[:])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(j0[:], h[:], tmp[:])
|
||||
} else {
|
||||
ct64.ghash(j0[:], h[:], tmp[:])
|
||||
}
|
||||
}
|
||||
|
||||
// ECB encrypt j0, so that we can just XOR with the tag.
|
||||
encrypt_block_hw(ctx, j0_enc[:], j0[:])
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2")
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
final_ghash_hw :: proc(
|
||||
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
@@ -94,14 +111,18 @@ final_ghash_hw :: proc(
|
||||
endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
|
||||
endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], blk[:])
|
||||
j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
|
||||
s_vec = x86._mm_xor_si128(s_vec, j0_vec)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], blk[:])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], blk[:])
|
||||
}
|
||||
j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^simd.u8x16)(s))
|
||||
s_vec = simd.bit_xor(s_vec, j0_vec)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(s), s_vec)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
gctr_hw :: proc(
|
||||
ctx: ^Context_Impl_Hardware,
|
||||
dst: []byte,
|
||||
@@ -111,13 +132,13 @@ gctr_hw :: proc(
|
||||
iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
is_seal: bool,
|
||||
) #no_bounds_check {
|
||||
sks: [15]x86.__m128i = ---
|
||||
sks: [15]simd.u8x16 = ---
|
||||
for i in 0 ..= ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
|
||||
sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
// Setup the counter block
|
||||
ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
|
||||
ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv))
|
||||
ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
|
||||
|
||||
src, dst := src, dst
|
||||
@@ -127,11 +148,15 @@ gctr_hw :: proc(
|
||||
// This results in an unreadable mess, so we opt for simplicity
|
||||
// as performance is adequate.
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
blks: [CTR_STRIDE_HW]simd.u8x16 = ---
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
@@ -139,42 +164,46 @@ gctr_hw :: proc(
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
blks[i] = simd.bit_xor(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
blks[j] = aes_hw.aesenc(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
blks[i] = aes_hw.aesenclast(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:])
|
||||
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
}
|
||||
|
||||
src = src[CTR_STRIDE_BYTES_HW:]
|
||||
@@ -186,28 +215,32 @@ gctr_hw :: proc(
|
||||
for n := len(src); n > 0; {
|
||||
l := min(n, BLOCK_SIZE)
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:l])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], src[:l])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], src[:l])
|
||||
}
|
||||
}
|
||||
|
||||
blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
blks[0] = simd.bit_xor(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
blks[0] = aes_hw.aesenc(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
blks[0] = aes_hw.aesenclast(blks[0], sks[14])
|
||||
}
|
||||
|
||||
if l == BLOCK_SIZE {
|
||||
@@ -219,7 +252,11 @@ gctr_hw :: proc(
|
||||
copy(dst, blk[:l])
|
||||
}
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:l])
|
||||
when aes_hw.HAS_GHASH {
|
||||
aes_hw.ghash(s[:], h[:], dst[:l])
|
||||
} else {
|
||||
ct64.ghash(s[:], h[:], dst[:l])
|
||||
}
|
||||
}
|
||||
|
||||
dst = dst[l:]
|
||||
@@ -235,8 +272,17 @@ gctr_hw :: proc(
|
||||
// the compiler.
|
||||
//
|
||||
// src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
|
||||
@(private = "file", enable_target_feature = "sse4.1")
|
||||
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
|
||||
ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
|
||||
@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
ctr_be := intrinsics.byte_swap(ctr)
|
||||
} else {
|
||||
ctr_be := ctr
|
||||
}
|
||||
|
||||
ret := transmute(simd.u8x16)(
|
||||
simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be)
|
||||
)
|
||||
|
||||
return ret, ctr + 1
|
||||
}
|
||||
18
core/crypto/aes/aes_impl_hw.odin
Normal file
18
core/crypto/aes/aes_impl_hw.odin
Normal file
@@ -0,0 +1,18 @@
|
||||
#+build amd64,arm32
|
||||
package aes
|
||||
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: aes_hw.Context
|
||||
|
||||
@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
aes_hw.init(ctx, key)
|
||||
}
|
||||
@@ -1,10 +1,12 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aes
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AES
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
#+build amd64
|
||||
package aes
|
||||
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return hw_intel.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: hw_intel.Context
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
hw_intel.init(ctx, key)
|
||||
}
|
||||
622
core/crypto/argon2id/argon2id.odin
Normal file
622
core/crypto/argon2id/argon2id.odin
Normal file
@@ -0,0 +1,622 @@
|
||||
/*
|
||||
package argon2id implements the Argon2id password hashing algorithm.
|
||||
|
||||
See: [[ https://datatracker.ietf.org/doc/rfc9106/ ]]
|
||||
*/
|
||||
package argon2id
|
||||
|
||||
import "core:crypto/blake2b"
|
||||
import "core:encoding/endian"
|
||||
import "core:math/bits"
|
||||
import "core:mem"
|
||||
|
||||
// Implementation based on the RFC, Monocypher (CC0-1.0), and the reference
|
||||
// code (CC0-1.0).
|
||||
|
||||
// MAX_INPUT_SIZE is the mamximum size of the various inputs (password,
|
||||
// salt, secret, ad) in bytes.
|
||||
MAX_INPUT_SIZE :: (1 << 32) - 1
|
||||
|
||||
// MIN_PARALLELISM is the minimum allowed parallelism.
|
||||
MIN_PARALLELISM :: 1
|
||||
// MAX_PARALLELISM is the maximum allowed parallelism.
|
||||
MAX_PARALLELISM :: (1 << 24) - 1
|
||||
|
||||
// MIN_TAG_SIZE is the minimum digest size in bytes.
|
||||
MIN_TAG_SIZE :: 4
|
||||
// MAX_TAG_SIZE is the maximum digest size in bytes.
|
||||
MAX_TAG_SIZE :: (1 << 32) - 1
|
||||
|
||||
// RECOMMENDED_TAG_SIZE is the recommended tag size in bytes.
|
||||
RECOMMENTED_TAG_SIZE :: 32 // 256-bits
|
||||
// RECOMMENDNED_SALT_SIZE is the recommended salt size in bytes.
|
||||
RECOMMENDED_SALT_SIZE :: 16 // 128-bits
|
||||
|
||||
@(private)
|
||||
V_RFC9106 :: 0x13
|
||||
@(private)
|
||||
Y_ID :: 0x02
|
||||
@(private)
|
||||
BLOCK_SIZE_BYTES :: 1024
|
||||
@(private)
|
||||
BLOCK_SIZE_U64 :: 128
|
||||
|
||||
// PARAMS_RFC9106 is the first recommended "uniformly safe" parameter set
|
||||
// per RFC 9106.
|
||||
@(rodata)
|
||||
PARAMS_RFC9106 := Parameters{
|
||||
memory_size = 2 * 1024 * 1024, // 2 GiB
|
||||
passes = 1,
|
||||
parallelism = 4,
|
||||
}
|
||||
|
||||
// PARAMS_RFC9106_SMALL is the second recommended "uniformly safe" parameter
|
||||
// set per RFC 9106 tailored for memory constrained environments.
|
||||
@(rodata)
|
||||
PARAMS_RFC9106_SMALL := Parameters{
|
||||
memory_size = 64 * 1024, // 64 MiB
|
||||
passes = 3,
|
||||
parallelism = 4,
|
||||
}
|
||||
|
||||
// PARAMS_OWASP is one of the recommended parameter set from the OWASP
|
||||
// Password Storage Cheat Sheet (as of 2026/02). The cheat sheet contains
|
||||
// additional variations to this parameter set with various trade-offs
|
||||
// between `memory_size` and `passes` that are intended to provide
|
||||
// equivalent security.
|
||||
//
|
||||
// See: [[ https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html ]]
|
||||
@(rodata)
|
||||
PARAMS_OWASP := Parameters{
|
||||
memory_size = 19 * 1024, // 19 MiB
|
||||
passes = 2,
|
||||
parallelism = 1,
|
||||
}
|
||||
|
||||
// PARAMS_OWASP_SMALL is equivalent in strength to PARAMS_OWASP, but
|
||||
// trades off less memory use for more CPU usage.
|
||||
@(rodata)
|
||||
PARAMS_OWASP_SMALL := Parameters{
|
||||
memory_size = 7 * 1024, // 7 MiB
|
||||
passes = 5,
|
||||
parallelism = 1,
|
||||
}
|
||||
|
||||
// Parameters is an Argon2id parameter set.
|
||||
Parameters :: struct {
|
||||
memory_size: u32, // m (KiB)
|
||||
passes: u32, // t
|
||||
parallelism: u32, // p
|
||||
}
|
||||
|
||||
@(private)
|
||||
Block :: [BLOCK_SIZE_U64]u64
|
||||
|
||||
// derive invokes Argon2id with the specified parameter set and inputs,
|
||||
// and outputs the derived key to dst.
|
||||
@(require_results)
|
||||
derive :: proc(
|
||||
parameters: ^Parameters,
|
||||
password: []byte, // P
|
||||
salt: []byte, // S
|
||||
dst: []byte,
|
||||
secret: []byte = nil, // K (aka `pepper`)
|
||||
ad: []byte = nil, // X
|
||||
sanitize := true,
|
||||
allocator := context.allocator, // Not temp as this can be large.
|
||||
) -> mem.Allocator_Error #no_bounds_check {
|
||||
if u64(len(password)) > MAX_INPUT_SIZE {
|
||||
panic("crypto/argon2id: invalid password size")
|
||||
}
|
||||
if u64(len(salt)) > MAX_INPUT_SIZE {
|
||||
panic("crypto/argon2id: invalid salt size")
|
||||
}
|
||||
if u64(len(secret)) > MAX_INPUT_SIZE {
|
||||
panic("crypto/argon2id: invalid secret size")
|
||||
}
|
||||
if u64(len(ad)) > MAX_INPUT_SIZE {
|
||||
panic("crypto/argon2id: invalid ad size")
|
||||
}
|
||||
if l := u64(len(dst)); l > MAX_TAG_SIZE || l < MIN_TAG_SIZE {
|
||||
panic("crypto/argon2id: invalid dst size")
|
||||
}
|
||||
|
||||
p, t, m := parameters.parallelism, parameters.passes, u64(parameters.memory_size)
|
||||
if p < MIN_PARALLELISM || p > MAX_PARALLELISM {
|
||||
panic("crypto/argon2id: invalid parallelism")
|
||||
}
|
||||
if t < 1 {
|
||||
panic("crypto/argon2id: invalid passes")
|
||||
}
|
||||
if m < 8 * u64(p) {
|
||||
panic("crypto/argon2id: insufficient memory size")
|
||||
}
|
||||
if m * BLOCK_SIZE_BYTES > u64(max(int)) {
|
||||
panic("crypto/argon2id: excessive memory size")
|
||||
}
|
||||
|
||||
// Allocate the memory as m' 1024-byte blocks, where m' is derived as:
|
||||
// m' = 4 * p * floor (m / 4p)
|
||||
//
|
||||
// For p lanes, the memory is organized in a matrix B[i][j] of
|
||||
// blocks with p rows (lanes) and q = m' / p columns.
|
||||
m_ := 4 * u64(p) * (m / u64(4 * p))
|
||||
b := mem.alloc_bytes_non_zeroed(
|
||||
int(m_) * BLOCK_SIZE_BYTES,
|
||||
alignment = mem.DEFAULT_PAGE_SIZE,
|
||||
allocator = allocator,
|
||||
) or_return
|
||||
defer delete(b, allocator)
|
||||
|
||||
block_buf: [BLOCK_SIZE_BYTES]byte = ---
|
||||
|
||||
blocks := ([^]Block)(raw_data(b))[:m_]
|
||||
segment_size := u32(m_ / u64(p) / 4)
|
||||
lane_size := segment_size * 4
|
||||
|
||||
// Establish H_0 as the 64-byte value as shown below. If K, X, or S
|
||||
// has zero length, it is just absent, but its length field remains.
|
||||
//
|
||||
// H_0 = H^(64)(LE32(p) || LE32(T) || LE32(m) || LE32(t) ||
|
||||
// LE32(v) || LE32(y) || LE32(length(P)) || P ||
|
||||
// LE32(length(S)) || S || LE32(length(K)) || K ||
|
||||
// LE32(length(X)) || X)
|
||||
{
|
||||
ctx: blake2b.Context
|
||||
blake2b.init(&ctx)
|
||||
|
||||
blake2b_update_u32le(&ctx, u32(p))
|
||||
blake2b_update_u32le(&ctx, u32(len(dst)))
|
||||
blake2b_update_u32le(&ctx, parameters.memory_size)
|
||||
blake2b_update_u32le(&ctx, t)
|
||||
blake2b_update_u32le(&ctx, V_RFC9106)
|
||||
blake2b_update_u32le(&ctx, Y_ID)
|
||||
blake2b_update_u32le(&ctx, u32(len(password)))
|
||||
blake2b.update(&ctx, password)
|
||||
blake2b_update_u32le(&ctx, u32(len(salt)))
|
||||
blake2b.update(&ctx, salt)
|
||||
blake2b_update_u32le(&ctx, u32(len(secret)))
|
||||
blake2b.update(&ctx, secret)
|
||||
blake2b_update_u32le(&ctx, u32(len(ad)))
|
||||
blake2b.update(&ctx, ad)
|
||||
|
||||
h_0: [blake2b.DIGEST_SIZE+8]byte
|
||||
blake2b.final(&ctx, h_0[:blake2b.DIGEST_SIZE])
|
||||
|
||||
// Compute B[i][0] for all i ranging from (and including) 0 to (not
|
||||
// including) p.
|
||||
//
|
||||
// B[i][0] = H'^(1024)(H_0 || LE32(0) || LE32(i))
|
||||
//
|
||||
// Compute B[i][1] for all i ranging from (and including) 0 to (not
|
||||
// including) p.
|
||||
//
|
||||
// B[i][1] = H'^(1024)(H_0 || LE32(1) || LE32(i))
|
||||
for l in u32(0) ..< p {
|
||||
for i in u32(0) ..< 2 {
|
||||
endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE:], i) // LE32({0,1})
|
||||
endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE+4:], l) // LE32(i)
|
||||
h_prime(block_buf[:], h_0[:])
|
||||
blk := &blocks[l * lane_size + i]
|
||||
for j in 0 ..< BLOCK_SIZE_U64 {
|
||||
blk[j] = endian.unchecked_get_u64le(block_buf[j*8:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mem.zero_explicit(&h_0, size_of(h_0)) // No longer needed.
|
||||
}
|
||||
|
||||
// Compute B[i][j] for all i ranging from (and including) 0 to (not
|
||||
// including) p and for all j ranging from (and including) 2 to (not
|
||||
// including) q. The computation MUST proceed slicewise
|
||||
// (Section 3.4): first, blocks from slice 0 are computed for all
|
||||
// lanes (in an arbitrary order of lanes), then blocks from slice 1
|
||||
// are computed, etc. The block indices l and z are determined for
|
||||
// each i, j differently for Argon2d, Argon2i, and Argon2id.
|
||||
//
|
||||
// B[i][j] = G(B[i][j-1], B[l][z])
|
||||
//
|
||||
// If the number of passes t is larger than 1, we repeat step 5. We
|
||||
// compute B[i][0] and B[i][j] for all i raging from (and including)
|
||||
// 0 to (not including) p and for all j ranging from (and including)
|
||||
// 1 to (not including) q. However, blocks are computed differently
|
||||
// as the old value is XORed with the new one:
|
||||
//
|
||||
// B[i][0] = G(B[i][q-1], B[l][z]) XOR B[i][0];
|
||||
// B[i][j] = G(B[i][j-1], B[l][z]) XOR B[i][j].
|
||||
constant_time := true // Start with constant time indexing.
|
||||
tmp, index_block: Block = ---, ---
|
||||
for pass in u32(0) ..< t {
|
||||
for slice in u32(0) ..< 4 {
|
||||
// The first slice of the first pass has blocks 0 and 1
|
||||
// pre-filled.
|
||||
pass_offset: u32 = pass == 0 && slice == 0 ? 2 : 0
|
||||
slice_offset := slice * segment_size
|
||||
|
||||
// 3.4.1.3. Argon2id
|
||||
//
|
||||
// If the pass number is 0 and the slice number is 0 or 1, then compute
|
||||
// J_1 and J_2 as for Argon2i, else compute J_1 and J_2 as for Argon2d.
|
||||
if slice == 2 {
|
||||
constant_time = false
|
||||
}
|
||||
|
||||
// Each segment can be processed in parallel, as long as
|
||||
// each iteration of the loop completes before proceeding
|
||||
// to the next. For simplicity we do this in serial
|
||||
// instead of using threads.
|
||||
for segment in u32(0) ..< u32(p) {
|
||||
index_ctr: u64 = 1
|
||||
for block in pass_offset ..< segment_size {
|
||||
// Current and previous blocks (indexes, not pointers)
|
||||
lane_offset := segment * lane_size
|
||||
segment_start := lane_offset + slice_offset
|
||||
current := segment_start + block
|
||||
previous := segment_start - 1
|
||||
switch {
|
||||
case block == 0 && slice_offset == 0:
|
||||
previous += lane_size
|
||||
case:
|
||||
previous += block
|
||||
}
|
||||
|
||||
index_seed: u64
|
||||
if constant_time {
|
||||
// 3.4.1.2. Argon2i
|
||||
//
|
||||
// For each segment, we do the following. First, we compute the value Z
|
||||
// as:
|
||||
//
|
||||
// Z= ( LE64(r) || LE64(l) || LE64(sl) || LE64(m') ||
|
||||
// LE64(t) || LE64(y) )
|
||||
//
|
||||
// Figure 11: Input to Compute J1,J2 in Argon2i
|
||||
//
|
||||
// where
|
||||
//
|
||||
// r: the pass number
|
||||
// l: the lane number
|
||||
// sl: the slice number
|
||||
// m': the total number of memory blocks
|
||||
// t: the total number of passes
|
||||
// y: the Argon2 type (0 for Argon2d, 1 for Argon2i, 2 for Argon2id)
|
||||
//
|
||||
// Then we compute:
|
||||
//
|
||||
// q/(128*SL) 1024-byte values
|
||||
// G(ZERO(1024),G(ZERO(1024),
|
||||
// Z || LE64(1) || ZERO(968) )),
|
||||
// G(ZERO(1024),G(ZERO(1024),
|
||||
// Z || LE64(2) || ZERO(968) )),... ,
|
||||
// G(ZERO(1024),G(ZERO(1024),
|
||||
// Z || LE64(q/(128*SL)) || ZERO(968) )),
|
||||
//
|
||||
// which are partitioned into q/(SL) 8-byte values X, which are viewed
|
||||
// as X1||X2 and converted to J_1=int32(X1) and J_2=int32(X2).
|
||||
//
|
||||
// The values r, l, sl, m', t, y, and i are represented as 8 bytes in
|
||||
// little endian.
|
||||
if block == pass_offset || (block % 128) == 0 {
|
||||
mem.zero(&index_block, size_of(index_block))
|
||||
index_block[0] = u64(pass)
|
||||
index_block[1] = u64(segment)
|
||||
index_block[2] = u64(slice)
|
||||
index_block[3] = u64(lane_size * p)
|
||||
index_block[4] = u64(t) // passes
|
||||
index_block[5] = Y_ID
|
||||
index_block[6] = index_ctr
|
||||
index_ctr += 1
|
||||
|
||||
copy(tmp[:], index_block[:])
|
||||
g_rounds(&index_block)
|
||||
xor_block(&index_block, &tmp)
|
||||
copy(tmp[:], index_block[:])
|
||||
g_rounds(&index_block)
|
||||
xor_block(&index_block, &tmp)
|
||||
}
|
||||
index_seed = index_block[block % 128]
|
||||
} else {
|
||||
// 3.4.1.1. Argon2d
|
||||
//
|
||||
// J_1 is given by the first 32 bits of block B[i][j-1], while J_2 is
|
||||
// given by the next 32 bits of block B[i][j-1]:
|
||||
//
|
||||
// J_1 = int32(extract(B[i][j-1], 0))
|
||||
// J_2 = int32(extract(B[i][j-1], 1))
|
||||
//
|
||||
// Figure 10: Deriving J1,J2 in Argon2d
|
||||
index_seed = blocks[previous][0]
|
||||
}
|
||||
|
||||
// 3.4.2. Mapping J_1 and J_2 to Reference Block Index [l][z]
|
||||
//
|
||||
// The value of l = J_2 mod p gives the index of the lane from which the
|
||||
// block will be taken. For the first pass (r=0) and the first slice
|
||||
// (sl=0), the block is taken from the current lane.
|
||||
//
|
||||
// The set W contains the indices that are referenced according to the
|
||||
// following rules:
|
||||
//
|
||||
// 1. If l is the current lane, then W includes the indices of all
|
||||
// blocks in the last SL - 1 = 3 segments computed and finished, as
|
||||
// well as the blocks computed in the current segment in the current
|
||||
// pass excluding B[i][j-1].
|
||||
//
|
||||
// 2. If l is not the current lane, then W includes the indices of all
|
||||
// blocks in the last SL - 1 = 3 segments computed and finished in
|
||||
// lane l. If B[i][j] is the first block of a segment, then the
|
||||
// very last index from W is excluded.
|
||||
//
|
||||
// Then take a block from W with a nonuniform distribution over [0, |W|)
|
||||
// using the following mapping:
|
||||
//
|
||||
// J_1 -> |W|(1 - J_1^2 / 2^(64))
|
||||
//
|
||||
// Figure 12: Computing J1
|
||||
//
|
||||
// To avoid floating point computation, the following approximation is
|
||||
// used:
|
||||
//
|
||||
// x = J_1^2 / 2^(32)
|
||||
// y = (|W| * x) / 2^(32)
|
||||
// zz = |W| - 1 - y
|
||||
//
|
||||
// Figure 13: Computing J1, Part 2
|
||||
//
|
||||
// Then take the zz-th index from W; it will be the z value for the
|
||||
// reference block index [l][z].
|
||||
next_slice: u32 = ((slice + 1) % 4) * segment_size
|
||||
window_start, nb_segments: u32
|
||||
lane := u32(index_seed >> 32) % p
|
||||
switch {
|
||||
case pass == 0:
|
||||
nb_segments = slice
|
||||
if slice == 0 {
|
||||
lane = segment
|
||||
}
|
||||
case:
|
||||
window_start = next_slice
|
||||
nb_segments = 3
|
||||
}
|
||||
window_size := nb_segments * segment_size
|
||||
if lane == segment {
|
||||
window_size += block - 1
|
||||
} else if block == 0 {
|
||||
window_size += ~u32(0)
|
||||
}
|
||||
|
||||
j1 := index_seed & 0xffffffff
|
||||
x := (j1 * j1) >> 32
|
||||
y := (u64(window_size) * x) >> 32
|
||||
z := (u64(window_size) - 1) - y
|
||||
ref := u32((u64(window_start) + z) % u64(lane_size))
|
||||
reference: u32 = lane * lane_size + ref
|
||||
|
||||
copy(tmp[:], blocks[previous][:])
|
||||
xor_block(&tmp, &blocks[reference])
|
||||
if pass == 0 {
|
||||
copy(blocks[current][:], tmp[:])
|
||||
} else {
|
||||
xor_block(&blocks[current], &tmp)
|
||||
}
|
||||
g_rounds(&tmp)
|
||||
xor_block(&blocks[current], &tmp)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
mem.zero_explicit(&tmp, size_of(tmp))
|
||||
mem.zero_explicit(&index_block, size_of(index_block))
|
||||
|
||||
// After t steps have been iterated, the final block C is computed
|
||||
// as the XOR of the last column:
|
||||
//
|
||||
// C = B[0][q-1] XOR B[1][q-1] XOR ... XOR B[p-1][q-1]
|
||||
idx := lane_size - 1
|
||||
last_block := &blocks[idx]
|
||||
for _ in 1 ..< p {
|
||||
idx += lane_size
|
||||
next_block := &blocks[idx]
|
||||
xor_block(next_block, last_block)
|
||||
last_block = next_block
|
||||
}
|
||||
|
||||
for v, i in last_block {
|
||||
endian.unchecked_put_u64le(block_buf[i*8:], v)
|
||||
}
|
||||
|
||||
// The output tag is computed as H'^T(C).
|
||||
h_prime(dst, block_buf[:])
|
||||
mem.zero_explicit(&block_buf, size_of(block_buf))
|
||||
|
||||
// Sanitize the working memory. While the RFC implies that this is
|
||||
// optional ("enable the memory-wiping option in the library call"),
|
||||
// the reference code defaults to enabling it.
|
||||
//
|
||||
// An opt-out is provided, as this can get somewhat expensive when
|
||||
// m gets large.
|
||||
if sanitize {
|
||||
mem.zero_explicit(raw_data(b), len(b))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@(private)
|
||||
xor_block :: #force_inline proc(dst, src: ^Block) {
|
||||
for v, i in src {
|
||||
dst[i] ~= v
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
blake2b_update_u32le :: #force_inline proc(ctx: ^blake2b.Context, i: u32) {
|
||||
tmp: [4]byte = ---
|
||||
endian.unchecked_put_u32le(tmp[:], i)
|
||||
blake2b.update(ctx, tmp[:])
|
||||
mem.zero_explicit(&tmp, size_of(tmp)) // Probably overkill.
|
||||
}
|
||||
|
||||
// 3.3. Variable-Length Hash Function H'
|
||||
//
|
||||
// Let V_i be a 64-byte block and W_i be its first 32 bytes. Then we
|
||||
// define function H' as follows:
|
||||
//
|
||||
// if T <= 64
|
||||
// H'^T(A) = H^T(LE32(T)||A)
|
||||
// else
|
||||
// r = ceil(T/32)-2
|
||||
// V_1 = H^(64)(LE32(T)||A)
|
||||
// V_2 = H^(64)(V_1)
|
||||
// ...
|
||||
// V_r = H^(64)(V_{r-1})
|
||||
// V_{r+1} = H^(T-32*r)(V_{r})
|
||||
// H'^T(X) = W_1 || W_2 || ... || W_r || V_{r+1}
|
||||
//
|
||||
// Figure 8: Function H' for Tag and Initial Block Computations
|
||||
@(private)
|
||||
h_prime :: proc(dst, src: []byte) {
|
||||
t := len(dst)
|
||||
ctx: blake2b.Context
|
||||
blake2b.init(&ctx, min(t, blake2b.DIGEST_SIZE))
|
||||
blake2b_update_u32le(&ctx, u32(t))
|
||||
blake2b.update(&ctx, src)
|
||||
blake2b.final(&ctx, dst)
|
||||
|
||||
if t > 64 {
|
||||
r := u32((u64(t) + 31) >> 5) - 2
|
||||
i: u32 = 1
|
||||
off_in := 0
|
||||
off_out := 32
|
||||
for i < r {
|
||||
blake2b.init(&ctx, blake2b.DIGEST_SIZE)
|
||||
blake2b.update(&ctx, dst[off_in:off_in+64])
|
||||
blake2b.final(&ctx, dst[off_out:])
|
||||
i += 1
|
||||
off_in += 32
|
||||
off_out += 32
|
||||
}
|
||||
blake2b.init(&ctx, t - int(32 * r))
|
||||
blake2b.update(&ctx, dst[off_in:off_in+64])
|
||||
blake2b.final(&ctx, dst[off_out:])
|
||||
}
|
||||
}
|
||||
|
||||
// GB(a, b, c, d) is defined as follows:
|
||||
//
|
||||
// a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
|
||||
// d = (d XOR a) >>> 32
|
||||
// c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
|
||||
// b = (b XOR c) >>> 24
|
||||
//
|
||||
// a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
|
||||
// d = (d XOR a) >>> 16
|
||||
// c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
|
||||
// b = (b XOR c) >>> 63
|
||||
//
|
||||
// Figure 19: Details of GB
|
||||
//
|
||||
// The modular additions in GB are combined with 64-bit multiplications.
|
||||
// Multiplications are the only difference from the original BLAKE2b
|
||||
// design. This choice is done to increase the circuit depth and thus
|
||||
// the running time of ASIC implementations, while having roughly the
|
||||
// same running time on CPUs thanks to parallelism and pipelining.
|
||||
@(private,require_results)
|
||||
gb :: #force_inline proc(a, b, c, d: u64) -> (u64, u64, u64, u64) {
|
||||
a, b, c, d := a, b, c, d
|
||||
|
||||
trunc := #force_inline proc(v: u64) -> u64 {
|
||||
return u64(u32(v))
|
||||
}
|
||||
|
||||
a += b + ((trunc(a) * trunc(b)) << 1)
|
||||
d = bits.rotate_left64(d ~ a, 32) // >>> 32
|
||||
c += d + ((trunc(c) * trunc(d)) << 1)
|
||||
b = bits.rotate_left64((b ~ c), 40) // >>> 24
|
||||
|
||||
a += b + ((trunc(a) * trunc(b)) << 1)
|
||||
d = bits.rotate_left64(d ~ a, 48) // >>> 16
|
||||
c += d + ((trunc(c) * trunc(d)) << 1)
|
||||
b = bits.rotate_left64((b ~ c), 1) // >>> 63
|
||||
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
// 3.6. Permutation P
|
||||
//
|
||||
// Permutation P is based on the round function of BLAKE2b. The eight
|
||||
// 16-byte inputs S_0, S_1, ... , S_7 are viewed as a 4x4 matrix of
|
||||
// 64-bit words, where S_i = (v_{2*i+1} || v_{2*i}):
|
||||
//
|
||||
// v_0 v_1 v_2 v_3
|
||||
// v_4 v_5 v_6 v_7
|
||||
// v_8 v_9 v_10 v_11
|
||||
// v_12 v_13 v_14 v_15
|
||||
//
|
||||
// Figure 17: Matrix Element Labeling
|
||||
//
|
||||
// It works as follows:
|
||||
//
|
||||
// GB(v_0, v_4, v_8, v_12)
|
||||
// GB(v_1, v_5, v_9, v_13)
|
||||
// GB(v_2, v_6, v_10, v_14)
|
||||
// GB(v_3, v_7, v_11, v_15)
|
||||
//
|
||||
// GB(v_0, v_5, v_10, v_15)
|
||||
// GB(v_1, v_6, v_11, v_12)
|
||||
// GB(v_2, v_7, v_8, v_13)
|
||||
// GB(v_3, v_4, v_9, v_14)
|
||||
//
|
||||
// Figure 18: Feeding Matrix Elements to GB
|
||||
@(private,require_results)
|
||||
perm_p :: #force_inline proc(v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15: u64) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64) {
|
||||
v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15 := v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
|
||||
|
||||
v_0, v_4, v_8, v_12 = gb(v_0, v_4, v_8, v_12)
|
||||
v_1, v_5, v_9, v_13 = gb(v_1, v_5, v_9, v_13)
|
||||
v_2, v_6, v_10, v_14 = gb(v_2, v_6, v_10, v_14)
|
||||
v_3, v_7, v_11, v_15 = gb(v_3, v_7, v_11, v_15)
|
||||
|
||||
v_0, v_5, v_10, v_15 = gb(v_0, v_5, v_10, v_15)
|
||||
v_1, v_6, v_11, v_12 = gb(v_1, v_6, v_11, v_12)
|
||||
v_2, v_7, v_8, v_13 = gb(v_2, v_7, v_8, v_13)
|
||||
v_3, v_4, v_9, v_14 = gb(v_3, v_4, v_9, v_14)
|
||||
|
||||
return v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
|
||||
}
|
||||
|
||||
// 3.5. Compression Function G
|
||||
//
|
||||
// The compression function G is built upon the BLAKE2b-based
|
||||
// transformation P. P operates on the 128-byte input, which can be
|
||||
// viewed as eight 16-byte registers:
|
||||
//
|
||||
// P(A_0, A_1, ... ,A_7) = (B_0, B_1, ... ,B_7)
|
||||
//
|
||||
// Figure 14: Blake Round Function P
|
||||
//
|
||||
// The compression function G(X, Y) operates on two 1024-byte blocks X
|
||||
// and Y. It first computes R = X XOR Y. Then R is viewed as an 8x8
|
||||
// matrix of 16-byte registers R_0, R_1, ... , R_63. Then P is first
|
||||
// applied to each row, and then to each column to get Z:
|
||||
//
|
||||
// ( Q_0, Q_1, Q_2, ... , Q_7) <- P( R_0, R_1, R_2, ... , R_7)
|
||||
// ( Q_8, Q_9, Q_10, ... , Q_15) <- P( R_8, R_9, R_10, ... , R_15)
|
||||
// ...
|
||||
// (Q_56, Q_57, Q_58, ... , Q_63) <- P(R_56, R_57, R_58, ... , R_63)
|
||||
// ( Z_0, Z_8, Z_16, ... , Z_56) <- P( Q_0, Q_8, Q_16, ... , Q_56)
|
||||
// ( Z_1, Z_9, Z_17, ... , Z_57) <- P( Q_1, Q_9, Q_17, ... , Q_57)
|
||||
// ...
|
||||
// ( Z_7, Z_15, Z 23, ... , Z_63) <- P( Q_7, Q_15, Q_23, ... , Q_63)
|
||||
//
|
||||
// Figure 15: Core of Compression Function G
|
||||
@(private)
|
||||
g_rounds :: proc(b: ^Block) {
|
||||
for i := 0; i < 128; i += 16 {
|
||||
b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15] = perm_p(b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15])
|
||||
}
|
||||
for i := 0; i < 16; i += 2 {
|
||||
b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113] = perm_p(b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113])
|
||||
}
|
||||
}
|
||||
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2b_Context
|
||||
|
||||
// init initializes a Context with the default BLAKE2b config.
|
||||
init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
|
||||
ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2b: invalid digest size")
|
||||
ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")
|
||||
|
||||
cfg: _blake2.Blake2_Config
|
||||
cfg.size = u8(digest_size)
|
||||
_blake2.init(ctx, &cfg)
|
||||
}
|
||||
|
||||
// init_mac initializes a Context with a user provided key.
|
||||
init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
|
||||
ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")
|
||||
ensure(len(key) <= DIGEST_SIZE, "crypto/blake2b: invalid key size")
|
||||
|
||||
cfg: _blake2.Blake2_Config
|
||||
cfg.size = u8(digest_size)
|
||||
cfg.key = key
|
||||
_blake2.init(ctx, &cfg)
|
||||
}
|
||||
|
||||
// update adds more data to the Context.
|
||||
update :: proc(ctx: ^Context, data: []byte) {
|
||||
_blake2.update(ctx, data)
|
||||
@@ -43,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_blake2.final(ctx, hash, finalize_clone)
|
||||
|
||||
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2s_Context
|
||||
|
||||
// init initializes a Context with the default BLAKE2s config.
|
||||
init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
|
||||
ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2s: invalid digest size")
|
||||
ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")
|
||||
|
||||
cfg: _blake2.Blake2_Config
|
||||
cfg.size = u8(digest_size)
|
||||
_blake2.init(ctx, &cfg)
|
||||
}
|
||||
|
||||
// init_mac initializes a Context with a user provided key.
|
||||
init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
|
||||
ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")
|
||||
ensure(len(key) <= DIGEST_SIZE, "crypto/blake2s: invalid key size")
|
||||
|
||||
cfg: _blake2.Blake2_Config
|
||||
cfg.size = u8(digest_size)
|
||||
cfg.key = key
|
||||
_blake2.init(ctx, &cfg)
|
||||
}
|
||||
|
||||
// update adds more data to the Context.
|
||||
update :: proc(ctx: ^Context, data: []byte) {
|
||||
_blake2.update(ctx, data)
|
||||
@@ -43,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_blake2.final(ctx, hash, finalize_clone)
|
||||
|
||||
@@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle"
|
||||
// Omit large precomputed tables, trading off performance for size.
|
||||
COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)
|
||||
|
||||
// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
|
||||
// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic
|
||||
// entropy source.
|
||||
HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
|
||||
|
||||
// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
|
||||
// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise.
|
||||
//
|
||||
// The execution time of this routine is constant regardless of the contents
|
||||
// of the slices being compared, as long as the length of the slices is equal.
|
||||
// If the length of the two slices is different, it will early-return 0.
|
||||
// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0.
|
||||
compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
|
||||
// If the length of the slices is different, early return.
|
||||
//
|
||||
@@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
|
||||
return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
|
||||
}
|
||||
|
||||
// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
|
||||
// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by
|
||||
// a and b are equal, 0 otherwise.
|
||||
//
|
||||
// The execution time of this routine is constant regardless of the
|
||||
@@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
|
||||
v |= x[i] ~ y[i]
|
||||
}
|
||||
|
||||
// After the loop, v == 0 iff a == b. The subtraction will underflow
|
||||
// iff v == 0, setting the sign-bit, which gets returned.
|
||||
// After the loop, v == 0 if and only if (⟺) a == b. The subtraction will underflow
|
||||
// if and only if (⟺) v == 0, setting the sign-bit, which gets returned.
|
||||
return subtle.eq(0, v)
|
||||
}
|
||||
|
||||
// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
|
||||
// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise.
|
||||
is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
|
||||
v: byte
|
||||
for b_ in b {
|
||||
|
||||
@@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
|
||||
|
||||
// open authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
// with the provided Context, iv, and tag, and stores the output in dst,
|
||||
// returning true iff the authentication was successful. If authentication
|
||||
// returning true if and only if (⟺) the authentication was successful. If authentication
|
||||
// fails, the destination buffer will be zeroed.
|
||||
//
|
||||
// dst and plaintext MUST alias exactly or not at all.
|
||||
|
||||
@@ -1,152 +1,183 @@
|
||||
#+build amd64
|
||||
#+build amd64,arm32
|
||||
package deoxysii
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/aes"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
// This processes a maximum of 4 blocks at a time, as that is suitable
|
||||
// for most current hardware that doesn't say "Xeon".
|
||||
//
|
||||
// TODO/perf: ARM should be able to do 8 at a time.
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "sse2,ssse3,aes"
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
@(private="file")
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
_BIT_ENC :: x86.__m128i{0x80, 0}
|
||||
_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
||||
@(private = "file")
|
||||
_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
|
||||
_PREFIX_AD_BLOCK :: simd.u8x16{
|
||||
PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
|
||||
_PREFIX_AD_FINAL :: simd.u8x16{
|
||||
PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
|
||||
_PREFIX_MSG_BLOCK :: simd.u8x16{
|
||||
PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
@(private = "file")
|
||||
_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
|
||||
_PREFIX_MSG_FINAL :: simd.u8x16{
|
||||
PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return aes.is_hardware_accelerated()
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse4.1", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
auth_tweak :: #force_inline proc "contextless" (
|
||||
prefix: x86.__m128i,
|
||||
prefix: simd.u8x16,
|
||||
block_nr: int,
|
||||
) -> x86.__m128i {
|
||||
return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
|
||||
}
|
||||
) -> simd.u8x16 {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
|
||||
} else {
|
||||
block_nr_u64 := u64(block_nr)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2", require_results)
|
||||
enc_tweak :: #force_inline proc "contextless" (
|
||||
tag: x86.__m128i,
|
||||
block_nr: int,
|
||||
) -> x86.__m128i {
|
||||
return x86._mm_xor_si128(
|
||||
x86._mm_or_si128(tag, _BIT_ENC),
|
||||
x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
|
||||
return simd.bit_or(
|
||||
prefix,
|
||||
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "ssse3", require_results)
|
||||
h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
|
||||
return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
enc_tweak :: #force_inline proc "contextless" (
|
||||
tag: simd.u8x16,
|
||||
block_nr: int,
|
||||
) -> simd.u8x16 {
|
||||
when ODIN_ENDIAN == .Little {
|
||||
block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
|
||||
} else {
|
||||
block_nr_u64 := u64(block_nr)
|
||||
}
|
||||
|
||||
return simd.bit_xor(
|
||||
simd.bit_or(tag, _BIT_ENC),
|
||||
transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
|
||||
)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_x4 :: #force_inline proc "contextless" (
|
||||
ctx: ^Context,
|
||||
s_0, s_1, s_2, s_3: x86.__m128i,
|
||||
tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
|
||||
) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
|
||||
s_0, s_1, s_2, s_3: simd.u8x16,
|
||||
tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16,
|
||||
) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check {
|
||||
s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
|
||||
tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3
|
||||
|
||||
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
|
||||
stk_0 := x86._mm_xor_si128(tk1_0, sk)
|
||||
stk_1 := x86._mm_xor_si128(tk1_1, sk)
|
||||
stk_2 := x86._mm_xor_si128(tk1_2, sk)
|
||||
stk_3 := x86._mm_xor_si128(tk1_3, sk)
|
||||
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
|
||||
stk_0 := simd.bit_xor(tk1_0, sk)
|
||||
stk_1 := simd.bit_xor(tk1_1, sk)
|
||||
stk_2 := simd.bit_xor(tk1_2, sk)
|
||||
stk_3 := simd.bit_xor(tk1_3, sk)
|
||||
|
||||
s_0 = x86._mm_xor_si128(s_0, stk_0)
|
||||
s_1 = x86._mm_xor_si128(s_1, stk_1)
|
||||
s_2 = x86._mm_xor_si128(s_2, stk_2)
|
||||
s_3 = x86._mm_xor_si128(s_3, stk_3)
|
||||
s_0 = simd.bit_xor(s_0, stk_0)
|
||||
s_1 = simd.bit_xor(s_1, stk_1)
|
||||
s_2 = simd.bit_xor(s_2, stk_2)
|
||||
s_3 = simd.bit_xor(s_3, stk_3)
|
||||
|
||||
for i in 1 ..= BC_ROUNDS {
|
||||
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
|
||||
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
|
||||
|
||||
tk1_0 = h_(tk1_0)
|
||||
tk1_1 = h_(tk1_1)
|
||||
tk1_2 = h_(tk1_2)
|
||||
tk1_3 = h_(tk1_3)
|
||||
tk1_0 = h(tk1_0)
|
||||
tk1_1 = h(tk1_1)
|
||||
tk1_2 = h(tk1_2)
|
||||
tk1_3 = h(tk1_3)
|
||||
|
||||
stk_0 = x86._mm_xor_si128(tk1_0, sk)
|
||||
stk_1 = x86._mm_xor_si128(tk1_1, sk)
|
||||
stk_2 = x86._mm_xor_si128(tk1_2, sk)
|
||||
stk_3 = x86._mm_xor_si128(tk1_3, sk)
|
||||
stk_0 = simd.bit_xor(tk1_0, sk)
|
||||
stk_1 = simd.bit_xor(tk1_1, sk)
|
||||
stk_2 = simd.bit_xor(tk1_2, sk)
|
||||
stk_3 = simd.bit_xor(tk1_3, sk)
|
||||
|
||||
s_0 = x86._mm_aesenc_si128(s_0, stk_0)
|
||||
s_1 = x86._mm_aesenc_si128(s_1, stk_1)
|
||||
s_2 = x86._mm_aesenc_si128(s_2, stk_2)
|
||||
s_3 = x86._mm_aesenc_si128(s_3, stk_3)
|
||||
s_0 = aes_hw.aesenc(s_0, stk_0)
|
||||
s_1 = aes_hw.aesenc(s_1, stk_1)
|
||||
s_2 = aes_hw.aesenc(s_2, stk_2)
|
||||
s_3 = aes_hw.aesenc(s_3, stk_3)
|
||||
}
|
||||
|
||||
return s_0, s_1, s_2, s_3
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_x1 :: #force_inline proc "contextless" (
|
||||
ctx: ^Context,
|
||||
s: x86.__m128i,
|
||||
tweak: x86.__m128i,
|
||||
) -> x86.__m128i #no_bounds_check {
|
||||
s: simd.u8x16,
|
||||
tweak: simd.u8x16,
|
||||
) -> simd.u8x16 #no_bounds_check {
|
||||
s, tk1 := s, tweak
|
||||
|
||||
sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
|
||||
stk := x86._mm_xor_si128(tk1, sk)
|
||||
sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
|
||||
stk := simd.bit_xor(tk1, sk)
|
||||
|
||||
s = x86._mm_xor_si128(s, stk)
|
||||
s = simd.bit_xor(s, stk)
|
||||
|
||||
for i in 1 ..= BC_ROUNDS {
|
||||
sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
|
||||
sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
|
||||
|
||||
tk1 = h_(tk1)
|
||||
tk1 = h(tk1)
|
||||
|
||||
stk = x86._mm_xor_si128(tk1, sk)
|
||||
stk = simd.bit_xor(tk1, sk)
|
||||
|
||||
s = x86._mm_aesenc_si128(s, stk)
|
||||
s = aes_hw.aesenc(s, stk)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_absorb :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
tag: x86.__m128i,
|
||||
tag: simd.u8x16,
|
||||
src: []byte,
|
||||
tweak_prefix: x86.__m128i,
|
||||
tweak_prefix: simd.u8x16,
|
||||
stk_block_nr: int,
|
||||
) -> (x86.__m128i, int) #no_bounds_check {
|
||||
) -> (simd.u8x16, int) #no_bounds_check {
|
||||
src, stk_block_nr, tag := src, stk_block_nr, tag
|
||||
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= 4 {
|
||||
d_0, d_1, d_2, d_3 := bc_x4(
|
||||
ctx,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
auth_tweak(tweak_prefix, stk_block_nr),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 1),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 2),
|
||||
auth_tweak(tweak_prefix, stk_block_nr + 3),
|
||||
)
|
||||
|
||||
tag = x86._mm_xor_si128(tag, d_0)
|
||||
tag = x86._mm_xor_si128(tag, d_1)
|
||||
tag = x86._mm_xor_si128(tag, d_2)
|
||||
tag = x86._mm_xor_si128(tag, d_3)
|
||||
tag = simd.bit_xor(tag, d_0)
|
||||
tag = simd.bit_xor(tag, d_1)
|
||||
tag = simd.bit_xor(tag, d_2)
|
||||
tag = simd.bit_xor(tag, d_3)
|
||||
|
||||
src = src[4*BLOCK_SIZE:]
|
||||
stk_block_nr += 4
|
||||
@@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" (
|
||||
for nr_blocks > 0 {
|
||||
d := bc_x1(
|
||||
ctx,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
auth_tweak(tweak_prefix, stk_block_nr),
|
||||
)
|
||||
|
||||
tag = x86._mm_xor_si128(tag, d)
|
||||
tag = simd.bit_xor(tag, d)
|
||||
|
||||
src = src[BLOCK_SIZE:]
|
||||
stk_block_nr += 1
|
||||
@@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" (
|
||||
return tag, stk_block_nr
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_final :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
tag: x86.__m128i,
|
||||
tag: simd.u8x16,
|
||||
iv: []byte,
|
||||
) -> x86.__m128i {
|
||||
) -> simd.u8x16 {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
|
||||
tmp[0] = PREFIX_TAG << PREFIX_SHIFT
|
||||
copy(tmp[1:], iv)
|
||||
|
||||
tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
|
||||
tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp))
|
||||
|
||||
return bc_x1(ctx, tag, tweak)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
|
||||
bc_encrypt :: proc "contextless" (
|
||||
ctx: ^Context,
|
||||
dst: []byte,
|
||||
src: []byte,
|
||||
iv: x86.__m128i,
|
||||
tweak_tag: x86.__m128i,
|
||||
iv: simd.u8x16,
|
||||
tweak_tag: simd.u8x16,
|
||||
stk_block_nr: int,
|
||||
) -> int {
|
||||
dst, src, stk_block_nr := dst, src, stk_block_nr
|
||||
@@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" (
|
||||
)
|
||||
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst)),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst)),
|
||||
simd.bit_xor(
|
||||
d_0,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_1,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_2,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])),
|
||||
simd.bit_xor(
|
||||
d_3,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" (
|
||||
)
|
||||
|
||||
intrinsics.unaligned_store(
|
||||
(^x86.__m128i)(raw_data(dst)),
|
||||
x86._mm_xor_si128(
|
||||
(^simd.u8x16)(raw_data(dst)),
|
||||
simd.bit_xor(
|
||||
d,
|
||||
intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
|
||||
intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" (
|
||||
e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
copy(tmp[1:], iv)
|
||||
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
|
||||
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
|
||||
|
||||
// Algorithm 3
|
||||
//
|
||||
@@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
|
||||
// if A_∗ != nil then
|
||||
// Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
|
||||
// end
|
||||
auth: x86.__m128i
|
||||
auth: simd.u8x16
|
||||
n: int
|
||||
|
||||
aad := aad
|
||||
@@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
|
||||
copy(dst[n*BLOCK_SIZE:], m_star[:])
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth)
|
||||
}
|
||||
|
||||
@(private, require_results)
|
||||
d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
|
||||
tmp: [BLOCK_SIZE]byte
|
||||
copy(tmp[1:], iv)
|
||||
iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
|
||||
iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))
|
||||
|
||||
// Algorithm 4
|
||||
//
|
||||
@@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// if C_∗ != nil then
|
||||
// M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
|
||||
// end
|
||||
auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
|
||||
auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag)))
|
||||
|
||||
m := ciphertext
|
||||
n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
|
||||
@@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// if A∗ != nil then
|
||||
// Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
|
||||
// end
|
||||
auth = x86.__m128i{0, 0}
|
||||
auth = simd.u8x16{}
|
||||
aad := aad
|
||||
auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
|
||||
aad = aad[BLOCK_SIZE*n:]
|
||||
@@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
|
||||
// Tag verification
|
||||
// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
|
||||
// else return false
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
|
||||
intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth)
|
||||
ok := crypto.compare_constant_time(tmp[:], tag) == 1
|
||||
|
||||
crypto.zero_explicit(&tmp, size_of(tmp))
|
||||
@@ -1,10 +1,12 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package deoxysii
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return false
|
||||
|
||||
@@ -104,7 +104,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
|
||||
@@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// private_key_equal returns true iff the private keys are equal,
|
||||
// private_key_equal returns true if and only if (⟺) the private keys are equal,
|
||||
// in constant time.
|
||||
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
@@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
|
||||
public_key_clear(pub_key)
|
||||
|
||||
@@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff the public keys are equal,
|
||||
// public_key_equal returns true if and only if (⟺) the public keys are equal,
|
||||
// in constant time.
|
||||
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
|
||||
@@ -79,7 +79,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
|
||||
@@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// private_key_equal returns true iff the private keys are equal,
|
||||
// private_key_equal returns true if and only if (⟺) the private keys are equal,
|
||||
// in constant time.
|
||||
private_key_equal :: proc(p, q: ^Private_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
@@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
|
||||
public_key_clear(pub_key)
|
||||
|
||||
@@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
}
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff the public keys are equal,
|
||||
// public_key_equal returns true if and only if (⟺) the public keys are equal,
|
||||
// in constant time.
|
||||
public_key_equal :: proc(p, q: ^Public_Key) -> bool {
|
||||
if p._curve != q._curve {
|
||||
|
||||
@@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) {
|
||||
return nil, nil, false
|
||||
}
|
||||
|
||||
// DER requires a leading 0 iff the sign bit of the leading byte
|
||||
// DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte
|
||||
// is set to distinguish between positive and negative integers,
|
||||
// and the minimal length representation. `r` and `s` are always
|
||||
// going to be unsigned, so we validate malformed DER and strip
|
||||
|
||||
@@ -3,7 +3,7 @@ package ecdsa
|
||||
import "core:crypto/hash"
|
||||
import secec "core:crypto/_weierstrass"
|
||||
|
||||
// verify_raw returns true iff sig is a valid signature by pub_key over
|
||||
// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over
|
||||
// msg, hased using hash_algo, per the verification procedure specifed
|
||||
// in SEC 1, Version 2.0, Section 4.1.4.
|
||||
//
|
||||
@@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: []
|
||||
panic("crypto/ecdsa: invalid curve")
|
||||
}
|
||||
|
||||
// verify_asn1 returns true iff sig is a valid signature by pub_key over
|
||||
// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over
|
||||
// msg, hased using hash_algo, per the verification procedure specifed
|
||||
// in SEC 1, Version 2.0, Section 4.1.4.
|
||||
//
|
||||
|
||||
@@ -48,7 +48,7 @@ Public_Key :: struct {
|
||||
}
|
||||
|
||||
// private_key_generate uses the system entropy source to generate a new
|
||||
// Private_Key. This will only fail iff the system entropy source is
|
||||
// Private_Key. This will only fail if and only if (⟺) the system entropy source is
|
||||
// missing or broken.
|
||||
private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
|
||||
private_key_clear(priv_key)
|
||||
@@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
|
||||
}
|
||||
|
||||
// private_key_set_bytes decodes a byte-encoded private key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
|
||||
if len(b) != PRIVATE_KEY_SIZE {
|
||||
return false
|
||||
@@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
|
||||
}
|
||||
|
||||
// public_key_set_bytes decodes a byte-encoded public key, and returns
|
||||
// true iff the operation was successful.
|
||||
// true if and only if (⟺) the operation was successful.
|
||||
public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
|
||||
if len(b) != PUBLIC_KEY_SIZE {
|
||||
return false
|
||||
@@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
|
||||
copy(dst, pub_key._b[:])
|
||||
}
|
||||
|
||||
// public_key_equal returns true iff pub_key is equal to other.
|
||||
// public_key_equal returns true if and only if (⟺) pub_key is equal to other.
|
||||
public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
|
||||
ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")
|
||||
|
||||
return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
|
||||
}
|
||||
|
||||
// verify returns true iff sig is a valid signature by pub_key over msg.
|
||||
// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg.
|
||||
//
|
||||
// The optional `allow_small_order_A` parameter will make this
|
||||
// implementation strictly compatible with FIPS 186-5, at the expense of
|
||||
|
||||
@@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
switch &impl in ctx._impl {
|
||||
|
||||
@@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the HMAC tag computed with the specified algorithm
|
||||
// and key over msg and return true iff the tag is valid. It requires
|
||||
// and key over msg and return true if and only if (⟺) the tag is valid. It requires
|
||||
// that the tag is correctly sized.
|
||||
verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
|
||||
tag_buf: [hash.MAX_DIGEST_SIZE]byte
|
||||
|
||||
@@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the KMAC tag computed with the specified security
|
||||
// strength, key and domain separator over msg and return true iff the
|
||||
// strength, key and domain separator over msg and return true if and only if (⟺) the
|
||||
// tag is valid.
|
||||
verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
|
||||
derived_tag := make([]byte, len(tag), allocator)
|
||||
|
||||
@@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -66,7 +66,7 @@ derive :: proc(
|
||||
dst_blk = dst_blk[h_len:]
|
||||
}
|
||||
|
||||
// Instead of rounding l up, just proceass the one extra block iff
|
||||
// Instead of rounding l up, just proceass the one extra block if and only if (⟺)
|
||||
// r != 0.
|
||||
if r > 0 {
|
||||
tmp: [hash.MAX_DIGEST_SIZE]byte
|
||||
|
||||
@@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) {
|
||||
}
|
||||
|
||||
// verify will verify the Poly1305 tag computed with the key over msg and
|
||||
// return true iff the tag is valid. It requires that the tag is correctly
|
||||
// return true if and only if (⟺) the tag is valid. It requires that the tag is correctly
|
||||
// sized.
|
||||
verify :: proc(tag, msg, key: []byte) -> bool {
|
||||
ctx: Context = ---
|
||||
|
||||
@@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc(
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
|
||||
// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{a})
|
||||
@@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
|
||||
// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{ge, a})
|
||||
@@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
|
||||
grp.ge_cond_assign(&ge._p, &a._p, ctrl)
|
||||
}
|
||||
|
||||
// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
|
||||
// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
|
||||
_ge_ensure_initialized([]^Group_Element{a, b})
|
||||
@@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
|
||||
ge._is_initialized = true
|
||||
}
|
||||
|
||||
// ge_equal returns 1 iff `a == b`, and 0 otherwise.
|
||||
// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
|
||||
@(require_results)
|
||||
ge_equal :: proc(a, b: ^Group_Element) -> int {
|
||||
_ge_ensure_initialized([]^Group_Element{a, b})
|
||||
@@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int {
|
||||
return ret
|
||||
}
|
||||
|
||||
// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise.
|
||||
// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise.
|
||||
@(require_results)
|
||||
ge_is_identity :: proc(ge: ^Group_Element) -> int {
|
||||
return ge_equal(ge, &GE_IDENTITY)
|
||||
|
||||
@@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) {
|
||||
grp.sc_square(sc, a)
|
||||
}
|
||||
|
||||
// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`.
|
||||
// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`.
|
||||
// Behavior for all other values of ctrl are undefined,
|
||||
sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) {
|
||||
grp.sc_cond_assign(sc, a, ctrl)
|
||||
}
|
||||
|
||||
// sc_equal returns 1 iff `a == b`, and 0 otherwise.
|
||||
// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
|
||||
@(require_results)
|
||||
sc_equal :: proc(a, b: ^Scalar) -> int {
|
||||
return grp.sc_equal(a, b)
|
||||
|
||||
@@ -44,7 +44,8 @@ Context_256 :: struct {
|
||||
length: u64,
|
||||
md_bits: int,
|
||||
|
||||
is_initialized: bool,
|
||||
is_hw_accelerated: bool,
|
||||
is_initialized: bool,
|
||||
}
|
||||
|
||||
// Context_512 is a SHA-384, SHA-512 or SHA-512/256 instance.
|
||||
@@ -55,7 +56,8 @@ Context_512 :: struct {
|
||||
length: u64,
|
||||
md_bits: int,
|
||||
|
||||
is_initialized: bool,
|
||||
is_hw_accelerated: bool,
|
||||
is_initialized: bool,
|
||||
}
|
||||
|
||||
// init_224 initializes a Context_256 for SHA-224.
|
||||
@@ -88,6 +90,9 @@ init_512_256 :: proc(ctx: ^Context_512) {
|
||||
_init(ctx)
|
||||
}
|
||||
|
||||
@(private)
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
|
||||
|
||||
@(private)
|
||||
_init :: proc(ctx: ^$T) {
|
||||
when T == Context_256 {
|
||||
@@ -113,6 +118,8 @@ _init :: proc(ctx: ^$T) {
|
||||
case:
|
||||
panic("crypto/sha2: invalid digest output length")
|
||||
}
|
||||
|
||||
ctx.is_hw_accelerated = is_hardware_accelerated_256()
|
||||
} else when T == Context_512 {
|
||||
switch ctx.md_bits {
|
||||
case 256:
|
||||
@@ -148,6 +155,8 @@ _init :: proc(ctx: ^$T) {
|
||||
case:
|
||||
panic("crypto/sha2: invalid digest output length")
|
||||
}
|
||||
|
||||
ctx.is_hw_accelerated = is_hardware_accelerated_512()
|
||||
}
|
||||
|
||||
ctx.length = 0
|
||||
@@ -191,7 +200,7 @@ update :: proc(ctx: ^$T, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
@@ -267,7 +276,7 @@ reset :: proc(ctx: ^$T) {
|
||||
SHA2 implementation
|
||||
*/
|
||||
|
||||
@(private, rodata)
|
||||
@(private = "file", rodata)
|
||||
SHA256_K := [64]u32 {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
@@ -287,7 +296,7 @@ SHA256_K := [64]u32 {
|
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
||||
}
|
||||
|
||||
@(private, rodata)
|
||||
@(private = "file", rodata)
|
||||
SHA512_K := [80]u64 {
|
||||
0x428a2f98d728ae22, 0x7137449123ef65cd,
|
||||
0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
|
||||
@@ -336,70 +345,70 @@ SHA256_ROUNDS :: 64
|
||||
@(private)
|
||||
SHA512_ROUNDS :: 80
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_CH :: #force_inline proc "contextless" (x, y, z: u32) -> u32 {
|
||||
return (x & y) ~ (~x & z)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_MAJ :: #force_inline proc "contextless" (x, y, z: u32) -> u32 {
|
||||
return (x & y) ~ (x & z) ~ (y & z)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_CH :: #force_inline proc "contextless" (x, y, z: u64) -> u64 {
|
||||
return (x & y) ~ (~x & z)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_MAJ :: #force_inline proc "contextless" (x, y, z: u64) -> u64 {
|
||||
return (x & y) ~ (x & z) ~ (y & z)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_F1 :: #force_inline proc "contextless" (x: u32) -> u32 {
|
||||
return bits.rotate_left32(x, 30) ~ bits.rotate_left32(x, 19) ~ bits.rotate_left32(x, 10)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_F2 :: #force_inline proc "contextless" (x: u32) -> u32 {
|
||||
return bits.rotate_left32(x, 26) ~ bits.rotate_left32(x, 21) ~ bits.rotate_left32(x, 7)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_F3 :: #force_inline proc "contextless" (x: u32) -> u32 {
|
||||
return bits.rotate_left32(x, 25) ~ bits.rotate_left32(x, 14) ~ (x >> 3)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA256_F4 :: #force_inline proc "contextless" (x: u32) -> u32 {
|
||||
return bits.rotate_left32(x, 15) ~ bits.rotate_left32(x, 13) ~ (x >> 10)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_F1 :: #force_inline proc "contextless" (x: u64) -> u64 {
|
||||
return bits.rotate_left64(x, 36) ~ bits.rotate_left64(x, 30) ~ bits.rotate_left64(x, 25)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_F2 :: #force_inline proc "contextless" (x: u64) -> u64 {
|
||||
return bits.rotate_left64(x, 50) ~ bits.rotate_left64(x, 46) ~ bits.rotate_left64(x, 23)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_F3 :: #force_inline proc "contextless" (x: u64) -> u64 {
|
||||
return bits.rotate_left64(x, 63) ~ bits.rotate_left64(x, 56) ~ (x >> 7)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 {
|
||||
return bits.rotate_left64(x, 45) ~ bits.rotate_left64(x, 3) ~ (x >> 6)
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check {
|
||||
when T == Context_256 {
|
||||
if is_hardware_accelerated_256() {
|
||||
if ctx.is_hw_accelerated {
|
||||
sha256_transf_hw(ctx, data)
|
||||
return
|
||||
}
|
||||
@@ -410,6 +419,11 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check {
|
||||
|
||||
CURR_BLOCK_SIZE :: BLOCK_SIZE_256
|
||||
} else when T == Context_512 {
|
||||
if ctx.is_hw_accelerated {
|
||||
sha512_transf_hw(ctx, data)
|
||||
return
|
||||
}
|
||||
|
||||
w: [SHA512_ROUNDS]u64
|
||||
wv: [8]u64
|
||||
t1, t2: u64
|
||||
|
||||
224
core/crypto/sha2/sha256_impl_hw_arm.odin
Normal file
224
core/crypto/sha2/sha256_impl_hw_arm.odin
Normal file
@@ -0,0 +1,224 @@
|
||||
#+build arm64,arm32
|
||||
package sha2
|
||||
|
||||
// Based on the public domain code by Jeffrey Walton, though
|
||||
// realistically, there only is one sensible way to write this.
|
||||
//
|
||||
// See: https://github.com/noloader/SHA-Intrinsics
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
import "core:sys/info"
|
||||
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
|
||||
// accelerated SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.sha256,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5}
|
||||
@(private = "file")
|
||||
K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5}
|
||||
@(private = "file")
|
||||
K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3}
|
||||
@(private = "file")
|
||||
K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174}
|
||||
@(private = "file")
|
||||
K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC}
|
||||
@(private = "file")
|
||||
K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA}
|
||||
@(private = "file")
|
||||
K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7}
|
||||
@(private = "file")
|
||||
K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967}
|
||||
@(private = "file")
|
||||
K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13}
|
||||
@(private = "file")
|
||||
K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85}
|
||||
@(private = "file")
|
||||
K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3}
|
||||
@(private = "file")
|
||||
K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070}
|
||||
@(private = "file")
|
||||
K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5}
|
||||
@(private = "file")
|
||||
K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3}
|
||||
@(private = "file")
|
||||
K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208}
|
||||
@(private = "file")
|
||||
K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2}
|
||||
|
||||
@(private, enable_target_feature = "neon,sha2")
|
||||
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
|
||||
state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0]))
|
||||
state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4]))
|
||||
|
||||
data := data
|
||||
for len(data) >= BLOCK_SIZE_256 {
|
||||
// Save state
|
||||
abef_save, cdgh_save := state_0, state_1
|
||||
|
||||
// Load message
|
||||
msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data)))
|
||||
msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:])))
|
||||
msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:])))
|
||||
msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:])))
|
||||
|
||||
// Reverse for little endian
|
||||
when ODIN_ENDIAN == .Little {
|
||||
msg_0 = byteswap_u32x4(msg_0)
|
||||
msg_1 = byteswap_u32x4(msg_1)
|
||||
msg_2 = byteswap_u32x4(msg_2)
|
||||
msg_3 = byteswap_u32x4(msg_3)
|
||||
}
|
||||
|
||||
tmp_0 := simd.add(msg_0, K_0)
|
||||
|
||||
// Rounds 0-3
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 := state_0
|
||||
tmp_1 := simd.add(msg_1, K_1)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 4-7
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_2)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 8-11
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_3)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 12-15
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_4)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 16-19
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_5)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 20-23
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_6)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 24-27
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_7)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 28-31
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_8)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 32-35
|
||||
msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_9)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
|
||||
|
||||
// Rounds 36-39
|
||||
msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_10)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
|
||||
|
||||
// Rounds 40-43
|
||||
msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_11)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
|
||||
|
||||
// Rounds 44-47
|
||||
msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_0, K_12)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
|
||||
|
||||
// Rounds 48-51
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_1, K_13)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
|
||||
// Rounds 52-55
|
||||
tmp_2 = state_0
|
||||
tmp_0 = simd.add(msg_2, K_14)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
|
||||
// Rounds 56-59
|
||||
tmp_2 = state_0
|
||||
tmp_1 = simd.add(msg_3, K_15)
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
|
||||
|
||||
// Rounds 60-63
|
||||
tmp_2 = state_0
|
||||
state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
|
||||
state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
|
||||
|
||||
// Combine state
|
||||
state_0 = simd.add(state_0, abef_save)
|
||||
state_1 = simd.add(state_1, cdgh_save)
|
||||
|
||||
data = data[BLOCK_SIZE_256:]
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0)
|
||||
intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1)
|
||||
}
|
||||
|
||||
when ODIN_ENDIAN == .Little {
|
||||
@(private = "file", enable_target_feature = "neon")
|
||||
byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 {
|
||||
return transmute(simd.u32x4)(
|
||||
simd.shuffle(
|
||||
transmute(simd.u8x16)(a),
|
||||
transmute(simd.u8x16)(a),
|
||||
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -1,15 +1,15 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package sha2
|
||||
|
||||
@(private = "file")
|
||||
ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
|
||||
|
||||
// is_hardware_accelerated_256 returns true iff hardware accelerated
|
||||
// SHA-224/SHA-256 is supported.
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
|
||||
// accelerated SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(private)
|
||||
sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) {
|
||||
panic_contextless(ERR_HW_NOT_SUPPORTED)
|
||||
}
|
||||
@@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
|
||||
K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}
|
||||
|
||||
|
||||
// is_hardware_accelerated_256 returns true iff hardware accelerated
|
||||
// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
|
||||
// SHA-224/SHA-256 is supported.
|
||||
is_hardware_accelerated_256 :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{
|
||||
@@ -70,8 +70,7 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou
|
||||
tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB
|
||||
state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH
|
||||
state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF
|
||||
// state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH
|
||||
state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp)
|
||||
state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH
|
||||
|
||||
data := data
|
||||
for len(data) >= BLOCK_SIZE_256 {
|
||||
@@ -238,18 +237,9 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou
|
||||
// Write back the updated state
|
||||
tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA
|
||||
state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG
|
||||
// state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
|
||||
state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1)
|
||||
state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
|
||||
state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1)
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i {
|
||||
// HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`.
|
||||
a_ := simd.to_array(a)
|
||||
b_ := simd.to_array(b)
|
||||
return x86.__m128i{a_[0], b_[1]}
|
||||
}
|
||||
498
core/crypto/sha2/sha512_impl_hw_arm.odin
Normal file
498
core/crypto/sha2/sha512_impl_hw_arm.odin
Normal file
@@ -0,0 +1,498 @@
|
||||
// The round function's intrinsic calls are based on:
|
||||
// https://github.com/LostInCompilation/HashMe/blob/main/src/SHA512_Hardware.cpp
|
||||
//
|
||||
// The zlib License
|
||||
//
|
||||
// Copyright (C) 2024 Marc Schöndorf
|
||||
//
|
||||
// This software is provided 'as-is', without any express or implied warranty. In
|
||||
// no event will the authors be held liable for any damages arising from the use of
|
||||
// this software.
|
||||
//
|
||||
// Permission is granted to anyone to use this software for any purpose, including
|
||||
// commercial applications, and to alter it and redistribute it freely, subject to
|
||||
// the following restrictions:
|
||||
//
|
||||
// 1. The origin of this software must not be misrepresented; you must not claim
|
||||
// that you wrote the original software. If you use this software in a product,
|
||||
// an acknowledgment in the product documentation would be appreciated but is
|
||||
// not required.
|
||||
//
|
||||
// 2. Altered source versions must be plainly marked as such, and must not be
|
||||
// misrepresented as being the original software.
|
||||
//
|
||||
// 3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
#+build arm64
|
||||
package sha2
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
import "core:sys/info"
|
||||
|
||||
// is_hardware_accelerated_512 returns true if and only if (⟺) hardware
|
||||
// accelerated SHA-384, SHA-512, and SHA-512/256 are supported.
|
||||
is_hardware_accelerated_512 :: proc "contextless" () -> bool {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.sha512,
|
||||
.sha3, // XXX: LLVM groups these under `sha3`.
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
K_0 :: simd.u64x2{0x428a2f98d728ae22, 0x7137449123ef65cd}
|
||||
@(private = "file")
|
||||
K_1 :: simd.u64x2{0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc}
|
||||
@(private = "file")
|
||||
K_2 :: simd.u64x2{0x3956c25bf348b538, 0x59f111f1b605d019}
|
||||
@(private = "file")
|
||||
K_3 :: simd.u64x2{0x923f82a4af194f9b, 0xab1c5ed5da6d8118}
|
||||
@(private = "file")
|
||||
K_4 :: simd.u64x2{0xd807aa98a3030242, 0x12835b0145706fbe}
|
||||
@(private = "file")
|
||||
K_5 :: simd.u64x2{0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2}
|
||||
@(private = "file")
|
||||
K_6 :: simd.u64x2{0x72be5d74f27b896f, 0x80deb1fe3b1696b1}
|
||||
@(private = "file")
|
||||
K_7 :: simd.u64x2{0x9bdc06a725c71235, 0xc19bf174cf692694}
|
||||
@(private = "file")
|
||||
K_8 :: simd.u64x2{0xe49b69c19ef14ad2, 0xefbe4786384f25e3}
|
||||
@(private = "file")
|
||||
K_9 :: simd.u64x2{0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65}
|
||||
@(private = "file")
|
||||
K_10 :: simd.u64x2{0x2de92c6f592b0275, 0x4a7484aa6ea6e483}
|
||||
@(private = "file")
|
||||
K_11 :: simd.u64x2{0x5cb0a9dcbd41fbd4, 0x76f988da831153b5}
|
||||
@(private = "file")
|
||||
K_12 :: simd.u64x2{0x983e5152ee66dfab, 0xa831c66d2db43210}
|
||||
@(private = "file")
|
||||
K_13 :: simd.u64x2{0xb00327c898fb213f, 0xbf597fc7beef0ee4}
|
||||
@(private = "file")
|
||||
K_14 :: simd.u64x2{0xc6e00bf33da88fc2, 0xd5a79147930aa725}
|
||||
@(private = "file")
|
||||
K_15 :: simd.u64x2{0x06ca6351e003826f, 0x142929670a0e6e70}
|
||||
@(private = "file")
|
||||
K_16 :: simd.u64x2{0x27b70a8546d22ffc, 0x2e1b21385c26c926}
|
||||
@(private = "file")
|
||||
K_17 :: simd.u64x2{0x4d2c6dfc5ac42aed, 0x53380d139d95b3df}
|
||||
@(private = "file")
|
||||
K_18 :: simd.u64x2{0x650a73548baf63de, 0x766a0abb3c77b2a8}
|
||||
@(private = "file")
|
||||
K_19 :: simd.u64x2{0x81c2c92e47edaee6, 0x92722c851482353b}
|
||||
@(private = "file")
|
||||
K_20 :: simd.u64x2{0xa2bfe8a14cf10364, 0xa81a664bbc423001}
|
||||
@(private = "file")
|
||||
K_21 :: simd.u64x2{0xc24b8b70d0f89791, 0xc76c51a30654be30}
|
||||
@(private = "file")
|
||||
K_22 :: simd.u64x2{0xd192e819d6ef5218, 0xd69906245565a910}
|
||||
@(private = "file")
|
||||
K_23 :: simd.u64x2{0xf40e35855771202a, 0x106aa07032bbd1b8}
|
||||
@(private = "file")
|
||||
K_24 :: simd.u64x2{0x19a4c116b8d2d0c8, 0x1e376c085141ab53}
|
||||
@(private = "file")
|
||||
K_25 :: simd.u64x2{0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8}
|
||||
@(private = "file")
|
||||
K_26 :: simd.u64x2{0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb}
|
||||
@(private = "file")
|
||||
K_27 :: simd.u64x2{0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3}
|
||||
@(private = "file")
|
||||
K_28 :: simd.u64x2{0x748f82ee5defb2fc, 0x78a5636f43172f60}
|
||||
@(private = "file")
|
||||
K_29 :: simd.u64x2{0x84c87814a1f0ab72, 0x8cc702081a6439ec}
|
||||
@(private = "file")
|
||||
K_30 :: simd.u64x2{0x90befffa23631e28, 0xa4506cebde82bde9}
|
||||
@(private = "file")
|
||||
K_31 :: simd.u64x2{0xbef9a3f7b2c67915, 0xc67178f2e372532b}
|
||||
@(private = "file")
|
||||
K_32 :: simd.u64x2{0xca273eceea26619c, 0xd186b8c721c0c207}
|
||||
@(private = "file")
|
||||
K_33 :: simd.u64x2{0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178}
|
||||
@(private = "file")
|
||||
K_34 :: simd.u64x2{0x06f067aa72176fba, 0x0a637dc5a2c898a6}
|
||||
@(private = "file")
|
||||
K_35 :: simd.u64x2{0x113f9804bef90dae, 0x1b710b35131c471b}
|
||||
@(private = "file")
|
||||
K_36 :: simd.u64x2{0x28db77f523047d84, 0x32caab7b40c72493}
|
||||
@(private = "file")
|
||||
K_37 :: simd.u64x2{0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c}
|
||||
@(private = "file")
|
||||
K_38 :: simd.u64x2{0x4cc5d4becb3e42b6, 0x597f299cfc657e2a}
|
||||
@(private = "file")
|
||||
K_39 :: simd.u64x2{0x5fcb6fab3ad6faec, 0x6c44198c4a475817}
|
||||
|
||||
@(private, enable_target_feature = "neon,sha3")
|
||||
sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) #no_bounds_check {
|
||||
state_0 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[0]))
|
||||
state_1 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[2]))
|
||||
state_2 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[4]))
|
||||
state_3 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[6]))
|
||||
|
||||
data := data
|
||||
for len(data) >= BLOCK_SIZE_512 {
|
||||
ab_save, cd_save, ef_save, gh_save := state_0, state_1, state_2, state_3
|
||||
|
||||
// Load message
|
||||
msg_0 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data)))
|
||||
msg_1 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[16:])))
|
||||
msg_2 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[32:])))
|
||||
msg_3 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[48:])))
|
||||
msg_4 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[64:])))
|
||||
msg_5 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[80:])))
|
||||
msg_6 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[96:])))
|
||||
msg_7 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[112:])))
|
||||
|
||||
// Reverse for little endian
|
||||
when ODIN_ENDIAN == .Little {
|
||||
msg_0 = byteswap_u64x2(msg_0)
|
||||
msg_1 = byteswap_u64x2(msg_1)
|
||||
msg_2 = byteswap_u64x2(msg_2)
|
||||
msg_3 = byteswap_u64x2(msg_3)
|
||||
msg_4 = byteswap_u64x2(msg_4)
|
||||
msg_5 = byteswap_u64x2(msg_5)
|
||||
msg_6 = byteswap_u64x2(msg_6)
|
||||
msg_7 = byteswap_u64x2(msg_7)
|
||||
}
|
||||
|
||||
// Rounds 0-1
|
||||
msg_k := simd.add(msg_0, K_0)
|
||||
tmp_0 := simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 := arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
|
||||
|
||||
// Rounds 2-3
|
||||
msg_k = simd.add(msg_1, K_1)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
|
||||
|
||||
// Rounds 4-5
|
||||
msg_k = simd.add(msg_2, K_2)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
|
||||
|
||||
// Rounds 6-7
|
||||
msg_k = simd.add(msg_3, K_3)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
|
||||
|
||||
// Rounds 8-9
|
||||
msg_k = simd.add(msg_4, K_4)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
|
||||
|
||||
// Rounds 10-11
|
||||
msg_k = simd.add(msg_5, K_5)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
|
||||
|
||||
// Rounds 12-13
|
||||
msg_k = simd.add(msg_6, K_6)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
|
||||
|
||||
// Rounds 14-15
|
||||
msg_k = simd.add(msg_7, K_7)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
|
||||
|
||||
// Rounds 16-17
|
||||
msg_k = simd.add(msg_0, K_8)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
|
||||
|
||||
// Rounds 18-19
|
||||
msg_k = simd.add(msg_1, K_9)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
|
||||
|
||||
// Rounds 20-21
|
||||
msg_k = simd.add(msg_2, K_10)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
|
||||
|
||||
// Rounds 22-23
|
||||
msg_k = simd.add(msg_3, K_11)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
|
||||
|
||||
// Rounds 24-25
|
||||
msg_k = simd.add(msg_4, K_12)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
|
||||
|
||||
// Rounds 26-27
|
||||
msg_k = simd.add(msg_5, K_13)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
|
||||
|
||||
// Rounds 28-29
|
||||
msg_k = simd.add(msg_6, K_14)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
|
||||
|
||||
// Rounds 30-31
|
||||
msg_k = simd.add(msg_7, K_15)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
|
||||
|
||||
// Rounds 32-33
|
||||
msg_k = simd.add(msg_0, K_16)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
|
||||
|
||||
// Rounds 34-35
|
||||
msg_k = simd.add(msg_1, K_17)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
|
||||
|
||||
// Rounds 36-37
|
||||
msg_k = simd.add(msg_2, K_18)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
|
||||
|
||||
// Rounds 38-39
|
||||
msg_k = simd.add(msg_3, K_19)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
|
||||
|
||||
// Rounds 40-41
|
||||
msg_k = simd.add(msg_4, K_20)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
|
||||
|
||||
// Rounds 42-43
|
||||
msg_k = simd.add(msg_5, K_21)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
|
||||
|
||||
// Rounds 44-45
|
||||
msg_k = simd.add(msg_6, K_22)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
|
||||
|
||||
// Rounds 46-47
|
||||
msg_k = simd.add(msg_7, K_23)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
|
||||
|
||||
// Rounds 48-49
|
||||
msg_k = simd.add(msg_0, K_24)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
|
||||
|
||||
// Rounds 50-51
|
||||
msg_k = simd.add(msg_1, K_25)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
|
||||
|
||||
// Rounds 52-53
|
||||
msg_k = simd.add(msg_2, K_26)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
|
||||
|
||||
// Rounds 54-55
|
||||
msg_k = simd.add(msg_3, K_27)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
|
||||
|
||||
// Rounds 56-57
|
||||
msg_k = simd.add(msg_4, K_28)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
|
||||
|
||||
// Rounds 58-59
|
||||
msg_k = simd.add(msg_5, K_29)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
|
||||
|
||||
// Rounds 60-61
|
||||
msg_k = simd.add(msg_6, K_30)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
|
||||
|
||||
// Rounds 62-63
|
||||
msg_k = simd.add(msg_7, K_31)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
|
||||
|
||||
// Rounds 64-65
|
||||
msg_k = simd.add(msg_0, K_32)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
|
||||
// Rounds 66-67
|
||||
msg_k = simd.add(msg_1, K_33)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
|
||||
// Rounds 68-69
|
||||
msg_k = simd.add(msg_2, K_34)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
|
||||
// Rounds 70-71
|
||||
msg_k = simd.add(msg_3, K_35)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
|
||||
// Rounds 72-73
|
||||
msg_k = simd.add(msg_4, K_36)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
|
||||
state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
|
||||
state_1 = simd.add(state_1, tmp_1)
|
||||
|
||||
// Rounds 74-75
|
||||
msg_k = simd.add(msg_5, K_37)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
|
||||
state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
|
||||
state_0 = simd.add(state_0, tmp_1)
|
||||
|
||||
// Rounds 76-77
|
||||
msg_k = simd.add(msg_6, K_38)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
|
||||
state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
|
||||
state_3 = simd.add(state_3, tmp_1)
|
||||
|
||||
// Rounds 78-79
|
||||
msg_k = simd.add(msg_7, K_39)
|
||||
tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
|
||||
tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
|
||||
state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
|
||||
state_2 = simd.add(state_2, tmp_1)
|
||||
|
||||
// Combine state
|
||||
state_0 = simd.add(state_0, ab_save)
|
||||
state_1 = simd.add(state_1, cd_save)
|
||||
state_2 = simd.add(state_2, ef_save)
|
||||
state_3 = simd.add(state_3, gh_save)
|
||||
|
||||
data = data[BLOCK_SIZE_512:]
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[0]), state_0)
|
||||
intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[2]), state_1)
|
||||
intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[4]), state_2)
|
||||
intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[6]), state_3)
|
||||
}
|
||||
|
||||
when ODIN_ENDIAN == .Little {
|
||||
@(private = "file", enable_target_feature = "neon")
|
||||
byteswap_u64x2 :: #force_inline proc "contextless" (a: simd.u64x2) -> simd.u64x2 {
|
||||
return transmute(simd.u64x2)(
|
||||
simd.shuffle(
|
||||
transmute(simd.u8x16)(a),
|
||||
transmute(simd.u8x16)(a),
|
||||
7, 6, 5, 4, 3, 2, 1, 0,
|
||||
15, 14, 13, 12, 11, 10, 9, 8,
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
13
core/crypto/sha2/sha512_impl_hw_gen.odin
Normal file
13
core/crypto/sha2/sha512_impl_hw_gen.odin
Normal file
@@ -0,0 +1,13 @@
|
||||
#+build !arm64
|
||||
package sha2
|
||||
|
||||
// is_hardware_accelerated_512 returns true if and only if (⟺) hardware
|
||||
// accelerated SHA-384, SHA-512, and SHA-512/256 are supported.
|
||||
is_hardware_accelerated_512 :: proc "contextless" () -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(private)
|
||||
sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) {
|
||||
panic_contextless(ERR_HW_NOT_SUPPORTED)
|
||||
}
|
||||
@@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
ensure(ctx.is_initialized)
|
||||
|
||||
@@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) {
|
||||
// final finalizes the Context, writes the digest to hash, and calls
|
||||
// reset on the Context.
|
||||
//
|
||||
// Iff finalize_clone is set, final will work on a copy of the Context,
|
||||
// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
|
||||
// which is useful for for calculating rolling digests.
|
||||
final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
|
||||
_sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone)
|
||||
|
||||
@@ -285,6 +285,32 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
|
||||
}
|
||||
return
|
||||
|
||||
|
||||
case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
array_data := uintptr(v.data)
|
||||
array_len := (^int)(array_data + info.len_offset)^
|
||||
if info.elem.id == byte {
|
||||
raw := runtime.Raw_Slice{v.data, array_len}
|
||||
return err_conv(_encode_bytes(e, transmute([]byte)raw))
|
||||
}
|
||||
|
||||
err_conv(_encode_u64(e, u64(array_len), .Array)) or_return
|
||||
|
||||
if impl, ok := _tag_implementations_type[info.elem.id]; ok {
|
||||
for i in 0..<array_len {
|
||||
data := array_data + uintptr(i*info.elem_size)
|
||||
impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
|
||||
for i in 0..<array_len {
|
||||
data := array_data + uintptr(i*info.elem_size)
|
||||
_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
|
||||
}
|
||||
return
|
||||
|
||||
case runtime.Type_Info_Slice:
|
||||
if info.elem.id == byte {
|
||||
raw := (^[]byte)(v.data)
|
||||
|
||||
@@ -389,6 +389,23 @@ _unmarshal_bytes :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
|
||||
n := copy(slice, bytes)
|
||||
assert(n == len(bytes))
|
||||
return
|
||||
|
||||
case reflect.Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
elem_base := reflect.type_info_base(t.elem)
|
||||
|
||||
if elem_base.id != byte { return _unsupported(v, hdr) }
|
||||
|
||||
bytes := err_conv(_decode_bytes(d, add, allocator=context.temp_allocator)) or_return
|
||||
defer delete(bytes, context.temp_allocator)
|
||||
|
||||
if len(bytes) > t.capacity { return _unsupported(v, hdr) }
|
||||
|
||||
// Copy into array type, delete original.
|
||||
slice := ([^]byte)(v.data)[:len(bytes)]
|
||||
n := copy(slice, bytes)
|
||||
assert(n == len(bytes))
|
||||
(^int)(uintptr(v.data) + t.len_offset)^ = n
|
||||
return
|
||||
}
|
||||
|
||||
return _unsupported(v, hdr)
|
||||
@@ -553,6 +570,21 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
|
||||
if out_of_space { return _unsupported(v, hdr) }
|
||||
return
|
||||
|
||||
case reflect.Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
length, _ := err_conv(_decode_len_container(d, add)) or_return
|
||||
if length > t.capacity {
|
||||
return _unsupported(v, hdr)
|
||||
}
|
||||
|
||||
da := mem.Raw_Dynamic_Array{rawptr(v.data), 0, length, allocator }
|
||||
|
||||
out_of_space := assign_array(d, &da, t.elem, length, growable=false) or_return
|
||||
if out_of_space { return _unsupported(v, hdr) }
|
||||
|
||||
(^int)(uintptr(v.data) + t.len_offset)^ = length
|
||||
|
||||
return
|
||||
|
||||
case reflect.Type_Info_Complex:
|
||||
length, _ := err_conv(_decode_len_container(d, add)) or_return
|
||||
if length > 2 {
|
||||
@@ -661,8 +693,7 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
|
||||
unknown := length == -1
|
||||
fields := reflect.struct_fields_zipped(ti.id)
|
||||
|
||||
idx := 0
|
||||
for ; idx < len(fields) && (unknown || idx < length); idx += 1 {
|
||||
for idx := 0; unknown || idx < length; idx += 1 {
|
||||
// Decode key, keys can only be strings.
|
||||
key: string
|
||||
if keyv, kerr := decode_key(d, v, context.temp_allocator); unknown && kerr == .Break {
|
||||
@@ -710,16 +741,6 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
|
||||
_unmarshal_value(d, fany, _decode_header(r) or_return) or_return
|
||||
}
|
||||
|
||||
// If there are fields left in the map that did not get decoded into the struct, decode and discard them.
|
||||
if !unknown {
|
||||
for _ in idx..<length {
|
||||
key := err_conv(_decode_from_decoder(d, allocator=context.temp_allocator)) or_return
|
||||
destroy(key, context.temp_allocator)
|
||||
val := err_conv(_decode_from_decoder(d, allocator=context.temp_allocator)) or_return
|
||||
destroy(val, context.temp_allocator)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
|
||||
case reflect.Type_Info_Map:
|
||||
|
||||
@@ -320,6 +320,16 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
|
||||
}
|
||||
opt_write_end(w, opt, ']') or_return
|
||||
|
||||
case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
opt_write_start(w, opt, '[') or_return
|
||||
len := (^int)(uintptr(v.data) + info.len_offset)^
|
||||
for i in 0..<len {
|
||||
opt_write_iteration(w, opt, i == 0) or_return
|
||||
data := uintptr(v.data) + uintptr(i*info.elem_size)
|
||||
marshal_to_writer(w, any{rawptr(data), info.elem.id}, opt) or_return
|
||||
}
|
||||
opt_write_end(w, opt, ']') or_return
|
||||
|
||||
case runtime.Type_Info_Slice:
|
||||
opt_write_start(w, opt, '[') or_return
|
||||
slice := cast(^mem.Raw_Slice)v.data
|
||||
|
||||
@@ -34,35 +34,6 @@ Register_User_Unmarshaler_Error :: enum {
|
||||
Unmarshaler_Previously_Found,
|
||||
}
|
||||
|
||||
// Example User Unmarshaler:
|
||||
// Custom Unmarshaler for `int`
|
||||
// Some_Unmarshaler :: proc(p: ^json.Parser, v: any) -> json.Unmarshal_Error {
|
||||
// token := p.curr_token.text
|
||||
// i, ok := strconv.parse_i64_of_base(token, 2)
|
||||
// if !ok {
|
||||
// return .Invalid_Data
|
||||
//
|
||||
// }
|
||||
// (^int)(v.data)^ = int(i)
|
||||
// return .None
|
||||
// }
|
||||
//
|
||||
// _main :: proc() {
|
||||
// // Ensure the json._user_unmarshaler map is initialized
|
||||
// json.set_user_unmarshalers(new(map[typeid]json.User_Unmarshaler))
|
||||
// reg_err := json.register_user_unmarshaler(type_info_of(int).id, Some_Unmarshaler)
|
||||
// assert(reg_err == .None)
|
||||
//
|
||||
// data := `{"value":101010}`
|
||||
// SomeType :: struct {
|
||||
// value: int,
|
||||
// }
|
||||
// y: SomeType
|
||||
//
|
||||
// unmarshal_err := json.unmarshal(transmute([]byte)data, &y)
|
||||
// fmt.println(y, unmarshal_err)
|
||||
// }
|
||||
|
||||
// NOTE(Jeroen): This is a pointer to prevent accidental additions
|
||||
// it is prefixed with `_` rather than marked with a private attribute so that users can access it if necessary
|
||||
_user_unmarshalers: ^map[typeid]User_Unmarshaler
|
||||
@@ -72,23 +43,60 @@ _user_unmarshalers: ^map[typeid]User_Unmarshaler
|
||||
// Inputs:
|
||||
// - m: A pointer to a map of typeids to User_Unmarshaler procs.
|
||||
//
|
||||
// NOTE: Must be called before using register_user_unmarshaler.
|
||||
// NOTE: Must be called before using `register_user_unmarshaler`.
|
||||
//
|
||||
set_user_unmarshalers :: proc(m: ^map[typeid]User_Unmarshaler) {
|
||||
assert(_user_unmarshalers == nil, "set_user_unmarshalers must not be called more than once.")
|
||||
_user_unmarshalers = m
|
||||
}
|
||||
|
||||
// Registers a user-defined unmarshaler for a specific typeid
|
||||
//
|
||||
// Inputs:
|
||||
// - id: The typeid of the custom type.
|
||||
// - unmarshaler: The User_Unmarshaler function for the custom type.
|
||||
//
|
||||
// Returns: A Register_User_Unmarshaler_Error value indicating the success or failure of the operation.
|
||||
//
|
||||
// WARNING: set_user_unmarshalers must be called before using this procedure.
|
||||
//
|
||||
/*
|
||||
Registers a user-defined unmarshaler for a specific `typeid`.
|
||||
|
||||
WARNING: set_user_unmarshalers must be called before using this procedure.
|
||||
|
||||
Inputs:
|
||||
- id: The `typeid` of the custom type.
|
||||
- unmarshaler: The `User_Unmarshaler` function for the custom type.
|
||||
|
||||
Example:
|
||||
import "core:fmt"
|
||||
import "core:encoding/json"
|
||||
import "core:strconv"
|
||||
|
||||
// Custom Unmarshaler for `int`
|
||||
some_unmarshaler :: proc(p: ^json.Parser, v: any) -> json.Unmarshal_Error {
|
||||
token := p.curr_token.text
|
||||
i, ok := strconv.parse_i64_of_base(token, 2)
|
||||
if !ok {
|
||||
return .Invalid_Data
|
||||
}
|
||||
|
||||
(^int)(v.data)^ = int(i)
|
||||
|
||||
json.advance_token(p)
|
||||
return nil
|
||||
}
|
||||
|
||||
register_user_unmarshaler_example :: proc() {
|
||||
// Ensure the `json._user_unmarshalers` map is initialized.
|
||||
json.set_user_unmarshalers(new(map[typeid]json.User_Unmarshaler))
|
||||
reg_err := json.register_user_unmarshaler(typeid_of(int), some_unmarshaler)
|
||||
assert(reg_err == .None)
|
||||
|
||||
data := `{"value":101010}`
|
||||
SomeType :: struct {
|
||||
value: int,
|
||||
}
|
||||
y: SomeType
|
||||
|
||||
unmarshal_err := json.unmarshal(transmute([]byte)data, &y)
|
||||
fmt.println(y, unmarshal_err)
|
||||
}
|
||||
|
||||
Output:
|
||||
SomeType{value = 42} nil
|
||||
*/
|
||||
register_user_unmarshaler :: proc(id: typeid, unmarshaler: User_Unmarshaler) -> Register_User_Unmarshaler_Error {
|
||||
if _user_unmarshalers == nil {
|
||||
return .No_User_Unmarshaler
|
||||
|
||||
@@ -2592,11 +2592,19 @@ fmt_named_buitlin_custom_formatters :: proc(fi: ^Info, v: any, verb: rune, info:
|
||||
prec = 6
|
||||
buf[w] = 'm'
|
||||
}
|
||||
if fi.space {
|
||||
w -= 1
|
||||
buf[w] = ' '
|
||||
}
|
||||
w, u = ffrac(buf[:w], u, prec)
|
||||
w = fint(buf[:w], u)
|
||||
} else {
|
||||
w -= 1
|
||||
buf[w] = 's'
|
||||
if fi.space {
|
||||
w -= 1
|
||||
buf[w] = ' '
|
||||
}
|
||||
w, u = ffrac(buf[:w], u, 9)
|
||||
w = fint(buf[:w], u%60)
|
||||
u /= 60
|
||||
@@ -3238,6 +3246,21 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
|
||||
}
|
||||
fmt_array(fi, ptr, n, info.elem_size, info.elem, verb)
|
||||
|
||||
|
||||
case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
|
||||
n := (^int)(uintptr(v.data) + info.len_offset)^
|
||||
|
||||
ptr := v.data // data is stored at the start
|
||||
if ol, ok := fi.optional_len.?; ok {
|
||||
fi.optional_len = nil
|
||||
n = min(n, ol)
|
||||
} else if fi.use_nul_termination {
|
||||
fi.use_nul_termination = false
|
||||
fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
|
||||
return
|
||||
}
|
||||
fmt_array(fi, ptr, n, info.elem_size, info.elem, verb)
|
||||
|
||||
case runtime.Type_Info_Simd_Vector:
|
||||
io.write_byte(fi.writer, '<', &fi.n)
|
||||
defer io.write_byte(fi.writer, '>', &fi.n)
|
||||
|
||||
@@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err
|
||||
|
||||
// copy_n copies n bytes (or till an error) from src to dst.
|
||||
// It returns the number of bytes copied and the first error that occurred whilst copying, if any.
|
||||
// On return, written == n IFF err == nil
|
||||
// On return, written == n if and only if (⟺) err == nil
|
||||
copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) {
|
||||
nsrc := limited_reader_init(&Limited_Reader{}, src, n)
|
||||
written, err = copy(dst, nsrc)
|
||||
|
||||
@@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc
|
||||
If the modulus is odd or dr != 0 use the montgomery method.
|
||||
*/
|
||||
if internal_int_is_odd(P) || dr != 0 {
|
||||
return _private_int_exponent_mod(res, G, X, P, dr)
|
||||
return _private_int_exponent_mod_fast(res, G, X, P, dr)
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
|
||||
return _private_int_mul_high_comba(dest, a, b, digits)
|
||||
}
|
||||
|
||||
internal_grow(dest, a.used + b.used + 1) or_return
|
||||
dest.used = a.used + b.used + 1
|
||||
/*
|
||||
Set up temporary output `Int`, which we'll swap for `dest` when done.
|
||||
*/
|
||||
|
||||
t := &Int{}
|
||||
|
||||
internal_grow(t, a.used + b.used + 1) or_return
|
||||
t.used = a.used + b.used + 1
|
||||
|
||||
pa := a.used
|
||||
pb := b.used
|
||||
@@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
|
||||
/*
|
||||
Calculate the double precision result.
|
||||
*/
|
||||
r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
||||
r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
|
||||
|
||||
/*
|
||||
Get the lower part.
|
||||
*/
|
||||
dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
||||
t.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
|
||||
|
||||
/*
|
||||
Carry the carry.
|
||||
*/
|
||||
carry = DIGIT(r >> _WORD(_DIGIT_BITS))
|
||||
}
|
||||
dest.digit[ix + pb] = carry
|
||||
t.digit[ix + pb] = carry
|
||||
}
|
||||
|
||||
internal_swap(dest, t)
|
||||
internal_destroy(t)
|
||||
return internal_clamp(dest)
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ Fixed32_32 :: distinct Fixed(i64, 32)
|
||||
Fixed52_12 :: distinct Fixed(i64, 12)
|
||||
|
||||
|
||||
init_from_f64 :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
|
||||
init_from_f64 :: proc "contextless" (x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
|
||||
i, f := math.modf(math.abs(val))
|
||||
x.i = Backing(f * (1<<Fraction_Width))
|
||||
x.i &= 1<<Fraction_Width - 1
|
||||
@@ -39,13 +39,13 @@ init_from_f64 :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
|
||||
}
|
||||
}
|
||||
|
||||
init_from_parts :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), integer, fraction: Backing) {
|
||||
init_from_parts :: proc "contextless" (x: ^$T/Fixed($Backing, $Fraction_Width), integer, fraction: Backing) {
|
||||
x.i = fraction
|
||||
x.i &= 1<<Fraction_Width - 1
|
||||
x.i |= (integer << Fraction_Width)
|
||||
}
|
||||
|
||||
to_f64 :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {
|
||||
to_f64 :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {
|
||||
sign := -1.0 if x.i < 0 else 1.0
|
||||
num := math.abs(x.i)
|
||||
res := f64(num >> Fraction_Width)
|
||||
@@ -55,39 +55,39 @@ to_f64 :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {
|
||||
|
||||
|
||||
@(require_results)
|
||||
add :: proc(x, y: $T/Fixed) -> T {
|
||||
add :: proc "contextless" (x, y: $T/Fixed) -> T {
|
||||
return {x.i + y.i}
|
||||
}
|
||||
@(require_results)
|
||||
sub :: proc(x, y: $T/Fixed) -> T {
|
||||
sub :: proc "contextless" (x, y: $T/Fixed) -> T {
|
||||
return {x.i - y.i}
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
mul :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
mul :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
z.i = intrinsics.fixed_point_mul(x.i, y.i, Fraction_Width)
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
mul_sat :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
mul_sat :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
z.i = intrinsics.fixed_point_mul_sat(x.i, y.i, Fraction_Width)
|
||||
return
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
div :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
div :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
z.i = intrinsics.fixed_point_div(x.i, y.i, Fraction_Width)
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
div_sat :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
div_sat :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
|
||||
z.i = intrinsics.fixed_point_div_sat(x.i, y.i, Fraction_Width)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@(require_results)
|
||||
floor :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
floor :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
if x.i >= 0 {
|
||||
return x.i >> Fraction_Width
|
||||
} else {
|
||||
@@ -95,11 +95,11 @@ floor :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
}
|
||||
}
|
||||
@(require_results)
|
||||
ceil :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
ceil :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
return (x.i + (1 << Fraction_Width - 1)) >> Fraction_Width
|
||||
}
|
||||
@(require_results)
|
||||
round :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
round :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
|
||||
return (x.i + (1 << (Fraction_Width - 1))) >> Fraction_Width
|
||||
}
|
||||
|
||||
@@ -163,7 +163,7 @@ to_string :: proc(x: $T/Fixed($Backing, $Fraction_Width), allocator := context.a
|
||||
}
|
||||
|
||||
|
||||
@(private)
|
||||
@(rodata, private)
|
||||
_power_of_two_table := [129]string{
|
||||
"0.5",
|
||||
"1",
|
||||
@@ -295,8 +295,3 @@ _power_of_two_table := [129]string{
|
||||
"85070591730234615865843651857942052864",
|
||||
"170141183460469231731687303715884105728",
|
||||
}
|
||||
|
||||
@(deprecated="Use write instead")
|
||||
append :: proc(dst: []byte, x: $T/Fixed($Backing, $Fraction_Width)) -> string {
|
||||
return write(dst, x)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,18 @@ package rand
|
||||
import "base:intrinsics"
|
||||
import "base:runtime"
|
||||
|
||||
when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
|
||||
// LLVM thinks that using SIMD for read_u64 is good,
|
||||
// when it causes a ~3x performance regression. As
|
||||
// far as I can tell, this behavior is limited to
|
||||
// Intel.
|
||||
@(private = "file")
|
||||
TARGET_FEATURES :: "-sse,-avx,-avx2"
|
||||
} else {
|
||||
@(private = "file")
|
||||
TARGET_FEATURES :: ""
|
||||
}
|
||||
|
||||
/*
|
||||
The state for a xoshiro256** pseudorandom generator.
|
||||
*/
|
||||
@@ -10,8 +22,9 @@ Xoshiro256_Random_State :: struct {
|
||||
s: [4]u64,
|
||||
}
|
||||
|
||||
@(enable_target_feature = TARGET_FEATURES)
|
||||
xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
|
||||
@(require_results)
|
||||
@(require_results, enable_target_feature = TARGET_FEATURES)
|
||||
read_u64 :: proc "contextless" (r: ^Xoshiro256_Random_State) -> u64 {
|
||||
// xoshiro256** output function and state transition
|
||||
|
||||
@@ -27,7 +40,7 @@ xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Gene
|
||||
|
||||
return result
|
||||
|
||||
rotate_left64 :: proc "contextless" (x: u64, k: int) -> u64 {
|
||||
rotate_left64 :: #force_inline proc "contextless" (x: u64, k: int) -> u64 {
|
||||
n :: 64
|
||||
s := uint(k) & (n-1)
|
||||
return x << s | x >> (n-s)
|
||||
|
||||
@@ -536,15 +536,8 @@ scratch_alloc_bytes_non_zeroed :: proc(
|
||||
// we don't need to be so strict about every byte.
|
||||
aligned_size += alignment - 1
|
||||
}
|
||||
if aligned_size <= len(s.data) {
|
||||
offset := uintptr(0)
|
||||
if s.curr_offset+aligned_size <= len(s.data) {
|
||||
offset = uintptr(s.curr_offset)
|
||||
} else {
|
||||
// The allocation will cause an overflow past the boundary of the
|
||||
// space available, so reset to the starting offset.
|
||||
offset = 0
|
||||
}
|
||||
if s.curr_offset+aligned_size <= len(s.data) {
|
||||
offset := uintptr(s.curr_offset)
|
||||
start := uintptr(raw_data(s.data))
|
||||
ptr := rawptr(offset+start)
|
||||
// We keep track of the original base pointer without extra alignment
|
||||
|
||||
@@ -467,13 +467,7 @@ Check whether a number is a power of two.
|
||||
This procedure checks whether a given pointer-sized unsigned integer contains
|
||||
a power-of-two value.
|
||||
*/
|
||||
@(require_results)
|
||||
is_power_of_two :: proc "contextless" (x: uintptr) -> bool {
|
||||
if x <= 0 {
|
||||
return false
|
||||
}
|
||||
return (x & (x-1)) == 0
|
||||
}
|
||||
is_power_of_two :: runtime.is_power_of_two_uintptr
|
||||
|
||||
/*
|
||||
Check if a pointer is aligned.
|
||||
@@ -497,11 +491,7 @@ bytes, `ptr` is returned.
|
||||
|
||||
The specified alignment must be a power of 2.
|
||||
*/
|
||||
@(require_results)
|
||||
align_forward_uintptr :: proc(ptr, align: uintptr) -> uintptr {
|
||||
assert(is_power_of_two(align))
|
||||
return (ptr + align-1) & ~(align-1)
|
||||
}
|
||||
align_forward_uintptr :: runtime.align_forward_uintptr
|
||||
|
||||
/*
|
||||
Align pointer forward.
|
||||
@@ -526,10 +516,7 @@ bytes, `ptr` is returned.
|
||||
|
||||
The specified alignment must be a power of 2.
|
||||
*/
|
||||
@(require_results)
|
||||
align_forward_int :: proc(ptr, align: int) -> int {
|
||||
return int(align_forward_uintptr(uintptr(ptr), uintptr(align)))
|
||||
}
|
||||
align_forward_int :: runtime.align_forward_int
|
||||
|
||||
/*
|
||||
Align uint forward.
|
||||
@@ -540,10 +527,7 @@ bytes, `ptr` is returned.
|
||||
|
||||
The specified alignment must be a power of 2.
|
||||
*/
|
||||
@(require_results)
|
||||
align_forward_uint :: proc(ptr, align: uint) -> uint {
|
||||
return uint(align_forward_uintptr(uintptr(ptr), uintptr(align)))
|
||||
}
|
||||
align_forward_uint :: runtime.align_forward_uint
|
||||
|
||||
/*
|
||||
Align uintptr backwards.
|
||||
@@ -626,32 +610,6 @@ reinterpret_copy :: proc "contextless" ($T: typeid, ptr: rawptr) -> (value: T) {
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
Dynamic array with a fixed capacity buffer.
|
||||
|
||||
This type represents dynamic arrays with a fixed-size backing buffer. Upon
|
||||
allocating memory beyond reaching the maximum capacity, allocations from fixed
|
||||
byte buffers return `nil` and no error.
|
||||
*/
|
||||
Fixed_Byte_Buffer :: distinct [dynamic]byte
|
||||
|
||||
/*
|
||||
Create a fixed byte buffer from a slice.
|
||||
*/
|
||||
@(require_results)
|
||||
make_fixed_byte_buffer :: proc "contextless" (backing: []byte) -> Fixed_Byte_Buffer {
|
||||
s := transmute(Raw_Slice)backing
|
||||
d: Raw_Dynamic_Array
|
||||
d.data = s.data
|
||||
d.len = 0
|
||||
d.cap = s.len
|
||||
d.allocator = Allocator{
|
||||
procedure = nil_allocator_proc,
|
||||
data = nil,
|
||||
}
|
||||
return transmute(Fixed_Byte_Buffer)d
|
||||
}
|
||||
|
||||
/*
|
||||
General-purpose align formula.
|
||||
|
||||
|
||||
@@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc :=
|
||||
|
||||
needed := mem.align_forward_uint(size, alignment)
|
||||
needed = max(needed, arena.default_commit_size)
|
||||
block_size := max(needed, arena.minimum_block_size)
|
||||
block_size := max(needed, arena.minimum_block_size) + alignment
|
||||
|
||||
new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return
|
||||
new_block := memory_block_alloc(needed, block_size) or_return
|
||||
new_block.prev = arena.curr_block
|
||||
arena.curr_block = new_block
|
||||
arena.total_reserved += new_block.reserved
|
||||
|
||||
@@ -10,7 +10,14 @@ map_file :: proc{
|
||||
}
|
||||
|
||||
map_file_from_path :: proc(filename: string, flags: Map_File_Flags) -> (data: []byte, error: Map_File_Error) {
|
||||
f, err := os.open(filename, os.O_RDWR)
|
||||
open_flags : os.File_Flags
|
||||
if .Read in flags {
|
||||
open_flags += {.Read}
|
||||
}
|
||||
if .Write in flags {
|
||||
open_flags += {.Write}
|
||||
}
|
||||
f, err := os.open(filename, open_flags)
|
||||
if err != nil {
|
||||
return nil, .Open_Failure
|
||||
}
|
||||
@@ -37,4 +44,4 @@ unmap_file :: proc(data: []byte) {
|
||||
if raw_data(data) != nil {
|
||||
_unmap_file(data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user