diff --git a/core/crypto/_fiat/README.md b/core/crypto/_fiat/README.md new file mode 100644 index 000000000..cd510d442 --- /dev/null +++ b/core/crypto/_fiat/README.md @@ -0,0 +1,35 @@ +# fiat + +This package contains low level arithmetic required to implement certain +cryptographic primitives, ported from the [fiat-crypto project][1] +along with some higher-level helpers. + +## Notes + +fiat-crypto gives the choice of 3 licenses for derived works. The 1-Clause +BSD license is chosen as it is compatible with Odin's existing licensing. + +The routines are intended to be timing-safe, as long as the underlying +integer arithmetic is constant time. This is true on most systems commonly +used today, with the notable exception of WASM. + +While fiat-crypto provides both output targeting both 32-bit and 64-bit +architectures, only the 64-bit versions were used, as 32-bit architectures +are becoming increasingly uncommon and irrelevant. + +With the current Odin syntax, the Go output is trivially ported in most +cases and was used as the basis of the port. + +In the future, it would be better to auto-generate Odin either directly +by adding an appropriate code-gen backend written in Coq, or perhaps by +parsing the JSON output. + +As this is a port rather than autogenerated output, none of fiat-crypto's +formal verification guarantees apply, unless it is possible to prove binary +equivalence. + +For the most part, alterations to the base fiat-crypto generated code was +kept to a minimum, to aid auditability. This results in a somewhat +ideosyncratic style, and in some cases minor performance penalties. + +[1]: https://github.com/mit-plv/fiat-crypto diff --git a/core/crypto/_fiat/fiat.odin b/core/crypto/_fiat/fiat.odin new file mode 100644 index 000000000..ae9727149 --- /dev/null +++ b/core/crypto/_fiat/fiat.odin @@ -0,0 +1,24 @@ +package fiat + +// This package provides various helpers and types common to all of the +// fiat-crypto derived backends. + +// This code only works on a two's complement system. +#assert((-1 & 3) == 3) + +u1 :: distinct u8 +i1 :: distinct i8 + +cmovznz_u64 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) { + x1 := (u64(arg1) * 0xffffffffffffffff) + x2 := ((x1 & arg3) | ((~x1) & arg2)) + out1 = x2 + return +} + +cmovznz_u32 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u32) -> (out1: u32) { + x1 := (u32(arg1) * 0xffffffff) + x2 := ((x1 & arg3) | ((~x1) & arg2)) + out1 = x2 + return +} diff --git a/core/crypto/_fiat/field_curve25519/field.odin b/core/crypto/_fiat/field_curve25519/field.odin new file mode 100644 index 000000000..faf8ae3f7 --- /dev/null +++ b/core/crypto/_fiat/field_curve25519/field.odin @@ -0,0 +1,138 @@ +package field_curve25519 + +import "core:crypto" +import "core:mem" + +fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element { + return transmute(^Loose_Field_Element)(arg1) +} + +fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element { + return transmute(^Tight_Field_Element)(arg1) +} + +fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) { + // Ignore the unused bit by copying the input and masking the bit off + // prior to deserialization. + tmp1: [32]byte = --- + copy_slice(tmp1[:], arg1[:]) + tmp1[31] &= 127 + + _fe_from_bytes(out1, &tmp1) + + mem.zero_explicit(&tmp1, size_of(tmp1)) +} + +fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int { + tmp2: [32]byte = --- + + fe_to_bytes(&tmp2, arg2) + ret := fe_equal_bytes(arg1, &tmp2) + + mem.zero_explicit(&tmp2, size_of(tmp2)) + + return ret +} + +fe_equal_bytes :: proc "contextless" (arg1: ^Tight_Field_Element, arg2: ^[32]byte) -> int { + tmp1: [32]byte = --- + + fe_to_bytes(&tmp1, arg1) + + ret := crypto.compare_constant_time(tmp1[:], arg2[:]) + + mem.zero_explicit(&tmp1, size_of(tmp1)) + + return ret +} + +fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element, arg2: uint) { + // Special case: `arg1^(2 * 0) = 1`, though this should never happen. + if arg2 == 0 { + fe_one(out1) + return + } + + fe_carry_square(out1, arg1) + for _ in 1.. int { + // Inverse square root taken from Monocypher. + + tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, --- + + // t0 = x^((p-5)/8) + // Can be achieved with a simple double & add ladder, + // but it would be slower. + fe_carry_pow2k(&tmp1, arg1, 1) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2) + fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2)) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2)) + fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 5) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 10) + fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 20) + fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2)) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 10) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 50) + fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 100) + fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2)) + fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1)) + fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2) + fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1) + + // quartic = x^((p-1)/4) + quartic := &tmp2 + fe_carry_square(quartic, fe_relax_cast(&tmp1)) + fe_carry_mul(quartic, fe_relax_cast(quartic), arg1) + + // Serialize quartic once to save on repeated serialization/sanitization. + quartic_buf: [32]byte = --- + fe_to_bytes(&quartic_buf, quartic) + check := &tmp3 + + fe_one(check) + p1 := fe_equal_bytes(check, &quartic_buf) + fe_carry_opp(check, check) + m1 := fe_equal_bytes(check, &quartic_buf) + fe_carry_opp(check, &SQRT_M1) + ms := fe_equal_bytes(check, &quartic_buf) + + // if quartic == -1 or sqrt(-1) + // then isr = x^((p-1)/4) * sqrt(-1) + // else isr = x^((p-1)/4) + fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1)) + fe_cond_assign(out1, &tmp1, (m1|ms) ~ 1) + + mem.zero_explicit(&tmp1, size_of(tmp1)) + mem.zero_explicit(&tmp2, size_of(tmp2)) + mem.zero_explicit(&tmp3, size_of(tmp3)) + mem.zero_explicit(&quartic_buf, size_of(quartic_buf)) + + return p1 | m1 +} + +fe_carry_inv :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + tmp1: Tight_Field_Element + + fe_carry_square(&tmp1, arg1) + _ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1)) + fe_carry_square(&tmp1, fe_relax_cast(&tmp1)) + fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1) + + mem.zero_explicit(&tmp1, size_of(tmp1)) +} diff --git a/core/crypto/_fiat/field_curve25519/field51.odin b/core/crypto/_fiat/field_curve25519/field51.odin new file mode 100644 index 000000000..e4ca98b57 --- /dev/null +++ b/core/crypto/_fiat/field_curve25519/field51.odin @@ -0,0 +1,616 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_curve25519 + +// The file provides arithmetic on the field Z/(2^255-19) using +// unsaturated 64-bit integer arithmetic. It is derived primarily +// from the machine generated Golang output from the fiat-crypto project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// At some point, it may be worth adding support to fiat-crypto for +// generating Odin output. +// +// TODO: +// * When fiat-crypto supports it, using a saturated 64-bit limbs +// instead of 51-bit limbs will be faster, though the gains are +// minimal unless adcx/adox/mulx are used. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +Loose_Field_Element :: distinct [5]u64 +Tight_Field_Element :: distinct [5]u64 + +SQRT_M1 := Tight_Field_Element{ + 1718705420411056, + 234908883556509, + 2233514472574048, + 2117202627021982, + 765476049583133, +} + +_addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((u64(arg1) + arg2) + arg3) + x2 := (x1 & 0x7ffffffffffff) + x3 := fiat.u1((x1 >> 51)) + out1 = x2 + out2 = x3 + return +} + +_subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((i64(arg2) - i64(arg1)) - i64(arg3)) + x2 := fiat.i1((x1 >> 51)) + x3 := (u64(x1) & 0x7ffffffffffff) + out1 = x3 + out2 = (0x0 - fiat.u1(x2)) + return +} + +fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) { + x2, x1 := bits.mul_u64(arg1[4], (arg2[4] * 0x13)) + x4, x3 := bits.mul_u64(arg1[4], (arg2[3] * 0x13)) + x6, x5 := bits.mul_u64(arg1[4], (arg2[2] * 0x13)) + x8, x7 := bits.mul_u64(arg1[4], (arg2[1] * 0x13)) + x10, x9 := bits.mul_u64(arg1[3], (arg2[4] * 0x13)) + x12, x11 := bits.mul_u64(arg1[3], (arg2[3] * 0x13)) + x14, x13 := bits.mul_u64(arg1[3], (arg2[2] * 0x13)) + x16, x15 := bits.mul_u64(arg1[2], (arg2[4] * 0x13)) + x18, x17 := bits.mul_u64(arg1[2], (arg2[3] * 0x13)) + x20, x19 := bits.mul_u64(arg1[1], (arg2[4] * 0x13)) + x22, x21 := bits.mul_u64(arg1[4], arg2[0]) + x24, x23 := bits.mul_u64(arg1[3], arg2[1]) + x26, x25 := bits.mul_u64(arg1[3], arg2[0]) + x28, x27 := bits.mul_u64(arg1[2], arg2[2]) + x30, x29 := bits.mul_u64(arg1[2], arg2[1]) + x32, x31 := bits.mul_u64(arg1[2], arg2[0]) + x34, x33 := bits.mul_u64(arg1[1], arg2[3]) + x36, x35 := bits.mul_u64(arg1[1], arg2[2]) + x38, x37 := bits.mul_u64(arg1[1], arg2[1]) + x40, x39 := bits.mul_u64(arg1[1], arg2[0]) + x42, x41 := bits.mul_u64(arg1[0], arg2[4]) + x44, x43 := bits.mul_u64(arg1[0], arg2[3]) + x46, x45 := bits.mul_u64(arg1[0], arg2[2]) + x48, x47 := bits.mul_u64(arg1[0], arg2[1]) + x50, x49 := bits.mul_u64(arg1[0], arg2[0]) + x51, x52 := bits.add_u64(x13, x7, u64(0x0)) + x53, _ := bits.add_u64(x14, x8, u64(fiat.u1(x52))) + x55, x56 := bits.add_u64(x17, x51, u64(0x0)) + x57, _ := bits.add_u64(x18, x53, u64(fiat.u1(x56))) + x59, x60 := bits.add_u64(x19, x55, u64(0x0)) + x61, _ := bits.add_u64(x20, x57, u64(fiat.u1(x60))) + x63, x64 := bits.add_u64(x49, x59, u64(0x0)) + x65, _ := bits.add_u64(x50, x61, u64(fiat.u1(x64))) + x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff)) + x68 := (x63 & 0x7ffffffffffff) + x69, x70 := bits.add_u64(x23, x21, u64(0x0)) + x71, _ := bits.add_u64(x24, x22, u64(fiat.u1(x70))) + x73, x74 := bits.add_u64(x27, x69, u64(0x0)) + x75, _ := bits.add_u64(x28, x71, u64(fiat.u1(x74))) + x77, x78 := bits.add_u64(x33, x73, u64(0x0)) + x79, _ := bits.add_u64(x34, x75, u64(fiat.u1(x78))) + x81, x82 := bits.add_u64(x41, x77, u64(0x0)) + x83, _ := bits.add_u64(x42, x79, u64(fiat.u1(x82))) + x85, x86 := bits.add_u64(x25, x1, u64(0x0)) + x87, _ := bits.add_u64(x26, x2, u64(fiat.u1(x86))) + x89, x90 := bits.add_u64(x29, x85, u64(0x0)) + x91, _ := bits.add_u64(x30, x87, u64(fiat.u1(x90))) + x93, x94 := bits.add_u64(x35, x89, u64(0x0)) + x95, _ := bits.add_u64(x36, x91, u64(fiat.u1(x94))) + x97, x98 := bits.add_u64(x43, x93, u64(0x0)) + x99, _ := bits.add_u64(x44, x95, u64(fiat.u1(x98))) + x101, x102 := bits.add_u64(x9, x3, u64(0x0)) + x103, _ := bits.add_u64(x10, x4, u64(fiat.u1(x102))) + x105, x106 := bits.add_u64(x31, x101, u64(0x0)) + x107, _ := bits.add_u64(x32, x103, u64(fiat.u1(x106))) + x109, x110 := bits.add_u64(x37, x105, u64(0x0)) + x111, _ := bits.add_u64(x38, x107, u64(fiat.u1(x110))) + x113, x114 := bits.add_u64(x45, x109, u64(0x0)) + x115, _ := bits.add_u64(x46, x111, u64(fiat.u1(x114))) + x117, x118 := bits.add_u64(x11, x5, u64(0x0)) + x119, _ := bits.add_u64(x12, x6, u64(fiat.u1(x118))) + x121, x122 := bits.add_u64(x15, x117, u64(0x0)) + x123, _ := bits.add_u64(x16, x119, u64(fiat.u1(x122))) + x125, x126 := bits.add_u64(x39, x121, u64(0x0)) + x127, _ := bits.add_u64(x40, x123, u64(fiat.u1(x126))) + x129, x130 := bits.add_u64(x47, x125, u64(0x0)) + x131, _ := bits.add_u64(x48, x127, u64(fiat.u1(x130))) + x133, x134 := bits.add_u64(x67, x129, u64(0x0)) + x135 := (u64(fiat.u1(x134)) + x131) + x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff)) + x137 := (x133 & 0x7ffffffffffff) + x138, x139 := bits.add_u64(x136, x113, u64(0x0)) + x140 := (u64(fiat.u1(x139)) + x115) + x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff)) + x142 := (x138 & 0x7ffffffffffff) + x143, x144 := bits.add_u64(x141, x97, u64(0x0)) + x145 := (u64(fiat.u1(x144)) + x99) + x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff)) + x147 := (x143 & 0x7ffffffffffff) + x148, x149 := bits.add_u64(x146, x81, u64(0x0)) + x150 := (u64(fiat.u1(x149)) + x83) + x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff)) + x152 := (x148 & 0x7ffffffffffff) + x153 := (x151 * 0x13) + x154 := (x68 + x153) + x155 := (x154 >> 51) + x156 := (x154 & 0x7ffffffffffff) + x157 := (x155 + x137) + x158 := fiat.u1((x157 >> 51)) + x159 := (x157 & 0x7ffffffffffff) + x160 := (u64(x158) + x142) + out1[0] = x156 + out1[1] = x159 + out1[2] = x160 + out1[3] = x147 + out1[4] = x152 +} + +fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := (arg1[4] * 0x13) + x2 := (x1 * 0x2) + x3 := (arg1[4] * 0x2) + x4 := (arg1[3] * 0x13) + x5 := (x4 * 0x2) + x6 := (arg1[3] * 0x2) + x7 := (arg1[2] * 0x2) + x8 := (arg1[1] * 0x2) + x10, x9 := bits.mul_u64(arg1[4], x1) + x12, x11 := bits.mul_u64(arg1[3], x2) + x14, x13 := bits.mul_u64(arg1[3], x4) + x16, x15 := bits.mul_u64(arg1[2], x2) + x18, x17 := bits.mul_u64(arg1[2], x5) + x20, x19 := bits.mul_u64(arg1[2], arg1[2]) + x22, x21 := bits.mul_u64(arg1[1], x2) + x24, x23 := bits.mul_u64(arg1[1], x6) + x26, x25 := bits.mul_u64(arg1[1], x7) + x28, x27 := bits.mul_u64(arg1[1], arg1[1]) + x30, x29 := bits.mul_u64(arg1[0], x3) + x32, x31 := bits.mul_u64(arg1[0], x6) + x34, x33 := bits.mul_u64(arg1[0], x7) + x36, x35 := bits.mul_u64(arg1[0], x8) + x38, x37 := bits.mul_u64(arg1[0], arg1[0]) + x39, x40 := bits.add_u64(x21, x17, u64(0x0)) + x41, _ := bits.add_u64(x22, x18, u64(fiat.u1(x40))) + x43, x44 := bits.add_u64(x37, x39, u64(0x0)) + x45, _ := bits.add_u64(x38, x41, u64(fiat.u1(x44))) + x47 := ((x43 >> 51) | ((x45 << 13) & 0xffffffffffffffff)) + x48 := (x43 & 0x7ffffffffffff) + x49, x50 := bits.add_u64(x23, x19, u64(0x0)) + x51, _ := bits.add_u64(x24, x20, u64(fiat.u1(x50))) + x53, x54 := bits.add_u64(x29, x49, u64(0x0)) + x55, _ := bits.add_u64(x30, x51, u64(fiat.u1(x54))) + x57, x58 := bits.add_u64(x25, x9, u64(0x0)) + x59, _ := bits.add_u64(x26, x10, u64(fiat.u1(x58))) + x61, x62 := bits.add_u64(x31, x57, u64(0x0)) + x63, _ := bits.add_u64(x32, x59, u64(fiat.u1(x62))) + x65, x66 := bits.add_u64(x27, x11, u64(0x0)) + x67, _ := bits.add_u64(x28, x12, u64(fiat.u1(x66))) + x69, x70 := bits.add_u64(x33, x65, u64(0x0)) + x71, _ := bits.add_u64(x34, x67, u64(fiat.u1(x70))) + x73, x74 := bits.add_u64(x15, x13, u64(0x0)) + x75, _ := bits.add_u64(x16, x14, u64(fiat.u1(x74))) + x77, x78 := bits.add_u64(x35, x73, u64(0x0)) + x79, _ := bits.add_u64(x36, x75, u64(fiat.u1(x78))) + x81, x82 := bits.add_u64(x47, x77, u64(0x0)) + x83 := (u64(fiat.u1(x82)) + x79) + x84 := ((x81 >> 51) | ((x83 << 13) & 0xffffffffffffffff)) + x85 := (x81 & 0x7ffffffffffff) + x86, x87 := bits.add_u64(x84, x69, u64(0x0)) + x88 := (u64(fiat.u1(x87)) + x71) + x89 := ((x86 >> 51) | ((x88 << 13) & 0xffffffffffffffff)) + x90 := (x86 & 0x7ffffffffffff) + x91, x92 := bits.add_u64(x89, x61, u64(0x0)) + x93 := (u64(fiat.u1(x92)) + x63) + x94 := ((x91 >> 51) | ((x93 << 13) & 0xffffffffffffffff)) + x95 := (x91 & 0x7ffffffffffff) + x96, x97 := bits.add_u64(x94, x53, u64(0x0)) + x98 := (u64(fiat.u1(x97)) + x55) + x99 := ((x96 >> 51) | ((x98 << 13) & 0xffffffffffffffff)) + x100 := (x96 & 0x7ffffffffffff) + x101 := (x99 * 0x13) + x102 := (x48 + x101) + x103 := (x102 >> 51) + x104 := (x102 & 0x7ffffffffffff) + x105 := (x103 + x85) + x106 := fiat.u1((x105 >> 51)) + x107 := (x105 & 0x7ffffffffffff) + x108 := (u64(x106) + x90) + out1[0] = x104 + out1[1] = x107 + out1[2] = x108 + out1[3] = x95 + out1[4] = x100 +} + +fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[0] + x2 := ((x1 >> 51) + arg1[1]) + x3 := ((x2 >> 51) + arg1[2]) + x4 := ((x3 >> 51) + arg1[3]) + x5 := ((x4 >> 51) + arg1[4]) + x6 := ((x1 & 0x7ffffffffffff) + ((x5 >> 51) * 0x13)) + x7 := (u64(fiat.u1((x6 >> 51))) + (x2 & 0x7ffffffffffff)) + x8 := (x6 & 0x7ffffffffffff) + x9 := (x7 & 0x7ffffffffffff) + x10 := (u64(fiat.u1((x7 >> 51))) + (x3 & 0x7ffffffffffff)) + x11 := (x4 & 0x7ffffffffffff) + x12 := (x5 & 0x7ffffffffffff) + out1[0] = x8 + out1[1] = x9 + out1[2] = x10 + out1[3] = x11 + out1[4] = x12 +} + +fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := (arg1[0] + arg2[0]) + x2 := (arg1[1] + arg2[1]) + x3 := (arg1[2] + arg2[2]) + x4 := (arg1[3] + arg2[3]) + x5 := (arg1[4] + arg2[4]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := ((0xfffffffffffda + arg1[0]) - arg2[0]) + x2 := ((0xffffffffffffe + arg1[1]) - arg2[1]) + x3 := ((0xffffffffffffe + arg1[2]) - arg2[2]) + x4 := ((0xffffffffffffe + arg1[3]) - arg2[3]) + x5 := ((0xffffffffffffe + arg1[4]) - arg2[4]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := (0xfffffffffffda - arg1[0]) + x2 := (0xffffffffffffe - arg1[1]) + x3 := (0xffffffffffffe - arg1[2]) + x4 := (0xffffffffffffe - arg1[3]) + x5 := (0xffffffffffffe - arg1[4]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: int) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3]) + x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) { + x1, x2 := _subborrowx_u51(0x0, arg1[0], 0x7ffffffffffed) + x3, x4 := _subborrowx_u51(x2, arg1[1], 0x7ffffffffffff) + x5, x6 := _subborrowx_u51(x4, arg1[2], 0x7ffffffffffff) + x7, x8 := _subborrowx_u51(x6, arg1[3], 0x7ffffffffffff) + x9, x10 := _subborrowx_u51(x8, arg1[4], 0x7ffffffffffff) + x11 := fiat.cmovznz_u64(x10, u64(0x0), 0xffffffffffffffff) + x12, x13 := _addcarryx_u51(0x0, x1, (x11 & 0x7ffffffffffed)) + x14, x15 := _addcarryx_u51(x13, x3, (x11 & 0x7ffffffffffff)) + x16, x17 := _addcarryx_u51(x15, x5, (x11 & 0x7ffffffffffff)) + x18, x19 := _addcarryx_u51(x17, x7, (x11 & 0x7ffffffffffff)) + x20, _ := _addcarryx_u51(x19, x9, (x11 & 0x7ffffffffffff)) + x22 := (x20 << 4) + x23 := (x18 * u64(0x2)) + x24 := (x16 << 6) + x25 := (x14 << 3) + x26 := (u8(x12) & 0xff) + x27 := (x12 >> 8) + x28 := (u8(x27) & 0xff) + x29 := (x27 >> 8) + x30 := (u8(x29) & 0xff) + x31 := (x29 >> 8) + x32 := (u8(x31) & 0xff) + x33 := (x31 >> 8) + x34 := (u8(x33) & 0xff) + x35 := (x33 >> 8) + x36 := (u8(x35) & 0xff) + x37 := u8((x35 >> 8)) + x38 := (x25 + u64(x37)) + x39 := (u8(x38) & 0xff) + x40 := (x38 >> 8) + x41 := (u8(x40) & 0xff) + x42 := (x40 >> 8) + x43 := (u8(x42) & 0xff) + x44 := (x42 >> 8) + x45 := (u8(x44) & 0xff) + x46 := (x44 >> 8) + x47 := (u8(x46) & 0xff) + x48 := (x46 >> 8) + x49 := (u8(x48) & 0xff) + x50 := u8((x48 >> 8)) + x51 := (x24 + u64(x50)) + x52 := (u8(x51) & 0xff) + x53 := (x51 >> 8) + x54 := (u8(x53) & 0xff) + x55 := (x53 >> 8) + x56 := (u8(x55) & 0xff) + x57 := (x55 >> 8) + x58 := (u8(x57) & 0xff) + x59 := (x57 >> 8) + x60 := (u8(x59) & 0xff) + x61 := (x59 >> 8) + x62 := (u8(x61) & 0xff) + x63 := (x61 >> 8) + x64 := (u8(x63) & 0xff) + x65 := fiat.u1((x63 >> 8)) + x66 := (x23 + u64(x65)) + x67 := (u8(x66) & 0xff) + x68 := (x66 >> 8) + x69 := (u8(x68) & 0xff) + x70 := (x68 >> 8) + x71 := (u8(x70) & 0xff) + x72 := (x70 >> 8) + x73 := (u8(x72) & 0xff) + x74 := (x72 >> 8) + x75 := (u8(x74) & 0xff) + x76 := (x74 >> 8) + x77 := (u8(x76) & 0xff) + x78 := u8((x76 >> 8)) + x79 := (x22 + u64(x78)) + x80 := (u8(x79) & 0xff) + x81 := (x79 >> 8) + x82 := (u8(x81) & 0xff) + x83 := (x81 >> 8) + x84 := (u8(x83) & 0xff) + x85 := (x83 >> 8) + x86 := (u8(x85) & 0xff) + x87 := (x85 >> 8) + x88 := (u8(x87) & 0xff) + x89 := (x87 >> 8) + x90 := (u8(x89) & 0xff) + x91 := u8((x89 >> 8)) + out1[0] = x26 + out1[1] = x28 + out1[2] = x30 + out1[3] = x32 + out1[4] = x34 + out1[5] = x36 + out1[6] = x39 + out1[7] = x41 + out1[8] = x43 + out1[9] = x45 + out1[10] = x47 + out1[11] = x49 + out1[12] = x52 + out1[13] = x54 + out1[14] = x56 + out1[15] = x58 + out1[16] = x60 + out1[17] = x62 + out1[18] = x64 + out1[19] = x67 + out1[20] = x69 + out1[21] = x71 + out1[22] = x73 + out1[23] = x75 + out1[24] = x77 + out1[25] = x80 + out1[26] = x82 + out1[27] = x84 + out1[28] = x86 + out1[29] = x88 + out1[30] = x90 + out1[31] = x91 +} + +_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) { + x1 := (u64(arg1[31]) << 44) + x2 := (u64(arg1[30]) << 36) + x3 := (u64(arg1[29]) << 28) + x4 := (u64(arg1[28]) << 20) + x5 := (u64(arg1[27]) << 12) + x6 := (u64(arg1[26]) << 4) + x7 := (u64(arg1[25]) << 47) + x8 := (u64(arg1[24]) << 39) + x9 := (u64(arg1[23]) << 31) + x10 := (u64(arg1[22]) << 23) + x11 := (u64(arg1[21]) << 15) + x12 := (u64(arg1[20]) << 7) + x13 := (u64(arg1[19]) << 50) + x14 := (u64(arg1[18]) << 42) + x15 := (u64(arg1[17]) << 34) + x16 := (u64(arg1[16]) << 26) + x17 := (u64(arg1[15]) << 18) + x18 := (u64(arg1[14]) << 10) + x19 := (u64(arg1[13]) << 2) + x20 := (u64(arg1[12]) << 45) + x21 := (u64(arg1[11]) << 37) + x22 := (u64(arg1[10]) << 29) + x23 := (u64(arg1[9]) << 21) + x24 := (u64(arg1[8]) << 13) + x25 := (u64(arg1[7]) << 5) + x26 := (u64(arg1[6]) << 48) + x27 := (u64(arg1[5]) << 40) + x28 := (u64(arg1[4]) << 32) + x29 := (u64(arg1[3]) << 24) + x30 := (u64(arg1[2]) << 16) + x31 := (u64(arg1[1]) << 8) + x32 := arg1[0] + x33 := (x31 + u64(x32)) + x34 := (x30 + x33) + x35 := (x29 + x34) + x36 := (x28 + x35) + x37 := (x27 + x36) + x38 := (x26 + x37) + x39 := (x38 & 0x7ffffffffffff) + x40 := u8((x38 >> 51)) + x41 := (x25 + u64(x40)) + x42 := (x24 + x41) + x43 := (x23 + x42) + x44 := (x22 + x43) + x45 := (x21 + x44) + x46 := (x20 + x45) + x47 := (x46 & 0x7ffffffffffff) + x48 := u8((x46 >> 51)) + x49 := (x19 + u64(x48)) + x50 := (x18 + x49) + x51 := (x17 + x50) + x52 := (x16 + x51) + x53 := (x15 + x52) + x54 := (x14 + x53) + x55 := (x13 + x54) + x56 := (x55 & 0x7ffffffffffff) + x57 := u8((x55 >> 51)) + x58 := (x12 + u64(x57)) + x59 := (x11 + x58) + x60 := (x10 + x59) + x61 := (x9 + x60) + x62 := (x8 + x61) + x63 := (x7 + x62) + x64 := (x63 & 0x7ffffffffffff) + x65 := u8((x63 >> 51)) + x66 := (x6 + u64(x65)) + x67 := (x5 + x66) + x68 := (x4 + x67) + x69 := (x3 + x68) + x70 := (x2 + x69) + x71 := (x1 + x70) + out1[0] = x39 + out1[1] = x47 + out1[2] = x56 + out1[3] = x64 + out1[4] = x71 +} + +fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x2, x1 := bits.mul_u64(0x1db42, arg1[4]) + x4, x3 := bits.mul_u64(0x1db42, arg1[3]) + x6, x5 := bits.mul_u64(0x1db42, arg1[2]) + x8, x7 := bits.mul_u64(0x1db42, arg1[1]) + x10, x9 := bits.mul_u64(0x1db42, arg1[0]) + x11 := ((x9 >> 51) | ((x10 << 13) & 0xffffffffffffffff)) + x12 := (x9 & 0x7ffffffffffff) + x13, x14 := bits.add_u64(x11, x7, u64(0x0)) + x15 := (u64(fiat.u1(x14)) + x8) + x16 := ((x13 >> 51) | ((x15 << 13) & 0xffffffffffffffff)) + x17 := (x13 & 0x7ffffffffffff) + x18, x19 := bits.add_u64(x16, x5, u64(0x0)) + x20 := (u64(fiat.u1(x19)) + x6) + x21 := ((x18 >> 51) | ((x20 << 13) & 0xffffffffffffffff)) + x22 := (x18 & 0x7ffffffffffff) + x23, x24 := bits.add_u64(x21, x3, u64(0x0)) + x25 := (u64(fiat.u1(x24)) + x4) + x26 := ((x23 >> 51) | ((x25 << 13) & 0xffffffffffffffff)) + x27 := (x23 & 0x7ffffffffffff) + x28, x29 := bits.add_u64(x26, x1, u64(0x0)) + x30 := (u64(fiat.u1(x29)) + x2) + x31 := ((x28 >> 51) | ((x30 << 13) & 0xffffffffffffffff)) + x32 := (x28 & 0x7ffffffffffff) + x33 := (x31 * 0x13) + x34 := (x12 + x33) + x35 := fiat.u1((x34 >> 51)) + x36 := (x34 & 0x7ffffffffffff) + x37 := (u64(x35) + x17) + x38 := fiat.u1((x37 >> 51)) + x39 := (x37 & 0x7ffffffffffff) + x40 := (u64(x38) + x22) + out1[0] = x36 + out1[1] = x39 + out1[2] = x40 + out1[3] = x27 + out1[4] = x32 +} + +// The following routines were added by hand, and do not come from fiat-crypto. + +fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 +} + +fe_one :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 1 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 +} + +fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 +} + +fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) { + mask := -u64(arg1) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + x = (out1[3] ~ out2[3]) & mask + x4, y4 := out1[3] ~ x, out2[3] ~ x + x = (out1[4] ~ out2[4]) & mask + x5, y5 := out1[4] ~ x, out2[4] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 + out1[3], out2[3] = x4, y4 + out1[4], out2[4] = x5, y5 +} diff --git a/core/crypto/_fiat/field_poly1305/field.odin b/core/crypto/_fiat/field_poly1305/field.odin new file mode 100644 index 000000000..bfb7cf1f9 --- /dev/null +++ b/core/crypto/_fiat/field_poly1305/field.odin @@ -0,0 +1,66 @@ +package field_poly1305 + +import "core:crypto/util" +import "core:mem" + +fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element { + return transmute(^Loose_Field_Element)(arg1) +} + +fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element { + return transmute(^Tight_Field_Element)(arg1) +} + +fe_from_bytes :: #force_inline proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte, sanitize: bool = true) { + // fiat-crypto's deserialization routine effectively processes a + // single byte at a time, and wants 256-bits of input for a value + // that will be 128-bits or 129-bits. + // + // This is somewhat cumbersome to use, so at a minimum a wrapper + // makes implementing the actual MAC block processing considerably + // neater. + + assert(len(arg1) == 16) + + when ODIN_ARCH == "386" || ODIN_ARCH == "amd64" { + // While it may be unwise to do deserialization here on our + // own when fiat-crypto provides equivalent functionality, + // doing it this way provides a little under 3x performance + // improvement when optimization is enabled. + src_p := transmute(^[2]u64)(&arg1[0]) + lo := src_p[0] + hi := src_p[1] + + // This is inspired by poly1305-donna, though adjustments were + // made since a Tight_Field_Element's limbs are 44-bits, 43-bits, + // and 43-bits wide. + // + // Note: This could be transplated into fe_from_u64s, but that + // code is called once per MAC, and is non-criticial path. + hibit := u64(arg2) << 41 // arg2 << 128 + out1[0] = lo & 0xfffffffffff + out1[1] = ((lo >> 44) | (hi << 20)) & 0x7ffffffffff + out1[2] = ((hi >> 23) & 0x7ffffffffff) | hibit + } else { + tmp: [32]byte + copy_slice(tmp[0:16], arg1[:]) + tmp[16] = arg2 + + _fe_from_bytes(out1, &tmp) + if sanitize { + // This is used to deserialize `s` which is confidential. + mem.zero_explicit(&tmp, size_of(tmp)) + } + } +} + +fe_from_u64s :: proc "contextless" (out1: ^Tight_Field_Element, lo, hi: u64) { + tmp: [32]byte + util.PUT_U64_LE(tmp[0:8], lo) + util.PUT_U64_LE(tmp[8:16], hi) + + _fe_from_bytes(out1, &tmp) + + // This routine is only used to deserialize `r` which is confidential. + mem.zero_explicit(&tmp, size_of(tmp)) +} diff --git a/core/crypto/_fiat/field_poly1305/field4344.odin b/core/crypto/_fiat/field_poly1305/field4344.odin new file mode 100644 index 000000000..ba9bc2694 --- /dev/null +++ b/core/crypto/_fiat/field_poly1305/field4344.odin @@ -0,0 +1,356 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_poly1305 + +// This file provides arithmetic on the field Z/(2^130 - 5) using +// unsaturated 64-bit integer arithmetic. It is derived primarily +// from the machine generate Golang output from the fiat-crypto project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// At some point, it may be worth adding support to fiat-crypto for +// generating Odin output. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +Loose_Field_Element :: distinct [3]u64 +Tight_Field_Element :: distinct [3]u64 + +_addcarryx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((u64(arg1) + arg2) + arg3) + x2 := (x1 & 0xfffffffffff) + x3 := fiat.u1((x1 >> 44)) + out1 = x2 + out2 = x3 + return +} + +_subborrowx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((i64(arg2) - i64(arg1)) - i64(arg3)) + x2 := fiat.i1((x1 >> 44)) + x3 := (u64(x1) & 0xfffffffffff) + out1 = x3 + out2 = (0x0 - fiat.u1(x2)) + return +} + +_addcarryx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((u64(arg1) + arg2) + arg3) + x2 := (x1 & 0x7ffffffffff) + x3 := fiat.u1((x1 >> 43)) + out1 = x2 + out2 = x3 + return +} + +_subborrowx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) { + x1 := ((i64(arg2) - i64(arg1)) - i64(arg3)) + x2 := fiat.i1((x1 >> 43)) + x3 := (u64(x1) & 0x7ffffffffff) + out1 = x3 + out2 = (0x0 - fiat.u1(x2)) + return +} + +fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) { + x2, x1 := bits.mul_u64(arg1[2], (arg2[2] * 0x5)) + x4, x3 := bits.mul_u64(arg1[2], (arg2[1] * 0xa)) + x6, x5 := bits.mul_u64(arg1[1], (arg2[2] * 0xa)) + x8, x7 := bits.mul_u64(arg1[2], arg2[0]) + x10, x9 := bits.mul_u64(arg1[1], (arg2[1] * 0x2)) + x12, x11 := bits.mul_u64(arg1[1], arg2[0]) + x14, x13 := bits.mul_u64(arg1[0], arg2[2]) + x16, x15 := bits.mul_u64(arg1[0], arg2[1]) + x18, x17 := bits.mul_u64(arg1[0], arg2[0]) + x19, x20 := bits.add_u64(x5, x3, u64(0x0)) + x21, _ := bits.add_u64(x6, x4, u64(fiat.u1(x20))) + x23, x24 := bits.add_u64(x17, x19, u64(0x0)) + x25, _ := bits.add_u64(x18, x21, u64(fiat.u1(x24))) + x27 := ((x23 >> 44) | ((x25 << 20) & 0xffffffffffffffff)) + x28 := (x23 & 0xfffffffffff) + x29, x30 := bits.add_u64(x9, x7, u64(0x0)) + x31, _ := bits.add_u64(x10, x8, u64(fiat.u1(x30))) + x33, x34 := bits.add_u64(x13, x29, u64(0x0)) + x35, _ := bits.add_u64(x14, x31, u64(fiat.u1(x34))) + x37, x38 := bits.add_u64(x11, x1, u64(0x0)) + x39, _ := bits.add_u64(x12, x2, u64(fiat.u1(x38))) + x41, x42 := bits.add_u64(x15, x37, u64(0x0)) + x43, _ := bits.add_u64(x16, x39, u64(fiat.u1(x42))) + x45, x46 := bits.add_u64(x27, x41, u64(0x0)) + x47 := (u64(fiat.u1(x46)) + x43) + x48 := ((x45 >> 43) | ((x47 << 21) & 0xffffffffffffffff)) + x49 := (x45 & 0x7ffffffffff) + x50, x51 := bits.add_u64(x48, x33, u64(0x0)) + x52 := (u64(fiat.u1(x51)) + x35) + x53 := ((x50 >> 43) | ((x52 << 21) & 0xffffffffffffffff)) + x54 := (x50 & 0x7ffffffffff) + x55 := (x53 * 0x5) + x56 := (x28 + x55) + x57 := (x56 >> 44) + x58 := (x56 & 0xfffffffffff) + x59 := (x57 + x49) + x60 := fiat.u1((x59 >> 43)) + x61 := (x59 & 0x7ffffffffff) + x62 := (u64(x60) + x54) + out1[0] = x58 + out1[1] = x61 + out1[2] = x62 +} + +fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := (arg1[2] * 0x5) + x2 := (x1 * 0x2) + x3 := (arg1[2] * 0x2) + x4 := (arg1[1] * 0x2) + x6, x5 := bits.mul_u64(arg1[2], x1) + x8, x7 := bits.mul_u64(arg1[1], (x2 * 0x2)) + x10, x9 := bits.mul_u64(arg1[1], (arg1[1] * 0x2)) + x12, x11 := bits.mul_u64(arg1[0], x3) + x14, x13 := bits.mul_u64(arg1[0], x4) + x16, x15 := bits.mul_u64(arg1[0], arg1[0]) + x17, x18 := bits.add_u64(x15, x7, u64(0x0)) + x19, _ := bits.add_u64(x16, x8, u64(fiat.u1(x18))) + x21 := ((x17 >> 44) | ((x19 << 20) & 0xffffffffffffffff)) + x22 := (x17 & 0xfffffffffff) + x23, x24 := bits.add_u64(x11, x9, u64(0x0)) + x25, _ := bits.add_u64(x12, x10, u64(fiat.u1(x24))) + x27, x28 := bits.add_u64(x13, x5, u64(0x0)) + x29, _ := bits.add_u64(x14, x6, u64(fiat.u1(x28))) + x31, x32 := bits.add_u64(x21, x27, u64(0x0)) + x33 := (u64(fiat.u1(x32)) + x29) + x34 := ((x31 >> 43) | ((x33 << 21) & 0xffffffffffffffff)) + x35 := (x31 & 0x7ffffffffff) + x36, x37 := bits.add_u64(x34, x23, u64(0x0)) + x38 := (u64(fiat.u1(x37)) + x25) + x39 := ((x36 >> 43) | ((x38 << 21) & 0xffffffffffffffff)) + x40 := (x36 & 0x7ffffffffff) + x41 := (x39 * 0x5) + x42 := (x22 + x41) + x43 := (x42 >> 44) + x44 := (x42 & 0xfffffffffff) + x45 := (x43 + x35) + x46 := fiat.u1((x45 >> 43)) + x47 := (x45 & 0x7ffffffffff) + x48 := (u64(x46) + x40) + out1[0] = x44 + out1[1] = x47 + out1[2] = x48 +} + +fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[0] + x2 := ((x1 >> 44) + arg1[1]) + x3 := ((x2 >> 43) + arg1[2]) + x4 := ((x1 & 0xfffffffffff) + ((x3 >> 43) * 0x5)) + x5 := (u64(fiat.u1((x4 >> 44))) + (x2 & 0x7ffffffffff)) + x6 := (x4 & 0xfffffffffff) + x7 := (x5 & 0x7ffffffffff) + x8 := (u64(fiat.u1((x5 >> 43))) + (x3 & 0x7ffffffffff)) + out1[0] = x6 + out1[1] = x7 + out1[2] = x8 +} + +fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := (arg1[0] + arg2[0]) + x2 := (arg1[1] + arg2[1]) + x3 := (arg1[2] + arg2[2]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := ((0x1ffffffffff6 + arg1[0]) - arg2[0]) + x2 := ((0xffffffffffe + arg1[1]) - arg2[1]) + x3 := ((0xffffffffffe + arg1[2]) - arg2[2]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := (0x1ffffffffff6 - arg1[0]) + x2 := (0xffffffffffe - arg1[1]) + x3 := (0xffffffffffe - arg1[2]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: bool) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) { + x1, x2 := _subborrowx_u44(0x0, arg1[0], 0xffffffffffb) + x3, x4 := _subborrowx_u43(x2, arg1[1], 0x7ffffffffff) + x5, x6 := _subborrowx_u43(x4, arg1[2], 0x7ffffffffff) + x7 := fiat.cmovznz_u64(x6, u64(0x0), 0xffffffffffffffff) + x8, x9 := _addcarryx_u44(0x0, x1, (x7 & 0xffffffffffb)) + x10, x11 := _addcarryx_u43(x9, x3, (x7 & 0x7ffffffffff)) + x12, _ := _addcarryx_u43(x11, x5, (x7 & 0x7ffffffffff)) + x14 := (x12 << 7) + x15 := (x10 << 4) + x16 := (u8(x8) & 0xff) + x17 := (x8 >> 8) + x18 := (u8(x17) & 0xff) + x19 := (x17 >> 8) + x20 := (u8(x19) & 0xff) + x21 := (x19 >> 8) + x22 := (u8(x21) & 0xff) + x23 := (x21 >> 8) + x24 := (u8(x23) & 0xff) + x25 := u8((x23 >> 8)) + x26 := (x15 + u64(x25)) + x27 := (u8(x26) & 0xff) + x28 := (x26 >> 8) + x29 := (u8(x28) & 0xff) + x30 := (x28 >> 8) + x31 := (u8(x30) & 0xff) + x32 := (x30 >> 8) + x33 := (u8(x32) & 0xff) + x34 := (x32 >> 8) + x35 := (u8(x34) & 0xff) + x36 := u8((x34 >> 8)) + x37 := (x14 + u64(x36)) + x38 := (u8(x37) & 0xff) + x39 := (x37 >> 8) + x40 := (u8(x39) & 0xff) + x41 := (x39 >> 8) + x42 := (u8(x41) & 0xff) + x43 := (x41 >> 8) + x44 := (u8(x43) & 0xff) + x45 := (x43 >> 8) + x46 := (u8(x45) & 0xff) + x47 := (x45 >> 8) + x48 := (u8(x47) & 0xff) + x49 := u8((x47 >> 8)) + out1[0] = x16 + out1[1] = x18 + out1[2] = x20 + out1[3] = x22 + out1[4] = x24 + out1[5] = x27 + out1[6] = x29 + out1[7] = x31 + out1[8] = x33 + out1[9] = x35 + out1[10] = x38 + out1[11] = x40 + out1[12] = x42 + out1[13] = x44 + out1[14] = x46 + out1[15] = x48 + out1[16] = x49 +} + +_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) { + x1 := (u64(arg1[16]) << 41) + x2 := (u64(arg1[15]) << 33) + x3 := (u64(arg1[14]) << 25) + x4 := (u64(arg1[13]) << 17) + x5 := (u64(arg1[12]) << 9) + x6 := (u64(arg1[11]) * u64(0x2)) + x7 := (u64(arg1[10]) << 36) + x8 := (u64(arg1[9]) << 28) + x9 := (u64(arg1[8]) << 20) + x10 := (u64(arg1[7]) << 12) + x11 := (u64(arg1[6]) << 4) + x12 := (u64(arg1[5]) << 40) + x13 := (u64(arg1[4]) << 32) + x14 := (u64(arg1[3]) << 24) + x15 := (u64(arg1[2]) << 16) + x16 := (u64(arg1[1]) << 8) + x17 := arg1[0] + x18 := (x16 + u64(x17)) + x19 := (x15 + x18) + x20 := (x14 + x19) + x21 := (x13 + x20) + x22 := (x12 + x21) + x23 := (x22 & 0xfffffffffff) + x24 := u8((x22 >> 44)) + x25 := (x11 + u64(x24)) + x26 := (x10 + x25) + x27 := (x9 + x26) + x28 := (x8 + x27) + x29 := (x7 + x28) + x30 := (x29 & 0x7ffffffffff) + x31 := fiat.u1((x29 >> 43)) + x32 := (x6 + u64(x31)) + x33 := (x5 + x32) + x34 := (x4 + x33) + x35 := (x3 + x34) + x36 := (x2 + x35) + x37 := (x1 + x36) + out1[0] = x23 + out1[1] = x30 + out1[2] = x37 +} + +fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +// The following routines were added by hand, and do not come from fiat-crypto. + +fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 +} + +fe_set :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 +} + +fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: bool) { + mask := -u64(arg1) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 +} diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin new file mode 100644 index 000000000..f6f551692 --- /dev/null +++ b/core/crypto/chacha20/chacha20.odin @@ -0,0 +1,581 @@ +package chacha20 + +import "core:crypto/util" +import "core:math/bits" +import "core:mem" + +KEY_SIZE :: 32 +NONCE_SIZE :: 12 +XNONCE_SIZE :: 24 + +_MAX_CTR_IETF :: 0xffffffff + +_BLOCK_SIZE :: 64 +_STATE_SIZE_U32 :: 16 +_ROUNDS :: 20 + +_SIGMA_0 : u32 : 0x61707865 +_SIGMA_1 : u32 : 0x3320646e +_SIGMA_2 : u32 : 0x79622d32 +_SIGMA_3 : u32 : 0x6b206574 + +Context :: struct { + _s: [_STATE_SIZE_U32]u32, + + _buffer: [_BLOCK_SIZE]byte, + _off: int, + + _is_ietf_flavor: bool, + _is_initialized: bool, +} + +init :: proc (ctx: ^Context, key, nonce: []byte) { + if len(key) != KEY_SIZE { + panic("crypto/chacha20: invalid ChaCha20 key size") + } + if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE { + panic("crypto/chacha20: invalid (X)ChaCha20 nonce size") + } + + k, n := key, nonce + + // Derive the XChaCha20 subkey and sub-nonce via HChaCha20. + is_xchacha := len(nonce) == XNONCE_SIZE + if is_xchacha { + sub_key := ctx._buffer[:KEY_SIZE] + _hchacha20(sub_key, k, n) + k = sub_key + n = n[16:24] + } + + ctx._s[0] = _SIGMA_0 + ctx._s[1] = _SIGMA_1 + ctx._s[2] = _SIGMA_2 + ctx._s[3] = _SIGMA_3 + ctx._s[4] = util.U32_LE(k[0:4]) + ctx._s[5] = util.U32_LE(k[4:8]) + ctx._s[6] = util.U32_LE(k[8:12]) + ctx._s[7] = util.U32_LE(k[12:16]) + ctx._s[8] = util.U32_LE(k[16:20]) + ctx._s[9] = util.U32_LE(k[20:24]) + ctx._s[10] = util.U32_LE(k[24:28]) + ctx._s[11] = util.U32_LE(k[28:32]) + ctx._s[12] = 0 + if !is_xchacha { + ctx._s[13] = util.U32_LE(n[0:4]) + ctx._s[14] = util.U32_LE(n[4:8]) + ctx._s[15] = util.U32_LE(n[8:12]) + } else { + ctx._s[13] = 0 + ctx._s[14] = util.U32_LE(n[0:4]) + ctx._s[15] = util.U32_LE(n[4:8]) + + // The sub-key is stored in the keystream buffer. While + // this will be overwritten in most circumstances, explicitly + // clear it out early. + mem.zero_explicit(&ctx._buffer, KEY_SIZE) + } + + ctx._off = _BLOCK_SIZE + ctx._is_ietf_flavor = !is_xchacha + ctx._is_initialized = true +} + +seek :: proc (ctx: ^Context, block_nr: u64) { + assert(ctx._is_initialized) + + if ctx._is_ietf_flavor { + if block_nr > _MAX_CTR_IETF { + panic("crypto/chacha20: attempted to seek past maximum counter") + } + } else { + ctx._s[13] = u32(block_nr >> 32) + } + ctx._s[12] = u32(block_nr) + ctx._off = _BLOCK_SIZE +} + +xor_bytes :: proc (ctx: ^Context, dst, src: []byte) { + assert(ctx._is_initialized) + + // TODO: Enforcing that dst and src alias exactly or not at all + // is a good idea, though odd aliasing should be extremely uncommon. + + src, dst := src, dst + if dst_len := len(dst); dst_len < len(src) { + src = src[:dst_len] + } + + for remaining := len(src); remaining > 0; { + // Process multiple blocks at once + if ctx._off == _BLOCK_SIZE { + if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _BLOCK_SIZE + _do_blocks(ctx, dst, src, nr_blocks) + remaining -= direct_bytes + if remaining == 0 { + return + } + dst = dst[direct_bytes:] + src = src[direct_bytes:] + } + + // If there is a partial block, generate and buffer 1 block + // worth of keystream. + _do_blocks(ctx, ctx._buffer[:], nil, 1) + ctx._off = 0 + } + + // Process partial blocks from the buffered keystream. + to_xor := min(_BLOCK_SIZE - ctx._off, remaining) + buffered_keystream := ctx._buffer[ctx._off:] + for i := 0; i < to_xor; i = i + 1 { + dst[i] = buffered_keystream[i] ~ src[i] + } + ctx._off += to_xor + dst = dst[to_xor:] + src = src[to_xor:] + remaining -= to_xor + } +} + +keystream_bytes :: proc (ctx: ^Context, dst: []byte) { + assert(ctx._is_initialized) + + dst := dst + for remaining := len(dst); remaining > 0; { + // Process multiple blocks at once + if ctx._off == _BLOCK_SIZE { + if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _BLOCK_SIZE + _do_blocks(ctx, dst, nil, nr_blocks) + remaining -= direct_bytes + if remaining == 0 { + return + } + dst = dst[direct_bytes:] + } + + // If there is a partial block, generate and buffer 1 block + // worth of keystream. + _do_blocks(ctx, ctx._buffer[:], nil, 1) + ctx._off = 0 + } + + // Process partial blocks from the buffered keystream. + to_copy := min(_BLOCK_SIZE - ctx._off, remaining) + buffered_keystream := ctx._buffer[ctx._off:] + copy(dst[:to_copy], buffered_keystream[:to_copy]) + ctx._off += to_copy + dst = dst[to_copy:] + remaining -= to_copy + } +} + +reset :: proc (ctx: ^Context) { + mem.zero_explicit(&ctx._s, size_of(ctx._s)) + mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) + + ctx._is_initialized = false +} + +_do_blocks :: proc (ctx: ^Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per nonce. + // + // While all modern "standard" definitions of ChaCha20 use + // the IETF 32-bit counter, for XChaCha20 most common + // implementations allow for a 64-bit counter. + // + // Honestly, the answer here is "use a MRAE primitive", but + // go with common practice in the case of XChaCha20. + if ctx._is_ietf_flavor { + if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff { + panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached") + } + } else { + ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) + if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { + panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached") + } + } + + dst, src := dst, src + x := &ctx._s + for n := 0; n < nr_blocks; n = n + 1 { + x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := _ROUNDS; i > 0; i = i - 2 { + // Even when forcing inlining manually inlining all of + // these is decently faster. + + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = util.ROTL32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = util.ROTL32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = util.ROTL32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = util.ROTL32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = util.ROTL32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = util.ROTL32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = util.ROTL32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = util.ROTL32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = util.ROTL32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = util.ROTL32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = util.ROTL32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = util.ROTL32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = util.ROTL32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = util.ROTL32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = util.ROTL32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = util.ROTL32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = util.ROTL32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = util.ROTL32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = util.ROTL32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = util.ROTL32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = util.ROTL32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = util.ROTL32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = util.ROTL32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = util.ROTL32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = util.ROTL32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = util.ROTL32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = util.ROTL32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = util.ROTL32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = util.ROTL32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = util.ROTL32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = util.ROTL32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = util.ROTL32(x4, 7) + } + + x0 += _SIGMA_0 + x1 += _SIGMA_1 + x2 += _SIGMA_2 + x3 += _SIGMA_3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + + // While the "correct" answer to getting more performance out of + // this is "use vector operations", support for that is currently + // a work in progress/to be designed. + // + // Until dedicated assembly can be written leverage the fact that + // the callers of this routine ensure that src/dst are valid. + + when ODIN_ARCH == "386" || ODIN_ARCH == "amd64" { + // util.PUT_U32_LE/util.U32_LE are not required on little-endian + // systems that also happen to not be strict about aligned + // memory access. + + dst_p := transmute(^[16]u32)(&dst[0]) + if src != nil { + src_p := transmute(^[16]u32)(&src[0]) + dst_p[0] = src_p[0] ~ x0 + dst_p[1] = src_p[1] ~ x1 + dst_p[2] = src_p[2] ~ x2 + dst_p[3] = src_p[3] ~ x3 + dst_p[4] = src_p[4] ~ x4 + dst_p[5] = src_p[5] ~ x5 + dst_p[6] = src_p[6] ~ x6 + dst_p[7] = src_p[7] ~ x7 + dst_p[8] = src_p[8] ~ x8 + dst_p[9] = src_p[9] ~ x9 + dst_p[10] = src_p[10] ~ x10 + dst_p[11] = src_p[11] ~ x11 + dst_p[12] = src_p[12] ~ x12 + dst_p[13] = src_p[13] ~ x13 + dst_p[14] = src_p[14] ~ x14 + dst_p[15] = src_p[15] ~ x15 + src = src[_BLOCK_SIZE:] + } else { + dst_p[0] = x0 + dst_p[1] = x1 + dst_p[2] = x2 + dst_p[3] = x3 + dst_p[4] = x4 + dst_p[5] = x5 + dst_p[6] = x6 + dst_p[7] = x7 + dst_p[8] = x8 + dst_p[9] = x9 + dst_p[10] = x10 + dst_p[11] = x11 + dst_p[12] = x12 + dst_p[13] = x13 + dst_p[14] = x14 + dst_p[15] = x15 + } + dst = dst[_BLOCK_SIZE:] + } else { + #no_bounds_check { + if src != nil { + util.PUT_U32_LE(dst[0:4], util.U32_LE(src[0:4]) ~ x0) + util.PUT_U32_LE(dst[4:8], util.U32_LE(src[4:8]) ~ x1) + util.PUT_U32_LE(dst[8:12], util.U32_LE(src[8:12]) ~ x2) + util.PUT_U32_LE(dst[12:16], util.U32_LE(src[12:16]) ~ x3) + util.PUT_U32_LE(dst[16:20], util.U32_LE(src[16:20]) ~ x4) + util.PUT_U32_LE(dst[20:24], util.U32_LE(src[20:24]) ~ x5) + util.PUT_U32_LE(dst[24:28], util.U32_LE(src[24:28]) ~ x6) + util.PUT_U32_LE(dst[28:32], util.U32_LE(src[28:32]) ~ x7) + util.PUT_U32_LE(dst[32:36], util.U32_LE(src[32:36]) ~ x8) + util.PUT_U32_LE(dst[36:40], util.U32_LE(src[36:40]) ~ x9) + util.PUT_U32_LE(dst[40:44], util.U32_LE(src[40:44]) ~ x10) + util.PUT_U32_LE(dst[44:48], util.U32_LE(src[44:48]) ~ x11) + util.PUT_U32_LE(dst[48:52], util.U32_LE(src[48:52]) ~ x12) + util.PUT_U32_LE(dst[52:56], util.U32_LE(src[52:56]) ~ x13) + util.PUT_U32_LE(dst[56:60], util.U32_LE(src[56:60]) ~ x14) + util.PUT_U32_LE(dst[60:64], util.U32_LE(src[60:64]) ~ x15) + src = src[_BLOCK_SIZE:] + } else { + util.PUT_U32_LE(dst[0:4], x0) + util.PUT_U32_LE(dst[4:8], x1) + util.PUT_U32_LE(dst[8:12], x2) + util.PUT_U32_LE(dst[12:16], x3) + util.PUT_U32_LE(dst[16:20], x4) + util.PUT_U32_LE(dst[20:24], x5) + util.PUT_U32_LE(dst[24:28], x6) + util.PUT_U32_LE(dst[28:32], x7) + util.PUT_U32_LE(dst[32:36], x8) + util.PUT_U32_LE(dst[36:40], x9) + util.PUT_U32_LE(dst[40:44], x10) + util.PUT_U32_LE(dst[44:48], x11) + util.PUT_U32_LE(dst[48:52], x12) + util.PUT_U32_LE(dst[52:56], x13) + util.PUT_U32_LE(dst[56:60], x14) + util.PUT_U32_LE(dst[60:64], x15) + } + dst = dst[_BLOCK_SIZE:] + } + } + + // Increment the counter. Overflow checking is done upon + // entry into the routine, so a 64-bit increment safely + // covers both cases. + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 + x[12] = u32(new_ctr) + x[13] = u32(new_ctr >> 32) + } +} + +_hchacha20 :: proc (dst, key, nonce: []byte) { + x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 + x4 := util.U32_LE(key[0:4]) + x5 := util.U32_LE(key[4:8]) + x6 := util.U32_LE(key[8:12]) + x7 := util.U32_LE(key[12:16]) + x8 := util.U32_LE(key[16:20]) + x9 := util.U32_LE(key[20:24]) + x10 := util.U32_LE(key[24:28]) + x11 := util.U32_LE(key[28:32]) + x12 := util.U32_LE(nonce[0:4]) + x13 := util.U32_LE(nonce[4:8]) + x14 := util.U32_LE(nonce[8:12]) + x15 := util.U32_LE(nonce[12:16]) + + for i := _ROUNDS; i > 0; i = i - 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = util.ROTL32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = util.ROTL32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = util.ROTL32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = util.ROTL32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = util.ROTL32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = util.ROTL32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = util.ROTL32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = util.ROTL32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = util.ROTL32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = util.ROTL32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = util.ROTL32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = util.ROTL32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = util.ROTL32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = util.ROTL32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = util.ROTL32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = util.ROTL32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = util.ROTL32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = util.ROTL32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = util.ROTL32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = util.ROTL32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = util.ROTL32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = util.ROTL32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = util.ROTL32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = util.ROTL32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = util.ROTL32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = util.ROTL32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = util.ROTL32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = util.ROTL32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = util.ROTL32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = util.ROTL32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = util.ROTL32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = util.ROTL32(x4, 7) + } + + util.PUT_U32_LE(dst[0:4], x0) + util.PUT_U32_LE(dst[4:8], x1) + util.PUT_U32_LE(dst[8:12], x2) + util.PUT_U32_LE(dst[12:16], x3) + util.PUT_U32_LE(dst[16:20], x12) + util.PUT_U32_LE(dst[20:24], x13) + util.PUT_U32_LE(dst[24:28], x14) + util.PUT_U32_LE(dst[28:32], x15) +} diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin new file mode 100644 index 000000000..67d89df56 --- /dev/null +++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin @@ -0,0 +1,146 @@ +package chacha20poly1305 + +import "core:crypto" +import "core:crypto/chacha20" +import "core:crypto/poly1305" +import "core:crypto/util" +import "core:mem" + +KEY_SIZE :: chacha20.KEY_SIZE +NONCE_SIZE :: chacha20.NONCE_SIZE +TAG_SIZE :: poly1305.TAG_SIZE + +_P_MAX :: 64 * 0xffffffff // 64 * (2^32-1) + +_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) { + if len(tag) != TAG_SIZE { + panic("crypto/chacha20poly1305: invalid destination tag size") + } + if len(key) != KEY_SIZE { + panic("crypto/chacha20poly1305: invalid key size") + } + if len(nonce) != NONCE_SIZE { + panic("crypto/chacha20poly1305: invalid nonce size") + } + + #assert(size_of(int) == 8 || size_of(int) <= 4) + when size_of(int) == 8 { + // A_MAX = 2^64 - 1 due to the length field limit. + // P_MAX = 64 * (2^32 - 1) due to the IETF ChaCha20 counter limit. + // + // A_MAX is limited by size_of(int), so there is no need to + // enforce it. P_MAX only needs to be checked on 64-bit targets, + // for reasons that should be obvious. + if text_len := len(text); text_len > _P_MAX { + panic("crypto/chacha20poly1305: oversized src data") + } + } +} + +_PAD: [16]byte +_update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) { + if pad_len := 16 - (x_len & (16-1)); pad_len != 16 { + poly1305.update(ctx, _PAD[:pad_len]) + } +} + +encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) { + _validate_common_slice_sizes(tag, key, nonce, aad, plaintext) + if len(ciphertext) != len(plaintext) { + panic("crypto/chacha20poly1305: invalid destination ciphertext size") + } + + stream_ctx: chacha20.Context = --- + chacha20.init(&stream_ctx, key, nonce) + + // otk = poly1305_key_gen(key, nonce) + otk: [poly1305.KEY_SIZE]byte = --- + chacha20.keystream_bytes(&stream_ctx, otk[:]) + mac_ctx: poly1305.Context = --- + poly1305.init(&mac_ctx, otk[:]) + mem.zero_explicit(&otk, size_of(otk)) + + aad_len, ciphertext_len := len(aad), len(ciphertext) + + // There is nothing preventing aad and ciphertext from overlapping + // so auth the AAD before encrypting (slightly different from the + // RFC, since the RFC encrypts into a new buffer). + // + // mac_data = aad | pad16(aad) + poly1305.update(&mac_ctx, aad) + _update_mac_pad16(&mac_ctx, aad_len) + + // ciphertext = chacha20_encrypt(key, 1, nonce, plaintext) + chacha20.seek(&stream_ctx, 1) + chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext) + chacha20.reset(&stream_ctx) // Don't need the stream context anymore. + + // mac_data |= ciphertext | pad16(ciphertext) + poly1305.update(&mac_ctx, ciphertext) + _update_mac_pad16(&mac_ctx, ciphertext_len) + + // mac_data |= num_to_8_le_bytes(aad.length) + // mac_data |= num_to_8_le_bytes(ciphertext.length) + l_buf := otk[0:16] // Reuse the scratch buffer. + util.PUT_U64_LE(l_buf[0:8], u64(aad_len)) + util.PUT_U64_LE(l_buf[8:16], u64(ciphertext_len)) + poly1305.update(&mac_ctx, l_buf) + + // tag = poly1305_mac(mac_data, otk) + poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context. +} + +decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool { + _validate_common_slice_sizes(tag, key, nonce, aad, ciphertext) + if len(ciphertext) != len(plaintext) { + panic("crypto/chacha20poly1305: invalid destination plaintext size") + } + + // Note: Unlike encrypt, this can fail early, so use defer for + // sanitization rather than assuming control flow reaches certain + // points where needed. + + stream_ctx: chacha20.Context = --- + chacha20.init(&stream_ctx, key, nonce) + + // otk = poly1305_key_gen(key, nonce) + otk: [poly1305.KEY_SIZE]byte = --- + chacha20.keystream_bytes(&stream_ctx, otk[:]) + defer chacha20.reset(&stream_ctx) + + mac_ctx: poly1305.Context = --- + poly1305.init(&mac_ctx, otk[:]) + defer mem.zero_explicit(&otk, size_of(otk)) + + aad_len, ciphertext_len := len(aad), len(ciphertext) + + // mac_data = aad | pad16(aad) + // mac_data |= ciphertext | pad16(ciphertext) + // mac_data |= num_to_8_le_bytes(aad.length) + // mac_data |= num_to_8_le_bytes(ciphertext.length) + poly1305.update(&mac_ctx, aad) + _update_mac_pad16(&mac_ctx, aad_len) + poly1305.update(&mac_ctx, ciphertext) + _update_mac_pad16(&mac_ctx, ciphertext_len) + l_buf := otk[0:16] // Reuse the scratch buffer. + util.PUT_U64_LE(l_buf[0:8], u64(aad_len)) + util.PUT_U64_LE(l_buf[8:16], u64(ciphertext_len)) + poly1305.update(&mac_ctx, l_buf) + + // tag = poly1305_mac(mac_data, otk) + derived_tag := otk[0:poly1305.TAG_SIZE] // Reuse the scratch buffer again. + poly1305.final(&mac_ctx, derived_tag) // Implicitly sanitizes context. + + // Validate the tag in constant time. + if crypto.compare_constant_time(tag, derived_tag) != 1 { + // Zero out the plaintext, as a defense in depth measure. + mem.zero_explicit(raw_data(plaintext), ciphertext_len) + return false + } + + // plaintext = chacha20_decrypt(key, 1, nonce, ciphertext) + chacha20.seek(&stream_ctx, 1) + chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext) + + return true +} diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin new file mode 100644 index 000000000..35e88c5ed --- /dev/null +++ b/core/crypto/crypto.odin @@ -0,0 +1,52 @@ +package crypto + +import "core:mem" + +// compare_constant_time returns 1 iff a and b are equal, 0 otherwise. +// +// The execution time of this routine is constant regardless of the contents +// of the slices being compared, as long as the length of the slices is equal. +// If the length of the two slices is different, it will early-return 0. +compare_constant_time :: proc "contextless" (a, b: []byte) -> int { + // If the length of the slices is different, early return. + // + // This leaks the fact that the slices have a different length, + // but the routine is primarily intended for comparing things + // like MACS and password digests. + n := len(a) + if n != len(b) { + return 0 + } + + return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n) +} + +// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by +// a and b are equal, 0 otherwise. +// +// The execution time of this routine is constant regardless of the +// contents of the memory being compared. +compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> int { + x := mem.slice_ptr(a, n) + y := mem.slice_ptr(b, n) + + v: byte + for i in 0..> 31) +} + +// rand_bytes fills the dst buffer with cryptographic entropy taken from +// the system entropy source. This routine will block if the system entropy +// source is not ready yet. All system entropy source failures are treated +// as catastrophic, resulting in a panic. +rand_bytes :: proc (dst: []byte) { + // zero-fill the buffer first + mem.zero_explicit(raw_data(dst), len(dst)) + + _rand_bytes(dst) +} diff --git a/core/crypto/poly1305/poly1305.odin b/core/crypto/poly1305/poly1305.odin new file mode 100644 index 000000000..8986be879 --- /dev/null +++ b/core/crypto/poly1305/poly1305.odin @@ -0,0 +1,163 @@ +package poly1305 + +import "core:crypto" +import "core:crypto/util" +import field "core:crypto/_fiat/field_poly1305" +import "core:mem" + +KEY_SIZE :: 32 +TAG_SIZE :: 16 + +_BLOCK_SIZE :: 16 + +sum :: proc (dst, msg, key: []byte) { + ctx: Context = --- + + init(&ctx, key) + update(&ctx, msg) + final(&ctx, dst) +} + +verify :: proc (tag, msg, key: []byte) -> bool { + ctx: Context = --- + derived_tag: [16]byte = --- + + if len(tag) != TAG_SIZE { + panic("crypto/poly1305: invalid tag size") + } + + init(&ctx, key) + update(&ctx, msg) + final(&ctx, derived_tag[:]) + + return crypto.compare_constant_time(derived_tag[:], tag) == 1 +} + +Context :: struct { + _r: field.Tight_Field_Element, + _a: field.Tight_Field_Element, + _s: field.Tight_Field_Element, + + _buffer: [_BLOCK_SIZE]byte, + _leftover: int, + + _is_initialized: bool, +} + +init :: proc (ctx: ^Context, key: []byte) { + if len(key) != KEY_SIZE { + panic("crypto/poly1305: invalid key size") + } + + // r = le_bytes_to_num(key[0..15]) + // r = clamp(r) (r &= 0xffffffc0ffffffc0ffffffc0fffffff) + tmp_lo := util.U64_LE(key[0:8]) & 0x0ffffffc0fffffff + tmp_hi := util.U64_LE(key[8:16]) & 0xffffffc0ffffffc + field.fe_from_u64s(&ctx._r, tmp_lo, tmp_hi) + + // s = le_bytes_to_num(key[16..31]) + field.fe_from_bytes(&ctx._s, key[16:32], 0) + + // a = 0 + field.fe_zero(&ctx._a) + + // No leftover in buffer + ctx._leftover = 0 + + ctx._is_initialized = true +} + +update :: proc (ctx: ^Context, data: []byte) { + assert(ctx._is_initialized) + + msg := data + msg_len := len(data) + + // Handle leftover + if ctx._leftover > 0 { + want := min(_BLOCK_SIZE - ctx._leftover, msg_len) + copy_slice(ctx._buffer[ctx._leftover:], msg[:want]) + msg_len = msg_len - want + msg = msg[want:] + ctx._leftover = ctx._leftover + want + if ctx._leftover < _BLOCK_SIZE { + return + } + _blocks(ctx, ctx._buffer[:]) + ctx._leftover = 0 + } + + // Process full blocks + if msg_len >= _BLOCK_SIZE { + want := msg_len & (~int(_BLOCK_SIZE - 1)) + _blocks(ctx, msg[:want]) + msg = msg[want:] + msg_len = msg_len - want + } + + // Store leftover + if msg_len > 0 { + // TODO: While -donna does it this way, I'm fairly sure that + // `ctx._leftover == 0` is an invariant at this point. + copy(ctx._buffer[ctx._leftover:], msg) + ctx._leftover = ctx._leftover + msg_len + } +} + +final :: proc (ctx: ^Context, dst: []byte) { + assert(ctx._is_initialized) + + if len(dst) != TAG_SIZE { + panic("poly1305: invalid destination tag size") + } + + // Process remaining block + if ctx._leftover > 0 { + ctx._buffer[ctx._leftover] = 1 + for i := ctx._leftover + 1; i < _BLOCK_SIZE; i = i + 1 { + ctx._buffer[i] = 0 + } + _blocks(ctx, ctx._buffer[:], true) + } + + // a += s + field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &ctx._s) // _a unreduced + field.fe_carry(&ctx._a, field.fe_relax_cast(&ctx._a)) // _a reduced + + // return num_to_16_le_bytes(a) + tmp: [32]byte = --- + field.fe_to_bytes(&tmp, &ctx._a) + copy_slice(dst, tmp[0:16]) + + reset(ctx) +} + +reset :: proc (ctx: ^Context) { + mem.zero_explicit(&ctx._r, size_of(ctx._r)) + mem.zero_explicit(&ctx._a, size_of(ctx._a)) + mem.zero_explicit(&ctx._s, size_of(ctx._s)) + mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) + + ctx._is_initialized = false +} + +_blocks :: proc (ctx: ^Context, msg: []byte, final := false) { + n: field.Tight_Field_Element = --- + final_byte := byte(!final) + + data := msg + data_len := len(data) + for data_len >= _BLOCK_SIZE { + // n = le_bytes_to_num(msg[((i-1)*16)..*i*16] | [0x01]) + field.fe_from_bytes(&n, data[:_BLOCK_SIZE], final_byte, false) + + // a += n + field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &n) // _a unreduced + + // a = (r * a) % p + field.fe_carry_mul(&ctx._a, field.fe_relax_cast(&ctx._a), field.fe_relax_cast(&ctx._r)) // _a reduced + + data = data[_BLOCK_SIZE:] + data_len = data_len - _BLOCK_SIZE + } +} diff --git a/core/crypto/rand_generic.odin b/core/crypto/rand_generic.odin new file mode 100644 index 000000000..98890b5b1 --- /dev/null +++ b/core/crypto/rand_generic.odin @@ -0,0 +1,7 @@ +package crypto + +when ODIN_OS != "linux" { + _rand_bytes :: proc (dst: []byte) { + unimplemented("crypto: rand_bytes not supported on this OS") + } +} diff --git a/core/crypto/rand_linux.odin b/core/crypto/rand_linux.odin new file mode 100644 index 000000000..4d1183757 --- /dev/null +++ b/core/crypto/rand_linux.odin @@ -0,0 +1,37 @@ +package crypto + +import "core:fmt" +import "core:os" +import "core:sys/unix" + +_MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1 + +_rand_bytes :: proc (dst: []byte) { + dst := dst + l := len(dst) + + for l > 0 { + to_read := min(l, _MAX_PER_CALL_BYTES) + ret := unix.sys_getrandom(raw_data(dst), to_read, 0) + if ret < 0 { + switch os.Errno(-ret) { + case os.EINTR: + // Call interupted by a signal handler, just retry the + // request. + continue + case os.ENOSYS: + // The kernel is apparently prehistoric (< 3.17 circa 2014) + // and does not support getrandom. + panic("crypto: getrandom not available in kernel") + case: + // All other failures are things that should NEVER happen + // unless the kernel interface changes (ie: the Linux + // developers break userland). + panic(fmt.tprintf("crypto: getrandom failed: %d", ret)) + } + } + + l -= ret + dst = dst[ret:] + } +} diff --git a/core/crypto/x25519/x25519.odin b/core/crypto/x25519/x25519.odin new file mode 100644 index 000000000..dfc8daa47 --- /dev/null +++ b/core/crypto/x25519/x25519.odin @@ -0,0 +1,126 @@ +package x25519 + +import field "core:crypto/_fiat/field_curve25519" +import "core:mem" + +SCALAR_SIZE :: 32 +POINT_SIZE :: 32 + +_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} + +_scalar_bit :: #force_inline proc "contextless" (s: ^[32]byte, i: int) -> u8 { + if i < 0 { + return 0 + } + return (s[i>>3] >> uint(i&7)) & 1 +} + +_scalarmult :: proc (out, scalar, point: ^[32]byte) { + // Montgomery pseduo-multiplication taken from Monocypher. + + // computes the scalar product + x1: field.Tight_Field_Element = --- + field.fe_from_bytes(&x1, point) + + // computes the actual scalar product (the result is in x2 and z2) + x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, --- + t0, t1: field.Loose_Field_Element = ---, --- + + // Montgomery ladder + // In projective coordinates, to avoid divisions: x = X / Z + // We don't care about the y coordinate, it's only 1 bit of information + field.fe_one(&x2) // "zero" point + field.fe_zero(&z2) + field.fe_set(&x3, &x1) // "one" point + field.fe_one(&z3) + + swap: int + for pos := 255-1; pos >= 0; pos = pos - 1 { + // constant time conditional swap before ladder step + b := int(_scalar_bit(scalar, pos)) + swap ~= b // xor trick avoids swapping at the end of the loop + field.fe_cond_swap(&x2, &x3, swap) + field.fe_cond_swap(&z2, &z3, swap) + swap = b // anticipates one last swap after the loop + + // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3) + // with differential addition + // + // Note: This deliberately omits reductions after add/sub operations + // if the result is only ever used as the input to a mul/square since + // the implementations of those can deal with non-reduced inputs. + // + // fe_tighten_cast is only used to store a fully reduced + // output in a Loose_Field_Element, or to provide such a + // Loose_Field_Element as a Tight_Field_Element argument. + field.fe_sub(&t0, &x3, &z3) + field.fe_sub(&t1, &x2, &z2) + field.fe_add(field.fe_relax_cast(&x2), &x2, &z2) // x2 - unreduced + field.fe_add(field.fe_relax_cast(&z2), &x3, &z3) // z2 - unreduced + field.fe_carry_mul(&z3, &t0, field.fe_relax_cast(&x2)) + field.fe_carry_mul(&z2, field.fe_relax_cast(&z2), &t1) // z2 - reduced + field.fe_carry_square(field.fe_tighten_cast(&t0), &t1) // t0 - reduced + field.fe_carry_square(field.fe_tighten_cast(&t1), field.fe_relax_cast(&x2)) // t1 - reduced + field.fe_add(field.fe_relax_cast(&x3), &z3, &z2) // x3 - unreduced + field.fe_sub(field.fe_relax_cast(&z2), &z3, &z2) // z2 - unreduced + field.fe_carry_mul(&x2, &t1, &t0) // x2 - reduced + field.fe_sub(&t1, field.fe_tighten_cast(&t1), field.fe_tighten_cast(&t0)) // safe - t1/t0 is reduced + field.fe_carry_square(&z2, field.fe_relax_cast(&z2)) // z2 - reduced + field.fe_carry_scmul_121666(&z3, &t1) + field.fe_carry_square(&x3, field.fe_relax_cast(&x3)) // x3 - reduced + field.fe_add(&t0, field.fe_tighten_cast(&t0), &z3) // safe - t0 is reduced + field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z2)) + field.fe_carry_mul(&z2, &t1, &t0) + } + // last swap is necessary to compensate for the xor trick + // Note: after this swap, P3 == P2 + P1. + field.fe_cond_swap(&x2, &x3, swap) + field.fe_cond_swap(&z2, &z3, swap) + + // normalises the coordinates: x == X / Z + field.fe_carry_inv(&z2, field.fe_relax_cast(&z2)) + field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2)) + field.fe_to_bytes(out, &x2) + + mem.zero_explicit(&x1, size_of(x1)) + mem.zero_explicit(&x2, size_of(x2)) + mem.zero_explicit(&x3, size_of(x3)) + mem.zero_explicit(&z2, size_of(z2)) + mem.zero_explicit(&z3, size_of(z3)) + mem.zero_explicit(&t0, size_of(t0)) + mem.zero_explicit(&t1, size_of(t1)) +} + +scalarmult :: proc (dst, scalar, point: []byte) { + if len(scalar) != SCALAR_SIZE { + panic("crypto/x25519: invalid scalar size") + } + if len(point) != POINT_SIZE { + panic("crypto/x25519: invalid point size") + } + if len(dst) != POINT_SIZE { + panic("crypto/x25519: invalid destination point size") + } + + // "clamp" the scalar + e: [32]byte = --- + copy_slice(e[:], scalar) + e[0] &= 248 + e[31] &= 127 + e[31] |= 64 + + p: [32]byte = --- + copy_slice(p[:], point) + + d: [32]byte = --- + _scalarmult(&d, &e, &p) + copy_slice(dst, d[:]) + + mem.zero_explicit(&e, size_of(e)) + mem.zero_explicit(&d, size_of(d)) +} + +scalarmult_basepoint :: proc (dst, scalar: []byte) { + // TODO/perf: Switch to using a precomputed table. + scalarmult(dst, scalar, _BASE_POINT[:]) +} diff --git a/core/mem/virtual/virtual_linux.odin b/core/mem/virtual/virtual_linux.odin index c4dd564ee..71a56e499 100644 --- a/core/mem/virtual/virtual_linux.odin +++ b/core/mem/virtual/virtual_linux.odin @@ -4,64 +4,56 @@ package mem_virtual import "core:c" import "core:intrinsics" +import "core:sys/unix" -when ODIN_ARCH == "amd64" { - SYS_mmap :: 9 - SYS_mprotect :: 10 - SYS_munmap :: 11 - SYS_madvise :: 28 - - PROT_NONE :: 0x0 - PROT_READ :: 0x1 - PROT_WRITE :: 0x2 - PROT_EXEC :: 0x4 - PROT_GROWSDOWN :: 0x01000000 - PROT_GROWSUP :: 0x02000000 +PROT_NONE :: 0x0 +PROT_READ :: 0x1 +PROT_WRITE :: 0x2 +PROT_EXEC :: 0x4 +PROT_GROWSDOWN :: 0x01000000 +PROT_GROWSUP :: 0x02000000 - MAP_FIXED :: 0x1 - MAP_PRIVATE :: 0x2 - MAP_SHARED :: 0x4 - MAP_ANONYMOUS :: 0x20 - - MADV_NORMAL :: 0 - MADV_RANDOM :: 1 - MADV_SEQUENTIAL :: 2 - MADV_WILLNEED :: 3 - MADV_DONTNEED :: 4 - MADV_FREE :: 8 - MADV_REMOVE :: 9 - MADV_DONTFORK :: 10 - MADV_DOFORK :: 11 - MADV_MERGEABLE :: 12 - MADV_UNMERGEABLE :: 13 - MADV_HUGEPAGE :: 14 - MADV_NOHUGEPAGE :: 15 - MADV_DONTDUMP :: 16 - MADV_DODUMP :: 17 - MADV_WIPEONFORK :: 18 - MADV_KEEPONFORK :: 19 - MADV_HWPOISON :: 100 -} else { - #panic("Unsupported architecture") -} +MAP_FIXED :: 0x1 +MAP_PRIVATE :: 0x2 +MAP_SHARED :: 0x4 +MAP_ANONYMOUS :: 0x20 + +MADV_NORMAL :: 0 +MADV_RANDOM :: 1 +MADV_SEQUENTIAL :: 2 +MADV_WILLNEED :: 3 +MADV_DONTNEED :: 4 +MADV_FREE :: 8 +MADV_REMOVE :: 9 +MADV_DONTFORK :: 10 +MADV_DOFORK :: 11 +MADV_MERGEABLE :: 12 +MADV_UNMERGEABLE :: 13 +MADV_HUGEPAGE :: 14 +MADV_NOHUGEPAGE :: 15 +MADV_DONTDUMP :: 16 +MADV_DODUMP :: 17 +MADV_WIPEONFORK :: 18 +MADV_KEEPONFORK :: 19 +MADV_HWPOISON :: 100 mmap :: proc "contextless" (addr: rawptr, length: uint, prot: c.int, flags: c.int, fd: c.int, offset: uintptr) -> rawptr { - res := intrinsics.syscall(SYS_mmap, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flags), uintptr(fd), offset) + res := intrinsics.syscall(unix.SYS_mmap, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flags), uintptr(fd), offset) return rawptr(res) } munmap :: proc "contextless" (addr: rawptr, length: uint) -> c.int { - res := intrinsics.syscall(SYS_munmap, uintptr(addr), uintptr(length)) + res := intrinsics.syscall(unix.SYS_munmap, uintptr(addr), uintptr(length)) return c.int(res) } mprotect :: proc "contextless" (addr: rawptr, length: uint, prot: c.int) -> c.int { - res := intrinsics.syscall(SYS_mprotect, uintptr(addr), uintptr(length), uint(prot)) + res := intrinsics.syscall(unix.SYS_mprotect, uintptr(addr), uintptr(length), uint(prot)) return c.int(res) } madvise :: proc "contextless" (addr: rawptr, length: uint, advice: c.int) -> c.int { - res := intrinsics.syscall(SYS_madvise, uintptr(addr), uintptr(length), uintptr(advice)) + res := intrinsics.syscall(unix.SYS_madvise, uintptr(addr), uintptr(length), uintptr(advice)) return c.int(res) } diff --git a/core/os/os_linux.odin b/core/os/os_linux.odin index bc4717b44..260a051ce 100644 --- a/core/os/os_linux.odin +++ b/core/os/os_linux.odin @@ -8,6 +8,7 @@ import "core:strings" import "core:c" import "core:strconv" import "core:intrinsics" +import "core:sys/unix" Handle :: distinct i32 File_Time :: distinct u64 @@ -265,8 +266,6 @@ X_OK :: 1 // Test for execute permission W_OK :: 2 // Test for write permission R_OK :: 4 // Test for read permission -SYS_GETTID :: 186 - foreign libc { @(link_name="__errno_location") __errno_location :: proc() -> ^int --- @@ -594,7 +593,7 @@ exit :: proc "contextless" (code: int) -> ! { } current_thread_id :: proc "contextless" () -> int { - return cast(int)intrinsics.syscall(SYS_GETTID) + return unix.sys_gettid() } dlopen :: proc(filename: string, flags: int) -> rawptr { diff --git a/core/sync/sync2/futex_linux.odin b/core/sync/sync2/futex_linux.odin index 1bd41c7cf..fca28cace 100644 --- a/core/sync/sync2/futex_linux.odin +++ b/core/sync/sync2/futex_linux.odin @@ -5,6 +5,7 @@ package sync2 import "core:c" import "core:time" import "core:intrinsics" +import "core:sys/unix" FUTEX_WAIT :: 0 FUTEX_WAKE :: 1 @@ -34,7 +35,7 @@ get_errno :: proc(r: int) -> int { } internal_futex :: proc(f: ^Futex, op: c.int, val: u32, timeout: rawptr) -> int { - code := int(intrinsics.syscall(202, uintptr(f), uintptr(op), uintptr(val), uintptr(timeout), 0, 0)) + code := int(intrinsics.syscall(unix.SYS_futex, uintptr(f), uintptr(op), uintptr(val), uintptr(timeout), 0, 0)) return get_errno(code) } diff --git a/core/sync/sync2/primitives_linux.odin b/core/sync/sync2/primitives_linux.odin index 4c81295bd..89ed97985 100644 --- a/core/sync/sync2/primitives_linux.odin +++ b/core/sync/sync2/primitives_linux.odin @@ -2,9 +2,8 @@ //+private package sync2 -import "core:intrinsics" +import "core:sys/unix" _current_thread_id :: proc "contextless" () -> int { - SYS_GETTID :: 186 - return int(intrinsics.syscall(SYS_GETTID)) + return unix.sys_gettid() } diff --git a/core/sync/sync_linux.odin b/core/sync/sync_linux.odin index fe856df94..340437c11 100644 --- a/core/sync/sync_linux.odin +++ b/core/sync/sync_linux.odin @@ -1,11 +1,9 @@ package sync import "core:sys/unix" -import "core:intrinsics" current_thread_id :: proc "contextless" () -> int { - SYS_GETTID :: 186 - return int(intrinsics.syscall(SYS_GETTID)) + return unix.sys_gettid() } diff --git a/core/sys/unix/syscalls_linux.odin b/core/sys/unix/syscalls_linux.odin new file mode 100644 index 000000000..659eedfbb --- /dev/null +++ b/core/sys/unix/syscalls_linux.odin @@ -0,0 +1,60 @@ +package unix + +import "core:intrinsics" + +// Linux has inconsistent system call numbering across architectures, +// for largely historical reasons. This attempts to provide a unified +// Odin-side interface for system calls that are required for the core +// library to work. + +// For authorative system call numbers, the following files in the kernel +// source can be used: +// +// amd64: arch/x86/entry/syscalls/syscall_64.tbl +// arm64: include/uapi/asm-generic/unistd.h +// 386: arch/x86/entry/syscalls/sycall_32.tbl +// arm: arch/arm/tools/syscall.tbl + +when ODIN_ARCH == "amd64" { + SYS_mmap : uintptr : 9 + SYS_mprotect : uintptr : 10 + SYS_munmap : uintptr : 11 + SYS_madvise : uintptr : 28 + SYS_futex : uintptr : 202 + SYS_gettid : uintptr : 186 + SYS_getrandom : uintptr : 318 +} else when ODIN_ARCH == "arm64" { + SYS_mmap : uintptr : 222 + SYS_mprotect : uintptr : 226 + SYS_munmap : uintptr : 215 + SYS_madvise : uintptr : 233 + SYS_futex : uintptr : 98 + SYS_gettid : uintptr : 178 + SYS_getrandom : uintptr : 278 +} else when ODIN_ARCH == "386" { + SYS_mmap : uintptr : 192 // 90 is "sys_old_mmap", we want mmap2 + SYS_mprotect : uintptr : 125 + SYS_munmap : uintptr : 91 + SYS_madvise : uintptr : 219 + SYS_futex : uintptr : 240 + SYS_gettid : uintptr : 224 + SYS_getrandom : uintptr : 355 +} else when ODIN_ARCH == "arm" { + SYS_mmap : uintptr : 192 // 90 is "sys_old_mmap", we want mmap2 + SYS_mprotect : uintptr : 125 + SYS_munmap: uintptr : 91 + SYS_madvise: uintptr : 220 + SYS_futex : uintptr : 240 + SYS_gettid : uintptr: 224 + SYS_getrandom : uintptr : 384 +} else { + #panic("Unsupported architecture") +} + +sys_gettid :: proc "contextless" () -> int { + return cast(int)intrinsics.syscall(SYS_gettid) +} + +sys_getrandom :: proc "contextless" (buf: ^byte, buflen: int, flags: uint) -> int { + return cast(int)intrinsics.syscall(SYS_getrandom, buf, cast(uintptr)(buflen), cast(uintptr)(flags)) +} diff --git a/src/main.cpp b/src/main.cpp index 5371393d1..99a55b2b6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -545,8 +545,8 @@ void usage(String argv0) { print_usage_line(1, "version print version"); print_usage_line(1, "report print information useful to reporting a bug"); print_usage_line(0, ""); - print_usage_line(0, "For more information of flags, apply the flag to see what is possible"); - print_usage_line(1, "-help"); + print_usage_line(0, "For further details on a command, use -help after the command name"); + print_usage_line(1, "e.g. odin build -help"); } diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin index df9920552..2ad00be66 100644 --- a/tests/core/crypto/test_core_crypto.odin +++ b/tests/core/crypto/test_core_crypto.odin @@ -115,6 +115,15 @@ main :: proc() { test_haval_224(&t) test_haval_256(&t) + // "modern" crypto tests + test_chacha20(&t) + test_poly1305(&t) + test_chacha20poly1305(&t) + test_x25519(&t) + test_rand_bytes(&t) + + bench_modern(&t) + fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count) } diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin new file mode 100644 index 000000000..71adad137 --- /dev/null +++ b/tests/core/crypto/test_core_crypto_modern.odin @@ -0,0 +1,535 @@ +package test_core_crypto + +import "core:testing" +import "core:fmt" +import "core:mem" +import "core:time" +import "core:crypto" + +import "core:crypto/chacha20" +import "core:crypto/chacha20poly1305" +import "core:crypto/poly1305" +import "core:crypto/x25519" + +_digit_value :: proc(r: rune) -> int { + ri := int(r) + v: int = 16 + switch r { + case '0'..='9': v = ri-'0' + case 'a'..='z': v = ri-'a'+10 + case 'A'..='Z': v = ri-'A'+10 + } + return v +} + +_decode_hex32 :: proc(s: string) -> [32]byte{ + b: [32]byte + for i := 0; i < len(s); i = i + 2 { + hi := _digit_value(rune(s[i])) + lo := _digit_value(rune(s[i+1])) + b[i/2] = byte(hi << 4 | lo) + } + return b +} + +_PLAINTEXT_SUNSCREEN_STR := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it." + +@(test) +test_chacha20 :: proc(t: ^testing.T) { + log(t, "Testing (X)ChaCha20") + + // Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03 + plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR) + + key := [chacha20.KEY_SIZE]byte{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + } + + nonce := [chacha20.NONCE_SIZE]byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, + 0x00, 0x00, 0x00, 0x00, + } + + ciphertext := [114]byte{ + 0x6e, 0x2e, 0x35, 0x9a, 0x25, 0x68, 0xf9, 0x80, + 0x41, 0xba, 0x07, 0x28, 0xdd, 0x0d, 0x69, 0x81, + 0xe9, 0x7e, 0x7a, 0xec, 0x1d, 0x43, 0x60, 0xc2, + 0x0a, 0x27, 0xaf, 0xcc, 0xfd, 0x9f, 0xae, 0x0b, + 0xf9, 0x1b, 0x65, 0xc5, 0x52, 0x47, 0x33, 0xab, + 0x8f, 0x59, 0x3d, 0xab, 0xcd, 0x62, 0xb3, 0x57, + 0x16, 0x39, 0xd6, 0x24, 0xe6, 0x51, 0x52, 0xab, + 0x8f, 0x53, 0x0c, 0x35, 0x9f, 0x08, 0x61, 0xd8, + 0x07, 0xca, 0x0d, 0xbf, 0x50, 0x0d, 0x6a, 0x61, + 0x56, 0xa3, 0x8e, 0x08, 0x8a, 0x22, 0xb6, 0x5e, + 0x52, 0xbc, 0x51, 0x4d, 0x16, 0xcc, 0xf8, 0x06, + 0x81, 0x8c, 0xe9, 0x1a, 0xb7, 0x79, 0x37, 0x36, + 0x5a, 0xf9, 0x0b, 0xbf, 0x74, 0xa3, 0x5b, 0xe6, + 0xb4, 0x0b, 0x8e, 0xed, 0xf2, 0x78, 0x5e, 0x42, + 0x87, 0x4d, + } + ciphertext_str := hex_string(ciphertext[:]) + + derived_ciphertext: [114]byte + ctx: chacha20.Context = --- + chacha20.init(&ctx, key[:], nonce[:]) + chacha20.seek(&ctx, 1) // The test vectors start the counter at 1. + chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:]) + + derived_ciphertext_str := hex_string(derived_ciphertext[:]) + expect(t, derived_ciphertext_str == ciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", ciphertext_str, derived_ciphertext_str)) + + xkey := [chacha20.KEY_SIZE]byte{ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + } + + xnonce := [chacha20.XNONCE_SIZE]byte{ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + } + + xciphertext := [114]byte{ + 0xbd, 0x6d, 0x17, 0x9d, 0x3e, 0x83, 0xd4, 0x3b, + 0x95, 0x76, 0x57, 0x94, 0x93, 0xc0, 0xe9, 0x39, + 0x57, 0x2a, 0x17, 0x00, 0x25, 0x2b, 0xfa, 0xcc, + 0xbe, 0xd2, 0x90, 0x2c, 0x21, 0x39, 0x6c, 0xbb, + 0x73, 0x1c, 0x7f, 0x1b, 0x0b, 0x4a, 0xa6, 0x44, + 0x0b, 0xf3, 0xa8, 0x2f, 0x4e, 0xda, 0x7e, 0x39, + 0xae, 0x64, 0xc6, 0x70, 0x8c, 0x54, 0xc2, 0x16, + 0xcb, 0x96, 0xb7, 0x2e, 0x12, 0x13, 0xb4, 0x52, + 0x2f, 0x8c, 0x9b, 0xa4, 0x0d, 0xb5, 0xd9, 0x45, + 0xb1, 0x1b, 0x69, 0xb9, 0x82, 0xc1, 0xbb, 0x9e, + 0x3f, 0x3f, 0xac, 0x2b, 0xc3, 0x69, 0x48, 0x8f, + 0x76, 0xb2, 0x38, 0x35, 0x65, 0xd3, 0xff, 0xf9, + 0x21, 0xf9, 0x66, 0x4c, 0x97, 0x63, 0x7d, 0xa9, + 0x76, 0x88, 0x12, 0xf6, 0x15, 0xc6, 0x8b, 0x13, + 0xb5, 0x2e, + } + xciphertext_str := hex_string(xciphertext[:]) + + chacha20.init(&ctx, xkey[:], xnonce[:]) + chacha20.seek(&ctx, 1) + chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:]) + + derived_ciphertext_str = hex_string(derived_ciphertext[:]) + expect(t, derived_ciphertext_str == xciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", xciphertext_str, derived_ciphertext_str)) +} + +@(test) +test_poly1305 :: proc(t: ^testing.T) { + log(t, "Testing poly1305") + + // Test cases taken from poly1305-donna. + key := [poly1305.KEY_SIZE]byte{ + 0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91, + 0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25, + 0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65, + 0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80, + } + + msg := [131]byte{ + 0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73, + 0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce, + 0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4, + 0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a, + 0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b, + 0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72, + 0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2, + 0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38, + 0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a, + 0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae, + 0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea, + 0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda, + 0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde, + 0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3, + 0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6, + 0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74, + 0xe3,0x55,0xa5, + } + + tag := [poly1305.TAG_SIZE]byte{ + 0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5, + 0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9, + } + tag_str := hex_string(tag[:]) + + // Verify - oneshot + compare + ok := poly1305.verify(tag[:], msg[:], key[:]) + expect(t, ok, "oneshot verify call failed") + + // Sum - oneshot + derived_tag: [poly1305.TAG_SIZE]byte + poly1305.sum(derived_tag[:], msg[:], key[:]) + derived_tag_str := hex_string(derived_tag[:]) + expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for sum(msg, key), but got %s instead", tag_str, derived_tag_str)) + + // Incremental + mem.zero(&derived_tag, size_of(derived_tag)) + ctx: poly1305.Context = --- + poly1305.init(&ctx, key[:]) + read_lengths := [11]int{32, 64, 16, 8, 4, 2, 1, 1, 1, 1, 1} + off := 0 + for read_length in read_lengths { + to_read := msg[off:off+read_length] + poly1305.update(&ctx, to_read) + off = off + read_length + } + poly1305.final(&ctx, derived_tag[:]) + derived_tag_str = hex_string(derived_tag[:]) + expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for init/update/final - incremental, but got %s instead", tag_str, derived_tag_str)) +} + +@(test) +test_chacha20poly1305 :: proc(t: ^testing.T) { + log(t, "Testing chacha20poly1205") + + plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR) + + aad := [12]byte{ + 0x50, 0x51, 0x52, 0x53, 0xc0, 0xc1, 0xc2, 0xc3, + 0xc4, 0xc5, 0xc6, 0xc7, + } + + key := [chacha20poly1305.KEY_SIZE]byte{ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + } + + nonce := [chacha20poly1305.NONCE_SIZE]byte{ + 0x07, 0x00, 0x00, 0x00, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + } + + ciphertext := [114]byte{ + 0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb, + 0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2, + 0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe, + 0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6, + 0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12, + 0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b, + 0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29, + 0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36, + 0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c, + 0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58, + 0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94, + 0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc, + 0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d, + 0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b, + 0x61, 0x16, + } + ciphertext_str := hex_string(ciphertext[:]) + + tag := [chacha20poly1305.TAG_SIZE]byte{ + 0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09, 0xe2, 0x6a, + 0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60, 0x06, 0x91, + } + tag_str := hex_string(tag[:]) + + derived_tag: [chacha20poly1305.TAG_SIZE]byte + derived_ciphertext: [114]byte + + chacha20poly1305.encrypt(derived_ciphertext[:], derived_tag[:], key[:], nonce[:], aad[:], plaintext) + + derived_ciphertext_str := hex_string(derived_ciphertext[:]) + expect(t, derived_ciphertext_str == ciphertext_str, fmt.tprintf("Expected ciphertext %s for encrypt(aad, plaintext), but got %s instead", ciphertext_str, derived_ciphertext_str)) + + derived_tag_str := hex_string(derived_tag[:]) + expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected tag %s for encrypt(aad, plaintext), but got %s instead", tag_str, derived_tag_str)) + + derived_plaintext: [114]byte + ok := chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], ciphertext[:]) + derived_plaintext_str := string(derived_plaintext[:]) + expect(t, ok, "Expected true for decrypt(tag, aad, ciphertext)") + expect(t, derived_plaintext_str == _PLAINTEXT_SUNSCREEN_STR, fmt.tprintf("Expected plaintext %s for decrypt(tag, aad, ciphertext), but got %s instead", _PLAINTEXT_SUNSCREEN_STR, derived_plaintext_str)) + + derived_ciphertext[0] ~= 0xa5 + ok = chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], derived_ciphertext[:]) + expect(t, !ok, "Expected false for decrypt(tag, aad, corrupted_ciphertext)") + + aad[0] ~= 0xa5 + ok = chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], ciphertext[:]) + expect(t, !ok, "Expected false for decrypt(tag, corrupted_aad, ciphertext)") +} + +TestECDH :: struct { + scalar: string, + point: string, + product: string, +} + +@(test) +test_x25519 :: proc(t: ^testing.T) { + log(t, "Testing X25519") + + test_vectors := [?]TestECDH { + // Test vectors from RFC 7748 + TestECDH{ + "a546e36bf0527c9d3b16154b82465edd62144c0ac1fc5a18506a2244ba449ac4", + "e6db6867583030db3594c1a424b15f7c726624ec26b3353b10a903a6d0ab1c4c", + "c3da55379de9c6908e94ea4df28d084f32eccf03491c71f754b4075577a28552", + }, + TestECDH{ + "4b66e9d4d1b4673c5ad22691957d6af5c11b6421e0ea01d42ca4169e7918ba0d", + "e5210f12786811d3f4b7959d0538ae2c31dbe7106fc03c3efc4cd549c715a493", + "95cbde9476e8907d7aade45cb4b873f88b595a68799fa152e6f8f7647aac7957", + }, + } + for v, _ in test_vectors { + scalar := _decode_hex32(v.scalar) + point := _decode_hex32(v.point) + + derived_point: [x25519.POINT_SIZE]byte + x25519.scalarmult(derived_point[:], scalar[:], point[:]) + derived_point_str := hex_string(derived_point[:]) + + expect(t, derived_point_str == v.product, fmt.tprintf("Expected %s for %s * %s, but got %s instead", v.product, v.scalar, v.point, derived_point_str)) + + // Abuse the test vectors to sanity-check the scalar-basepoint multiply. + p1, p2: [x25519.POINT_SIZE]byte + x25519.scalarmult_basepoint(p1[:], scalar[:]) + x25519.scalarmult(p2[:], scalar[:], x25519._BASE_POINT[:]) + p1_str, p2_str := hex_string(p1[:]), hex_string(p2[:]) + expect(t, p1_str == p2_str, fmt.tprintf("Expected %s for %s * basepoint, but got %s instead", p2_str, v.scalar, p1_str)) + } + + // TODO/tests: Run the wycheproof test vectors, once I figure out + // how to work with JSON. +} + +@(test) +test_rand_bytes :: proc(t: ^testing.T) { + log(t, "Testing rand_bytes") + + if ODIN_OS != "linux" { + log(t, "rand_bytes not supported - skipping") + return + } + + allocator := context.allocator + + buf := make([]byte, 1 << 25, allocator) + defer delete(buf) + + // Testing a CSPRNG for correctness is incredibly involved and + // beyond the scope of an implementation that offloads + // responsibility for correctness to the OS. + // + // Just attempt to randomize a sufficiently large buffer, where + // sufficiently large is: + // * Larger than the maximum getentropy request size (256 bytes). + // * Larger than the maximum getrandom request size (2^25 - 1 bytes). + // + // While theoretically non-deterministic, if this fails, chances + // are the CSPRNG is busted. + seems_ok := false + for i := 0; i < 256; i = i + 1 { + mem.zero_explicit(raw_data(buf), len(buf)) + crypto.rand_bytes(buf) + + if buf[0] != 0 && buf[len(buf)-1] != 0 { + seems_ok = true + break + } + } + + expect(t, seems_ok, "Expected to randomize the head and tail of the buffer within a handful of attempts") +} + +@(test) +bench_modern :: proc(t: ^testing.T) { + fmt.println("Starting benchmarks:") + + bench_chacha20(t) + bench_poly1305(t) + bench_chacha20poly1305(t) + bench_x25519(t) +} + +_setup_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) { + assert(options != nil) + + options.input = make([]u8, options.bytes, allocator) + return nil if len(options.input) == options.bytes else .Allocation_Error +} + +_teardown_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) { + assert(options != nil) + + delete(options.input) + return nil +} + +_benchmark_chacha20 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) { + buf := options.input + key := [chacha20.KEY_SIZE]byte{ + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + nonce := [chacha20.NONCE_SIZE]byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + } + + ctx: chacha20.Context = --- + chacha20.init(&ctx, key[:], nonce[:]) + + for _ in 0..=options.rounds { + chacha20.xor_bytes(&ctx, buf, buf) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + return nil +} + +_benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) { + buf := options.input + key := [poly1305.KEY_SIZE]byte{ + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + + tag: [poly1305.TAG_SIZE]byte = --- + for _ in 0..=options.rounds { + poly1305.sum(tag[:], buf, key[:]) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + //options.hash = u128(h) + return nil +} + +_benchmark_chacha20poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) { + buf := options.input + key := [chacha20.KEY_SIZE]byte{ + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + nonce := [chacha20.NONCE_SIZE]byte{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + } + + tag: [chacha20poly1305.TAG_SIZE]byte = --- + + for _ in 0..=options.rounds { + chacha20poly1305.encrypt(buf,tag[:], key[:], nonce[:], nil, buf) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + return nil +} + +benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) { + fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n", + name, + options.rounds, + options.processed, + time.duration_nanoseconds(options.duration), + options.rounds_per_second, + options.megabytes_per_second, + ) +} + +bench_chacha20 :: proc(t: ^testing.T) { + name := "ChaCha20 64 bytes" + options := &time.Benchmark_Options{ + rounds = 1_000, + bytes = 64, + setup = _setup_sized_buf, + bench = _benchmark_chacha20, + teardown = _teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) + + name = "ChaCha20 1024 bytes" + options.bytes = 1024 + err = time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) + + name = "ChaCha20 65536 bytes" + options.bytes = 65536 + err = time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) +} + +bench_poly1305 :: proc(t: ^testing.T) { + name := "Poly1305 64 zero bytes" + options := &time.Benchmark_Options{ + rounds = 1_000, + bytes = 64, + setup = _setup_sized_buf, + bench = _benchmark_poly1305, + teardown = _teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) + + name = "Poly1305 1024 zero bytes" + options.bytes = 1024 + err = time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) +} + +bench_chacha20poly1305 :: proc(t: ^testing.T) { + name := "chacha20poly1305 64 bytes" + options := &time.Benchmark_Options{ + rounds = 1_000, + bytes = 64, + setup = _setup_sized_buf, + bench = _benchmark_chacha20poly1305, + teardown = _teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) + + name = "chacha20poly1305 1024 bytes" + options.bytes = 1024 + err = time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) + + name = "chacha20poly1305 65536 bytes" + options.bytes = 65536 + err = time.benchmark(options, context.allocator) + expect(t, err == nil, name) + benchmark_print(name, options) +} + +bench_x25519 :: proc(t: ^testing.T) { + point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef") + scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe") + out: [x25519.POINT_SIZE]byte = --- + + iters :: 10000 + start := time.now() + for i := 0; i < iters; i = i + 1 { + x25519.scalarmult(out[:], scalar[:], point[:]) + } + elapsed := time.since(start) + + log(t, fmt.tprintf("x25519.scalarmult: ~%f us/op", time.duration_microseconds(elapsed) / iters)) +}