From 1823b0cead9530aea5f6694a95f28e1ddac5efac Mon Sep 17 00:00:00 2001
From: cybermancer <cybermancer@protonmail.com>
Date: Tue, 16 Nov 2021 15:15:21 +1100
Subject: [PATCH 01/10] Improve compiler help output with regard to command
 specific help.

---
 src/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/main.cpp b/src/main.cpp
index 5371393d1..7896756d4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -545,8 +545,8 @@ void usage(String argv0) {
 	print_usage_line(1, "version   print version");
 	print_usage_line(1, "report    print information useful to reporting a bug");
 	print_usage_line(0, "");
-	print_usage_line(0, "For more information of flags, apply the flag to see what is possible");
-	print_usage_line(1, "-help");
+	print_usage_line(0, "For further details on a command, use -help or --help after the command name");
+	print_usage_line(1, "e.g. odin build -help");
 }
 
 

From d1e76ee4f299fa2a47306c3dc8a4929abfdd4886 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Sat, 6 Nov 2021 02:36:30 +0000
Subject: [PATCH 02/10] core/crypto: Add constant-time memory comparison
 routines

Using a constant-time comparison is required when comparing things like
MACs, password digests, and etc to avoid exposing sensitive data via
trivial timing attacks.

These routines could also live under core:mem, but they are somewhat
specialized, and are likely only useful for cryptographic applications.
---
 core/crypto/crypto.odin | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 core/crypto/crypto.odin

diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin
new file mode 100644
index 000000000..ddcc5d367
--- /dev/null
+++ b/core/crypto/crypto.odin
@@ -0,0 +1,41 @@
+package crypto
+
+import "core:mem"
+
+// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
+//
+// The execution time of this routine is constant regardless of the contents
+// of the slices being compared, as long as the length of the slices is equal.
+// If the length of the two slices is different, it will early-return 0.
+compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
+	// If the length of the slices is different, early return.
+	//
+	// This leaks the fact that the slices have a different length,
+	// but the routine is primarily intended for comparing things
+	// like MACS and password digests.
+	n := len(a)
+	if n != len(b) {
+		return 0
+	}
+
+	return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
+}
+
+// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
+// a and b are equal, 0 otherwise.
+//
+// The execution time of this routine is constant regardless of the
+// contents of the memory being compared.
+compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> int {
+	x := mem.slice_ptr(a, n)
+	y := mem.slice_ptr(b, n)
+
+	v: byte
+	for i in 0..<n {
+		v |= x[i] ~ y[i]
+	}
+
+	// After the loop, v == 0 iff a == b.  The subtraction will underflow
+	// iff v == 0, setting the sign-bit, which gets returned.
+	return int((u32(v)-1) >> 31)
+}

From 1a7a6a9116c7d9ed0e9ced208d0373ea62ad46c3 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Sat, 6 Nov 2021 04:21:24 +0000
Subject: [PATCH 03/10] core/crypto: Add x25519

This package implements the X25519 key agreement scheme as specified in
RFC 7748, using routines taken from fiat-crypto and Monocypher.
---
 core/crypto/_fiat/README.md                   |  35 +
 core/crypto/_fiat/fiat.odin                   |  24 +
 core/crypto/_fiat/field_curve25519/field.odin | 138 ++++
 .../_fiat/field_curve25519/field51.odin       | 616 ++++++++++++++++++
 core/crypto/x25519/x25519.odin                | 126 ++++
 tests/core/crypto/test_core_crypto.odin       |   5 +
 .../core/crypto/test_core_crypto_modern.odin  |  95 +++
 7 files changed, 1039 insertions(+)
 create mode 100644 core/crypto/_fiat/README.md
 create mode 100644 core/crypto/_fiat/fiat.odin
 create mode 100644 core/crypto/_fiat/field_curve25519/field.odin
 create mode 100644 core/crypto/_fiat/field_curve25519/field51.odin
 create mode 100644 core/crypto/x25519/x25519.odin
 create mode 100644 tests/core/crypto/test_core_crypto_modern.odin

diff --git a/core/crypto/_fiat/README.md b/core/crypto/_fiat/README.md
new file mode 100644
index 000000000..cd510d442
--- /dev/null
+++ b/core/crypto/_fiat/README.md
@@ -0,0 +1,35 @@
+# fiat
+
+This package contains low level arithmetic required to implement certain
+cryptographic primitives, ported from the [fiat-crypto project][1]
+along with some higher-level helpers.
+
+## Notes
+
+fiat-crypto gives the choice of 3 licenses for derived works.  The 1-Clause
+BSD license is chosen as it is compatible with Odin's existing licensing.
+
+The routines are intended to be timing-safe, as long as the underlying
+integer arithmetic is constant time.  This is true on most systems commonly
+used today, with the notable exception of WASM.
+
+While fiat-crypto provides both output targeting both 32-bit and 64-bit
+architectures, only the 64-bit versions were used, as 32-bit architectures
+are becoming increasingly uncommon and irrelevant.
+
+With the current Odin syntax, the Go output is trivially ported in most
+cases and was used as the basis of the port.
+
+In the future, it would be better to auto-generate Odin either directly
+by adding an appropriate code-gen backend written in Coq, or perhaps by
+parsing the JSON output.
+
+As this is a port rather than autogenerated output, none of fiat-crypto's
+formal verification guarantees apply, unless it is possible to prove binary
+equivalence.
+
+For the most part, alterations to the base fiat-crypto generated code was
+kept to a minimum, to aid auditability.  This results in a somewhat
+ideosyncratic style, and in some cases minor performance penalties.
+
+[1]: https://github.com/mit-plv/fiat-crypto
diff --git a/core/crypto/_fiat/fiat.odin b/core/crypto/_fiat/fiat.odin
new file mode 100644
index 000000000..ae9727149
--- /dev/null
+++ b/core/crypto/_fiat/fiat.odin
@@ -0,0 +1,24 @@
+package fiat
+
+// This package provides various helpers and types common to all of the
+// fiat-crypto derived backends.
+
+// This code only works on a two's complement system.
+#assert((-1 & 3) == 3)
+
+u1 :: distinct u8
+i1 :: distinct i8
+
+cmovznz_u64 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) {
+	x1 := (u64(arg1) * 0xffffffffffffffff)
+	x2 := ((x1 & arg3) | ((~x1) & arg2))
+	out1 = x2
+	return
+}
+
+cmovznz_u32 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u32) -> (out1: u32) {
+	x1 := (u32(arg1) * 0xffffffff)
+	x2 := ((x1 & arg3) | ((~x1) & arg2))
+	out1 = x2
+	return
+}
diff --git a/core/crypto/_fiat/field_curve25519/field.odin b/core/crypto/_fiat/field_curve25519/field.odin
new file mode 100644
index 000000000..faf8ae3f7
--- /dev/null
+++ b/core/crypto/_fiat/field_curve25519/field.odin
@@ -0,0 +1,138 @@
+package field_curve25519
+
+import "core:crypto"
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+	return transmute(^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+	return transmute(^Tight_Field_Element)(arg1)
+}
+
+fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	// Ignore the unused bit by copying the input and masking the bit off
+	// prior to deserialization.
+	tmp1: [32]byte = ---
+	copy_slice(tmp1[:], arg1[:])
+	tmp1[31] &= 127
+
+	_fe_from_bytes(out1, &tmp1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+}
+
+fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int {
+	tmp2: [32]byte = ---
+
+	fe_to_bytes(&tmp2, arg2)
+	ret := fe_equal_bytes(arg1, &tmp2)
+
+	mem.zero_explicit(&tmp2, size_of(tmp2))
+
+	return ret
+}
+
+fe_equal_bytes :: proc "contextless" (arg1: ^Tight_Field_Element, arg2: ^[32]byte) -> int {
+	tmp1: [32]byte = ---
+
+	fe_to_bytes(&tmp1, arg1)
+
+	ret := crypto.compare_constant_time(tmp1[:], arg2[:])
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+
+	return ret
+}
+
+fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element, arg2: uint) {
+	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
+	if arg2 == 0 {
+		fe_one(out1)
+		return
+	}
+
+	fe_carry_square(out1, arg1)
+	for _ in 1..<arg2 {
+		fe_carry_square(out1, fe_relax_cast(out1))
+	}
+}
+
+fe_carry_opp :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	fe_opp(fe_relax_cast(out1), arg1)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
+fe_carry_invsqrt :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) -> int {
+	// Inverse square root taken from Monocypher.
+
+	tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
+
+	// t0 = x^((p-5)/8)
+	// Can be achieved with a simple double & add ladder,
+	// but it would be slower.
+	fe_carry_pow2k(&tmp1, arg1, 1)
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2)
+	fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2))
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 5)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 10)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 20)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 10)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 50)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 100)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1)
+
+	// quartic = x^((p-1)/4)
+	quartic := &tmp2
+	fe_carry_square(quartic, fe_relax_cast(&tmp1))
+	fe_carry_mul(quartic, fe_relax_cast(quartic), arg1)
+
+	// Serialize quartic once to save on repeated serialization/sanitization.
+	quartic_buf: [32]byte = ---
+	fe_to_bytes(&quartic_buf, quartic)
+	check := &tmp3
+
+	fe_one(check)
+	p1 := fe_equal_bytes(check, &quartic_buf)
+	fe_carry_opp(check, check)
+	m1 := fe_equal_bytes(check, &quartic_buf)
+	fe_carry_opp(check, &SQRT_M1)
+	ms := fe_equal_bytes(check, &quartic_buf)
+
+	// if quartic == -1 or sqrt(-1)
+	// then  isr = x^((p-1)/4) * sqrt(-1)
+	// else  isr = x^((p-1)/4)
+	fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1))
+	fe_cond_assign(out1, &tmp1, (m1|ms) ~ 1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+	mem.zero_explicit(&tmp2, size_of(tmp2))
+	mem.zero_explicit(&tmp3, size_of(tmp3))
+	mem.zero_explicit(&quartic_buf, size_of(quartic_buf))
+
+	return p1 | m1
+}
+
+fe_carry_inv :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	tmp1: Tight_Field_Element
+
+	fe_carry_square(&tmp1, arg1)
+	_ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1))
+	fe_carry_square(&tmp1, fe_relax_cast(&tmp1))
+	fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+}
diff --git a/core/crypto/_fiat/field_curve25519/field51.odin b/core/crypto/_fiat/field_curve25519/field51.odin
new file mode 100644
index 000000000..e4ca98b57
--- /dev/null
+++ b/core/crypto/_fiat/field_curve25519/field51.odin
@@ -0,0 +1,616 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_curve25519
+
+// The file provides arithmetic on the field Z/(2^255-19) using
+// unsaturated 64-bit integer arithmetic.  It is derived primarily
+// from the machine generated Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+// At some point, it may be worth adding support to fiat-crypto for
+// generating Odin output.
+//
+// TODO:
+//  * When fiat-crypto supports it, using a saturated 64-bit limbs
+//    instead of 51-bit limbs will be faster, though the gains are
+//    minimal unless adcx/adox/mulx are used.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+Loose_Field_Element :: distinct [5]u64
+Tight_Field_Element :: distinct [5]u64
+
+SQRT_M1 := Tight_Field_Element{
+	1718705420411056,
+	234908883556509,
+	2233514472574048,
+	2117202627021982,
+	765476049583133,
+}
+
+_addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0x7ffffffffffff)
+	x3 := fiat.u1((x1 >> 51))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 51))
+	x3 := (u64(x1) & 0x7ffffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(arg1[4], (arg2[4] * 0x13))
+	x4, x3 := bits.mul_u64(arg1[4], (arg2[3] * 0x13))
+	x6, x5 := bits.mul_u64(arg1[4], (arg2[2] * 0x13))
+	x8, x7 := bits.mul_u64(arg1[4], (arg2[1] * 0x13))
+	x10, x9 := bits.mul_u64(arg1[3], (arg2[4] * 0x13))
+	x12, x11 := bits.mul_u64(arg1[3], (arg2[3] * 0x13))
+	x14, x13 := bits.mul_u64(arg1[3], (arg2[2] * 0x13))
+	x16, x15 := bits.mul_u64(arg1[2], (arg2[4] * 0x13))
+	x18, x17 := bits.mul_u64(arg1[2], (arg2[3] * 0x13))
+	x20, x19 := bits.mul_u64(arg1[1], (arg2[4] * 0x13))
+	x22, x21 := bits.mul_u64(arg1[4], arg2[0])
+	x24, x23 := bits.mul_u64(arg1[3], arg2[1])
+	x26, x25 := bits.mul_u64(arg1[3], arg2[0])
+	x28, x27 := bits.mul_u64(arg1[2], arg2[2])
+	x30, x29 := bits.mul_u64(arg1[2], arg2[1])
+	x32, x31 := bits.mul_u64(arg1[2], arg2[0])
+	x34, x33 := bits.mul_u64(arg1[1], arg2[3])
+	x36, x35 := bits.mul_u64(arg1[1], arg2[2])
+	x38, x37 := bits.mul_u64(arg1[1], arg2[1])
+	x40, x39 := bits.mul_u64(arg1[1], arg2[0])
+	x42, x41 := bits.mul_u64(arg1[0], arg2[4])
+	x44, x43 := bits.mul_u64(arg1[0], arg2[3])
+	x46, x45 := bits.mul_u64(arg1[0], arg2[2])
+	x48, x47 := bits.mul_u64(arg1[0], arg2[1])
+	x50, x49 := bits.mul_u64(arg1[0], arg2[0])
+	x51, x52 := bits.add_u64(x13, x7, u64(0x0))
+	x53, _ := bits.add_u64(x14, x8, u64(fiat.u1(x52)))
+	x55, x56 := bits.add_u64(x17, x51, u64(0x0))
+	x57, _ := bits.add_u64(x18, x53, u64(fiat.u1(x56)))
+	x59, x60 := bits.add_u64(x19, x55, u64(0x0))
+	x61, _ := bits.add_u64(x20, x57, u64(fiat.u1(x60)))
+	x63, x64 := bits.add_u64(x49, x59, u64(0x0))
+	x65, _ := bits.add_u64(x50, x61, u64(fiat.u1(x64)))
+	x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff))
+	x68 := (x63 & 0x7ffffffffffff)
+	x69, x70 := bits.add_u64(x23, x21, u64(0x0))
+	x71, _ := bits.add_u64(x24, x22, u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x27, x69, u64(0x0))
+	x75, _ := bits.add_u64(x28, x71, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x33, x73, u64(0x0))
+	x79, _ := bits.add_u64(x34, x75, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x41, x77, u64(0x0))
+	x83, _ := bits.add_u64(x42, x79, u64(fiat.u1(x82)))
+	x85, x86 := bits.add_u64(x25, x1, u64(0x0))
+	x87, _ := bits.add_u64(x26, x2, u64(fiat.u1(x86)))
+	x89, x90 := bits.add_u64(x29, x85, u64(0x0))
+	x91, _ := bits.add_u64(x30, x87, u64(fiat.u1(x90)))
+	x93, x94 := bits.add_u64(x35, x89, u64(0x0))
+	x95, _ := bits.add_u64(x36, x91, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(x43, x93, u64(0x0))
+	x99, _ := bits.add_u64(x44, x95, u64(fiat.u1(x98)))
+	x101, x102 := bits.add_u64(x9, x3, u64(0x0))
+	x103, _ := bits.add_u64(x10, x4, u64(fiat.u1(x102)))
+	x105, x106 := bits.add_u64(x31, x101, u64(0x0))
+	x107, _ := bits.add_u64(x32, x103, u64(fiat.u1(x106)))
+	x109, x110 := bits.add_u64(x37, x105, u64(0x0))
+	x111, _ := bits.add_u64(x38, x107, u64(fiat.u1(x110)))
+	x113, x114 := bits.add_u64(x45, x109, u64(0x0))
+	x115, _ := bits.add_u64(x46, x111, u64(fiat.u1(x114)))
+	x117, x118 := bits.add_u64(x11, x5, u64(0x0))
+	x119, _ := bits.add_u64(x12, x6, u64(fiat.u1(x118)))
+	x121, x122 := bits.add_u64(x15, x117, u64(0x0))
+	x123, _ := bits.add_u64(x16, x119, u64(fiat.u1(x122)))
+	x125, x126 := bits.add_u64(x39, x121, u64(0x0))
+	x127, _ := bits.add_u64(x40, x123, u64(fiat.u1(x126)))
+	x129, x130 := bits.add_u64(x47, x125, u64(0x0))
+	x131, _ := bits.add_u64(x48, x127, u64(fiat.u1(x130)))
+	x133, x134 := bits.add_u64(x67, x129, u64(0x0))
+	x135 := (u64(fiat.u1(x134)) + x131)
+	x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff))
+	x137 := (x133 & 0x7ffffffffffff)
+	x138, x139 := bits.add_u64(x136, x113, u64(0x0))
+	x140 := (u64(fiat.u1(x139)) + x115)
+	x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff))
+	x142 := (x138 & 0x7ffffffffffff)
+	x143, x144 := bits.add_u64(x141, x97, u64(0x0))
+	x145 := (u64(fiat.u1(x144)) + x99)
+	x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff))
+	x147 := (x143 & 0x7ffffffffffff)
+	x148, x149 := bits.add_u64(x146, x81, u64(0x0))
+	x150 := (u64(fiat.u1(x149)) + x83)
+	x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff))
+	x152 := (x148 & 0x7ffffffffffff)
+	x153 := (x151 * 0x13)
+	x154 := (x68 + x153)
+	x155 := (x154 >> 51)
+	x156 := (x154 & 0x7ffffffffffff)
+	x157 := (x155 + x137)
+	x158 := fiat.u1((x157 >> 51))
+	x159 := (x157 & 0x7ffffffffffff)
+	x160 := (u64(x158) + x142)
+	out1[0] = x156
+	out1[1] = x159
+	out1[2] = x160
+	out1[3] = x147
+	out1[4] = x152
+}
+
+fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := (arg1[4] * 0x13)
+	x2 := (x1 * 0x2)
+	x3 := (arg1[4] * 0x2)
+	x4 := (arg1[3] * 0x13)
+	x5 := (x4 * 0x2)
+	x6 := (arg1[3] * 0x2)
+	x7 := (arg1[2] * 0x2)
+	x8 := (arg1[1] * 0x2)
+	x10, x9 := bits.mul_u64(arg1[4], x1)
+	x12, x11 := bits.mul_u64(arg1[3], x2)
+	x14, x13 := bits.mul_u64(arg1[3], x4)
+	x16, x15 := bits.mul_u64(arg1[2], x2)
+	x18, x17 := bits.mul_u64(arg1[2], x5)
+	x20, x19 := bits.mul_u64(arg1[2], arg1[2])
+	x22, x21 := bits.mul_u64(arg1[1], x2)
+	x24, x23 := bits.mul_u64(arg1[1], x6)
+	x26, x25 := bits.mul_u64(arg1[1], x7)
+	x28, x27 := bits.mul_u64(arg1[1], arg1[1])
+	x30, x29 := bits.mul_u64(arg1[0], x3)
+	x32, x31 := bits.mul_u64(arg1[0], x6)
+	x34, x33 := bits.mul_u64(arg1[0], x7)
+	x36, x35 := bits.mul_u64(arg1[0], x8)
+	x38, x37 := bits.mul_u64(arg1[0], arg1[0])
+	x39, x40 := bits.add_u64(x21, x17, u64(0x0))
+	x41, _ := bits.add_u64(x22, x18, u64(fiat.u1(x40)))
+	x43, x44 := bits.add_u64(x37, x39, u64(0x0))
+	x45, _ := bits.add_u64(x38, x41, u64(fiat.u1(x44)))
+	x47 := ((x43 >> 51) | ((x45 << 13) & 0xffffffffffffffff))
+	x48 := (x43 & 0x7ffffffffffff)
+	x49, x50 := bits.add_u64(x23, x19, u64(0x0))
+	x51, _ := bits.add_u64(x24, x20, u64(fiat.u1(x50)))
+	x53, x54 := bits.add_u64(x29, x49, u64(0x0))
+	x55, _ := bits.add_u64(x30, x51, u64(fiat.u1(x54)))
+	x57, x58 := bits.add_u64(x25, x9, u64(0x0))
+	x59, _ := bits.add_u64(x26, x10, u64(fiat.u1(x58)))
+	x61, x62 := bits.add_u64(x31, x57, u64(0x0))
+	x63, _ := bits.add_u64(x32, x59, u64(fiat.u1(x62)))
+	x65, x66 := bits.add_u64(x27, x11, u64(0x0))
+	x67, _ := bits.add_u64(x28, x12, u64(fiat.u1(x66)))
+	x69, x70 := bits.add_u64(x33, x65, u64(0x0))
+	x71, _ := bits.add_u64(x34, x67, u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x15, x13, u64(0x0))
+	x75, _ := bits.add_u64(x16, x14, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x35, x73, u64(0x0))
+	x79, _ := bits.add_u64(x36, x75, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x47, x77, u64(0x0))
+	x83 := (u64(fiat.u1(x82)) + x79)
+	x84 := ((x81 >> 51) | ((x83 << 13) & 0xffffffffffffffff))
+	x85 := (x81 & 0x7ffffffffffff)
+	x86, x87 := bits.add_u64(x84, x69, u64(0x0))
+	x88 := (u64(fiat.u1(x87)) + x71)
+	x89 := ((x86 >> 51) | ((x88 << 13) & 0xffffffffffffffff))
+	x90 := (x86 & 0x7ffffffffffff)
+	x91, x92 := bits.add_u64(x89, x61, u64(0x0))
+	x93 := (u64(fiat.u1(x92)) + x63)
+	x94 := ((x91 >> 51) | ((x93 << 13) & 0xffffffffffffffff))
+	x95 := (x91 & 0x7ffffffffffff)
+	x96, x97 := bits.add_u64(x94, x53, u64(0x0))
+	x98 := (u64(fiat.u1(x97)) + x55)
+	x99 := ((x96 >> 51) | ((x98 << 13) & 0xffffffffffffffff))
+	x100 := (x96 & 0x7ffffffffffff)
+	x101 := (x99 * 0x13)
+	x102 := (x48 + x101)
+	x103 := (x102 >> 51)
+	x104 := (x102 & 0x7ffffffffffff)
+	x105 := (x103 + x85)
+	x106 := fiat.u1((x105 >> 51))
+	x107 := (x105 & 0x7ffffffffffff)
+	x108 := (u64(x106) + x90)
+	out1[0] = x104
+	out1[1] = x107
+	out1[2] = x108
+	out1[3] = x95
+	out1[4] = x100
+}
+
+fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[0]
+	x2 := ((x1 >> 51) + arg1[1])
+	x3 := ((x2 >> 51) + arg1[2])
+	x4 := ((x3 >> 51) + arg1[3])
+	x5 := ((x4 >> 51) + arg1[4])
+	x6 := ((x1 & 0x7ffffffffffff) + ((x5 >> 51) * 0x13))
+	x7 := (u64(fiat.u1((x6 >> 51))) + (x2 & 0x7ffffffffffff))
+	x8 := (x6 & 0x7ffffffffffff)
+	x9 := (x7 & 0x7ffffffffffff)
+	x10 := (u64(fiat.u1((x7 >> 51))) + (x3 & 0x7ffffffffffff))
+	x11 := (x4 & 0x7ffffffffffff)
+	x12 := (x5 & 0x7ffffffffffff)
+	out1[0] = x8
+	out1[1] = x9
+	out1[2] = x10
+	out1[3] = x11
+	out1[4] = x12
+}
+
+fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := (arg1[0] + arg2[0])
+	x2 := (arg1[1] + arg2[1])
+	x3 := (arg1[2] + arg2[2])
+	x4 := (arg1[3] + arg2[3])
+	x5 := (arg1[4] + arg2[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := ((0xfffffffffffda + arg1[0]) - arg2[0])
+	x2 := ((0xffffffffffffe + arg1[1]) - arg2[1])
+	x3 := ((0xffffffffffffe + arg1[2]) - arg2[2])
+	x4 := ((0xffffffffffffe + arg1[3]) - arg2[3])
+	x5 := ((0xffffffffffffe + arg1[4]) - arg2[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := (0xfffffffffffda - arg1[0])
+	x2 := (0xffffffffffffe - arg1[1])
+	x3 := (0xffffffffffffe - arg1[2])
+	x4 := (0xffffffffffffe - arg1[3])
+	x5 := (0xffffffffffffe - arg1[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: int) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) {
+	x1, x2 := _subborrowx_u51(0x0, arg1[0], 0x7ffffffffffed)
+	x3, x4 := _subborrowx_u51(x2, arg1[1], 0x7ffffffffffff)
+	x5, x6 := _subborrowx_u51(x4, arg1[2], 0x7ffffffffffff)
+	x7, x8 := _subborrowx_u51(x6, arg1[3], 0x7ffffffffffff)
+	x9, x10 := _subborrowx_u51(x8, arg1[4], 0x7ffffffffffff)
+	x11 := fiat.cmovznz_u64(x10, u64(0x0), 0xffffffffffffffff)
+	x12, x13 := _addcarryx_u51(0x0, x1, (x11 & 0x7ffffffffffed))
+	x14, x15 := _addcarryx_u51(x13, x3, (x11 & 0x7ffffffffffff))
+	x16, x17 := _addcarryx_u51(x15, x5, (x11 & 0x7ffffffffffff))
+	x18, x19 := _addcarryx_u51(x17, x7, (x11 & 0x7ffffffffffff))
+	x20, _ := _addcarryx_u51(x19, x9, (x11 & 0x7ffffffffffff))
+	x22 := (x20 << 4)
+	x23 := (x18 * u64(0x2))
+	x24 := (x16 << 6)
+	x25 := (x14 << 3)
+	x26 := (u8(x12) & 0xff)
+	x27 := (x12 >> 8)
+	x28 := (u8(x27) & 0xff)
+	x29 := (x27 >> 8)
+	x30 := (u8(x29) & 0xff)
+	x31 := (x29 >> 8)
+	x32 := (u8(x31) & 0xff)
+	x33 := (x31 >> 8)
+	x34 := (u8(x33) & 0xff)
+	x35 := (x33 >> 8)
+	x36 := (u8(x35) & 0xff)
+	x37 := u8((x35 >> 8))
+	x38 := (x25 + u64(x37))
+	x39 := (u8(x38) & 0xff)
+	x40 := (x38 >> 8)
+	x41 := (u8(x40) & 0xff)
+	x42 := (x40 >> 8)
+	x43 := (u8(x42) & 0xff)
+	x44 := (x42 >> 8)
+	x45 := (u8(x44) & 0xff)
+	x46 := (x44 >> 8)
+	x47 := (u8(x46) & 0xff)
+	x48 := (x46 >> 8)
+	x49 := (u8(x48) & 0xff)
+	x50 := u8((x48 >> 8))
+	x51 := (x24 + u64(x50))
+	x52 := (u8(x51) & 0xff)
+	x53 := (x51 >> 8)
+	x54 := (u8(x53) & 0xff)
+	x55 := (x53 >> 8)
+	x56 := (u8(x55) & 0xff)
+	x57 := (x55 >> 8)
+	x58 := (u8(x57) & 0xff)
+	x59 := (x57 >> 8)
+	x60 := (u8(x59) & 0xff)
+	x61 := (x59 >> 8)
+	x62 := (u8(x61) & 0xff)
+	x63 := (x61 >> 8)
+	x64 := (u8(x63) & 0xff)
+	x65 := fiat.u1((x63 >> 8))
+	x66 := (x23 + u64(x65))
+	x67 := (u8(x66) & 0xff)
+	x68 := (x66 >> 8)
+	x69 := (u8(x68) & 0xff)
+	x70 := (x68 >> 8)
+	x71 := (u8(x70) & 0xff)
+	x72 := (x70 >> 8)
+	x73 := (u8(x72) & 0xff)
+	x74 := (x72 >> 8)
+	x75 := (u8(x74) & 0xff)
+	x76 := (x74 >> 8)
+	x77 := (u8(x76) & 0xff)
+	x78 := u8((x76 >> 8))
+	x79 := (x22 + u64(x78))
+	x80 := (u8(x79) & 0xff)
+	x81 := (x79 >> 8)
+	x82 := (u8(x81) & 0xff)
+	x83 := (x81 >> 8)
+	x84 := (u8(x83) & 0xff)
+	x85 := (x83 >> 8)
+	x86 := (u8(x85) & 0xff)
+	x87 := (x85 >> 8)
+	x88 := (u8(x87) & 0xff)
+	x89 := (x87 >> 8)
+	x90 := (u8(x89) & 0xff)
+	x91 := u8((x89 >> 8))
+	out1[0] = x26
+	out1[1] = x28
+	out1[2] = x30
+	out1[3] = x32
+	out1[4] = x34
+	out1[5] = x36
+	out1[6] = x39
+	out1[7] = x41
+	out1[8] = x43
+	out1[9] = x45
+	out1[10] = x47
+	out1[11] = x49
+	out1[12] = x52
+	out1[13] = x54
+	out1[14] = x56
+	out1[15] = x58
+	out1[16] = x60
+	out1[17] = x62
+	out1[18] = x64
+	out1[19] = x67
+	out1[20] = x69
+	out1[21] = x71
+	out1[22] = x73
+	out1[23] = x75
+	out1[24] = x77
+	out1[25] = x80
+	out1[26] = x82
+	out1[27] = x84
+	out1[28] = x86
+	out1[29] = x88
+	out1[30] = x90
+	out1[31] = x91
+}
+
+_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	x1 := (u64(arg1[31]) << 44)
+	x2 := (u64(arg1[30]) << 36)
+	x3 := (u64(arg1[29]) << 28)
+	x4 := (u64(arg1[28]) << 20)
+	x5 := (u64(arg1[27]) << 12)
+	x6 := (u64(arg1[26]) << 4)
+	x7 := (u64(arg1[25]) << 47)
+	x8 := (u64(arg1[24]) << 39)
+	x9 := (u64(arg1[23]) << 31)
+	x10 := (u64(arg1[22]) << 23)
+	x11 := (u64(arg1[21]) << 15)
+	x12 := (u64(arg1[20]) << 7)
+	x13 := (u64(arg1[19]) << 50)
+	x14 := (u64(arg1[18]) << 42)
+	x15 := (u64(arg1[17]) << 34)
+	x16 := (u64(arg1[16]) << 26)
+	x17 := (u64(arg1[15]) << 18)
+	x18 := (u64(arg1[14]) << 10)
+	x19 := (u64(arg1[13]) << 2)
+	x20 := (u64(arg1[12]) << 45)
+	x21 := (u64(arg1[11]) << 37)
+	x22 := (u64(arg1[10]) << 29)
+	x23 := (u64(arg1[9]) << 21)
+	x24 := (u64(arg1[8]) << 13)
+	x25 := (u64(arg1[7]) << 5)
+	x26 := (u64(arg1[6]) << 48)
+	x27 := (u64(arg1[5]) << 40)
+	x28 := (u64(arg1[4]) << 32)
+	x29 := (u64(arg1[3]) << 24)
+	x30 := (u64(arg1[2]) << 16)
+	x31 := (u64(arg1[1]) << 8)
+	x32 := arg1[0]
+	x33 := (x31 + u64(x32))
+	x34 := (x30 + x33)
+	x35 := (x29 + x34)
+	x36 := (x28 + x35)
+	x37 := (x27 + x36)
+	x38 := (x26 + x37)
+	x39 := (x38 & 0x7ffffffffffff)
+	x40 := u8((x38 >> 51))
+	x41 := (x25 + u64(x40))
+	x42 := (x24 + x41)
+	x43 := (x23 + x42)
+	x44 := (x22 + x43)
+	x45 := (x21 + x44)
+	x46 := (x20 + x45)
+	x47 := (x46 & 0x7ffffffffffff)
+	x48 := u8((x46 >> 51))
+	x49 := (x19 + u64(x48))
+	x50 := (x18 + x49)
+	x51 := (x17 + x50)
+	x52 := (x16 + x51)
+	x53 := (x15 + x52)
+	x54 := (x14 + x53)
+	x55 := (x13 + x54)
+	x56 := (x55 & 0x7ffffffffffff)
+	x57 := u8((x55 >> 51))
+	x58 := (x12 + u64(x57))
+	x59 := (x11 + x58)
+	x60 := (x10 + x59)
+	x61 := (x9 + x60)
+	x62 := (x8 + x61)
+	x63 := (x7 + x62)
+	x64 := (x63 & 0x7ffffffffffff)
+	x65 := u8((x63 >> 51))
+	x66 := (x6 + u64(x65))
+	x67 := (x5 + x66)
+	x68 := (x4 + x67)
+	x69 := (x3 + x68)
+	x70 := (x2 + x69)
+	x71 := (x1 + x70)
+	out1[0] = x39
+	out1[1] = x47
+	out1[2] = x56
+	out1[3] = x64
+	out1[4] = x71
+}
+
+fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(0x1db42, arg1[4])
+	x4, x3 := bits.mul_u64(0x1db42, arg1[3])
+	x6, x5 := bits.mul_u64(0x1db42, arg1[2])
+	x8, x7 := bits.mul_u64(0x1db42, arg1[1])
+	x10, x9 := bits.mul_u64(0x1db42, arg1[0])
+	x11 := ((x9 >> 51) | ((x10 << 13) & 0xffffffffffffffff))
+	x12 := (x9 & 0x7ffffffffffff)
+	x13, x14 := bits.add_u64(x11, x7, u64(0x0))
+	x15 := (u64(fiat.u1(x14)) + x8)
+	x16 := ((x13 >> 51) | ((x15 << 13) & 0xffffffffffffffff))
+	x17 := (x13 & 0x7ffffffffffff)
+	x18, x19 := bits.add_u64(x16, x5, u64(0x0))
+	x20 := (u64(fiat.u1(x19)) + x6)
+	x21 := ((x18 >> 51) | ((x20 << 13) & 0xffffffffffffffff))
+	x22 := (x18 & 0x7ffffffffffff)
+	x23, x24 := bits.add_u64(x21, x3, u64(0x0))
+	x25 := (u64(fiat.u1(x24)) + x4)
+	x26 := ((x23 >> 51) | ((x25 << 13) & 0xffffffffffffffff))
+	x27 := (x23 & 0x7ffffffffffff)
+	x28, x29 := bits.add_u64(x26, x1, u64(0x0))
+	x30 := (u64(fiat.u1(x29)) + x2)
+	x31 := ((x28 >> 51) | ((x30 << 13) & 0xffffffffffffffff))
+	x32 := (x28 & 0x7ffffffffffff)
+	x33 := (x31 * 0x13)
+	x34 := (x12 + x33)
+	x35 := fiat.u1((x34 >> 51))
+	x36 := (x34 & 0x7ffffffffffff)
+	x37 := (u64(x35) + x17)
+	x38 := fiat.u1((x37 >> 51))
+	x39 := (x37 & 0x7ffffffffffff)
+	x40 := (u64(x38) + x22)
+	out1[0] = x36
+	out1[1] = x39
+	out1[2] = x40
+	out1[3] = x27
+	out1[4] = x32
+}
+
+// The following routines were added by hand, and do not come from fiat-crypto.
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 1
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
+	mask := -u64(arg1)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	x = (out1[4] ~ out2[4]) & mask
+	x5, y5 := out1[4] ~ x, out2[4] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+	out1[4], out2[4] = x5, y5
+}
diff --git a/core/crypto/x25519/x25519.odin b/core/crypto/x25519/x25519.odin
new file mode 100644
index 000000000..dfc8daa47
--- /dev/null
+++ b/core/crypto/x25519/x25519.odin
@@ -0,0 +1,126 @@
+package x25519
+
+import field "core:crypto/_fiat/field_curve25519"
+import "core:mem"
+
+SCALAR_SIZE :: 32
+POINT_SIZE :: 32
+
+_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+
+_scalar_bit :: #force_inline proc "contextless" (s: ^[32]byte, i: int) -> u8 {
+	if i < 0 {
+		return 0
+	}
+	return (s[i>>3] >> uint(i&7)) & 1
+}
+
+_scalarmult :: proc (out, scalar, point: ^[32]byte) {
+	// Montgomery pseduo-multiplication taken from Monocypher.
+
+	// computes the scalar product
+	x1: field.Tight_Field_Element = ---
+	field.fe_from_bytes(&x1, point)
+
+	// computes the actual scalar product (the result is in x2 and z2)
+	x2, x3, z2, z3: field.Tight_Field_Element =  ---, ---, ---, ---
+	t0, t1: field.Loose_Field_Element = ---, ---
+
+	// Montgomery ladder
+	// In projective coordinates, to avoid divisions: x = X / Z
+	// We don't care about the y coordinate, it's only 1 bit of information
+	field.fe_one(&x2) // "zero" point
+	field.fe_zero(&z2)
+	field.fe_set(&x3, &x1) // "one" point
+	field.fe_one(&z3)
+
+	swap: int
+	for pos := 255-1; pos >= 0; pos = pos - 1 	{
+		// constant time conditional swap before ladder step
+		b := int(_scalar_bit(scalar, pos))
+		swap ~= b // xor trick avoids swapping at the end of the loop
+		field.fe_cond_swap(&x2, &x3, swap)
+		field.fe_cond_swap(&z2, &z3, swap)
+		swap = b // anticipates one last swap after the loop
+
+		// Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
+		// with differential addition
+		//
+		// Note: This deliberately omits reductions after add/sub operations
+		// if the result is only ever used as the input to a mul/square since
+		// the implementations of those can deal with non-reduced inputs.
+		//
+		// fe_tighten_cast is only used to store a fully reduced
+		// output in a Loose_Field_Element, or to provide such a
+		// Loose_Field_Element as a Tight_Field_Element argument.
+		field.fe_sub(&t0, &x3, &z3)
+		field.fe_sub(&t1, &x2, &z2)
+		field.fe_add(field.fe_relax_cast(&x2), &x2, &z2) // x2 - unreduced
+		field.fe_add(field.fe_relax_cast(&z2), &x3, &z3) // z2 - unreduced
+		field.fe_carry_mul(&z3, &t0, field.fe_relax_cast(&x2))
+		field.fe_carry_mul(&z2, field.fe_relax_cast(&z2), &t1) // z2 - reduced
+		field.fe_carry_square(field.fe_tighten_cast(&t0), &t1) // t0 - reduced
+		field.fe_carry_square(field.fe_tighten_cast(&t1), field.fe_relax_cast(&x2)) // t1 - reduced
+		field.fe_add(field.fe_relax_cast(&x3), &z3, &z2) // x3 - unreduced
+		field.fe_sub(field.fe_relax_cast(&z2), &z3, &z2) // z2 - unreduced
+		field.fe_carry_mul(&x2, &t1, &t0) // x2 - reduced
+		field.fe_sub(&t1, field.fe_tighten_cast(&t1), field.fe_tighten_cast(&t0)) // safe - t1/t0 is reduced
+		field.fe_carry_square(&z2, field.fe_relax_cast(&z2)) // z2 - reduced
+		field.fe_carry_scmul_121666(&z3, &t1)
+		field.fe_carry_square(&x3, field.fe_relax_cast(&x3)) // x3 - reduced
+		field.fe_add(&t0, field.fe_tighten_cast(&t0), &z3) // safe - t0 is reduced
+		field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z2))
+		field.fe_carry_mul(&z2, &t1, &t0)
+	}
+	// last swap is necessary to compensate for the xor trick
+	// Note: after this swap, P3 == P2 + P1.
+	field.fe_cond_swap(&x2, &x3, swap)
+	field.fe_cond_swap(&z2, &z3, swap)
+
+	// normalises the coordinates: x == X / Z
+	field.fe_carry_inv(&z2, field.fe_relax_cast(&z2))
+	field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
+	field.fe_to_bytes(out, &x2)
+
+	mem.zero_explicit(&x1, size_of(x1))
+	mem.zero_explicit(&x2, size_of(x2))
+	mem.zero_explicit(&x3, size_of(x3))
+	mem.zero_explicit(&z2, size_of(z2))
+	mem.zero_explicit(&z3, size_of(z3))
+	mem.zero_explicit(&t0, size_of(t0))
+	mem.zero_explicit(&t1, size_of(t1))
+}
+
+scalarmult :: proc (dst, scalar, point: []byte) {
+	if len(scalar) != SCALAR_SIZE {
+		panic("crypto/x25519: invalid scalar size")
+	}
+	if len(point) != POINT_SIZE {
+		panic("crypto/x25519: invalid point size")
+	}
+	if len(dst) != POINT_SIZE {
+		panic("crypto/x25519: invalid destination point size")
+	}
+
+	// "clamp" the scalar
+	e: [32]byte = ---
+	copy_slice(e[:], scalar)
+	e[0] &= 248
+	e[31] &= 127
+	e[31] |= 64
+
+	p: [32]byte = ---
+	copy_slice(p[:], point)
+
+	d: [32]byte = ---
+	_scalarmult(&d, &e, &p)
+	copy_slice(dst, d[:])
+
+	mem.zero_explicit(&e, size_of(e))
+	mem.zero_explicit(&d, size_of(d))
+}
+
+scalarmult_basepoint :: proc (dst, scalar: []byte) {
+	// TODO/perf: Switch to using a precomputed table.
+	scalarmult(dst, scalar, _BASE_POINT[:])
+}
diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin
index df9920552..768ba242f 100644
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -115,6 +115,11 @@ main :: proc() {
     test_haval_224(&t)
     test_haval_256(&t)
 
+    // "modern" crypto tests
+    test_x25519(&t)
+
+    bench_modern(&t)
+
     fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
 }
 
diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin
new file mode 100644
index 000000000..4d7f08bb1
--- /dev/null
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -0,0 +1,95 @@
+package test_core_crypto
+
+import "core:testing"
+import "core:fmt"
+import "core:time"
+
+import "core:crypto/x25519"
+
+_digit_value :: proc(r: rune) -> int {
+	ri := int(r)
+	v: int = 16
+	switch r {
+	case '0'..='9': v = ri-'0'
+	case 'a'..='z': v = ri-'a'+10
+	case 'A'..='Z': v = ri-'A'+10
+	}
+	return v
+}
+
+_decode_hex32 :: proc(s: string) -> [32]byte{
+	b: [32]byte
+	for i := 0; i < len(s); i = i + 2 {
+		hi := _digit_value(rune(s[i]))
+		lo := _digit_value(rune(s[i+1]))
+		b[i/2] = byte(hi << 4 | lo)
+	}
+	return b
+}
+
+TestECDH :: struct {
+	scalar:  string,
+	point:   string,
+	product: string,
+}
+
+@(test)
+test_x25519 :: proc(t: ^testing.T) {
+	log(t, "Testing X25519")
+
+	test_vectors := [?]TestECDH {
+		// Test vectors from RFC 7748
+		TestECDH{
+			"a546e36bf0527c9d3b16154b82465edd62144c0ac1fc5a18506a2244ba449ac4",
+			"e6db6867583030db3594c1a424b15f7c726624ec26b3353b10a903a6d0ab1c4c",
+			"c3da55379de9c6908e94ea4df28d084f32eccf03491c71f754b4075577a28552",
+		},
+		TestECDH{
+			"4b66e9d4d1b4673c5ad22691957d6af5c11b6421e0ea01d42ca4169e7918ba0d",
+			"e5210f12786811d3f4b7959d0538ae2c31dbe7106fc03c3efc4cd549c715a493",
+			"95cbde9476e8907d7aade45cb4b873f88b595a68799fa152e6f8f7647aac7957",
+		},
+	}
+	for v, _ in test_vectors {
+		scalar := _decode_hex32(v.scalar)
+		point := _decode_hex32(v.point)
+
+		derived_point: [x25519.POINT_SIZE]byte
+		x25519.scalarmult(derived_point[:], scalar[:], point[:])
+		derived_point_str := hex_string(derived_point[:])
+
+		expect(t, derived_point_str == v.product, fmt.tprintf("Expected %s for %s * %s, but got %s instead", v.product, v.scalar, v.point, derived_point_str))
+
+		// Abuse the test vectors to sanity-check the scalar-basepoint multiply.
+		p1, p2: [x25519.POINT_SIZE]byte
+		x25519.scalarmult_basepoint(p1[:], scalar[:])
+		x25519.scalarmult(p2[:], scalar[:], x25519._BASE_POINT[:])
+		p1_str, p2_str := hex_string(p1[:]), hex_string(p2[:])
+		expect(t, p1_str == p2_str, fmt.tprintf("Expected %s for %s * basepoint, but got %s instead", p2_str, v.scalar, p1_str))
+	}
+
+    // TODO/tests: Run the wycheproof test vectors, once I figure out
+    // how to work with JSON.
+}
+
+@(test)
+bench_modern :: proc(t: ^testing.T) {
+	fmt.println("Starting benchmarks:")
+
+	bench_x25519(t)
+}
+
+bench_x25519 :: proc(t: ^testing.T) {
+	point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
+	scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")
+	out: [x25519.POINT_SIZE]byte = ---
+
+	iters :: 10000
+	start := time.now()
+	for i := 0; i < iters; i = i + 1 {
+		x25519.scalarmult(out[:], scalar[:], point[:])
+	}
+	elapsed := time.since(start)
+
+	log(t, fmt.tprintf("x25519.scalarmult: ~%f us/op", time.duration_microseconds(elapsed) / iters))
+}

From 64db286582dc6007317bc5c74d7f5e156f22f855 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Sat, 6 Nov 2021 07:38:04 +0000
Subject: [PATCH 04/10] core/crypto: Add poly1305

This package implements the Poly1305 MAC algorithm as specified in RFC
8439, using routines taked from fiat-crypto and poly1305-donna.
---
 core/crypto/_fiat/field_poly1305/field.odin   |  45 +++
 .../_fiat/field_poly1305/field4344.odin       | 356 ++++++++++++++++++
 core/crypto/poly1305/poly1305.odin            | 163 ++++++++
 tests/core/crypto/test_core_crypto.odin       |   1 +
 .../core/crypto/test_core_crypto_modern.odin  | 131 +++++++
 5 files changed, 696 insertions(+)
 create mode 100644 core/crypto/_fiat/field_poly1305/field.odin
 create mode 100644 core/crypto/_fiat/field_poly1305/field4344.odin
 create mode 100644 core/crypto/poly1305/poly1305.odin

diff --git a/core/crypto/_fiat/field_poly1305/field.odin b/core/crypto/_fiat/field_poly1305/field.odin
new file mode 100644
index 000000000..642949b02
--- /dev/null
+++ b/core/crypto/_fiat/field_poly1305/field.odin
@@ -0,0 +1,45 @@
+package field_poly1305
+
+import "core:crypto/util"
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+	return transmute(^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+	return transmute(^Tight_Field_Element)(arg1)
+}
+
+fe_from_bytes :: proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte, sanitize: bool = true) {
+	// fiat-crypto's deserialization routine wants 256-bits of input, but
+	// r/s are 128-bits long, and block processing works on 128-bits plus a
+	// final bit.
+	//
+	// This is more ergonomic, and while the copy is unfortunate, this avoids
+	// having to alter the fiat-crypto derived code.
+
+	assert(len(arg1) == 16)
+
+	tmp: [32]byte
+	copy_slice(tmp[0:16], arg1[:])
+	tmp[16] = arg2
+
+	_fe_from_bytes(out1, &tmp)
+
+	// Need to sanitize the temporary buffer when deserializing `s`.
+	if sanitize {
+		mem.zero_explicit(&tmp, size_of(tmp))
+	}
+}
+
+fe_from_u64s :: proc "contextless" (out1: ^Tight_Field_Element, lo, hi: u64) {
+	tmp: [32]byte
+	util.PUT_U64_LE(tmp[0:8], lo)
+	util.PUT_U64_LE(tmp[8:16], hi)
+
+	_fe_from_bytes(out1, &tmp)
+
+	// This routine is only used to deserialize `r` which is confidential.
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
diff --git a/core/crypto/_fiat/field_poly1305/field4344.odin b/core/crypto/_fiat/field_poly1305/field4344.odin
new file mode 100644
index 000000000..ba9bc2694
--- /dev/null
+++ b/core/crypto/_fiat/field_poly1305/field4344.odin
@@ -0,0 +1,356 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_poly1305
+
+// This file provides arithmetic on the field Z/(2^130 - 5) using
+// unsaturated 64-bit integer arithmetic.  It is derived primarily
+// from the machine generate Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+// At some point, it may be worth adding support to fiat-crypto for
+// generating Odin output.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+Loose_Field_Element :: distinct [3]u64
+Tight_Field_Element :: distinct [3]u64
+
+_addcarryx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0xfffffffffff)
+	x3 := fiat.u1((x1 >> 44))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 44))
+	x3 := (u64(x1) & 0xfffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+_addcarryx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0x7ffffffffff)
+	x3 := fiat.u1((x1 >> 43))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 43))
+	x3 := (u64(x1) & 0x7ffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(arg1[2], (arg2[2] * 0x5))
+	x4, x3 := bits.mul_u64(arg1[2], (arg2[1] * 0xa))
+	x6, x5 := bits.mul_u64(arg1[1], (arg2[2] * 0xa))
+	x8, x7 := bits.mul_u64(arg1[2], arg2[0])
+	x10, x9 := bits.mul_u64(arg1[1], (arg2[1] * 0x2))
+	x12, x11 := bits.mul_u64(arg1[1], arg2[0])
+	x14, x13 := bits.mul_u64(arg1[0], arg2[2])
+	x16, x15 := bits.mul_u64(arg1[0], arg2[1])
+	x18, x17 := bits.mul_u64(arg1[0], arg2[0])
+	x19, x20 := bits.add_u64(x5, x3, u64(0x0))
+	x21, _ := bits.add_u64(x6, x4, u64(fiat.u1(x20)))
+	x23, x24 := bits.add_u64(x17, x19, u64(0x0))
+	x25, _ := bits.add_u64(x18, x21, u64(fiat.u1(x24)))
+	x27 := ((x23 >> 44) | ((x25 << 20) & 0xffffffffffffffff))
+	x28 := (x23 & 0xfffffffffff)
+	x29, x30 := bits.add_u64(x9, x7, u64(0x0))
+	x31, _ := bits.add_u64(x10, x8, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x13, x29, u64(0x0))
+	x35, _ := bits.add_u64(x14, x31, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x11, x1, u64(0x0))
+	x39, _ := bits.add_u64(x12, x2, u64(fiat.u1(x38)))
+	x41, x42 := bits.add_u64(x15, x37, u64(0x0))
+	x43, _ := bits.add_u64(x16, x39, u64(fiat.u1(x42)))
+	x45, x46 := bits.add_u64(x27, x41, u64(0x0))
+	x47 := (u64(fiat.u1(x46)) + x43)
+	x48 := ((x45 >> 43) | ((x47 << 21) & 0xffffffffffffffff))
+	x49 := (x45 & 0x7ffffffffff)
+	x50, x51 := bits.add_u64(x48, x33, u64(0x0))
+	x52 := (u64(fiat.u1(x51)) + x35)
+	x53 := ((x50 >> 43) | ((x52 << 21) & 0xffffffffffffffff))
+	x54 := (x50 & 0x7ffffffffff)
+	x55 := (x53 * 0x5)
+	x56 := (x28 + x55)
+	x57 := (x56 >> 44)
+	x58 := (x56 & 0xfffffffffff)
+	x59 := (x57 + x49)
+	x60 := fiat.u1((x59 >> 43))
+	x61 := (x59 & 0x7ffffffffff)
+	x62 := (u64(x60) + x54)
+	out1[0] = x58
+	out1[1] = x61
+	out1[2] = x62
+}
+
+fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := (arg1[2] * 0x5)
+	x2 := (x1 * 0x2)
+	x3 := (arg1[2] * 0x2)
+	x4 := (arg1[1] * 0x2)
+	x6, x5 := bits.mul_u64(arg1[2], x1)
+	x8, x7 := bits.mul_u64(arg1[1], (x2 * 0x2))
+	x10, x9 := bits.mul_u64(arg1[1], (arg1[1] * 0x2))
+	x12, x11 := bits.mul_u64(arg1[0], x3)
+	x14, x13 := bits.mul_u64(arg1[0], x4)
+	x16, x15 := bits.mul_u64(arg1[0], arg1[0])
+	x17, x18 := bits.add_u64(x15, x7, u64(0x0))
+	x19, _ := bits.add_u64(x16, x8, u64(fiat.u1(x18)))
+	x21 := ((x17 >> 44) | ((x19 << 20) & 0xffffffffffffffff))
+	x22 := (x17 & 0xfffffffffff)
+	x23, x24 := bits.add_u64(x11, x9, u64(0x0))
+	x25, _ := bits.add_u64(x12, x10, u64(fiat.u1(x24)))
+	x27, x28 := bits.add_u64(x13, x5, u64(0x0))
+	x29, _ := bits.add_u64(x14, x6, u64(fiat.u1(x28)))
+	x31, x32 := bits.add_u64(x21, x27, u64(0x0))
+	x33 := (u64(fiat.u1(x32)) + x29)
+	x34 := ((x31 >> 43) | ((x33 << 21) & 0xffffffffffffffff))
+	x35 := (x31 & 0x7ffffffffff)
+	x36, x37 := bits.add_u64(x34, x23, u64(0x0))
+	x38 := (u64(fiat.u1(x37)) + x25)
+	x39 := ((x36 >> 43) | ((x38 << 21) & 0xffffffffffffffff))
+	x40 := (x36 & 0x7ffffffffff)
+	x41 := (x39 * 0x5)
+	x42 := (x22 + x41)
+	x43 := (x42 >> 44)
+	x44 := (x42 & 0xfffffffffff)
+	x45 := (x43 + x35)
+	x46 := fiat.u1((x45 >> 43))
+	x47 := (x45 & 0x7ffffffffff)
+	x48 := (u64(x46) + x40)
+	out1[0] = x44
+	out1[1] = x47
+	out1[2] = x48
+}
+
+fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[0]
+	x2 := ((x1 >> 44) + arg1[1])
+	x3 := ((x2 >> 43) + arg1[2])
+	x4 := ((x1 & 0xfffffffffff) + ((x3 >> 43) * 0x5))
+	x5 := (u64(fiat.u1((x4 >> 44))) + (x2 & 0x7ffffffffff))
+	x6 := (x4 & 0xfffffffffff)
+	x7 := (x5 & 0x7ffffffffff)
+	x8 := (u64(fiat.u1((x5 >> 43))) + (x3 & 0x7ffffffffff))
+	out1[0] = x6
+	out1[1] = x7
+	out1[2] = x8
+}
+
+fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := (arg1[0] + arg2[0])
+	x2 := (arg1[1] + arg2[1])
+	x3 := (arg1[2] + arg2[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := ((0x1ffffffffff6 + arg1[0]) - arg2[0])
+	x2 := ((0xffffffffffe + arg1[1]) - arg2[1])
+	x3 := ((0xffffffffffe + arg1[2]) - arg2[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := (0x1ffffffffff6 - arg1[0])
+	x2 := (0xffffffffffe - arg1[1])
+	x3 := (0xffffffffffe - arg1[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: bool) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) {
+	x1, x2 := _subborrowx_u44(0x0, arg1[0], 0xffffffffffb)
+	x3, x4 := _subborrowx_u43(x2, arg1[1], 0x7ffffffffff)
+	x5, x6 := _subborrowx_u43(x4, arg1[2], 0x7ffffffffff)
+	x7 := fiat.cmovznz_u64(x6, u64(0x0), 0xffffffffffffffff)
+	x8, x9 := _addcarryx_u44(0x0, x1, (x7 & 0xffffffffffb))
+	x10, x11 := _addcarryx_u43(x9, x3, (x7 & 0x7ffffffffff))
+	x12, _ := _addcarryx_u43(x11, x5, (x7 & 0x7ffffffffff))
+	x14 := (x12 << 7)
+	x15 := (x10 << 4)
+	x16 := (u8(x8) & 0xff)
+	x17 := (x8 >> 8)
+	x18 := (u8(x17) & 0xff)
+	x19 := (x17 >> 8)
+	x20 := (u8(x19) & 0xff)
+	x21 := (x19 >> 8)
+	x22 := (u8(x21) & 0xff)
+	x23 := (x21 >> 8)
+	x24 := (u8(x23) & 0xff)
+	x25 := u8((x23 >> 8))
+	x26 := (x15 + u64(x25))
+	x27 := (u8(x26) & 0xff)
+	x28 := (x26 >> 8)
+	x29 := (u8(x28) & 0xff)
+	x30 := (x28 >> 8)
+	x31 := (u8(x30) & 0xff)
+	x32 := (x30 >> 8)
+	x33 := (u8(x32) & 0xff)
+	x34 := (x32 >> 8)
+	x35 := (u8(x34) & 0xff)
+	x36 := u8((x34 >> 8))
+	x37 := (x14 + u64(x36))
+	x38 := (u8(x37) & 0xff)
+	x39 := (x37 >> 8)
+	x40 := (u8(x39) & 0xff)
+	x41 := (x39 >> 8)
+	x42 := (u8(x41) & 0xff)
+	x43 := (x41 >> 8)
+	x44 := (u8(x43) & 0xff)
+	x45 := (x43 >> 8)
+	x46 := (u8(x45) & 0xff)
+	x47 := (x45 >> 8)
+	x48 := (u8(x47) & 0xff)
+	x49 := u8((x47 >> 8))
+	out1[0] = x16
+	out1[1] = x18
+	out1[2] = x20
+	out1[3] = x22
+	out1[4] = x24
+	out1[5] = x27
+	out1[6] = x29
+	out1[7] = x31
+	out1[8] = x33
+	out1[9] = x35
+	out1[10] = x38
+	out1[11] = x40
+	out1[12] = x42
+	out1[13] = x44
+	out1[14] = x46
+	out1[15] = x48
+	out1[16] = x49
+}
+
+_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	x1 := (u64(arg1[16]) << 41)
+	x2 := (u64(arg1[15]) << 33)
+	x3 := (u64(arg1[14]) << 25)
+	x4 := (u64(arg1[13]) << 17)
+	x5 := (u64(arg1[12]) << 9)
+	x6 := (u64(arg1[11]) * u64(0x2))
+	x7 := (u64(arg1[10]) << 36)
+	x8 := (u64(arg1[9]) << 28)
+	x9 := (u64(arg1[8]) << 20)
+	x10 := (u64(arg1[7]) << 12)
+	x11 := (u64(arg1[6]) << 4)
+	x12 := (u64(arg1[5]) << 40)
+	x13 := (u64(arg1[4]) << 32)
+	x14 := (u64(arg1[3]) << 24)
+	x15 := (u64(arg1[2]) << 16)
+	x16 := (u64(arg1[1]) << 8)
+	x17 := arg1[0]
+	x18 := (x16 + u64(x17))
+	x19 := (x15 + x18)
+	x20 := (x14 + x19)
+	x21 := (x13 + x20)
+	x22 := (x12 + x21)
+	x23 := (x22 & 0xfffffffffff)
+	x24 := u8((x22 >> 44))
+	x25 := (x11 + u64(x24))
+	x26 := (x10 + x25)
+	x27 := (x9 + x26)
+	x28 := (x8 + x27)
+	x29 := (x7 + x28)
+	x30 := (x29 & 0x7ffffffffff)
+	x31 := fiat.u1((x29 >> 43))
+	x32 := (x6 + u64(x31))
+	x33 := (x5 + x32)
+	x34 := (x4 + x33)
+	x35 := (x3 + x34)
+	x36 := (x2 + x35)
+	x37 := (x1 + x36)
+	out1[0] = x23
+	out1[1] = x30
+	out1[2] = x37
+}
+
+fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+// The following routines were added by hand, and do not come from fiat-crypto.
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+}
+
+fe_set :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: bool) {
+	mask := -u64(arg1)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+}
diff --git a/core/crypto/poly1305/poly1305.odin b/core/crypto/poly1305/poly1305.odin
new file mode 100644
index 000000000..8986be879
--- /dev/null
+++ b/core/crypto/poly1305/poly1305.odin
@@ -0,0 +1,163 @@
+package poly1305
+
+import "core:crypto"
+import "core:crypto/util"
+import field "core:crypto/_fiat/field_poly1305"
+import "core:mem"
+
+KEY_SIZE :: 32
+TAG_SIZE :: 16
+
+_BLOCK_SIZE :: 16
+
+sum :: proc (dst, msg, key: []byte) {
+	ctx: Context = ---
+
+	init(&ctx, key)
+	update(&ctx, msg)
+	final(&ctx, dst)
+}
+
+verify :: proc (tag, msg, key: []byte) -> bool {
+	ctx: Context = ---
+	derived_tag: [16]byte = ---
+
+	if len(tag) != TAG_SIZE {
+		panic("crypto/poly1305: invalid tag size")
+	}
+
+	init(&ctx, key)
+	update(&ctx, msg)
+	final(&ctx, derived_tag[:])
+
+	return crypto.compare_constant_time(derived_tag[:], tag) == 1
+}
+
+Context :: struct {
+	_r: field.Tight_Field_Element,
+	_a: field.Tight_Field_Element,
+	_s: field.Tight_Field_Element,
+
+	_buffer: [_BLOCK_SIZE]byte,
+	_leftover: int,
+
+	_is_initialized: bool,
+}
+
+init :: proc (ctx: ^Context, key: []byte) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/poly1305: invalid key size")
+	}
+
+	// r = le_bytes_to_num(key[0..15])
+	// r = clamp(r) (r &= 0xffffffc0ffffffc0ffffffc0fffffff)
+	tmp_lo := util.U64_LE(key[0:8]) & 0x0ffffffc0fffffff
+	tmp_hi := util.U64_LE(key[8:16]) & 0xffffffc0ffffffc
+	field.fe_from_u64s(&ctx._r, tmp_lo, tmp_hi)
+
+	// s = le_bytes_to_num(key[16..31])
+	field.fe_from_bytes(&ctx._s, key[16:32], 0)
+
+	// a = 0
+	field.fe_zero(&ctx._a)
+
+	// No leftover in buffer
+	ctx._leftover = 0
+
+	ctx._is_initialized = true
+}
+
+update :: proc (ctx: ^Context, data: []byte) {
+	assert(ctx._is_initialized)
+
+	msg := data
+	msg_len := len(data)
+
+	// Handle leftover
+	if ctx._leftover > 0 {
+		want := min(_BLOCK_SIZE - ctx._leftover, msg_len)
+		copy_slice(ctx._buffer[ctx._leftover:], msg[:want])
+		msg_len = msg_len - want
+		msg = msg[want:]
+		ctx._leftover = ctx._leftover + want
+		if ctx._leftover < _BLOCK_SIZE {
+			return
+		}
+		_blocks(ctx, ctx._buffer[:])
+		ctx._leftover = 0
+	}
+
+	// Process full blocks
+	if msg_len >= _BLOCK_SIZE {
+		want := msg_len & (~int(_BLOCK_SIZE - 1))
+		_blocks(ctx, msg[:want])
+		msg = msg[want:]
+		msg_len = msg_len - want
+	}
+
+	// Store leftover
+	if msg_len > 0 {
+		// TODO: While -donna does it this way, I'm fairly sure that
+		// `ctx._leftover == 0` is an invariant at this point.
+		copy(ctx._buffer[ctx._leftover:], msg)
+		ctx._leftover = ctx._leftover + msg_len
+	}
+}
+
+final :: proc (ctx: ^Context, dst: []byte) {
+	assert(ctx._is_initialized)
+
+	if len(dst) != TAG_SIZE {
+		panic("poly1305: invalid destination tag size")
+	}
+
+	// Process remaining block
+	if ctx._leftover > 0 {
+		ctx._buffer[ctx._leftover] = 1
+		for i := ctx._leftover + 1; i < _BLOCK_SIZE; i = i + 1 {
+			ctx._buffer[i] = 0
+		}
+		_blocks(ctx, ctx._buffer[:], true)
+	}
+
+	// a += s
+	field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &ctx._s) // _a unreduced
+	field.fe_carry(&ctx._a, field.fe_relax_cast(&ctx._a)) // _a reduced
+
+	// return num_to_16_le_bytes(a)
+	tmp: [32]byte = ---
+	field.fe_to_bytes(&tmp, &ctx._a)
+	copy_slice(dst, tmp[0:16])
+
+	reset(ctx)
+}
+
+reset :: proc (ctx: ^Context) {
+	mem.zero_explicit(&ctx._r, size_of(ctx._r))
+	mem.zero_explicit(&ctx._a, size_of(ctx._a))
+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+	ctx._is_initialized = false
+}
+
+_blocks :: proc (ctx: ^Context, msg: []byte, final := false) {
+	n: field.Tight_Field_Element = ---
+	final_byte := byte(!final)
+
+	data := msg
+	data_len := len(data)
+	for data_len >= _BLOCK_SIZE {
+		// n = le_bytes_to_num(msg[((i-1)*16)..*i*16] | [0x01])
+		field.fe_from_bytes(&n, data[:_BLOCK_SIZE], final_byte, false)
+
+		// a += n
+		field.fe_add(field.fe_relax_cast(&ctx._a), &ctx._a, &n) // _a unreduced
+
+		// a = (r * a) % p
+		field.fe_carry_mul(&ctx._a, field.fe_relax_cast(&ctx._a), field.fe_relax_cast(&ctx._r)) // _a reduced
+
+		data = data[_BLOCK_SIZE:]
+		data_len = data_len - _BLOCK_SIZE
+	}
+}
diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin
index 768ba242f..c27b5c6bc 100644
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -116,6 +116,7 @@ main :: proc() {
     test_haval_256(&t)
 
     // "modern" crypto tests
+    test_poly1305(&t)
     test_x25519(&t)
 
     bench_modern(&t)
diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin
index 4d7f08bb1..f4f07928e 100644
--- a/tests/core/crypto/test_core_crypto_modern.odin
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -4,6 +4,7 @@ import "core:testing"
 import "core:fmt"
 import "core:time"
 
+import "core:crypto/poly1305"
 import "core:crypto/x25519"
 
 _digit_value :: proc(r: rune) -> int {
@@ -27,6 +28,70 @@ _decode_hex32 :: proc(s: string) -> [32]byte{
 	return b
 }
 
+@(test)
+test_poly1305 :: proc(t: ^testing.T) {
+	log(t, "Testing poly1305")
+
+	// Test cases taken from poly1305-donna.
+	key := [poly1305.KEY_SIZE]byte{
+		0xee,0xa6,0xa7,0x25,0x1c,0x1e,0x72,0x91,
+		0x6d,0x11,0xc2,0xcb,0x21,0x4d,0x3c,0x25,
+		0x25,0x39,0x12,0x1d,0x8e,0x23,0x4e,0x65,
+		0x2d,0x65,0x1f,0xa4,0xc8,0xcf,0xf8,0x80,
+	}
+
+	msg := [131]byte{
+		0x8e,0x99,0x3b,0x9f,0x48,0x68,0x12,0x73,
+		0xc2,0x96,0x50,0xba,0x32,0xfc,0x76,0xce,
+		0x48,0x33,0x2e,0xa7,0x16,0x4d,0x96,0xa4,
+		0x47,0x6f,0xb8,0xc5,0x31,0xa1,0x18,0x6a,
+		0xc0,0xdf,0xc1,0x7c,0x98,0xdc,0xe8,0x7b,
+		0x4d,0xa7,0xf0,0x11,0xec,0x48,0xc9,0x72,
+		0x71,0xd2,0xc2,0x0f,0x9b,0x92,0x8f,0xe2,
+		0x27,0x0d,0x6f,0xb8,0x63,0xd5,0x17,0x38,
+		0xb4,0x8e,0xee,0xe3,0x14,0xa7,0xcc,0x8a,
+		0xb9,0x32,0x16,0x45,0x48,0xe5,0x26,0xae,
+		0x90,0x22,0x43,0x68,0x51,0x7a,0xcf,0xea,
+		0xbd,0x6b,0xb3,0x73,0x2b,0xc0,0xe9,0xda,
+		0x99,0x83,0x2b,0x61,0xca,0x01,0xb6,0xde,
+		0x56,0x24,0x4a,0x9e,0x88,0xd5,0xf9,0xb3,
+		0x79,0x73,0xf6,0x22,0xa4,0x3d,0x14,0xa6,
+		0x59,0x9b,0x1f,0x65,0x4c,0xb4,0x5a,0x74,
+		0xe3,0x55,0xa5,
+	}
+
+	tag := [poly1305.TAG_SIZE]byte{
+		0xf3,0xff,0xc7,0x70,0x3f,0x94,0x00,0xe5,
+		0x2a,0x7d,0xfb,0x4b,0x3d,0x33,0x05,0xd9,
+	}
+	tag_str := hex_string(tag[:])
+
+	// Verify - oneshot + compare
+	ok := poly1305.verify(tag[:], msg[:], key[:])
+	expect(t, ok, "oneshot verify call failed")
+
+	// Sum - oneshot
+	derived_tag: [poly1305.TAG_SIZE]byte
+	poly1305.sum(derived_tag[:], msg[:], key[:])
+	derived_tag_str := hex_string(derived_tag[:])
+	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for sum(msg, key), but got %s instead", tag_str, derived_tag_str))
+
+	// Incremental
+	mem.zero(&derived_tag, size_of(derived_tag))
+	ctx: poly1305.Context = ---
+	poly1305.init(&ctx, key[:])
+	read_lengths := [11]int{32, 64, 16, 8, 4, 2, 1, 1, 1, 1, 1}
+	off := 0
+	for read_length in read_lengths {
+		to_read := msg[off:off+read_length]
+		poly1305.update(&ctx, to_read)
+		off = off + read_length
+	}
+	poly1305.final(&ctx, derived_tag[:])
+	derived_tag_str = hex_string(derived_tag[:])
+	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for init/update/final - incremental, but got %s instead", tag_str, derived_tag_str))
+}
+
 TestECDH :: struct {
 	scalar:  string,
 	point:   string,
@@ -76,9 +141,75 @@ test_x25519 :: proc(t: ^testing.T) {
 bench_modern :: proc(t: ^testing.T) {
 	fmt.println("Starting benchmarks:")
 
+	bench_poly1305(t)
 	bench_x25519(t)
 }
 
+_setup_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	assert(options != nil)
+
+	options.input = make([]u8, options.bytes, allocator)
+	return nil if len(options.input) == options.bytes else .Allocation_Error
+}
+
+_teardown_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	assert(options != nil)
+
+	delete(options.input)
+	return nil
+}
+
+_benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	buf := options.input
+	key := [poly1305.KEY_SIZE]byte{
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+	}
+
+	tag: [poly1305.TAG_SIZE]byte = ---
+	for _ in 0..=options.rounds {
+		poly1305.sum(tag[:], buf, key[:])
+	}
+	options.count     = options.rounds
+	options.processed = options.rounds * options.bytes
+	//options.hash      = u128(h)
+	return nil
+}
+
+benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
+	fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
+		name,
+		options.rounds,
+		options.processed,
+		time.duration_nanoseconds(options.duration),
+		options.rounds_per_second,
+		options.megabytes_per_second,
+	)
+}
+
+bench_poly1305 :: proc(t: ^testing.T) {
+	name    := "Poly1305 64 zero bytes"
+	options := &time.Benchmark_Options{
+		rounds   = 1_000,
+		bytes    = 64,
+		setup    = _setup_poly1305,
+		bench    = _benchmark_poly1305,
+		teardown = _teardown_poly1305,
+	}
+
+	err  := time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "Poly1305 1024 zero bytes"
+	options.bytes = 1024
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+}
+
 bench_x25519 :: proc(t: ^testing.T) {
 	point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
 	scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")

From 4647081f4953ad22810b3ff120d4c499f4240701 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Mon, 8 Nov 2021 04:47:42 +0000
Subject: [PATCH 05/10] core/crypto/poly1305: Triple performance on amd64 with
 -o:speed

---
 core/crypto/_fiat/field_poly1305/field.odin | 47 +++++++++++++++------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/core/crypto/_fiat/field_poly1305/field.odin b/core/crypto/_fiat/field_poly1305/field.odin
index 642949b02..bfb7cf1f9 100644
--- a/core/crypto/_fiat/field_poly1305/field.odin
+++ b/core/crypto/_fiat/field_poly1305/field.odin
@@ -11,25 +11,46 @@ fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element)
 	return transmute(^Tight_Field_Element)(arg1)
 }
 
-fe_from_bytes :: proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte, sanitize: bool = true) {
-	// fiat-crypto's deserialization routine wants 256-bits of input, but
-	// r/s are 128-bits long, and block processing works on 128-bits plus a
-	// final bit.
+fe_from_bytes :: #force_inline proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte, sanitize: bool = true) {
+	// fiat-crypto's deserialization routine effectively processes a
+	// single byte at a time, and wants 256-bits of input for a value
+	// that will be 128-bits or 129-bits.
 	//
-	// This is more ergonomic, and while the copy is unfortunate, this avoids
-	// having to alter the fiat-crypto derived code.
+	// This is somewhat cumbersome to use, so at a minimum a wrapper
+	// makes implementing the actual MAC block processing considerably
+	// neater.
 
 	assert(len(arg1) == 16)
 
-	tmp: [32]byte
-	copy_slice(tmp[0:16], arg1[:])
-	tmp[16] = arg2
+	when ODIN_ARCH == "386" || ODIN_ARCH == "amd64" {
+		// While it may be unwise to do deserialization here on our
+		// own when fiat-crypto provides equivalent functionality,
+		// doing it this way provides a little under 3x performance
+		// improvement when optimization is enabled.
+		src_p := transmute(^[2]u64)(&arg1[0])
+		lo := src_p[0]
+		hi := src_p[1]
 
-	_fe_from_bytes(out1, &tmp)
+		// This is inspired by poly1305-donna, though adjustments were
+		// made since a Tight_Field_Element's limbs are 44-bits, 43-bits,
+		// and 43-bits wide.
+		//
+		// Note: This could be transplated into fe_from_u64s, but that
+		// code is called once per MAC, and is non-criticial path.
+		hibit := u64(arg2) << 41 // arg2 << 128
+		out1[0] = lo & 0xfffffffffff
+		out1[1] = ((lo >> 44) | (hi << 20)) & 0x7ffffffffff
+		out1[2] = ((hi >> 23) & 0x7ffffffffff) | hibit
+	} else {
+		tmp: [32]byte
+		copy_slice(tmp[0:16], arg1[:])
+		tmp[16] = arg2
 
-	// Need to sanitize the temporary buffer when deserializing `s`.
-	if sanitize {
-		mem.zero_explicit(&tmp, size_of(tmp))
+		_fe_from_bytes(out1, &tmp)
+		if sanitize {
+			// This is used to deserialize `s` which is confidential.
+			mem.zero_explicit(&tmp, size_of(tmp))
+		}
 	}
 }
 

From 7bed3176360df79d88e1e6022e62b985d3648579 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Sat, 6 Nov 2021 23:15:50 +0000
Subject: [PATCH 06/10] core/crypto: Add chacha20

This package implements the ChaCha20 stream cipher as specified in
RFC 8439, and the somewhat non-standard XChaCha20 variant that supports
a 192-bit nonce.

While an IETF draft for XChaCha20 standardization exists,
implementations that pre-date the draft use a 64-bit counter, instead of
the IETF-style 32-bit one.  This implementation opts for the latter as
compatibility with libsodium is more important than compatibility with
an expired IETF draft.
---
 core/crypto/chacha20/chacha20.odin            | 581 ++++++++++++++++++
 tests/core/crypto/test_core_crypto.odin       |   1 +
 .../core/crypto/test_core_crypto_modern.odin  | 150 ++++-
 3 files changed, 728 insertions(+), 4 deletions(-)
 create mode 100644 core/crypto/chacha20/chacha20.odin

diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin
new file mode 100644
index 000000000..f6f551692
--- /dev/null
+++ b/core/crypto/chacha20/chacha20.odin
@@ -0,0 +1,581 @@
+package chacha20
+
+import "core:crypto/util"
+import "core:math/bits"
+import "core:mem"
+
+KEY_SIZE :: 32
+NONCE_SIZE :: 12
+XNONCE_SIZE :: 24
+
+_MAX_CTR_IETF :: 0xffffffff
+
+_BLOCK_SIZE :: 64
+_STATE_SIZE_U32 :: 16
+_ROUNDS :: 20
+
+_SIGMA_0 : u32 : 0x61707865
+_SIGMA_1 : u32 : 0x3320646e
+_SIGMA_2 : u32 : 0x79622d32
+_SIGMA_3 : u32 : 0x6b206574
+
+Context :: struct {
+	_s: [_STATE_SIZE_U32]u32,
+
+	_buffer: [_BLOCK_SIZE]byte,
+	_off: int,
+
+	_is_ietf_flavor: bool,
+	_is_initialized: bool,
+}
+
+init :: proc (ctx: ^Context, key, nonce: []byte) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/chacha20: invalid ChaCha20 key size")
+	}
+	if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
+		panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
+	}
+
+	k, n := key, nonce
+
+	// Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
+	is_xchacha := len(nonce) == XNONCE_SIZE
+	if is_xchacha {
+		sub_key := ctx._buffer[:KEY_SIZE]
+		_hchacha20(sub_key, k, n)
+		k = sub_key
+		n = n[16:24]
+	}
+
+	ctx._s[0] = _SIGMA_0
+	ctx._s[1] = _SIGMA_1
+	ctx._s[2] = _SIGMA_2
+	ctx._s[3] = _SIGMA_3
+	ctx._s[4] = util.U32_LE(k[0:4])
+	ctx._s[5] = util.U32_LE(k[4:8])
+	ctx._s[6] = util.U32_LE(k[8:12])
+	ctx._s[7] = util.U32_LE(k[12:16])
+	ctx._s[8] = util.U32_LE(k[16:20])
+	ctx._s[9] = util.U32_LE(k[20:24])
+	ctx._s[10] = util.U32_LE(k[24:28])
+	ctx._s[11] = util.U32_LE(k[28:32])
+	ctx._s[12] = 0
+	if !is_xchacha {
+		ctx._s[13] = util.U32_LE(n[0:4])
+		ctx._s[14] = util.U32_LE(n[4:8])
+		ctx._s[15] = util.U32_LE(n[8:12])
+	} else {
+		ctx._s[13] = 0
+		ctx._s[14] = util.U32_LE(n[0:4])
+		ctx._s[15] = util.U32_LE(n[4:8])
+
+		// The sub-key is stored in the keystream buffer.  While
+		// this will be overwritten in most circumstances, explicitly
+		// clear it out early.
+		mem.zero_explicit(&ctx._buffer, KEY_SIZE)
+	}
+
+	ctx._off = _BLOCK_SIZE
+	ctx._is_ietf_flavor = !is_xchacha
+	ctx._is_initialized = true
+}
+
+seek :: proc (ctx: ^Context, block_nr: u64) {
+	assert(ctx._is_initialized)
+
+	if ctx._is_ietf_flavor {
+		if block_nr > _MAX_CTR_IETF {
+			panic("crypto/chacha20: attempted to seek past maximum counter")
+		}
+	} else {
+		ctx._s[13] = u32(block_nr >> 32)
+	}
+	ctx._s[12] = u32(block_nr)
+	ctx._off = _BLOCK_SIZE
+}
+
+xor_bytes :: proc (ctx: ^Context, dst, src: []byte) {
+	assert(ctx._is_initialized)
+
+	// TODO: Enforcing that dst and src alias exactly or not at all
+	// is a good idea, though odd aliasing should be extremely uncommon.
+
+	src, dst := src, dst
+	if dst_len := len(dst); dst_len < len(src) {
+		src = src[:dst_len]
+	}
+
+	for remaining := len(src); remaining > 0; {
+		// Process multiple blocks at once
+		if ctx._off == _BLOCK_SIZE {
+			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _BLOCK_SIZE
+				_do_blocks(ctx, dst, src, nr_blocks)
+				remaining -= direct_bytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[direct_bytes:]
+				src = src[direct_bytes:]
+			}
+
+			// If there is a partial block, generate and buffer 1 block
+			// worth of keystream.
+			_do_blocks(ctx, ctx._buffer[:], nil, 1)
+			ctx._off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
+		buffered_keystream := ctx._buffer[ctx._off:]
+		for i := 0; i < to_xor; i = i + 1 {
+			dst[i] = buffered_keystream[i] ~ src[i]
+		}
+		ctx._off += to_xor
+		dst = dst[to_xor:]
+		src = src[to_xor:]
+		remaining -= to_xor
+	}
+}
+
+keystream_bytes :: proc (ctx: ^Context, dst: []byte) {
+	assert(ctx._is_initialized)
+
+	dst := dst
+	for remaining := len(dst); remaining > 0; {
+		// Process multiple blocks at once
+		if ctx._off == _BLOCK_SIZE {
+			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _BLOCK_SIZE
+				_do_blocks(ctx, dst, nil, nr_blocks)
+				remaining -= direct_bytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[direct_bytes:]
+			}
+
+			// If there is a partial block, generate and buffer 1 block
+			// worth of keystream.
+			_do_blocks(ctx, ctx._buffer[:], nil, 1)
+			ctx._off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
+		buffered_keystream := ctx._buffer[ctx._off:]
+		copy(dst[:to_copy], buffered_keystream[:to_copy])
+		ctx._off += to_copy
+		dst = dst[to_copy:]
+		remaining -= to_copy
+	}
+}
+
+reset :: proc (ctx: ^Context) {
+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+	ctx._is_initialized = false
+}
+
+_do_blocks :: proc (ctx: ^Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per nonce.
+	//
+	// While all modern "standard" definitions of ChaCha20 use
+	// the IETF 32-bit counter, for XChaCha20 most common
+	// implementations allow for a 64-bit counter.
+	//
+	// Honestly, the answer here is "use a MRAE primitive", but
+	// go with common practice in the case of XChaCha20.
+	if ctx._is_ietf_flavor {
+		if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
+			panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
+		}
+	} else {
+		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
+		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
+			panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
+		}
+	}
+
+	dst, src := dst, src
+	x := &ctx._s
+	for n := 0; n < nr_blocks; n = n + 1 {
+		x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+		for i := _ROUNDS; i > 0; i = i - 2 {
+			// Even when forcing inlining manually inlining all of
+			// these is decently faster.
+
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = util.ROTL32(x12, 16)
+			x8 += x12
+			x4 ~= x8
+			x4 = util.ROTL32(x4, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = util.ROTL32(x12, 8)
+			x8 += x12
+			x4 ~= x8
+			x4 = util.ROTL32(x4, 7)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ~= x1
+			x13 = util.ROTL32(x13, 16)
+			x9 += x13
+			x5 ~= x9
+			x5 = util.ROTL32(x5, 12)
+			x1 += x5
+			x13 ~= x1
+			x13 = util.ROTL32(x13, 8)
+			x9 += x13
+			x5 ~= x9
+			x5 = util.ROTL32(x5, 7)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ~= x2
+			x14 = util.ROTL32(x14, 16)
+			x10 += x14
+			x6 ~= x10
+			x6 = util.ROTL32(x6, 12)
+			x2 += x6
+			x14 ~= x2
+			x14 = util.ROTL32(x14, 8)
+			x10 += x14
+			x6 ~= x10
+			x6 = util.ROTL32(x6, 7)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ~= x3
+			x15 = util.ROTL32(x15, 16)
+			x11 += x15
+			x7 ~= x11
+			x7 = util.ROTL32(x7, 12)
+			x3 += x7
+			x15 ~= x3
+			x15 = util.ROTL32(x15, 8)
+			x11 += x15
+			x7 ~= x11
+			x7 = util.ROTL32(x7, 7)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ~= x0
+			x15 = util.ROTL32(x15, 16)
+			x10 += x15
+			x5 ~= x10
+			x5 = util.ROTL32(x5, 12)
+			x0 += x5
+			x15 ~= x0
+			x15 = util.ROTL32(x15, 8)
+			x10 += x15
+			x5 ~= x10
+			x5 = util.ROTL32(x5, 7)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = util.ROTL32(x12, 16)
+			x11 += x12
+			x6 ~= x11
+			x6 = util.ROTL32(x6, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = util.ROTL32(x12, 8)
+			x11 += x12
+			x6 ~= x11
+			x6 = util.ROTL32(x6, 7)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ~= x2
+			x13 = util.ROTL32(x13, 16)
+			x8 += x13
+			x7 ~= x8
+			x7 = util.ROTL32(x7, 12)
+			x2 += x7
+			x13 ~= x2
+			x13 = util.ROTL32(x13, 8)
+			x8 += x13
+			x7 ~= x8
+			x7 = util.ROTL32(x7, 7)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ~= x3
+			x14 = util.ROTL32(x14, 16)
+			x9 += x14
+			x4 ~= x9
+			x4 = util.ROTL32(x4, 12)
+			x3 += x4
+			x14 ~= x3
+			x14 = util.ROTL32(x14, 8)
+			x9 += x14
+			x4 ~= x9
+			x4 = util.ROTL32(x4, 7)
+		}
+
+		x0 += _SIGMA_0
+		x1 += _SIGMA_1
+		x2 += _SIGMA_2
+		x3 += _SIGMA_3
+		x4 += x[4]
+		x5 += x[5]
+		x6 += x[6]
+		x7 += x[7]
+		x8 += x[8]
+		x9 += x[9]
+		x10 += x[10]
+		x11 += x[11]
+		x12 += x[12]
+		x13 += x[13]
+		x14 += x[14]
+		x15 += x[15]
+
+		// While the "correct" answer to getting more performance out of
+		// this is "use vector operations", support for that is currently
+		// a work in progress/to be designed.
+		//
+		// Until dedicated assembly can be written leverage the fact that
+		// the callers of this routine ensure that src/dst are valid.
+
+		when ODIN_ARCH == "386" || ODIN_ARCH == "amd64" {
+			// util.PUT_U32_LE/util.U32_LE are not required on little-endian
+			// systems that also happen to not be strict about aligned
+			// memory access.
+
+			dst_p := transmute(^[16]u32)(&dst[0])
+			if src != nil {
+				src_p := transmute(^[16]u32)(&src[0])
+				dst_p[0] = src_p[0] ~ x0
+				dst_p[1] = src_p[1] ~ x1
+				dst_p[2] = src_p[2] ~ x2
+				dst_p[3] = src_p[3] ~ x3
+				dst_p[4] = src_p[4] ~ x4
+				dst_p[5] = src_p[5] ~ x5
+				dst_p[6] = src_p[6] ~ x6
+				dst_p[7] = src_p[7] ~ x7
+				dst_p[8] = src_p[8] ~ x8
+				dst_p[9] = src_p[9] ~ x9
+				dst_p[10] = src_p[10] ~ x10
+				dst_p[11] = src_p[11] ~ x11
+				dst_p[12] = src_p[12] ~ x12
+				dst_p[13] = src_p[13] ~ x13
+				dst_p[14] = src_p[14] ~ x14
+				dst_p[15] = src_p[15] ~ x15
+				src = src[_BLOCK_SIZE:]
+			} else {
+				dst_p[0] = x0
+				dst_p[1] = x1
+				dst_p[2] = x2
+				dst_p[3] = x3
+				dst_p[4] = x4
+				dst_p[5] = x5
+				dst_p[6] = x6
+				dst_p[7] = x7
+				dst_p[8] = x8
+				dst_p[9] = x9
+				dst_p[10] = x10
+				dst_p[11] = x11
+				dst_p[12] = x12
+				dst_p[13] = x13
+				dst_p[14] = x14
+				dst_p[15] = x15
+			}
+			dst = dst[_BLOCK_SIZE:]
+		} else {
+			#no_bounds_check {
+				if src != nil {
+					util.PUT_U32_LE(dst[0:4], util.U32_LE(src[0:4]) ~ x0)
+					util.PUT_U32_LE(dst[4:8], util.U32_LE(src[4:8]) ~ x1)
+					util.PUT_U32_LE(dst[8:12], util.U32_LE(src[8:12]) ~ x2)
+					util.PUT_U32_LE(dst[12:16], util.U32_LE(src[12:16]) ~ x3)
+					util.PUT_U32_LE(dst[16:20], util.U32_LE(src[16:20]) ~ x4)
+					util.PUT_U32_LE(dst[20:24], util.U32_LE(src[20:24]) ~ x5)
+					util.PUT_U32_LE(dst[24:28], util.U32_LE(src[24:28]) ~ x6)
+					util.PUT_U32_LE(dst[28:32], util.U32_LE(src[28:32]) ~ x7)
+					util.PUT_U32_LE(dst[32:36], util.U32_LE(src[32:36]) ~ x8)
+					util.PUT_U32_LE(dst[36:40], util.U32_LE(src[36:40]) ~ x9)
+					util.PUT_U32_LE(dst[40:44], util.U32_LE(src[40:44]) ~ x10)
+					util.PUT_U32_LE(dst[44:48], util.U32_LE(src[44:48]) ~ x11)
+					util.PUT_U32_LE(dst[48:52], util.U32_LE(src[48:52]) ~ x12)
+					util.PUT_U32_LE(dst[52:56], util.U32_LE(src[52:56]) ~ x13)
+					util.PUT_U32_LE(dst[56:60], util.U32_LE(src[56:60]) ~ x14)
+					util.PUT_U32_LE(dst[60:64], util.U32_LE(src[60:64]) ~ x15)
+					src = src[_BLOCK_SIZE:]
+				} else {
+					util.PUT_U32_LE(dst[0:4], x0)
+					util.PUT_U32_LE(dst[4:8], x1)
+					util.PUT_U32_LE(dst[8:12], x2)
+					util.PUT_U32_LE(dst[12:16], x3)
+					util.PUT_U32_LE(dst[16:20], x4)
+					util.PUT_U32_LE(dst[20:24], x5)
+					util.PUT_U32_LE(dst[24:28], x6)
+					util.PUT_U32_LE(dst[28:32], x7)
+					util.PUT_U32_LE(dst[32:36], x8)
+					util.PUT_U32_LE(dst[36:40], x9)
+					util.PUT_U32_LE(dst[40:44], x10)
+					util.PUT_U32_LE(dst[44:48], x11)
+					util.PUT_U32_LE(dst[48:52], x12)
+					util.PUT_U32_LE(dst[52:56], x13)
+					util.PUT_U32_LE(dst[56:60], x14)
+					util.PUT_U32_LE(dst[60:64], x15)
+				}
+				dst = dst[_BLOCK_SIZE:]
+			}
+		}
+
+		// Increment the counter.  Overflow checking is done upon
+		// entry into the routine, so a 64-bit increment safely
+		// covers both cases.
+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+		x[12] = u32(new_ctr)
+		x[13] = u32(new_ctr >> 32)
+	}
+}
+
+_hchacha20 :: proc (dst, key, nonce: []byte) {
+	x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
+	x4 := util.U32_LE(key[0:4])
+	x5 := util.U32_LE(key[4:8])
+	x6 := util.U32_LE(key[8:12])
+	x7 := util.U32_LE(key[12:16])
+	x8 := util.U32_LE(key[16:20])
+	x9 := util.U32_LE(key[20:24])
+	x10 := util.U32_LE(key[24:28])
+	x11 := util.U32_LE(key[28:32])
+	x12 := util.U32_LE(nonce[0:4])
+	x13 := util.U32_LE(nonce[4:8])
+	x14 := util.U32_LE(nonce[8:12])
+	x15 := util.U32_LE(nonce[12:16])
+
+	for i := _ROUNDS; i > 0; i = i - 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = util.ROTL32(x12, 16)
+		x8 += x12
+		x4 ~= x8
+		x4 = util.ROTL32(x4, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = util.ROTL32(x12, 8)
+		x8 += x12
+		x4 ~= x8
+		x4 = util.ROTL32(x4, 7)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ~= x1
+		x13 = util.ROTL32(x13, 16)
+		x9 += x13
+		x5 ~= x9
+		x5 = util.ROTL32(x5, 12)
+		x1 += x5
+		x13 ~= x1
+		x13 = util.ROTL32(x13, 8)
+		x9 += x13
+		x5 ~= x9
+		x5 = util.ROTL32(x5, 7)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ~= x2
+		x14 = util.ROTL32(x14, 16)
+		x10 += x14
+		x6 ~= x10
+		x6 = util.ROTL32(x6, 12)
+		x2 += x6
+		x14 ~= x2
+		x14 = util.ROTL32(x14, 8)
+		x10 += x14
+		x6 ~= x10
+		x6 = util.ROTL32(x6, 7)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ~= x3
+		x15 = util.ROTL32(x15, 16)
+		x11 += x15
+		x7 ~= x11
+		x7 = util.ROTL32(x7, 12)
+		x3 += x7
+		x15 ~= x3
+		x15 = util.ROTL32(x15, 8)
+		x11 += x15
+		x7 ~= x11
+		x7 = util.ROTL32(x7, 7)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ~= x0
+		x15 = util.ROTL32(x15, 16)
+		x10 += x15
+		x5 ~= x10
+		x5 = util.ROTL32(x5, 12)
+		x0 += x5
+		x15 ~= x0
+		x15 = util.ROTL32(x15, 8)
+		x10 += x15
+		x5 ~= x10
+		x5 = util.ROTL32(x5, 7)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = util.ROTL32(x12, 16)
+		x11 += x12
+		x6 ~= x11
+		x6 = util.ROTL32(x6, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = util.ROTL32(x12, 8)
+		x11 += x12
+		x6 ~= x11
+		x6 = util.ROTL32(x6, 7)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ~= x2
+		x13 = util.ROTL32(x13, 16)
+		x8 += x13
+		x7 ~= x8
+		x7 = util.ROTL32(x7, 12)
+		x2 += x7
+		x13 ~= x2
+		x13 = util.ROTL32(x13, 8)
+		x8 += x13
+		x7 ~= x8
+		x7 = util.ROTL32(x7, 7)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ~= x3
+		x14 = util.ROTL32(x14, 16)
+		x9 += x14
+		x4 ~= x9
+		x4 = util.ROTL32(x4, 12)
+		x3 += x4
+		x14 ~= x3
+		x14 = util.ROTL32(x14, 8)
+		x9 += x14
+		x4 ~= x9
+		x4 = util.ROTL32(x4, 7)
+	}
+
+	util.PUT_U32_LE(dst[0:4], x0)
+	util.PUT_U32_LE(dst[4:8], x1)
+	util.PUT_U32_LE(dst[8:12], x2)
+	util.PUT_U32_LE(dst[12:16], x3)
+	util.PUT_U32_LE(dst[16:20], x12)
+	util.PUT_U32_LE(dst[20:24], x13)
+	util.PUT_U32_LE(dst[24:28], x14)
+	util.PUT_U32_LE(dst[28:32], x15)
+}
diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin
index c27b5c6bc..b73a191ad 100644
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -116,6 +116,7 @@ main :: proc() {
     test_haval_256(&t)
 
     // "modern" crypto tests
+    test_chacha20(&t)
     test_poly1305(&t)
     test_x25519(&t)
 
diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin
index f4f07928e..45ec8b339 100644
--- a/tests/core/crypto/test_core_crypto_modern.odin
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -2,8 +2,10 @@ package test_core_crypto
 
 import "core:testing"
 import "core:fmt"
+import "core:mem"
 import "core:time"
 
+import "core:crypto/chacha20"
 import "core:crypto/poly1305"
 import "core:crypto/x25519"
 
@@ -28,6 +30,94 @@ _decode_hex32 :: proc(s: string) -> [32]byte{
 	return b
 }
 
+@(test)
+test_chacha20 :: proc(t: ^testing.T) {
+	log(t, "Testing (X)ChaCha20")
+
+	// Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03
+	plaintext_str := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it."
+	plaintext := transmute([]byte)(plaintext_str)
+
+	key := [chacha20.KEY_SIZE]byte{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	}
+
+	nonce := [chacha20.NONCE_SIZE]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a,
+		0x00, 0x00, 0x00, 0x00,
+	}
+
+	ciphertext := [114]byte{
+		0x6e, 0x2e, 0x35, 0x9a, 0x25, 0x68, 0xf9, 0x80,
+		0x41, 0xba, 0x07, 0x28, 0xdd, 0x0d, 0x69, 0x81,
+		0xe9, 0x7e, 0x7a, 0xec, 0x1d, 0x43, 0x60, 0xc2,
+		0x0a, 0x27, 0xaf, 0xcc, 0xfd, 0x9f, 0xae, 0x0b,
+		0xf9, 0x1b, 0x65, 0xc5, 0x52, 0x47, 0x33, 0xab,
+		0x8f, 0x59, 0x3d, 0xab, 0xcd, 0x62, 0xb3, 0x57,
+		0x16, 0x39, 0xd6, 0x24, 0xe6, 0x51, 0x52, 0xab,
+		0x8f, 0x53, 0x0c, 0x35, 0x9f, 0x08, 0x61, 0xd8,
+		0x07, 0xca, 0x0d, 0xbf, 0x50, 0x0d, 0x6a, 0x61,
+		0x56, 0xa3, 0x8e, 0x08, 0x8a, 0x22, 0xb6, 0x5e,
+		0x52, 0xbc, 0x51, 0x4d, 0x16, 0xcc, 0xf8, 0x06,
+		0x81, 0x8c, 0xe9, 0x1a, 0xb7, 0x79, 0x37, 0x36,
+		0x5a, 0xf9, 0x0b, 0xbf, 0x74, 0xa3, 0x5b, 0xe6,
+		0xb4, 0x0b, 0x8e, 0xed, 0xf2, 0x78, 0x5e, 0x42,
+		0x87, 0x4d,
+	}
+	ciphertext_str := hex_string(ciphertext[:])
+
+	derived_ciphertext: [114]byte
+	ctx: chacha20.Context = ---
+	chacha20.init(&ctx, key[:], nonce[:])
+	chacha20.seek(&ctx, 1) // The test vectors start the counter at 1.
+	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
+
+	derived_ciphertext_str := hex_string(derived_ciphertext[:])
+	expect(t, derived_ciphertext_str == ciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", ciphertext_str, derived_ciphertext_str))
+
+	xkey := [chacha20.KEY_SIZE]byte{
+		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	}
+
+	xnonce := [chacha20.XNONCE_SIZE]byte{
+		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+		0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+		0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	}
+
+	xciphertext := [114]byte{
+		0xbd, 0x6d, 0x17, 0x9d, 0x3e, 0x83, 0xd4, 0x3b,
+		0x95, 0x76, 0x57, 0x94, 0x93, 0xc0, 0xe9, 0x39,
+		0x57, 0x2a, 0x17, 0x00, 0x25, 0x2b, 0xfa, 0xcc,
+		0xbe, 0xd2, 0x90, 0x2c, 0x21, 0x39, 0x6c, 0xbb,
+		0x73, 0x1c, 0x7f, 0x1b, 0x0b, 0x4a, 0xa6, 0x44,
+		0x0b, 0xf3, 0xa8, 0x2f, 0x4e, 0xda, 0x7e, 0x39,
+		0xae, 0x64, 0xc6, 0x70, 0x8c, 0x54, 0xc2, 0x16,
+		0xcb, 0x96, 0xb7, 0x2e, 0x12, 0x13, 0xb4, 0x52,
+		0x2f, 0x8c, 0x9b, 0xa4, 0x0d, 0xb5, 0xd9, 0x45,
+		0xb1, 0x1b, 0x69, 0xb9, 0x82, 0xc1, 0xbb, 0x9e,
+		0x3f, 0x3f, 0xac, 0x2b, 0xc3, 0x69, 0x48, 0x8f,
+		0x76, 0xb2, 0x38, 0x35, 0x65, 0xd3, 0xff, 0xf9,
+		0x21, 0xf9, 0x66, 0x4c, 0x97, 0x63, 0x7d, 0xa9,
+		0x76, 0x88, 0x12, 0xf6, 0x15, 0xc6, 0x8b, 0x13,
+		0xb5, 0x2e,
+	}
+	xciphertext_str := hex_string(xciphertext[:])
+
+	chacha20.init(&ctx, xkey[:], xnonce[:])
+	chacha20.seek(&ctx, 1)
+	chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:])
+
+	derived_ciphertext_str = hex_string(derived_ciphertext[:])
+	expect(t, derived_ciphertext_str == xciphertext_str, fmt.tprintf("Expected %s for xor_bytes(plaintext_str), but got %s instead", xciphertext_str, derived_ciphertext_str))
+}
+
 @(test)
 test_poly1305 :: proc(t: ^testing.T) {
 	log(t, "Testing poly1305")
@@ -141,24 +231,49 @@ test_x25519 :: proc(t: ^testing.T) {
 bench_modern :: proc(t: ^testing.T) {
 	fmt.println("Starting benchmarks:")
 
+	bench_chacha20(t)
 	bench_poly1305(t)
 	bench_x25519(t)
 }
 
-_setup_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+_setup_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
 	assert(options != nil)
 
 	options.input = make([]u8, options.bytes, allocator)
 	return nil if len(options.input) == options.bytes else .Allocation_Error
 }
 
-_teardown_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+_teardown_sized_buf :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
 	assert(options != nil)
 
 	delete(options.input)
 	return nil
 }
 
+_benchmark_chacha20 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	buf := options.input
+	key := [chacha20.KEY_SIZE]byte{
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+	}
+	nonce := [chacha20.NONCE_SIZE]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+	}
+
+	ctx: chacha20.Context = ---
+	chacha20.init(&ctx, key[:], nonce[:])
+
+	for _ in 0..=options.rounds {
+		chacha20.xor_bytes(&ctx, buf, buf)
+	}
+	options.count     = options.rounds
+	options.processed = options.rounds * options.bytes
+	return nil
+}
+
 _benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
 	buf := options.input
 	key := [poly1305.KEY_SIZE]byte{
@@ -189,14 +304,41 @@ benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
 	)
 }
 
+bench_chacha20 :: proc(t: ^testing.T) {
+	name    := "ChaCha20 64 bytes"
+	options := &time.Benchmark_Options{
+		rounds   = 1_000,
+		bytes    = 64,
+		setup    = _setup_sized_buf,
+		bench    = _benchmark_chacha20,
+		teardown = _teardown_sized_buf,
+	}
+
+	err  := time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "ChaCha20 1024 bytes"
+	options.bytes = 1024
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "ChaCha20 65536 bytes"
+	options.bytes = 65536
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+}
+
 bench_poly1305 :: proc(t: ^testing.T) {
 	name    := "Poly1305 64 zero bytes"
 	options := &time.Benchmark_Options{
 		rounds   = 1_000,
 		bytes    = 64,
-		setup    = _setup_poly1305,
+		setup    = _setup_sized_buf,
 		bench    = _benchmark_poly1305,
-		teardown = _teardown_poly1305,
+		teardown = _teardown_sized_buf,
 	}
 
 	err  := time.benchmark(options, context.allocator)

From 6c4c9aef618dcf3932be88ca6df65164145b7cea Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Tue, 9 Nov 2021 07:22:41 +0000
Subject: [PATCH 07/10] core/crypto: Add chacha20poly1305

This package implements the chacha20poly1305 AEAD construct as specified
in RFC 8439.
---
 .../chacha20poly1305/chacha20poly1305.odin    | 146 ++++++++++++++++++
 tests/core/crypto/test_core_crypto.odin       |   1 +
 .../core/crypto/test_core_crypto_modern.odin  | 131 +++++++++++++++-
 3 files changed, 276 insertions(+), 2 deletions(-)
 create mode 100644 core/crypto/chacha20poly1305/chacha20poly1305.odin

diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin
new file mode 100644
index 000000000..67d89df56
--- /dev/null
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -0,0 +1,146 @@
+package chacha20poly1305
+
+import "core:crypto"
+import "core:crypto/chacha20"
+import "core:crypto/poly1305"
+import "core:crypto/util"
+import "core:mem"
+
+KEY_SIZE :: chacha20.KEY_SIZE
+NONCE_SIZE :: chacha20.NONCE_SIZE
+TAG_SIZE :: poly1305.TAG_SIZE
+
+_P_MAX :: 64 * 0xffffffff // 64 * (2^32-1)
+
+_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) {
+	if len(tag) != TAG_SIZE {
+		panic("crypto/chacha20poly1305: invalid destination tag size")
+	}
+	if len(key) != KEY_SIZE {
+		panic("crypto/chacha20poly1305: invalid key size")
+	}
+	if len(nonce) != NONCE_SIZE {
+		panic("crypto/chacha20poly1305: invalid nonce size")
+	}
+
+	#assert(size_of(int) == 8 || size_of(int) <= 4)
+	when size_of(int) == 8 {
+		// A_MAX = 2^64 - 1 due to the length field limit.
+		// P_MAX = 64 * (2^32 - 1) due to the IETF ChaCha20 counter limit.
+		//
+		// A_MAX is limited by size_of(int), so there is no need to
+		// enforce it. P_MAX only needs to be checked on 64-bit targets,
+		// for reasons that should be obvious.
+		if text_len := len(text); text_len > _P_MAX {
+			panic("crypto/chacha20poly1305: oversized src data")
+		}
+	}
+}
+
+_PAD: [16]byte
+_update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) {
+	if pad_len := 16 - (x_len & (16-1)); pad_len != 16 {
+		poly1305.update(ctx, _PAD[:pad_len])
+	}
+}
+
+encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
+	_validate_common_slice_sizes(tag, key, nonce, aad, plaintext)
+	if len(ciphertext) != len(plaintext) {
+		panic("crypto/chacha20poly1305: invalid destination ciphertext size")
+	}
+
+	stream_ctx: chacha20.Context = ---
+	chacha20.init(&stream_ctx, key, nonce)
+
+	// otk = poly1305_key_gen(key, nonce)
+	otk: [poly1305.KEY_SIZE]byte = ---
+	chacha20.keystream_bytes(&stream_ctx, otk[:])
+	mac_ctx: poly1305.Context = ---
+	poly1305.init(&mac_ctx, otk[:])
+	mem.zero_explicit(&otk, size_of(otk))
+
+	aad_len, ciphertext_len := len(aad), len(ciphertext)
+
+	// There is nothing preventing aad and ciphertext from overlapping
+	// so auth the AAD before encrypting (slightly different from the
+	// RFC, since the RFC encrypts into a new buffer).
+	//
+	// mac_data = aad | pad16(aad)
+	poly1305.update(&mac_ctx, aad)
+	_update_mac_pad16(&mac_ctx, aad_len)
+
+	// ciphertext = chacha20_encrypt(key, 1, nonce, plaintext)
+	chacha20.seek(&stream_ctx, 1)
+	chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext)
+	chacha20.reset(&stream_ctx) // Don't need the stream context anymore.
+
+	// mac_data |= ciphertext | pad16(ciphertext)
+	poly1305.update(&mac_ctx, ciphertext)
+	_update_mac_pad16(&mac_ctx, ciphertext_len)
+
+	// mac_data |= num_to_8_le_bytes(aad.length)
+	// mac_data |= num_to_8_le_bytes(ciphertext.length)
+	l_buf := otk[0:16] // Reuse the scratch buffer.
+	util.PUT_U64_LE(l_buf[0:8], u64(aad_len))
+	util.PUT_U64_LE(l_buf[8:16], u64(ciphertext_len))
+	poly1305.update(&mac_ctx, l_buf)
+
+	// tag = poly1305_mac(mac_data, otk)
+	poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context.
+}
+
+decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
+	_validate_common_slice_sizes(tag, key, nonce, aad, ciphertext)
+	if len(ciphertext) != len(plaintext) {
+		panic("crypto/chacha20poly1305: invalid destination plaintext size")
+	}
+
+	// Note: Unlike encrypt, this can fail early, so use defer for
+	// sanitization rather than assuming control flow reaches certain
+	// points where needed.
+
+	stream_ctx: chacha20.Context = ---
+	chacha20.init(&stream_ctx, key, nonce)
+
+	// otk = poly1305_key_gen(key, nonce)
+	otk: [poly1305.KEY_SIZE]byte = ---
+	chacha20.keystream_bytes(&stream_ctx, otk[:])
+	defer chacha20.reset(&stream_ctx)
+
+	mac_ctx: poly1305.Context = ---
+	poly1305.init(&mac_ctx, otk[:])
+	defer mem.zero_explicit(&otk, size_of(otk))
+
+	aad_len, ciphertext_len := len(aad), len(ciphertext)
+
+	// mac_data = aad | pad16(aad)
+	// mac_data |= ciphertext | pad16(ciphertext)
+	// mac_data |= num_to_8_le_bytes(aad.length)
+	// mac_data |= num_to_8_le_bytes(ciphertext.length)
+	poly1305.update(&mac_ctx, aad)
+	_update_mac_pad16(&mac_ctx, aad_len)
+	poly1305.update(&mac_ctx, ciphertext)
+	_update_mac_pad16(&mac_ctx, ciphertext_len)
+	l_buf := otk[0:16] // Reuse the scratch buffer.
+	util.PUT_U64_LE(l_buf[0:8], u64(aad_len))
+	util.PUT_U64_LE(l_buf[8:16], u64(ciphertext_len))
+	poly1305.update(&mac_ctx, l_buf)
+
+	// tag = poly1305_mac(mac_data, otk)
+	derived_tag := otk[0:poly1305.TAG_SIZE] // Reuse the scratch buffer again.
+	poly1305.final(&mac_ctx, derived_tag) // Implicitly sanitizes context.
+
+	// Validate the tag in constant time.
+	if crypto.compare_constant_time(tag, derived_tag) != 1 {
+		// Zero out the plaintext, as a defense in depth measure.
+		mem.zero_explicit(raw_data(plaintext), ciphertext_len)
+		return false
+	}
+
+	// plaintext = chacha20_decrypt(key, 1, nonce, ciphertext)
+	chacha20.seek(&stream_ctx, 1)
+	chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext)
+
+	return true
+}
diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin
index b73a191ad..731833096 100644
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -118,6 +118,7 @@ main :: proc() {
     // "modern" crypto tests
     test_chacha20(&t)
     test_poly1305(&t)
+    test_chacha20poly1305(&t)
     test_x25519(&t)
 
     bench_modern(&t)
diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin
index 45ec8b339..b3d9e47fd 100644
--- a/tests/core/crypto/test_core_crypto_modern.odin
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -6,6 +6,7 @@ import "core:mem"
 import "core:time"
 
 import "core:crypto/chacha20"
+import "core:crypto/chacha20poly1305"
 import "core:crypto/poly1305"
 import "core:crypto/x25519"
 
@@ -30,13 +31,14 @@ _decode_hex32 :: proc(s: string) -> [32]byte{
 	return b
 }
 
+_PLAINTEXT_SUNSCREEN_STR := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it."
+
 @(test)
 test_chacha20 :: proc(t: ^testing.T) {
 	log(t, "Testing (X)ChaCha20")
 
 	// Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03
-	plaintext_str := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it."
-	plaintext := transmute([]byte)(plaintext_str)
+	plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR)
 
 	key := [chacha20.KEY_SIZE]byte{
 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -182,6 +184,80 @@ test_poly1305 :: proc(t: ^testing.T) {
 	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected %s for init/update/final - incremental, but got %s instead", tag_str, derived_tag_str))
 }
 
+@(test)
+test_chacha20poly1305 :: proc(t: ^testing.T) {
+	log(t, "Testing chacha20poly1205")
+
+	plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR)
+
+	aad := [12]byte{
+		0x50, 0x51, 0x52, 0x53, 0xc0, 0xc1, 0xc2, 0xc3,
+		0xc4, 0xc5, 0xc6, 0xc7,
+	}
+
+	key := [chacha20poly1305.KEY_SIZE]byte{
+		0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+		0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+		0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	}
+
+	nonce := [chacha20poly1305.NONCE_SIZE]byte{
+		0x07, 0x00, 0x00, 0x00,
+		0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	}
+
+	ciphertext := [114]byte{
+		0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb,
+		0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2,
+		0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe,
+		0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6,
+		0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12,
+		0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b,
+		0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29,
+		0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36,
+		0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c,
+		0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58,
+		0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94,
+		0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc,
+		0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d,
+		0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b,
+		0x61, 0x16,
+	}
+	ciphertext_str := hex_string(ciphertext[:])
+
+	tag := [chacha20poly1305.TAG_SIZE]byte{
+		0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09, 0xe2, 0x6a,
+		0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60, 0x06, 0x91,
+	}
+	tag_str := hex_string(tag[:])
+
+	derived_tag: [chacha20poly1305.TAG_SIZE]byte
+	derived_ciphertext: [114]byte
+
+	chacha20poly1305.encrypt(derived_ciphertext[:], derived_tag[:], key[:], nonce[:], aad[:], plaintext)
+
+	derived_ciphertext_str := hex_string(derived_ciphertext[:])
+	expect(t, derived_ciphertext_str == ciphertext_str, fmt.tprintf("Expected ciphertext %s for encrypt(aad, plaintext), but got %s instead", ciphertext_str, derived_ciphertext_str))
+
+	derived_tag_str := hex_string(derived_tag[:])
+	expect(t, derived_tag_str == tag_str, fmt.tprintf("Expected tag %s for encrypt(aad, plaintext), but got %s instead", tag_str, derived_tag_str))
+
+	derived_plaintext: [114]byte
+	ok := chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], ciphertext[:])
+	derived_plaintext_str := string(derived_plaintext[:])
+	expect(t, ok, "Expected true for decrypt(tag, aad, ciphertext)")
+	expect(t, derived_plaintext_str == _PLAINTEXT_SUNSCREEN_STR, fmt.tprintf("Expected plaintext %s for decrypt(tag, aad, ciphertext), but got %s instead", _PLAINTEXT_SUNSCREEN_STR, derived_plaintext_str))
+
+	derived_ciphertext[0] ~= 0xa5
+	ok = chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], derived_ciphertext[:])
+	expect(t, !ok, "Expected false for decrypt(tag, aad, corrupted_ciphertext)")
+
+	aad[0] ~= 0xa5
+	ok = chacha20poly1305.decrypt(derived_plaintext[:], tag[:], key[:], nonce[:], aad[:], ciphertext[:])
+	expect(t, !ok, "Expected false for decrypt(tag, corrupted_aad, ciphertext)")
+}
+
 TestECDH :: struct {
 	scalar:  string,
 	point:   string,
@@ -233,6 +309,7 @@ bench_modern :: proc(t: ^testing.T) {
 
 	bench_chacha20(t)
 	bench_poly1305(t)
+	bench_chacha20poly1305(t)
 	bench_x25519(t)
 }
 
@@ -293,6 +370,29 @@ _benchmark_poly1305 :: proc(options: ^time.Benchmark_Options, allocator := conte
 	return nil
 }
 
+_benchmark_chacha20poly1305 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
+	buf := options.input
+	key := [chacha20.KEY_SIZE]byte{
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+	}
+	nonce := [chacha20.NONCE_SIZE]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00,
+	}
+
+	tag: [chacha20poly1305.TAG_SIZE]byte = ---
+
+	for _ in 0..=options.rounds {
+		chacha20poly1305.encrypt(buf,tag[:], key[:], nonce[:], nil, buf)
+	}
+	options.count     = options.rounds
+	options.processed = options.rounds * options.bytes
+	return nil
+}
+
 benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
 	fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
 		name,
@@ -352,6 +452,33 @@ bench_poly1305 :: proc(t: ^testing.T) {
 	benchmark_print(name, options)
 }
 
+bench_chacha20poly1305 :: proc(t: ^testing.T) {
+	name    := "chacha20poly1305 64 bytes"
+	options := &time.Benchmark_Options{
+		rounds   = 1_000,
+		bytes    = 64,
+		setup    = _setup_sized_buf,
+		bench    = _benchmark_chacha20poly1305,
+		teardown = _teardown_sized_buf,
+	}
+
+	err  := time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "chacha20poly1305 1024 bytes"
+	options.bytes = 1024
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+
+	name = "chacha20poly1305 65536 bytes"
+	options.bytes = 65536
+	err = time.benchmark(options, context.allocator)
+	expect(t, err == nil, name)
+	benchmark_print(name, options)
+}
+
 bench_x25519 :: proc(t: ^testing.T) {
 	point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
 	scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")

From 61c581baeb94ac73cbb25e93af2710d12e15f25c Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Fri, 12 Nov 2021 06:22:17 +0000
Subject: [PATCH 08/10] core/sys/unix: Add syscalls_linux.odin

Linux is in the unfortunate situation where the system call number is
architecture specific.  This consolidates the system call number
definitions in a single location, adds some wrappers, and hopefully
fixes the existing non-portable invocations of the syscall intrinsic.
---
 core/mem/virtual/virtual_linux.odin   | 76 ++++++++++++---------------
 core/os/os_linux.odin                 |  5 +-
 core/sync/sync2/futex_linux.odin      |  3 +-
 core/sync/sync2/primitives_linux.odin |  5 +-
 core/sync/sync_linux.odin             |  4 +-
 core/sys/unix/syscalls_linux.odin     | 60 +++++++++++++++++++++
 6 files changed, 101 insertions(+), 52 deletions(-)
 create mode 100644 core/sys/unix/syscalls_linux.odin

diff --git a/core/mem/virtual/virtual_linux.odin b/core/mem/virtual/virtual_linux.odin
index c4dd564ee..71a56e499 100644
--- a/core/mem/virtual/virtual_linux.odin
+++ b/core/mem/virtual/virtual_linux.odin
@@ -4,64 +4,56 @@ package mem_virtual
 
 import "core:c"
 import "core:intrinsics"
+import "core:sys/unix"
 
-when ODIN_ARCH == "amd64" {
-	SYS_mmap     :: 9
-	SYS_mprotect :: 10
-	SYS_munmap   :: 11
-	SYS_madvise  :: 28
-	
-	PROT_NONE  :: 0x0
-	PROT_READ  :: 0x1
-	PROT_WRITE :: 0x2
-	PROT_EXEC  :: 0x4
-	PROT_GROWSDOWN :: 0x01000000
-	PROT_GROWSUP :: 0x02000000
+PROT_NONE  :: 0x0
+PROT_READ  :: 0x1
+PROT_WRITE :: 0x2
+PROT_EXEC  :: 0x4
+PROT_GROWSDOWN :: 0x01000000
+PROT_GROWSUP :: 0x02000000
 
-	MAP_FIXED     :: 0x1
-	MAP_PRIVATE   :: 0x2
-	MAP_SHARED    :: 0x4
-	MAP_ANONYMOUS :: 0x20
-	
-	MADV_NORMAL      :: 0
-	MADV_RANDOM      :: 1
-	MADV_SEQUENTIAL  :: 2
-	MADV_WILLNEED    :: 3
-	MADV_DONTNEED    :: 4
-	MADV_FREE        :: 8
-	MADV_REMOVE      :: 9
-	MADV_DONTFORK    :: 10
-	MADV_DOFORK      :: 11
-	MADV_MERGEABLE   :: 12
-	MADV_UNMERGEABLE :: 13
-	MADV_HUGEPAGE    :: 14
-	MADV_NOHUGEPAGE  :: 15
-	MADV_DONTDUMP    :: 16
-	MADV_DODUMP      :: 17
-	MADV_WIPEONFORK  :: 18
-	MADV_KEEPONFORK  :: 19
-	MADV_HWPOISON    :: 100
-} else {
-	#panic("Unsupported architecture")
-}
+MAP_FIXED     :: 0x1
+MAP_PRIVATE   :: 0x2
+MAP_SHARED    :: 0x4
+MAP_ANONYMOUS :: 0x20
+
+MADV_NORMAL      :: 0
+MADV_RANDOM      :: 1
+MADV_SEQUENTIAL  :: 2
+MADV_WILLNEED    :: 3
+MADV_DONTNEED    :: 4
+MADV_FREE        :: 8
+MADV_REMOVE      :: 9
+MADV_DONTFORK    :: 10
+MADV_DOFORK      :: 11
+MADV_MERGEABLE   :: 12
+MADV_UNMERGEABLE :: 13
+MADV_HUGEPAGE    :: 14
+MADV_NOHUGEPAGE  :: 15
+MADV_DONTDUMP    :: 16
+MADV_DODUMP      :: 17
+MADV_WIPEONFORK  :: 18
+MADV_KEEPONFORK  :: 19
+MADV_HWPOISON    :: 100
 
 mmap :: proc "contextless" (addr: rawptr, length: uint, prot: c.int, flags: c.int, fd: c.int, offset: uintptr) -> rawptr {
-	res := intrinsics.syscall(SYS_mmap, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flags), uintptr(fd), offset)
+	res := intrinsics.syscall(unix.SYS_mmap, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flags), uintptr(fd), offset)
 	return rawptr(res)
 }
 
 munmap :: proc "contextless" (addr: rawptr, length: uint) -> c.int {
-	res := intrinsics.syscall(SYS_munmap, uintptr(addr), uintptr(length))
+	res := intrinsics.syscall(unix.SYS_munmap, uintptr(addr), uintptr(length))
 	return c.int(res)
 }
 
 mprotect :: proc "contextless" (addr: rawptr, length: uint, prot: c.int) -> c.int {
-	res := intrinsics.syscall(SYS_mprotect, uintptr(addr), uintptr(length), uint(prot))
+	res := intrinsics.syscall(unix.SYS_mprotect, uintptr(addr), uintptr(length), uint(prot))
 	return c.int(res)
 }
 
 madvise :: proc "contextless" (addr: rawptr, length: uint, advice: c.int) -> c.int {
-	res := intrinsics.syscall(SYS_madvise, uintptr(addr), uintptr(length), uintptr(advice))
+	res := intrinsics.syscall(unix.SYS_madvise, uintptr(addr), uintptr(length), uintptr(advice))
 	return c.int(res)
 }
 
diff --git a/core/os/os_linux.odin b/core/os/os_linux.odin
index bc4717b44..260a051ce 100644
--- a/core/os/os_linux.odin
+++ b/core/os/os_linux.odin
@@ -8,6 +8,7 @@ import "core:strings"
 import "core:c"
 import "core:strconv"
 import "core:intrinsics"
+import "core:sys/unix"
 
 Handle    :: distinct i32
 File_Time :: distinct u64
@@ -265,8 +266,6 @@ X_OK :: 1 // Test for execute permission
 W_OK :: 2 // Test for write permission
 R_OK :: 4 // Test for read permission
 
-SYS_GETTID :: 186
-
 foreign libc {
 	@(link_name="__errno_location") __errno_location    :: proc() -> ^int ---
 
@@ -594,7 +593,7 @@ exit :: proc "contextless" (code: int) -> ! {
 }
 
 current_thread_id :: proc "contextless" () -> int {
-	return cast(int)intrinsics.syscall(SYS_GETTID)
+	return unix.sys_gettid()
 }
 
 dlopen :: proc(filename: string, flags: int) -> rawptr {
diff --git a/core/sync/sync2/futex_linux.odin b/core/sync/sync2/futex_linux.odin
index 1bd41c7cf..fca28cace 100644
--- a/core/sync/sync2/futex_linux.odin
+++ b/core/sync/sync2/futex_linux.odin
@@ -5,6 +5,7 @@ package sync2
 import "core:c"
 import "core:time"
 import "core:intrinsics"
+import "core:sys/unix"
 
 FUTEX_WAIT :: 0
 FUTEX_WAKE :: 1
@@ -34,7 +35,7 @@ get_errno :: proc(r: int) -> int {
 }
 
 internal_futex :: proc(f: ^Futex, op: c.int, val: u32, timeout: rawptr) -> int {
-	code := int(intrinsics.syscall(202, uintptr(f), uintptr(op), uintptr(val), uintptr(timeout), 0, 0))
+	code := int(intrinsics.syscall(unix.SYS_futex, uintptr(f), uintptr(op), uintptr(val), uintptr(timeout), 0, 0))
 	return get_errno(code)
 }
 
diff --git a/core/sync/sync2/primitives_linux.odin b/core/sync/sync2/primitives_linux.odin
index 4c81295bd..89ed97985 100644
--- a/core/sync/sync2/primitives_linux.odin
+++ b/core/sync/sync2/primitives_linux.odin
@@ -2,9 +2,8 @@
 //+private
 package sync2
 
-import "core:intrinsics"
+import "core:sys/unix"
 
 _current_thread_id :: proc "contextless" () -> int {
-	SYS_GETTID :: 186
-	return int(intrinsics.syscall(SYS_GETTID))
+	return unix.sys_gettid()
 }
diff --git a/core/sync/sync_linux.odin b/core/sync/sync_linux.odin
index fe856df94..340437c11 100644
--- a/core/sync/sync_linux.odin
+++ b/core/sync/sync_linux.odin
@@ -1,11 +1,9 @@
 package sync
 
 import "core:sys/unix"
-import "core:intrinsics"
 
 current_thread_id :: proc "contextless" () -> int {
-	SYS_GETTID :: 186
-	return int(intrinsics.syscall(SYS_GETTID))
+	return unix.sys_gettid()
 }
 
 
diff --git a/core/sys/unix/syscalls_linux.odin b/core/sys/unix/syscalls_linux.odin
new file mode 100644
index 000000000..659eedfbb
--- /dev/null
+++ b/core/sys/unix/syscalls_linux.odin
@@ -0,0 +1,60 @@
+package unix
+
+import "core:intrinsics"
+
+// Linux has inconsistent system call numbering across architectures,
+// for largely historical reasons.  This attempts to provide a unified
+// Odin-side interface for system calls that are required for the core
+// library to work.
+
+// For authorative system call numbers, the following files in the kernel
+// source can be used:
+//
+//  amd64: arch/x86/entry/syscalls/syscall_64.tbl
+//  arm64: include/uapi/asm-generic/unistd.h
+//  386: arch/x86/entry/syscalls/sycall_32.tbl
+//  arm: arch/arm/tools/syscall.tbl
+
+when ODIN_ARCH == "amd64" {
+	SYS_mmap : uintptr : 9
+	SYS_mprotect : uintptr : 10
+	SYS_munmap : uintptr : 11
+	SYS_madvise : uintptr : 28
+	SYS_futex : uintptr : 202
+	SYS_gettid : uintptr : 186
+	SYS_getrandom : uintptr : 318
+} else when ODIN_ARCH == "arm64" {
+	SYS_mmap : uintptr : 222
+	SYS_mprotect : uintptr : 226
+	SYS_munmap : uintptr : 215
+	SYS_madvise : uintptr : 233
+	SYS_futex : uintptr : 98
+	SYS_gettid : uintptr : 178
+	SYS_getrandom : uintptr : 278
+} else when ODIN_ARCH == "386" {
+	SYS_mmap : uintptr : 192 // 90 is "sys_old_mmap", we want mmap2
+	SYS_mprotect : uintptr : 125
+	SYS_munmap : uintptr : 91
+	SYS_madvise : uintptr : 219
+	SYS_futex : uintptr : 240
+	SYS_gettid : uintptr : 224
+	SYS_getrandom : uintptr : 355
+} else when ODIN_ARCH == "arm" {
+	SYS_mmap : uintptr : 192 // 90 is "sys_old_mmap", we want mmap2
+	SYS_mprotect : uintptr : 125
+	SYS_munmap: uintptr : 91
+	SYS_madvise: uintptr : 220
+	SYS_futex : uintptr : 240
+	SYS_gettid : uintptr: 224
+	SYS_getrandom : uintptr : 384
+} else {
+	#panic("Unsupported architecture")
+}
+
+sys_gettid :: proc "contextless" () -> int {
+	return cast(int)intrinsics.syscall(SYS_gettid)
+}
+
+sys_getrandom :: proc "contextless" (buf: ^byte, buflen: int, flags: uint) -> int {
+	return cast(int)intrinsics.syscall(SYS_getrandom, buf, cast(uintptr)(buflen), cast(uintptr)(flags))
+}

From 6bafa21bee56ccfbdf74f88bf7937a900a7d22d9 Mon Sep 17 00:00:00 2001
From: Yawning Angel <yawning@schwanenlied.me>
Date: Thu, 11 Nov 2021 07:59:45 +0000
Subject: [PATCH 09/10] crypto: Add rand_bytes

This adds `rand_bytes(dst: []byte)` which fills the destination buffer
with entropy from the cryptographic random number generator.  This takes
the "simple is best" approach and just directly returns the OS CSPRNG
output instead of doing anything fancy (a la OpenBSD's arc4random).
---
 core/crypto/crypto.odin                       | 11 +++++
 core/crypto/rand_generic.odin                 |  7 ++++
 core/crypto/rand_linux.odin                   | 37 +++++++++++++++++
 tests/core/crypto/test_core_crypto.odin       |  1 +
 .../core/crypto/test_core_crypto_modern.odin  | 40 +++++++++++++++++++
 5 files changed, 96 insertions(+)
 create mode 100644 core/crypto/rand_generic.odin
 create mode 100644 core/crypto/rand_linux.odin

diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin
index ddcc5d367..35e88c5ed 100644
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -39,3 +39,14 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
 	// iff v == 0, setting the sign-bit, which gets returned.
 	return int((u32(v)-1) >> 31)
 }
+
+// rand_bytes fills the dst buffer with cryptographic entropy taken from
+// the system entropy source.  This routine will block if the system entropy
+// source is not ready yet.  All system entropy source failures are treated
+// as catastrophic, resulting in a panic.
+rand_bytes :: proc (dst: []byte) {
+	// zero-fill the buffer first
+	mem.zero_explicit(raw_data(dst), len(dst))
+
+	_rand_bytes(dst)
+}
diff --git a/core/crypto/rand_generic.odin b/core/crypto/rand_generic.odin
new file mode 100644
index 000000000..98890b5b1
--- /dev/null
+++ b/core/crypto/rand_generic.odin
@@ -0,0 +1,7 @@
+package crypto
+
+when ODIN_OS != "linux" {
+	_rand_bytes :: proc (dst: []byte) {
+		unimplemented("crypto: rand_bytes not supported on this OS")
+	}
+}
diff --git a/core/crypto/rand_linux.odin b/core/crypto/rand_linux.odin
new file mode 100644
index 000000000..4d1183757
--- /dev/null
+++ b/core/crypto/rand_linux.odin
@@ -0,0 +1,37 @@
+package crypto
+
+import "core:fmt"
+import "core:os"
+import "core:sys/unix"
+
+_MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1
+
+_rand_bytes :: proc (dst: []byte) {
+	dst := dst
+	l := len(dst)
+
+	for l > 0 {
+		to_read := min(l, _MAX_PER_CALL_BYTES)
+		ret := unix.sys_getrandom(raw_data(dst), to_read, 0)
+		if ret < 0 {
+			switch os.Errno(-ret) {
+			case os.EINTR:
+				// Call interupted by a signal handler, just retry the
+				// request.
+				continue
+			case os.ENOSYS:
+				// The kernel is apparently prehistoric (< 3.17 circa 2014)
+				// and does not support getrandom.
+				panic("crypto: getrandom not available in kernel")
+			case:
+				// All other failures are things that should NEVER happen
+				// unless the kernel interface changes (ie: the Linux
+				// developers break userland).
+				panic(fmt.tprintf("crypto: getrandom failed: %d", ret))
+			}
+		}
+
+		l -= ret
+		dst = dst[ret:]
+	}
+}
diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin
index 731833096..2ad00be66 100644
--- a/tests/core/crypto/test_core_crypto.odin
+++ b/tests/core/crypto/test_core_crypto.odin
@@ -120,6 +120,7 @@ main :: proc() {
     test_poly1305(&t)
     test_chacha20poly1305(&t)
     test_x25519(&t)
+    test_rand_bytes(&t)
 
     bench_modern(&t)
 
diff --git a/tests/core/crypto/test_core_crypto_modern.odin b/tests/core/crypto/test_core_crypto_modern.odin
index b3d9e47fd..71adad137 100644
--- a/tests/core/crypto/test_core_crypto_modern.odin
+++ b/tests/core/crypto/test_core_crypto_modern.odin
@@ -4,6 +4,7 @@ import "core:testing"
 import "core:fmt"
 import "core:mem"
 import "core:time"
+import "core:crypto"
 
 import "core:crypto/chacha20"
 import "core:crypto/chacha20poly1305"
@@ -303,6 +304,45 @@ test_x25519 :: proc(t: ^testing.T) {
     // how to work with JSON.
 }
 
+@(test)
+test_rand_bytes :: proc(t: ^testing.T) {
+	log(t, "Testing rand_bytes")
+
+	if ODIN_OS != "linux" {
+		log(t, "rand_bytes not supported - skipping")
+		return
+	}
+
+	allocator := context.allocator
+
+	buf := make([]byte, 1 << 25, allocator)
+	defer delete(buf)
+
+	// Testing a CSPRNG for correctness is incredibly involved and
+	// beyond the scope of an implementation that offloads
+	// responsibility for correctness to the OS.
+	//
+	// Just attempt to randomize a sufficiently large buffer, where
+	// sufficiently large is:
+	//  * Larger than the maximum getentropy request size (256 bytes).
+	//  * Larger than the maximum getrandom request size (2^25 - 1 bytes).
+	//
+	// While theoretically non-deterministic, if this fails, chances
+	// are the CSPRNG is busted.
+	seems_ok := false
+	for i := 0; i < 256; i = i + 1 {
+		mem.zero_explicit(raw_data(buf), len(buf))
+		crypto.rand_bytes(buf)
+
+		if buf[0] != 0 && buf[len(buf)-1] != 0 {
+			seems_ok = true
+			break
+		}
+	}
+
+	expect(t, seems_ok, "Expected to randomize the head and tail of the buffer within a handful of attempts")
+}
+
 @(test)
 bench_modern :: proc(t: ^testing.T) {
 	fmt.println("Starting benchmarks:")

From e5f961b48f52f8346f00d43fea4700c8513c53c3 Mon Sep 17 00:00:00 2001
From: DYSEQTA <dyseqta@me.com>
Date: Wed, 24 Nov 2021 11:10:40 +1100
Subject: [PATCH 10/10] Removed '--help' from help string as per request.

---
 src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7896756d4..99a55b2b6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -545,7 +545,7 @@ void usage(String argv0) {
 	print_usage_line(1, "version   print version");
 	print_usage_line(1, "report    print information useful to reporting a bug");
 	print_usage_line(0, "");
-	print_usage_line(0, "For further details on a command, use -help or --help after the command name");
+	print_usage_line(0, "For further details on a command, use -help after the command name");
 	print_usage_line(1, "e.g. odin build -help");
 }