Merge branch 'master' into parsing-package-fixes

2026-07-21 23:21:08 +00:00 · 2024-04-10 19:10:33 +02:00
parent 239d4e1076 13e459980b
commit 95a38d5a96
541 changed files with 46021 additions and 30526 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,11 +3,16 @@ on: [push, pull_request, workflow_dispatch]

 jobs:
  build_linux:
+    name: Ubuntu Build, Check, and Test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v1
-      - name: Download LLVM, botan
-        run: sudo apt-get install llvm-11 clang-11 libbotan-2-dev botan
+      - name: Download LLVM
+        run: |
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 17
+          echo "/usr/lib/llvm-17/bin" >> $GITHUB_PATH
      - name: build odin
        run: ./build_odin.sh release
      - name: Odin version
@@ -46,6 +51,9 @@ jobs:
      - name: Odin check examples/all for Linux i386
        run: ./odin check examples/all -vet -strict-style -target:linux_i386
        timeout-minutes: 10
+      - name: Odin check examples/all for Linux arm64
+        run: ./odin check examples/all -vet -strict-style -target:linux_arm64
+        timeout-minutes: 10
      - name: Odin check examples/all for FreeBSD amd64
        run: ./odin check examples/all -vet -strict-style -target:freebsd_amd64
        timeout-minutes: 10
@@ -53,15 +61,14 @@ jobs:
        run: ./odin check examples/all -vet -strict-style -target:openbsd_amd64
        timeout-minutes: 10
  build_macOS:
+    name: MacOS Build, Check, and Test
    runs-on: macos-latest
    steps:
      - uses: actions/checkout@v1
-      - name: Download LLVM, botan and setup PATH
+      - name: Download LLVM, and setup PATH
        run: |
-          brew install llvm@13 botan
-          echo "/usr/local/opt/llvm@13/bin" >> $GITHUB_PATH
-          TMP_PATH=$(xcrun --show-sdk-path)/user/include
-          echo "CPATH=$TMP_PATH" >> $GITHUB_ENV
+          brew install llvm@17
+          echo "/usr/local/opt/llvm@17/bin" >> $GITHUB_PATH
      - name: build odin
        run: ./build_odin.sh release
      - name: Odin version
@@ -92,13 +99,47 @@ jobs:
          cd tests/internal
          make
        timeout-minutes: 10
-      - name: Odin check examples/all for Darwin arm64
-        run: ./odin check examples/all -vet -strict-style -target:darwin_arm64
+  build_macOS_arm:
+    name: MacOS ARM Build, Check, and Test
+    runs-on: macos-14 # This is an arm/m1 runner.
+    steps:
+      - uses: actions/checkout@v1
+      - name: Download LLVM and setup PATH
+        run: |
+          brew install llvm@17
+          echo "/opt/homebrew/opt/llvm@17/bin" >> $GITHUB_PATH
+      - name: build odin
+        run: ./build_odin.sh release
+      - name: Odin version
+        run: ./odin version
+        timeout-minutes: 1
+      - name: Odin report
+        run: ./odin report
+        timeout-minutes: 1
+      - name: Odin check
+        run: ./odin check examples/demo -vet
        timeout-minutes: 10
-      - name: Odin check examples/all for Linux arm64
-        run: ./odin check examples/all -vet -strict-style -target:linux_arm64
+      - name: Odin run
+        run: ./odin run examples/demo
+        timeout-minutes: 10
+      - name: Odin run -debug
+        run: ./odin run examples/demo -debug
+        timeout-minutes: 10
+      - name: Odin check examples/all
+        run: ./odin check examples/all -strict-style
+        timeout-minutes: 10
+      - name: Core library tests
+        run: |
+          cd tests/core
+          make
+        timeout-minutes: 10
+      - name: Odin internals tests
+        run: |
+          cd tests/internal
+          make
        timeout-minutes: 10
  build_windows:
+    name: Windows Build, Check, and Test
    runs-on: windows-2022
    steps:
      - uses: actions/checkout@v1
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -7,6 +7,7 @@ on:

 jobs:
  build_windows:
+    name: Windows Build
    if: github.repository == 'odin-lang/Odin'
    runs-on: windows-2022
    steps:
@@ -29,6 +30,7 @@ jobs:
          cp LICENSE dist
          cp LLVM-C.dll dist
          cp -r shared dist
+          cp -r base dist
          cp -r core dist
          cp -r vendor dist
          cp -r bin dist
@@ -39,12 +41,17 @@ jobs:
          name: windows_artifacts
          path: dist
  build_ubuntu:
+    name: Ubuntu Build
    if: github.repository == 'odin-lang/Odin'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v1
      - name: (Linux) Download LLVM
-        run: sudo apt-get install llvm-11 clang-11
+        run: |
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 17
+          echo "/usr/lib/llvm-17/bin" >> $GITHUB_PATH
      - name: build odin
        run: make nightly
      - name: Odin run
@@ -56,46 +63,88 @@ jobs:
          cp LICENSE dist
          cp libLLVM* dist
          cp -r shared dist
+          cp -r base dist
          cp -r core dist
          cp -r vendor dist
          cp -r examples dist
+          # Zipping so executable permissions are retained, see https://github.com/actions/upload-artifact/issues/38
+          zip -r dist.zip dist
      - name: Upload artifact
        uses: actions/upload-artifact@v1
        with:
          name: ubuntu_artifacts
-          path: dist
+          path: dist.zip
  build_macos:
+    name: MacOS Build
    if: github.repository == 'odin-lang/Odin'
-    runs-on: macOS-latest
+    runs-on: macos-latest
    steps:
      - uses: actions/checkout@v1
      - name: Download LLVM and setup PATH
        run: |
-          brew install llvm@13
-          echo "/usr/local/opt/llvm@13/bin" >> $GITHUB_PATH
-          TMP_PATH=$(xcrun --show-sdk-path)/user/include
-          echo "CPATH=$TMP_PATH" >> $GITHUB_ENV
+          brew install llvm@17 dylibbundler
+          echo "/usr/local/opt/llvm@17/bin" >> $GITHUB_PATH
      - name: build odin
-        run: make nightly
-      - name: Odin run
-        run: ./odin run examples/demo
-      - name: Copy artifacts
+        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
+        # not link with libunwind bundled with LLVM but link with libunwind on the system.
+        run: CXXFLAGS="-L/usr/lib/system -L/usr/lib" make nightly
+      - name: Bundle
        run: |
          mkdir dist
          cp odin dist
          cp LICENSE dist
          cp -r shared dist
+          cp -r base dist
          cp -r core dist
          cp -r vendor dist
          cp -r examples dist
+          dylibbundler -b -x dist/odin -d dist/libs -od -p @executable_path/libs
+          # Zipping so executable permissions are retained, see https://github.com/actions/upload-artifact/issues/38
+          zip -r dist.zip dist
+      - name: Odin run
+        run: ./dist/odin run examples/demo
      - name: Upload artifact
        uses: actions/upload-artifact@v1
        with:
          name: macos_artifacts
-          path: dist
+          path: dist.zip
+  build_macos_arm:
+    name: MacOS ARM Build
+    if: github.repository == 'odin-lang/Odin'
+    runs-on: macos-14 # ARM machine
+    steps:
+      - uses: actions/checkout@v1
+      - name: Download LLVM and setup PATH
+        run: |
+          brew install llvm@17 dylibbundler
+          echo "/opt/homebrew/opt/llvm@17/bin" >> $GITHUB_PATH
+      - name: build odin
+        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
+        # not link with libunwind bundled with LLVM but link with libunwind on the system.
+        run: CXXFLAGS="-L/usr/lib/system -L/usr/lib" make nightly
+      - name: Bundle
+        run: |
+          mkdir dist
+          cp odin dist
+          cp LICENSE dist
+          cp -r shared dist
+          cp -r base dist
+          cp -r core dist
+          cp -r vendor dist
+          cp -r examples dist
+          dylibbundler -b -x dist/odin -d dist/libs -od -p @executable_path/libs
+          # Zipping so executable permissions are retained, see https://github.com/actions/upload-artifact/issues/38
+          zip -r dist.zip dist
+      - name: Odin run
+        run: ./dist/odin run examples/demo
+      - name: Upload artifact
+        uses: actions/upload-artifact@v1
+        with:
+          name: macos_arm_artifacts
+          path: dist.zip
  upload_b2:
    runs-on: [ubuntu-latest]
-    needs: [build_windows, build_macos, build_ubuntu]
+    needs: [build_windows, build_macos, build_macos_arm, build_ubuntu]
    steps:
      - uses: actions/checkout@v1
      - uses: actions/setup-python@v2
@@ -126,6 +175,11 @@ jobs:
        with:
          name: macos_artifacts

+      - name: Download macOS arm artifacts
+        uses: actions/download-artifact@v1
+        with:
+          name: macos_arm_artifacts
+
      - name: Create archives and upload
        shell: bash
        env:
@@ -140,8 +194,9 @@ jobs:
          echo Uploading artifcates to B2
          chmod +x ./ci/upload_create_nightly.sh
          ./ci/upload_create_nightly.sh "$BUCKET" windows-amd64 windows_artifacts/
-          ./ci/upload_create_nightly.sh "$BUCKET" ubuntu-amd64 ubuntu_artifacts/
-          ./ci/upload_create_nightly.sh "$BUCKET" macos-amd64 macos_artifacts/
+          ./ci/upload_create_nightly.sh "$BUCKET" ubuntu-amd64 ubuntu_artifacts/dist.zip
+          ./ci/upload_create_nightly.sh "$BUCKET" macos-amd64 macos_artifacts/dist.zip
+          ./ci/upload_create_nightly.sh "$BUCKET" macos-arm64 macos_arm_artifacts/dist.zip

          echo Deleting old artifacts in B2
          python3 ci/delete_old_binaries.py "$BUCKET" "$DAYS_TO_KEEP"
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@ tests/internal/test_map
 tests/internal/test_pow
 tests/internal/test_rtti
 tests/core/test_core_compress
+tests/core/test_core_container
 tests/core/test_core_filepath
 tests/core/test_core_fmt
 tests/core/test_core_i18n
@@ -39,7 +40,7 @@ tests/core/test_core_net
 tests/core/test_core_os_exit
 tests/core/test_core_reflect
 tests/core/test_core_strings
-tests/core/test_crypto_hash
+tests/core/test_crypto
 tests/core/test_hash
 tests/core/test_hxa
 tests/core/test_json
@@ -49,6 +50,7 @@ tests/core/test_varint
 tests/core/test_xml
 tests/core/test_core_slice
 tests/core/test_core_thread
+tests/core/test_core_runtime
 tests/vendor/vendor_botan
 # Visual Studio 2015 cache/options directory
 .vs/
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-Copyright (c) 2016-2022 Ginger Bill. All rights reserved.
+Copyright (c) 2016-2024 Ginger Bill. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
--- a/base/builtin/builtin.odin
+++ b/base/builtin/builtin.odin
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -5,6 +5,12 @@ package intrinsics
 // Package-Related
 is_package_imported :: proc(package_name: string) -> bool ---

+// Matrix Related Procedures
+transpose        :: proc(m: $T/matrix[$R, $C]$E)    -> matrix[C, R]E ---
+outer_product    :: proc(a: $A/[$X]$E, b: $B/[$Y]E) -> matrix[X, Y]E ---
+hadamard_product :: proc(a, b: $T/matrix[$R, $C]$E) -> T ---
+matrix_flatten   :: proc(m: $T/matrix[$R, $C]$E)    -> [R*C]E ---
+
 // Types
 soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T

@@ -287,7 +293,9 @@ wasm_memory_size :: proc(index: uintptr)        -> int ---
 // 0 - indicates that the thread blocked and then was woken up
 // 1 - the loaded value from `ptr` did not match `expected`, the thread did not block
 // 2 - the thread blocked, but the timeout
+@(enable_target_feature="atomics")
 wasm_memory_atomic_wait32   :: proc(ptr: ^u32, expected: u32, timeout_ns: i64) -> u32 ---
+@(enable_target_feature="atomics")
 wasm_memory_atomic_notify32 :: proc(ptr: ^u32, waiters: u32) -> (waiters_woken_up: u32) ---

 // x86 Targets (i386, amd64)
--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -21,7 +21,7 @@
 //+no-instrumentation
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 // NOTE(bill): This must match the compiler's
 Calling_Convention :: enum u8 {
@@ -177,10 +177,22 @@ Type_Info_Matrix :: struct {
 	row_count:    int,
 	column_count: int,
 	// Total element count = column_count * elem_stride
+	layout: enum u8 {
+		Column_Major, // array of column vectors
+		Row_Major,    // array of row vectors
+	},
 }
 Type_Info_Soa_Pointer :: struct {
 	elem: ^Type_Info,
 }
+Type_Info_Bit_Field :: struct {
+	backing_type: ^Type_Info,
+	names:        []string,
+	types:        []^Type_Info,
+	bit_sizes:    []uintptr,
+	bit_offsets:  []uintptr,
+	tags:         []string,
+}

 Type_Info_Flag :: enum u8 {
 	Comparable     = 0,
@@ -223,6 +235,7 @@ Type_Info :: struct {
 		Type_Info_Relative_Multi_Pointer,
 		Type_Info_Matrix,
 		Type_Info_Soa_Pointer,
+		Type_Info_Bit_Field,
 	},
 }

@@ -256,6 +269,7 @@ Typeid_Kind :: enum u8 {
 	Relative_Multi_Pointer,
 	Matrix,
 	Soa_Pointer,
+	Bit_Field,
 }
 #assert(len(Typeid_Kind) < 32)

@@ -270,7 +284,7 @@ Typeid_Kind :: enum u8 {

 // NOTE(bill): only the ones that are needed (not all types)
 // This will be set by the compiler
-type_table: []Type_Info
+type_table: []^Type_Info

 args__: []cstring

@@ -296,6 +310,14 @@ Source_Code_Location :: struct {
 	procedure:    string,
 }

+/*
+	Used by the built-in directory `#load_directory(path: string) -> []Load_Directory_File`
+*/
+Load_Directory_File :: struct {
+	name: string,
+	data: []byte, // immutable data
+}
+
 Assertion_Failure_Proc :: #type proc(prefix, message: string, loc: Source_Code_Location) -> !

 // Allocation Stuff
@@ -575,8 +597,9 @@ type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	base := info
 	loop: for {
 		#partial switch i in base.variant {
-		case Type_Info_Named:  base = i.base
-		case Type_Info_Enum:   base = i.base
+		case Type_Info_Named:     base = i.base
+		case Type_Info_Enum:      base = i.base
+		case Type_Info_Bit_Field: base = i.backing_type
 		case: break loop
 		}
 	}
@@ -591,7 +614,7 @@ __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check
 	if n < 0 || n >= len(type_table) {
 		n = 0
 	}
-	return &type_table[n]
+	return type_table[n]
 }

 when !ODIN_NO_RTTI {
--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

@builtin
 Maybe :: union($T: typeid) {T}
@@ -122,7 +122,7 @@ pop :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bou
 // `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@builtin
-pop_safe :: proc(array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
+pop_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
 	if len(array) == 0 {
 		return
 	}
@@ -148,7 +148,7 @@ pop_front :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #
 // `pop_front_safe` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@builtin
-pop_front_safe :: proc(array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
+pop_front_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
 	if len(array) == 0 {
 		return
 	}
@@ -172,7 +172,7 @@ reserve :: proc{reserve_dynamic_array, reserve_map}
@builtin
 non_zero_reserve :: proc{non_zero_reserve_dynamic_array}

-// `resize` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+// `resize` will try to resize memory of a passed dynamic array to the requested element count (setting the `len`, and possibly `cap`).
@builtin
 resize :: proc{resize_dynamic_array}

@@ -312,6 +312,7 @@ make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, alloca
@(builtin, require_results)
 make_dynamic_array_len_cap :: proc($T: typeid/[dynamic]$E, #any_int len: int, #any_int cap: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
 	make_dynamic_array_error_loc(loc, len, cap)
+	array.allocator = allocator // initialize allocator before just in case it fails to allocate any memory
 	data := mem_alloc_bytes(size_of(E)*cap, align_of(E), allocator, loc) or_return
 	s := Raw_Dynamic_Array{raw_data(data), len, cap, allocator}
 	if data == nil && size_of(E) != 0 {
@@ -446,12 +447,12 @@ _append_elem :: #force_inline proc(array: ^$T/[dynamic]$E, arg: E, should_zero:
 }

@builtin
-append_elem :: proc(array: ^$T/[dynamic]$E, arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elem(array, arg, true, loc=loc)
 }

@builtin
-non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elem(array, arg, false, loc=loc)
 }

@@ -495,12 +496,12 @@ _append_elems :: #force_inline proc(array: ^$T/[dynamic]$E, should_zero: bool, l
 }

@builtin
-append_elems :: proc(array: ^$T/[dynamic]$E, args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elems(array, true, loc, ..args)
 }

@builtin
-non_zero_append_elems :: proc(array: ^$T/[dynamic]$E, args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+non_zero_append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elems(array, false, loc, ..args)
 }

@@ -555,7 +556,7 @@ append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: i


@builtin
-inject_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+inject_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -573,7 +574,7 @@ inject_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #calle
 }

@builtin
-inject_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+inject_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -739,6 +740,9 @@ _resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int,
 	a := (^Raw_Dynamic_Array)(array)

 	if length <= a.cap {
+		if should_zero && a.len < length {
+			intrinsics.mem_zero(([^]E)(a.data)[a.len:], (length-a.len)*size_of(E))
+		}
 		a.len = max(length, 0)
 		return nil
 	}
@@ -823,42 +827,25 @@ map_insert :: proc(m: ^$T/map[$K]$V, key: K, value: V, loc := #caller_location)
 	return (^V)(__dynamic_map_set_without_hash((^Raw_Map)(m), map_info(T), rawptr(&key), rawptr(&value), loc))
 }

-
-@builtin
-incl_elem :: proc(s: ^$S/bit_set[$E; $U], elem: E) {
-	s^ |= {elem}
-}
-@builtin
-incl_elems :: proc(s: ^$S/bit_set[$E; $U], elems: ..E) {
-	for elem in elems {
-		s^ |= {elem}
+// Explicitly inserts a key and value into a map `m`, the same as `map_insert`, but the return values differ.
+// - `prev_key` will return the previous pointer of a key if it exists, check `found_previous` if was previously found
+// - `value_ptr` will return the pointer of the memory where the insertion happens, and `nil` if the map failed to resize
+// - `found_previous` will be true a previous key was found
+@(builtin, require_results)
+map_upsert :: proc(m: ^$T/map[$K]$V, key: K, value: V, loc := #caller_location) -> (prev_key: K, value_ptr: ^V, found_previous: bool) {
+	key, value := key, value
+	kp, vp := __dynamic_map_set_extra_without_hash((^Raw_Map)(m), map_info(T), rawptr(&key), rawptr(&value), loc)
+	if kp != nil {
+		prev_key = (^K)(kp)^
+		found_previous = true
 	}
+	value_ptr = (^V)(vp)
+	return
 }
-@builtin
-incl_bit_set :: proc(s: ^$S/bit_set[$E; $U], other: S) {
-	s^ |= other
-}
-@builtin
-excl_elem :: proc(s: ^$S/bit_set[$E; $U], elem: E) {
-	s^ &~= {elem}
-}
-@builtin
-excl_elems :: proc(s: ^$S/bit_set[$E; $U], elems: ..E) {
-	for elem in elems {
-		s^ &~= {elem}
-	}
-}
-@builtin
-excl_bit_set :: proc(s: ^$S/bit_set[$E; $U], other: S) {
-	s^ &~= other
-}
-
-@builtin incl :: proc{incl_elem, incl_elems, incl_bit_set}
-@builtin excl :: proc{excl_elem, excl_elems, excl_bit_set}


@builtin
-card :: proc(s: $S/bit_set[$E; $U]) -> int {
+card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {
 	when size_of(S) == 1 {
 		return int(intrinsics.count_ones(transmute(u8)s))
 	} else when size_of(S) == 2 {
--- a/base/runtime/core_builtin_soa.odin
+++ b/base/runtime/core_builtin_soa.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"
 _ :: intrinsics

 /*
@@ -425,4 +425,97 @@ clear_soa_dynamic_array :: proc(array: ^$T/#soa[dynamic]$E) {
@builtin
 clear_soa :: proc{
 	clear_soa_dynamic_array,
-}
+}
+
+// Converts soa slice into a soa dynamic array without cloning or allocating memory
+@(require_results)
+into_dynamic_soa :: proc(array: $T/#soa[]$E) -> #soa[dynamic]E {
+	d: #soa[dynamic]E
+	footer := raw_soa_footer_dynamic_array(&d)
+	footer^ = {
+		cap = len(array),
+		len = 0,
+		allocator = nil_allocator(),
+	}
+
+	field_count: uintptr
+	when intrinsics.type_is_array(E) {
+		field_count = len(E)
+	} else {
+		field_count = uintptr(intrinsics.type_struct_field_count(E))
+	}
+
+	array := array
+	dynamic_data := ([^]rawptr)(&d)[:field_count]
+	slice_data   := ([^]rawptr)(&array)[:field_count]
+	copy(dynamic_data, slice_data)
+
+	return d
+}
+
+// `unordered_remove_soa` removed the element at the specified `index`. It does so by replacing the current end value
+// with the old value, and reducing the length of the dynamic array by 1.
+//
+// Note: This is an O(1) operation.
+// Note: If you the elements to remain in their order, use `ordered_remove_soa`.
+// Note: If the index is out of bounds, this procedure will panic.
+@builtin
+unordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, index: int, loc := #caller_location) #no_bounds_check {
+	bounds_check_error_loc(loc, index, len(array))
+	if index+1 < len(array) {
+		ti := type_info_of(typeid_of(T))
+		ti = type_info_base(ti)
+		si := &ti.variant.(Type_Info_Struct)
+
+		field_count: uintptr
+		when intrinsics.type_is_array(E) {
+			field_count = len(E)
+		} else {
+			field_count = uintptr(intrinsics.type_struct_field_count(E))
+		}
+
+		data := uintptr(array)
+		for i in 0..<field_count {
+			type := si.types[i].variant.(Type_Info_Pointer).elem
+
+			offset := rawptr((^uintptr)(data)^ + uintptr(index*type.size))
+			final := rawptr((^uintptr)(data)^ + uintptr((len(array)-1)*type.size))
+			mem_copy(offset, final, type.size)
+			data += size_of(rawptr)
+		}
+	}
+	raw_soa_footer_dynamic_array(array).len -= 1
+}
+
+// `ordered_remove_soa` removed the element at the specified `index` whilst keeping the order of the other elements.
+//
+// Note: This is an O(N) operation.
+// Note: If you the elements do not have to remain in their order, prefer `unordered_remove_soa`.
+// Note: If the index is out of bounds, this procedure will panic.
+@builtin
+ordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, index: int, loc := #caller_location) #no_bounds_check {
+	bounds_check_error_loc(loc, index, len(array))
+	if index+1 < len(array) {
+		ti := type_info_of(typeid_of(T))
+		ti = type_info_base(ti)
+		si := &ti.variant.(Type_Info_Struct)
+
+		field_count: uintptr
+		when intrinsics.type_is_array(E) {
+			field_count = len(E)
+		} else {
+			field_count = uintptr(intrinsics.type_struct_field_count(E))
+		}
+
+		data := uintptr(array)
+		for i in 0..<field_count {
+			type := si.types[i].variant.(Type_Info_Pointer).elem
+
+			offset := (^uintptr)(data)^ + uintptr(index*type.size)
+			length := type.size*(len(array) - index - 1)
+			mem_copy(rawptr(offset), rawptr(offset + uintptr(type.size)), length)
+			data += size_of(rawptr)
+		}
+	}
+	raw_soa_footer_dynamic_array(array).len -= 1
+}
--- a/base/runtime/default_allocators_arena.odin
+++ b/base/runtime/default_allocators_arena.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 DEFAULT_ARENA_GROWING_MINIMUM_BLOCK_SIZE :: uint(DEFAULT_TEMP_ALLOCATOR_BACKING_SIZE)

--- a/base/runtime/default_allocators_general.odin
+++ b/base/runtime/default_allocators_general.odin
@@ -0,0 +1,12 @@
+package runtime
+
+when ODIN_DEFAULT_TO_NIL_ALLOCATOR {
+	default_allocator_proc :: nil_allocator_proc
+	default_allocator :: nil_allocator
+} else when ODIN_DEFAULT_TO_PANIC_ALLOCATOR {
+	default_allocator_proc :: panic_allocator_proc
+	default_allocator :: panic_allocator
+} else {
+	default_allocator :: heap_allocator
+	default_allocator_proc :: heap_allocator_proc
+}
--- a/base/runtime/default_allocators_nil.odin
+++ b/base/runtime/default_allocators_nil.odin
@@ -31,14 +31,6 @@ nil_allocator :: proc() -> Allocator {
 }


-
-when ODIN_OS == .Freestanding {
-	default_allocator_proc :: nil_allocator_proc
-	default_allocator :: nil_allocator
-}
-
-
-
 panic_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
                             size, alignment: int,
                             old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
--- a/base/runtime/default_temporary_allocator.odin
+++ b/base/runtime/default_temporary_allocator.odin
--- a/base/runtime/docs.odin
+++ b/base/runtime/docs.odin
@@ -44,7 +44,7 @@ memcpy
 memove


-## Procedures required by the LLVM backend
+## Procedures required by the LLVM backend if u128/i128 is used
 umodti3
 udivti3
 modti3
@@ -59,11 +59,12 @@ truncdfhf2
 gnu_h2f_ieee
 gnu_f2h_ieee
 extendhfsf2
+
+## Procedures required by the LLVM backend if f16 is used
 __ashlti3 // wasm specific
 __multi3  // wasm specific


-
 ## Required an entry point is defined (i.e. 'main')

 args__
--- a/base/runtime/dynamic_array_internal.odin
+++ b/base/runtime/dynamic_array_internal.odin
--- a/base/runtime/dynamic_map_internal.odin
+++ b/base/runtime/dynamic_map_internal.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"
 _ :: intrinsics

 // High performance, cache-friendly, open-addressed Robin Hood hashing hash map
@@ -333,7 +333,7 @@ map_kvh_data_values_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^
 }


-@(private, require_results)
+@(require_results)
 map_total_allocation_size :: #force_inline proc "contextless" (capacity: uintptr, info: ^Map_Info) -> uintptr {
 	round :: #force_inline proc "contextless" (value: uintptr) -> uintptr {
 		CACHE_MASK :: MAP_CACHE_LINE_SIZE - 1
@@ -350,6 +350,12 @@ map_total_allocation_size :: #force_inline proc "contextless" (capacity: uintptr
 	return size
 }

+@(require_results)
+map_total_allocation_size_from_value :: #force_inline proc "contextless" (m: $M/map[$K]$V) -> uintptr {
+	return map_total_allocation_size(uintptr(cap(m)), map_info(M))
+}
+
+
 // The only procedure which needs access to the context is the one which allocates the map.
@(require_results)
 map_alloc_dynamic :: proc "odin" (info: ^Map_Info, log2_capacity: uintptr, allocator := context.allocator, loc := #caller_location) -> (result: Raw_Map, err: Allocator_Error) {
@@ -391,7 +397,8 @@ map_alloc_dynamic :: proc "odin" (info: ^Map_Info, log2_capacity: uintptr, alloc
 // arrays to reduce variance. This swapping can only be done with memcpy since
 // there is no type information.
 //
-// This procedure returns the address of the just inserted value.
+// This procedure returns the address of the just inserted value, and will
+// return 'nil' if there was no room to insert the entry
@(require_results)
 map_insert_hash_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, h: Map_Hash, ik: uintptr, iv: uintptr) -> (result: uintptr) {
 	h        := h
@@ -415,6 +422,11 @@ map_insert_hash_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^
 	tv := map_cell_index_dynamic(sv, info.vs, 1)

 	swap_loop: for {
+		if distance > mask {
+			// Failed to find an empty slot and prevent infinite loop
+			panic("unable to insert into a map")
+		}
+
 		element_hash := hs[pos]

 		if map_hash_is_empty(element_hash) {
@@ -841,6 +853,33 @@ __dynamic_map_get :: proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info:
 	}
 }

+__dynamic_map_get_key_and_value :: proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, h: Map_Hash, key: rawptr) -> (key_ptr, value_ptr: rawptr) {
+	if m.len == 0 {
+		return nil, nil
+	}
+	pos := map_desired_position(m^, h)
+	distance := uintptr(0)
+	mask := (uintptr(1) << map_log2_cap(m^)) - 1
+	ks, vs, hs, _, _ := map_kvh_data_dynamic(m^, info)
+	for {
+		element_hash := hs[pos]
+		if map_hash_is_empty(element_hash) {
+			return nil, nil
+		} else if distance > map_probe_distance(m^, element_hash, pos) {
+			return nil, nil
+		} else if element_hash == h {
+			other_key := rawptr(map_cell_index_dynamic(ks, info.ks, pos))
+			if info.key_equal(key, other_key) {
+				key_ptr   = other_key
+				value_ptr = rawptr(map_cell_index_dynamic(vs, info.vs, pos))
+				return
+			}
+		}
+		pos = (pos + 1) & mask
+		distance += 1
+	}
+}
+
 // IMPORTANT: USED WITHIN THE COMPILER
 __dynamic_map_check_grow :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, loc := #caller_location) -> (err: Allocator_Error, has_grown: bool) {
 	if m.len >= map_resize_threshold(m^) {
@@ -871,9 +910,37 @@ __dynamic_map_set :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_In
 	}

 	result := map_insert_hash_dynamic(m, info, hash, uintptr(key), uintptr(value))
-	m.len += 1
+	if result != 0 {
+		m.len += 1
+	}
 	return rawptr(result)
 }
+__dynamic_map_set_extra_without_hash :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, key, value: rawptr, loc := #caller_location) -> (prev_key_ptr, value_ptr: rawptr) {
+	return __dynamic_map_set_extra(m, info, info.key_hasher(key, map_seed(m^)), key, value, loc)
+}
+
+__dynamic_map_set_extra :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, hash: Map_Hash, key, value: rawptr, loc := #caller_location) -> (prev_key_ptr, value_ptr: rawptr) {
+	if prev_key_ptr, value_ptr = __dynamic_map_get_key_and_value(m, info, hash, key); value_ptr != nil {
+		intrinsics.mem_copy_non_overlapping(value_ptr, value, info.vs.size_of_type)
+		return
+	}
+
+	hash := hash
+	err, has_grown := __dynamic_map_check_grow(m, info, loc)
+	if err != nil {
+		return nil, nil
+	}
+	if has_grown {
+		hash = info.key_hasher(key, map_seed(m^))
+	}
+
+	result := map_insert_hash_dynamic(m, info, hash, uintptr(key), uintptr(value))
+	if result != 0 {
+		m.len += 1
+	}
+	return nil, rawptr(result)
+}
+

 // IMPORTANT: USED WITHIN THE COMPILER
@(private)
--- a/base/runtime/entry_unix.odin
+++ b/base/runtime/entry_unix.odin
@@ -1,9 +1,9 @@
 //+private
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 //+no-instrumentation
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 when ODIN_BUILD_MODE == .Dynamic {
 	@(link_name="_odin_entry_point", linkage="strong", require/*, link_section=".init"*/)
--- a/base/runtime/entry_unix_no_crt_amd64.asm
+++ b/base/runtime/entry_unix_no_crt_amd64.asm
--- a/base/runtime/entry_unix_no_crt_darwin_arm64.asm
+++ b/base/runtime/entry_unix_no_crt_darwin_arm64.asm
--- a/base/runtime/entry_unix_no_crt_i386.asm
+++ b/base/runtime/entry_unix_no_crt_i386.asm
--- a/base/runtime/entry_wasm.odin
+++ b/base/runtime/entry_wasm.odin
@@ -3,7 +3,7 @@
 //+no-instrumentation
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 when !ODIN_TEST && !ODIN_NO_ENTRY_POINT {
 	@(link_name="_start", linkage="strong", require, export)
--- a/base/runtime/entry_windows.odin
+++ b/base/runtime/entry_windows.odin
@@ -3,7 +3,7 @@
 //+no-instrumentation
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 when ODIN_BUILD_MODE == .Dynamic {
 	@(link_name="DllMain", linkage="strong", require)
--- a/base/runtime/error_checks.odin
+++ b/base/runtime/error_checks.odin
--- a/base/runtime/heap_allocator.odin
+++ b/base/runtime/heap_allocator.odin
@@ -0,0 +1,110 @@
+package runtime
+
+import "base:intrinsics"
+
+heap_allocator :: proc() -> Allocator {
+	return Allocator{
+		procedure = heap_allocator_proc,
+		data = nil,
+	}
+}
+
+heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
+                            size, alignment: int,
+                            old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
+	//
+	// NOTE(tetra, 2020-01-14): The heap doesn't respect alignment.
+	// Instead, we overallocate by `alignment + size_of(rawptr) - 1`, and insert
+	// padding. We also store the original pointer returned by heap_alloc right before
+	// the pointer we return to the user.
+	//
+
+	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr = nil, zero_memory := true) -> ([]byte, Allocator_Error) {
+		a := max(alignment, align_of(rawptr))
+		space := size + a - 1
+
+		allocated_mem: rawptr
+		if old_ptr != nil {
+			original_old_ptr := ([^]rawptr)(old_ptr)[-1]
+			allocated_mem = heap_resize(original_old_ptr, space+size_of(rawptr))
+		} else {
+			allocated_mem = heap_alloc(space+size_of(rawptr), zero_memory)
+		}
+		aligned_mem := rawptr(([^]u8)(allocated_mem)[size_of(rawptr):])
+
+		ptr := uintptr(aligned_mem)
+		aligned_ptr := (ptr - 1 + uintptr(a)) & -uintptr(a)
+		diff := int(aligned_ptr - ptr)
+		if (size + diff) > space || allocated_mem == nil {
+			return nil, .Out_Of_Memory
+		}
+
+		aligned_mem = rawptr(aligned_ptr)
+		([^]rawptr)(aligned_mem)[-1] = allocated_mem
+
+		return byte_slice(aligned_mem, size), nil
+	}
+
+	aligned_free :: proc(p: rawptr) {
+		if p != nil {
+			heap_free(([^]rawptr)(p)[-1])
+		}
+	}
+
+	aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) {
+		if p == nil {
+			return nil, nil
+		}
+
+		new_memory = aligned_alloc(new_size, new_alignment, p, zero_memory) or_return
+
+		// NOTE: heap_resize does not zero the new memory, so we do it
+		if zero_memory && new_size > old_size {
+			new_region := raw_data(new_memory[old_size:])
+			intrinsics.mem_zero(new_region, new_size - old_size)
+		}
+		return
+	}
+
+	switch mode {
+	case .Alloc, .Alloc_Non_Zeroed:
+		return aligned_alloc(size, alignment, nil, mode == .Alloc)
+
+	case .Free:
+		aligned_free(old_memory)
+
+	case .Free_All:
+		return nil, .Mode_Not_Implemented
+
+	case .Resize, .Resize_Non_Zeroed:
+		if old_memory == nil {
+			return aligned_alloc(size, alignment, nil, mode == .Resize)
+		}
+		return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize)
+
+	case .Query_Features:
+		set := (^Allocator_Mode_Set)(old_memory)
+		if set != nil {
+			set^ = {.Alloc, .Alloc_Non_Zeroed, .Free, .Resize, .Resize_Non_Zeroed, .Query_Features}
+		}
+		return nil, nil
+
+	case .Query_Info:
+		return nil, .Mode_Not_Implemented
+	}
+
+	return nil, nil
+}
+
+
+heap_alloc :: proc(size: int, zero_memory := true) -> rawptr {
+	return _heap_alloc(size, zero_memory)
+}
+
+heap_resize :: proc(ptr: rawptr, new_size: int) -> rawptr {
+	return _heap_resize(ptr, new_size)
+}
+
+heap_free :: proc(ptr: rawptr) {
+	_heap_free(ptr)
+}
--- a/base/runtime/heap_allocator_other.odin
+++ b/base/runtime/heap_allocator_other.odin
@@ -0,0 +1,15 @@
+//+build js, wasi, freestanding, essence
+//+private
+package runtime
+
+_heap_alloc :: proc(size: int, zero_memory := true) -> rawptr {
+	unimplemented("base:runtime 'heap_alloc' procedure is not supported on this platform")
+}
+
+_heap_resize :: proc(ptr: rawptr, new_size: int) -> rawptr {
+	unimplemented("base:runtime 'heap_resize' procedure is not supported on this platform")
+}
+
+_heap_free :: proc(ptr: rawptr) {
+	unimplemented("base:runtime 'heap_free' procedure is not supported on this platform")
+}
--- a/base/runtime/heap_allocator_unix.odin
+++ b/base/runtime/heap_allocator_unix.odin
@@ -0,0 +1,38 @@
+//+build linux, darwin, freebsd, openbsd, haiku
+//+private
+package runtime
+
+when ODIN_OS == .Darwin {
+	foreign import libc "system:System.framework"
+} else {
+	foreign import libc "system:c"
+}
+
+@(default_calling_convention="c")
+foreign libc {
+	@(link_name="malloc")   _unix_malloc   :: proc(size: int) -> rawptr ---
+	@(link_name="calloc")   _unix_calloc   :: proc(num, size: int) -> rawptr ---
+	@(link_name="free")     _unix_free     :: proc(ptr: rawptr) ---
+	@(link_name="realloc")  _unix_realloc  :: proc(ptr: rawptr, size: int) -> rawptr ---
+}
+
+_heap_alloc :: proc(size: int, zero_memory := true) -> rawptr {
+	if size <= 0 {
+		return nil
+	}
+	if zero_memory {
+		return _unix_calloc(1, size)
+	} else {
+		return _unix_malloc(size)
+	}
+}
+
+_heap_resize :: proc(ptr: rawptr, new_size: int) -> rawptr {
+	// NOTE: _unix_realloc doesn't guarantee new memory will be zeroed on
+	// POSIX platforms. Ensure your caller takes this into account.
+	return _unix_realloc(ptr, new_size)
+}
+
+_heap_free :: proc(ptr: rawptr) {
+	_unix_free(ptr)
+}
--- a/base/runtime/heap_allocator_windows.odin
+++ b/base/runtime/heap_allocator_windows.odin
@@ -0,0 +1,39 @@
+package runtime
+
+foreign import kernel32 "system:Kernel32.lib"
+
+@(private="file")
+@(default_calling_convention="system")
+foreign kernel32 {
+	// NOTE(bill): The types are not using the standard names (e.g. DWORD and LPVOID) to just minimizing the dependency
+
+	// default_allocator
+	GetProcessHeap :: proc() -> rawptr ---
+	HeapAlloc      :: proc(hHeap: rawptr, dwFlags: u32, dwBytes: uint) -> rawptr ---
+	HeapReAlloc    :: proc(hHeap: rawptr, dwFlags: u32, lpMem: rawptr, dwBytes: uint) -> rawptr ---
+	HeapFree       :: proc(hHeap: rawptr, dwFlags: u32, lpMem: rawptr) -> b32 ---
+}
+
+_heap_alloc :: proc(size: int, zero_memory := true) -> rawptr {
+	HEAP_ZERO_MEMORY :: 0x00000008
+	return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY if zero_memory else 0, uint(size))
+}
+_heap_resize :: proc(ptr: rawptr, new_size: int) -> rawptr {
+	if new_size == 0 {
+		_heap_free(ptr)
+		return nil
+	}
+	if ptr == nil {
+		return _heap_alloc(new_size)
+	}
+
+	HEAP_ZERO_MEMORY :: 0x00000008
+	return HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, ptr, uint(new_size))
+}
+_heap_free :: proc(ptr: rawptr) {
+	if ptr == nil {
+		return
+	}
+	HeapFree(GetProcessHeap(), 0, ptr)
+}
+
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

@(private="file")
 IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32
@@ -11,7 +11,7 @@ RUNTIME_LINKAGE :: "strong" when (
 	ODIN_BUILD_MODE == .Dynamic ||
 	!ODIN_NO_CRT) &&
 	!IS_WASM) else "internal"
-RUNTIME_REQUIRE :: !ODIN_TILDE
+RUNTIME_REQUIRE :: false // !ODIN_TILDE

@(private)
 __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
@@ -22,51 +22,7 @@ byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byt
 	return ([^]byte)(data)[:max(len, 0)]
 }

-bswap_16 :: proc "contextless" (x: u16) -> u16 {
-	return x>>8 | x<<8
-}
-
-bswap_32 :: proc "contextless" (x: u32) -> u32 {
-	return x>>24 | (x>>8)&0xff00 | (x<<8)&0xff0000 | x<<24
-}
-
-bswap_64 :: proc "contextless" (x: u64) -> u64 {
-	z := x
-	z = (z & 0x00000000ffffffff) << 32 | (z & 0xffffffff00000000) >> 32
-	z = (z & 0x0000ffff0000ffff) << 16 | (z & 0xffff0000ffff0000) >> 16
-	z = (z & 0x00ff00ff00ff00ff) << 8  | (z & 0xff00ff00ff00ff00) >> 8
-	return z
-}
-
-bswap_128 :: proc "contextless" (x: u128) -> u128 {
-	z := transmute([4]u32)x
-	z[0], z[3] = bswap_32(z[3]), bswap_32(z[0])
-	z[1], z[2] = bswap_32(z[2]), bswap_32(z[1])
-	return transmute(u128)z
-}
-
-bswap_f16 :: proc "contextless" (f: f16) -> f16 {
-	x := transmute(u16)f
-	z := bswap_16(x)
-	return transmute(f16)z
-
-}
-
-bswap_f32 :: proc "contextless" (f: f32) -> f32 {
-	x := transmute(u32)f
-	z := bswap_32(x)
-	return transmute(f32)z
-
-}
-
-bswap_f64 :: proc "contextless" (f: f64) -> f64 {
-	x := transmute(u64)f
-	z := bswap_64(x)
-	return transmute(f64)z
-}
-
-
-is_power_of_two_int :: #force_inline proc(x: int) -> bool {
+is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
 	if x <= 0 {
 		return false
 	}
@@ -84,7 +40,7 @@ align_forward_int :: #force_inline proc(ptr, align: int) -> int {
 	return p
 }

-is_power_of_two_uintptr :: #force_inline proc(x: uintptr) -> bool {
+is_power_of_two_uintptr :: #force_inline proc "contextless" (x: uintptr) -> bool {
 	if x <= 0 {
 		return false
 	}
@@ -608,36 +564,6 @@ string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
 	return r, size
 }

-
-abs_f16 :: #force_inline proc "contextless" (x: f16) -> f16 {
-	return -x if x < 0 else x
-}
-abs_f32 :: #force_inline proc "contextless" (x: f32) -> f32 {
-	return -x if x < 0 else x
-}
-abs_f64 :: #force_inline proc "contextless" (x: f64) -> f64 {
-	return -x if x < 0 else x
-}
-
-min_f16 :: #force_inline proc "contextless" (a, b: f16) -> f16 {
-	return a if a < b else b
-}
-min_f32 :: #force_inline proc "contextless" (a, b: f32) -> f32 {
-	return a if a < b else b
-}
-min_f64 :: #force_inline proc "contextless" (a, b: f64) -> f64 {
-	return a if a < b else b
-}
-max_f16 :: #force_inline proc "contextless" (a, b: f16) -> f16 {
-	return a if a > b else b
-}
-max_f32 :: #force_inline proc "contextless" (a, b: f32) -> f32 {
-	return a if a > b else b
-}
-max_f64 :: #force_inline proc "contextless" (a, b: f64) -> f64 {
-	return a if a > b else b
-}
-
 abs_complex32 :: #force_inline proc "contextless" (x: complex32) -> f16 {
 	p, q := abs(real(x)), abs(imag(x))
 	if p < q {
@@ -1036,9 +962,11 @@ udivmodti4 :: proc "c" (a, b: u128, rem: ^u128) -> u128 {
 	return udivmod128(a, b, rem)
 }

-@(link_name="__udivti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
-udivti3 :: proc "c" (a, b: u128) -> u128 {
-	return udivmodti4(a, b, nil)
+when !IS_WASM {
+	@(link_name="__udivti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
+	udivti3 :: proc "c" (a, b: u128) -> u128 {
+		return udivmodti4(a, b, nil)
+	}
 }


@@ -1108,3 +1036,25 @@ fixdfti :: proc(a: u64) -> i128 {
 	}

 }
+
+
+
+__write_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) {
+	for i in 0..<size {
+		j := offset+i
+		the_bit := byte((src[i/8]) & (1<<(i&7)) != 0)
+		b := the_bit<<(j&7)
+		dst[j/8] &~= b
+		dst[j/8] |=  b
+	}
+}
+
+__read_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) {
+	for j in 0..<size {
+		i := offset+j
+		the_bit := byte((src[i/8]) & (1<<(i&7)) != 0)
+		b := the_bit<<(j&7)
+		dst[j/8] &~= b
+		dst[j/8] |=  b
+	}
+}
--- a/base/runtime/os_specific.odin
+++ b/base/runtime/os_specific.odin
@@ -0,0 +1,7 @@
+package runtime
+
+_OS_Errno :: distinct int
+
+stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	return _stderr_write(data)
+}
--- a/base/runtime/os_specific_bsd.odin
+++ b/base/runtime/os_specific_bsd.odin
@@ -0,0 +1,22 @@
+//+build freebsd, openbsd
+//+private
+package runtime
+
+foreign import libc "system:c"
+
+@(default_calling_convention="c")
+foreign libc {
+	@(link_name="write")
+	_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---
+
+	__error :: proc() -> ^i32 ---
+}
+
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	ret := _unix_write(2, raw_data(data), len(data))
+	if ret < len(data) {
+		err := __error()
+		return int(ret), _OS_Errno(err^ if err != nil else 0)
+	}
+	return int(ret), 0
+}
--- a/base/runtime/os_specific_darwin.odin
+++ b/base/runtime/os_specific_darwin.odin
@@ -0,0 +1,15 @@
+//+build darwin
+//+private
+package runtime
+
+import "base:intrinsics"
+
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	WRITE  :: 0x2000004
+	STDERR :: 2
+	ret := intrinsics.syscall(WRITE, STDERR, uintptr(raw_data(data)), uintptr(len(data)))
+	if ret < 0 {
+		return 0, _OS_Errno(-ret)
+	}
+	return int(ret), 0
+}
--- a/base/runtime/os_specific_freestanding.odin
+++ b/base/runtime/os_specific_freestanding.odin
@@ -1,7 +1,8 @@
 //+build freestanding
+//+private
 package runtime

 // TODO(bill): reimplement `os.write`
-_os_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	return 0, -1
 }
--- a/base/runtime/os_specific_haiku.odin
+++ b/base/runtime/os_specific_haiku.odin
@@ -0,0 +1,21 @@
+//+build haiku
+//+private
+package runtime
+
+foreign import libc "system:c"
+
+foreign libc {
+	@(link_name="write")
+	_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---
+
+	_errnop :: proc() -> ^i32 ---
+}
+
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	ret := _unix_write(2, raw_data(data), len(data))
+	if ret < len(data) {
+		err := _errnop()
+		return int(ret), _OS_Errno(err^ if err != nil else 0)
+	}
+	return int(ret), 0
+}
--- a/base/runtime/os_specific_js.odin
+++ b/base/runtime/os_specific_js.odin
@@ -1,9 +1,10 @@
 //+build js
+//+private
 package runtime

 foreign import "odin_env"

-_os_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	foreign odin_env {
 		write :: proc "contextless" (fd: u32, p: []byte) ---
 	}
--- a/base/runtime/os_specific_linux.odin
+++ b/base/runtime/os_specific_linux.odin
@@ -0,0 +1,24 @@
+//+private
+package runtime
+
+import "base:intrinsics"
+
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	when ODIN_ARCH == .amd64 {
+		SYS_write :: uintptr(1)
+	} else when ODIN_ARCH == .arm64 {
+		SYS_write :: uintptr(64)
+	} else when ODIN_ARCH == .i386 {
+		SYS_write :: uintptr(4)
+	} else when ODIN_ARCH == .arm32 {
+		SYS_write :: uintptr(4)
+	}
+
+	stderr :: 2
+
+	ret := int(intrinsics.syscall(SYS_write, uintptr(stderr), uintptr(raw_data(data)), uintptr(len(data))))
+	if ret < 0 && ret > -4096 {
+		return 0, _OS_Errno(-ret)
+	}
+	return ret, 0
+}
--- a/base/runtime/os_specific_wasi.odin
+++ b/base/runtime/os_specific_wasi.odin
@@ -1,9 +1,10 @@
 //+build wasi
+//+private
 package runtime

 import "core:sys/wasm/wasi"

-_os_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	data := (wasi.ciovec_t)(data)
 	n, err := wasi.fd_write(1, {data})
 	return int(n), _OS_Errno(err)
--- a/base/runtime/os_specific_windows.odin
+++ b/base/runtime/os_specific_windows.odin
@@ -0,0 +1,51 @@
+//+build windows
+//+private
+package runtime
+
+foreign import kernel32 "system:Kernel32.lib"
+
+@(private="file")
+@(default_calling_convention="system")
+foreign kernel32 {
+	// NOTE(bill): The types are not using the standard names (e.g. DWORD and LPVOID) to just minimizing the dependency
+
+	// stderr_write
+	GetStdHandle         :: proc(which: u32) -> rawptr ---
+	SetHandleInformation :: proc(hObject: rawptr, dwMask: u32, dwFlags: u32) -> b32 ---
+	WriteFile            :: proc(hFile: rawptr, lpBuffer: rawptr, nNumberOfBytesToWrite: u32, lpNumberOfBytesWritten: ^u32, lpOverlapped: rawptr) -> b32 ---
+	GetLastError         :: proc() -> u32 ---
+}
+
+_stderr_write :: proc "contextless" (data: []byte) -> (n: int, err: _OS_Errno) #no_bounds_check {
+	if len(data) == 0 {
+		return 0, 0
+	}
+
+	STD_ERROR_HANDLE :: ~u32(0) -12 + 1
+	HANDLE_FLAG_INHERIT :: 0x00000001
+	MAX_RW :: 1<<30
+
+	h := GetStdHandle(STD_ERROR_HANDLE)
+	when size_of(uintptr) == 8 {
+		SetHandleInformation(h, HANDLE_FLAG_INHERIT, 0)
+	}
+
+	single_write_length: u32
+	total_write: i64
+	length := i64(len(data))
+
+	for total_write < length {
+		remaining := length - total_write
+		to_write := u32(min(i32(remaining), MAX_RW))
+
+		e := WriteFile(h, &data[total_write], to_write, &single_write_length, nil)
+		if single_write_length <= 0 || !e {
+			err = _OS_Errno(GetLastError())
+			n = int(total_write)
+			return
+		}
+		total_write += i64(single_write_length)
+	}
+	n = int(total_write)
+	return
+}
--- a/base/runtime/print.odin
+++ b/base/runtime/print.odin
@@ -123,13 +123,13 @@ encode_rune :: proc "contextless" (c: rune) -> ([4]u8, int) {
 }

 print_string :: proc "contextless" (str: string) -> (n: int) {
-	n, _ = os_write(transmute([]byte)str)
+	n, _ = stderr_write(transmute([]byte)str)
 	return
 }

 print_strings :: proc "contextless" (args: ..string) -> (n: int) {
 	for str in args {
-		m, err := os_write(transmute([]byte)str)
+		m, err := stderr_write(transmute([]byte)str)
 		n += m
 		if err != 0 {
 			break
@@ -139,7 +139,7 @@ print_strings :: proc "contextless" (args: ..string) -> (n: int) {
 }

 print_byte :: proc "contextless" (b: byte) -> (n: int) {
-	n, _ = os_write([]byte{b})
+	n, _ = stderr_write([]byte{b})
 	return
 }

@@ -178,7 +178,7 @@ print_rune :: proc "contextless" (r: rune) -> int #no_bounds_check {
 	}

 	b, n := encode_rune(r)
-	m, _ := os_write(b[:n])
+	m, _ := stderr_write(b[:n])
 	return m
 }

@@ -194,7 +194,7 @@ print_u64 :: proc "contextless" (x: u64) #no_bounds_check {
 	}
 	i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]

-	os_write(a[i:])
+	stderr_write(a[i:])
 }


@@ -216,7 +216,7 @@ print_i64 :: proc "contextless" (x: i64) #no_bounds_check {
 		i -= 1; a[i] = '-'
 	}

-	os_write(a[i:])
+	stderr_write(a[i:])
 }

 print_uint    :: proc "contextless" (x: uint)    { print_u64(u64(x)) }
@@ -459,6 +459,20 @@ print_type :: proc "contextless" (ti: ^Type_Info) {
 		}
 		print_byte(']')

+	case Type_Info_Bit_Field:
+		print_string("bit_field ")
+		print_type(info.backing_type)
+		print_string(" {")
+		for name, i in info.names {
+			if i > 0 { print_string(", ") }
+			print_string(name)
+			print_string(": ")
+			print_type(info.types[i])
+			print_string(" | ")
+			print_u64(u64(info.bit_sizes[i]))
+		}
+		print_byte('}')
+

 	case Type_Info_Simd_Vector:
 		print_string("#simd[")
--- a/base/runtime/procs.odin
+++ b/base/runtime/procs.odin
--- a/base/runtime/procs_darwin.odin
+++ b/base/runtime/procs_darwin.odin
@@ -3,7 +3,7 @@ package runtime

 foreign import "system:Foundation.framework"

-import "core:intrinsics"
+import "base:intrinsics"

 objc_id :: ^intrinsics.objc_object
 objc_Class :: ^intrinsics.objc_class
--- a/base/runtime/procs_js.odin
+++ b/base/runtime/procs_js.odin
--- a/base/runtime/procs_wasm.odin
+++ b/base/runtime/procs_wasm.odin
@@ -7,19 +7,25 @@ ti_int :: struct #raw_union {
 	all: i128,
 }

+@(private="file")
+ti_uint :: struct #raw_union {
+	using s: struct { lo, hi: u64 },
+	all: u128,
+}
+
@(link_name="__ashlti3", linkage="strong")
-__ashlti3 :: proc "contextless" (a: i128, b_: u32) -> i128 {
+__ashlti3 :: proc "contextless" (la, ha: u64, b_: u32) -> i128 {
 	bits_in_dword :: size_of(u32)*8
 	b := u32(b_)
 	
 	input, result: ti_int
-	input.all = a
+	input.lo, input.hi = la, ha
 	if b & bits_in_dword != 0 {
 		result.lo = 0
 		result.hi = input.lo << (b-bits_in_dword)
 	} else {
 		if b == 0 {
-			return a
+			return input.all
 		}
 		result.lo = input.lo<<b
 		result.hi = (input.hi<<b) | (input.lo>>(bits_in_dword-b))
@@ -29,12 +35,20 @@ __ashlti3 :: proc "contextless" (a: i128, b_: u32) -> i128 {


@(link_name="__multi3", linkage="strong")
-__multi3 :: proc "contextless" (a, b: i128) -> i128 {
+__multi3 :: proc "contextless" (la, ha, lb, hb: u64) -> i128 {
 	x, y, r: ti_int
-	
-	x.all = a
-	y.all = b
+
+	x.lo, x.hi = la, ha
+	y.lo, y.hi = lb, hb
 	r.all = i128(x.lo * y.lo) // TODO this is incorrect
 	r.hi += x.hi*y.lo + x.lo*y.hi
 	return r.all
-}
+}
+
+@(link_name="__udivti3", linkage="strong")
+udivti3 :: proc "c" (la, ha, lb, hb: u64) -> u128 {
+	a, b: ti_uint
+	a.lo, a.hi = la, ha
+	b.lo, b.hi = lb, hb
+	return udivmodti4(a.all, b.all, nil)
+}
--- a/base/runtime/procs_windows_amd64.asm
+++ b/base/runtime/procs_windows_amd64.asm
--- a/base/runtime/procs_windows_amd64.odin
+++ b/base/runtime/procs_windows_amd64.odin
--- a/base/runtime/procs_windows_i386.odin
+++ b/base/runtime/procs_windows_i386.odin
--- a/base/runtime/udivmod128.odin
+++ b/base/runtime/udivmod128.odin
@@ -1,6 +1,6 @@
 package runtime

-import "core:intrinsics"
+import "base:intrinsics"

 udivmod128 :: proc "c" (a, b: u128, rem: ^u128) -> u128 {
 	_ctz :: intrinsics.count_trailing_zeros
--- a/build_odin.sh
+++ b/build_odin.sh
@@ -56,15 +56,14 @@ fi

 case "$OS_NAME" in
 Darwin)
-	if [ "$OS_ARCH" == "arm64" ]; then
+	if [ "$OS_ARCH" = "arm64" ]; then
 		if [ $LLVM_VERSION_MAJOR -lt 13 ] || [ $LLVM_VERSION_MAJOR -gt 17 ]; then
 			error "Darwin Arm64 requires LLVM 13, 14 or 17"
 		fi
 	fi

 	CXXFLAGS="$CXXFLAGS $($LLVM_CONFIG --cxxflags --ldflags)"
-	LDFLAGS="$LDFLAGS -liconv -ldl -framework System"
-	LDFLAGS="$LDFLAGS -lLLVM-C"
+	LDFLAGS="$LDFLAGS -liconv -ldl -framework System -lLLVM"
 	;;
 FreeBSD)
 	CXXFLAGS="$CXXFLAGS $($LLVM_CONFIG --cxxflags --ldflags)"
@@ -83,6 +82,11 @@ OpenBSD)
 	LDFLAGS="$LDFLAGS -liconv"
 	LDFLAGS="$LDFLAGS $($LLVM_CONFIG --libs core native --system-libs)"
 	;;
+Haiku)
+	CXXFLAGS="$CXXFLAGS $($LLVM_CONFIG --cxxflags --ldflags) -I/system/develop/headers/private/shared -I/system/develop/headers/private/kernel"
+	LDFLAGS="$LDFLAGS -liconv"
+	LDFLAGS="$LDFLAGS $($LLVM_CONFIG --libs core native --system-libs)"
+	;;
 *)
 	error "Platform \"$OS_NAME\" unsupported"
 	;;
@@ -97,7 +101,7 @@ build_odin() {
 		EXTRAFLAGS="-O3"
 		;;
 	release-native)
-		if [ "$OS_ARCH" == "arm64" ]; then
+		if [ "$OS_ARCH" = "arm64" ]; then
 			# Use preferred flag for Arm (ie arm64 / aarch64 / etc)
 			EXTRAFLAGS="-O3 -mcpu=native"
 		else
--- a/ci/upload_create_nightly.sh
+++ b/ci/upload_create_nightly.sh
@@ -1,5 +1,7 @@
 #!/bin/bash

+set -e
+
 bucket=$1
 platform=$2
 artifact=$3
@@ -9,5 +11,15 @@ filename="odin-$platform-nightly+$now.zip"

 echo "Creating archive $filename from $artifact and uploading to $bucket"

-7z a -bd "output/$filename" -r "$artifact"
-b2 upload-file --noProgress "$bucket" "output/$filename" "nightly/$filename"
+# If this is already zipped up (done before artifact upload to keep permissions in tact), just move it.
+if [ "${artifact: -4}" == ".zip" ]
+then
+	echo "Artifact already a zip"
+	mkdir -p "output"
+	mv "$artifact" "output/$filename"
+else
+	echo "Artifact needs to be zipped"
+	7z a -bd "output/$filename" -r "$artifact"
+fi
+
+b2 upload-file --noProgress "$bucket" "output/$filename" "nightly/$filename"
--- a/core/bufio/scanner.odin
+++ b/core/bufio/scanner.odin
@@ -4,7 +4,7 @@ import "core:bytes"
 import "core:io"
 import "core:mem"
 import "core:unicode/utf8"
-import "core:intrinsics"
+import "base:intrinsics"

 // Extra errors returns by scanning procedures
 Scanner_Extra_Error :: enum i32 {
--- a/core/bufio/writer.odin
+++ b/core/bufio/writer.odin
@@ -226,7 +226,6 @@ writer_to_writer :: proc(b: ^Writer) -> (s: io.Writer) {



-@(private)
 _writer_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offset: i64, whence: io.Seek_From) -> (n: i64, err: io.Error) {
 	b := (^Writer)(stream_data)
 	#partial switch mode {
--- a/core/c/c.odin
+++ b/core/c/c.odin
@@ -1,6 +1,6 @@
 package c

-import builtin "core:builtin"
+import builtin "base:builtin"

 char           :: builtin.u8  // assuming -funsigned-char

@@ -104,3 +104,13 @@ NULL           :: rawptr(uintptr(0))
 NDEBUG         :: !ODIN_DEBUG

 CHAR_BIT :: 8
+
+// Since there are no types in C with an alignment larger than that of
+// max_align_t, which cannot be larger than sizeof(long double) as any other
+// exposed type wouldn't be valid C, the maximum alignment possible in a
+// strictly conformant C implementation is 16 on the platforms we care about.
+// The choice of 4096 bytes for storage of this type is more than enough on all
+// relevant platforms.
+va_list :: struct #align(16) {
+	_: [4096]u8,
+}
--- a/core/c/libc/complex.odin
+++ b/core/c/libc/complex.odin
@@ -67,7 +67,7 @@ foreign libc {
 	crealf  :: proc(z: complex_float) -> float ---
 }

-import builtin "core:builtin"
+import builtin "base:builtin"

 complex_float  :: distinct builtin.complex64
 complex_double :: distinct builtin.complex128
--- a/core/c/libc/errno.odin
+++ b/core/c/libc/errno.odin
@@ -80,6 +80,24 @@ when ODIN_OS == .Darwin {
 	ERANGE :: 34
 }

+when ODIN_OS == .Haiku {
+	@(private="file")
+	@(default_calling_convention="c")
+	foreign libc {
+		@(link_name="_errnop")
+		_get_errno :: proc() -> ^int ---
+	}
+
+	@(private="file")
+	B_GENERAL_ERROR_BASE :: min(i32)
+	@(private="file")
+	B_POSIX_ERROR_BASE   :: B_GENERAL_ERROR_BASE + 0x7000
+
+	EDOM   :: B_POSIX_ERROR_BASE + 16
+	EILSEQ :: B_POSIX_ERROR_BASE + 38
+	ERANGE :: B_POSIX_ERROR_BASE + 17
+}
+
 // Odin has no way to make an identifier "errno" behave as a function call to
 // read the value, or to produce an lvalue such that you can assign a different
 // error value to errno. To work around this, just expose it as a function like
--- a/core/c/libc/math.odin
+++ b/core/c/libc/math.odin
@@ -2,7 +2,7 @@ package libc

 // 7.12 Mathematics

-import "core:intrinsics"
+import "base:intrinsics"

 when ODIN_OS == .Windows {
 	foreign import libc "system:libucrt.lib"
--- a/core/c/libc/stdarg.odin
+++ b/core/c/libc/stdarg.odin
@@ -2,7 +2,9 @@ package libc

 // 7.16 Variable arguments

-import "core:intrinsics"
+import "base:intrinsics"
+
+import "core:c"

@(private="file")
@(default_calling_convention="none")
@@ -12,15 +14,7 @@ foreign _ {
 	@(link_name="llvm.va_copy")  _va_copy  :: proc(dst, src: ^i8) ---
 }

-// Since there are no types in C with an alignment larger than that of
-// max_align_t, which cannot be larger than sizeof(long double) as any other
-// exposed type wouldn't be valid C, the maximum alignment possible in a
-// strictly conformant C implementation is 16 on the platforms we care about.
-// The choice of 4096 bytes for storage of this type is more than enough on all
-// relevant platforms.
-va_list :: struct #align(16) {
-	_: [4096]u8,
-}
+va_list :: c.va_list

 va_start :: #force_inline proc(ap: ^va_list, _: any) {
 	_va_start(cast(^i8)ap)
--- a/core/c/libc/stdatomic.odin
+++ b/core/c/libc/stdatomic.odin
@@ -2,7 +2,7 @@ package libc

 // 7.17 Atomics

-import "core:intrinsics"
+import "base:intrinsics"

 ATOMIC_BOOL_LOCK_FREE     :: true
 ATOMIC_CHAR_LOCK_FREE     :: true
--- a/core/c/libc/stdio.odin
+++ b/core/c/libc/stdio.odin
@@ -163,6 +163,36 @@ when ODIN_OS == .Darwin {
 	}
 }

+when ODIN_OS == .Haiku {
+	fpos_t :: distinct i64
+	
+	_IOFBF        :: 0
+	_IOLBF        :: 1
+	_IONBF        :: 2
+
+	BUFSIZ        :: 8192
+
+	EOF           :: int(-1)
+
+	FOPEN_MAX     :: 128
+
+	FILENAME_MAX  :: 256
+
+	L_tmpnam      :: 512
+
+	SEEK_SET      :: 0
+	SEEK_CUR      :: 1
+	SEEK_END      :: 2
+
+	TMP_MAX       :: 32768
+
+	foreign libc {
+		stderr: ^FILE
+		stdin:  ^FILE
+		stdout: ^FILE
+	}
+}
+
@(default_calling_convention="c")
 foreign libc {
 	// 7.21.4 Operations on files
--- a/core/c/libc/string.odin
+++ b/core/c/libc/string.odin
@@ -1,6 +1,6 @@
 package libc

-import "core:runtime"
+import "base:runtime"

 // 7.24 String handling

--- a/core/c/libc/time.odin
+++ b/core/c/libc/time.odin
@@ -45,7 +45,7 @@ when ODIN_OS == .Windows {
 	}
 }

-when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS == .OpenBSD {
+when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS == .OpenBSD || ODIN_OS == .Haiku {
 	@(default_calling_convention="c")
 	foreign libc {
 		// 7.27.2 Time manipulation functions
--- a/core/c/libc/wctype.odin
+++ b/core/c/libc/wctype.odin
@@ -29,7 +29,11 @@ when ODIN_OS == .Windows {
 } else when ODIN_OS == .FreeBSD {
 	wctrans_t :: distinct int
 	wctype_t  :: distinct ulong
-	
+
+} else when ODIN_OS == .Haiku {
+	wctrans_t :: distinct i32
+	wctype_t  :: distinct i32
+
 }

@(default_calling_convention="c")
--- a/core/compress/common.odin
+++ b/core/compress/common.odin
@@ -12,7 +12,7 @@ package compress

 import "core:io"
 import "core:bytes"
-import "core:runtime"
+import "base:runtime"

 /*
 	These settings bound how much compression algorithms will allocate for their output buffer.
--- a/core/compress/shoco/shoco.odin
+++ b/core/compress/shoco/shoco.odin
@@ -11,7 +11,7 @@
 // package shoco is an implementation of the shoco short string compressor
 package shoco

-import "core:intrinsics"
+import "base:intrinsics"
 import "core:compress"

 Shoco_Pack :: struct {
--- a/core/container/avl/avl.odin
+++ b/core/container/avl/avl.odin
@@ -0,0 +1,678 @@
+/*
+package avl implements an AVL tree.
+
+The implementation is non-intrusive, and non-recursive.
+*/
+package container_avl
+
+import "base:intrinsics"
+import "base:runtime"
+import "core:slice"
+
+_ :: intrinsics
+_ :: runtime
+
+// Originally based on the CC0 implementation by Eric Biggers
+// See: https://github.com/ebiggers/avl_tree/
+
+// Direction specifies the traversal direction for a tree iterator.
+Direction :: enum i8 {
+	// Backward is the in-order backwards direction.
+	Backward = -1,
+	// Forward is the in-order forwards direction.
+	Forward  = 1,
+}
+
+// Ordering specifies order when inserting/finding values into the tree.
+Ordering :: slice.Ordering
+
+// Tree is an AVL tree.
+Tree :: struct($Value: typeid) {
+	// user_data is a parameter that will be passed to the on_remove
+	// callback.
+	user_data: rawptr,
+	// on_remove is an optional callback that can be called immediately
+	// after a node is removed from the tree.
+	on_remove: proc(value: Value, user_data: rawptr),
+
+	_root:           ^Node(Value),
+	_node_allocator: runtime.Allocator,
+	_cmp_fn:         proc(a, b: Value) -> Ordering,
+	_size:           int,
+}
+
+// Node is an AVL tree node.
+//
+// WARNING: It is unsafe to mutate value if the node is part of a tree
+// if doing so will alter the Node's sort position relative to other
+// elements in the tree.
+Node :: struct($Value: typeid) {
+	value: Value,
+
+	_parent:  ^Node(Value),
+	_left:    ^Node(Value),
+	_right:   ^Node(Value),
+	_balance: i8,
+}
+
+// Iterator is a tree iterator.
+//
+// WARNING: It is unsafe to modify the tree while iterating, except via
+// the iterator_remove method.
+Iterator :: struct($Value: typeid) {
+	_tree:        ^Tree(Value),
+	_cur:         ^Node(Value),
+	_next:        ^Node(Value),
+	_direction:   Direction,
+	_called_next: bool,
+}
+
+// init initializes a tree.
+init :: proc {
+	init_ordered,
+	init_cmp,
+}
+
+// init_cmp initializes a tree.
+init_cmp :: proc(
+	t: ^$T/Tree($Value),
+	cmp_fn: proc(a, b: Value) -> Ordering,
+	node_allocator := context.allocator,
+) {
+	t._root = nil
+	t._node_allocator = node_allocator
+	t._cmp_fn = cmp_fn
+	t._size = 0
+}
+
+// init_ordered initializes a tree containing ordered items, with
+// a comparison function that results in an ascending order sort.
+init_ordered :: proc(
+	t: ^$T/Tree($Value),
+	node_allocator := context.allocator,
+) where intrinsics.type_is_ordered_numeric(Value) {
+	init_cmp(t, slice.cmp_proc(Value), node_allocator)
+}
+
+// destroy de-initializes a tree.
+destroy :: proc(t: ^$T/Tree($Value), call_on_remove: bool = true) {
+	iter := iterator(t, Direction.Forward)
+	for _ in iterator_next(&iter) {
+		iterator_remove(&iter, call_on_remove)
+	}
+}
+
+// len returns the number of elements in the tree.
+len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
+	return t._size
+}
+
+// first returns the first node in the tree (in-order) or nil iff
+// the tree is empty.
+first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
+	return tree_first_or_last_in_order(t, Direction.Backward)
+}
+
+// last returns the last element in the tree (in-order) or nil iff
+// the tree is empty.
+last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
+	return tree_first_or_last_in_order(t, Direction.Forward)
+}
+
+// find finds the value in the tree, and returns the corresponding
+// node or nil iff the value is not present.
+find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
+	cur := t._root
+	descend_loop: for cur != nil {
+		switch t._cmp_fn(value, cur.value) {
+		case .Less:
+			cur = cur._left
+		case .Greater:
+			cur = cur._right
+		case .Equal:
+			break descend_loop
+		}
+	}
+
+	return cur
+}
+
+// find_or_insert attempts to insert the value into the tree, and returns
+// the node, a boolean indicating if the value was inserted, and the
+// node allocator error if relevant.  If the value is already
+// present, the existing node is returned un-altered.
+find_or_insert :: proc(
+	t: ^$T/Tree($Value),
+	value: Value,
+) -> (
+	n: ^Node(Value),
+	inserted: bool,
+	err: runtime.Allocator_Error,
+) {
+	n_ptr := &t._root
+	for n_ptr^ != nil {
+		n = n_ptr^
+		switch t._cmp_fn(value, n.value) {
+		case .Less:
+			n_ptr = &n._left
+		case .Greater:
+			n_ptr = &n._right
+		case .Equal:
+			return
+		}
+	}
+
+	parent := n
+	n = new(Node(Value), t._node_allocator) or_return
+	n.value = value
+	n._parent = parent
+	n_ptr^ = n
+	tree_rebalance_after_insert(t, n)
+
+	t._size += 1
+	inserted = true
+
+	return
+}
+
+// remove removes a node or value from the tree, and returns true iff the
+// removal was successful.  While the node's value will be left intact,
+// the node itself will be freed via the tree's node allocator.
+remove :: proc {
+	remove_value,
+	remove_node,
+}
+
+// remove_value removes a value from the tree, and returns true iff the
+// removal was successful.  While the node's value will be left intact,
+// the node itself will be freed via the tree's node allocator.
+remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
+	n := find(t, value)
+	if n == nil {
+		return false
+	}
+	return remove_node(t, n, call_on_remove)
+}
+
+// remove_node removes a node from the tree, and returns true iff the
+// removal was successful.  While the node's value will be left intact,
+// the node itself will be freed via the tree's node allocator.
+remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
+	if node._parent == node || (node._parent == nil && t._root != node) {
+		return false
+	}
+	defer {
+		if call_on_remove && t.on_remove != nil {
+			t.on_remove(node.value, t.user_data)
+		}
+		free(node, t._node_allocator)
+	}
+
+	parent: ^Node(Value)
+	left_deleted: bool
+
+	t._size -= 1
+	if node._left != nil && node._right != nil {
+		parent, left_deleted = tree_swap_with_successor(t, node)
+	} else {
+		child := node._left
+		if child == nil {
+			child = node._right
+		}
+		parent = node._parent
+		if parent != nil {
+			if node == parent._left {
+				parent._left = child
+				left_deleted = true
+			} else {
+				parent._right = child
+				left_deleted = false
+			}
+			if child != nil {
+				child._parent = parent
+			}
+		} else {
+			if child != nil {
+				child._parent = parent
+			}
+			t._root = child
+			node_reset(node)
+			return true
+		}
+	}
+
+	for {
+		if left_deleted {
+			parent = tree_handle_subtree_shrink(t, parent, +1, &left_deleted)
+		} else {
+			parent = tree_handle_subtree_shrink(t, parent, -1, &left_deleted)
+		}
+		if parent == nil {
+			break
+		}
+	}
+	node_reset(node)
+
+	return true
+}
+
+// iterator returns a tree iterator in the specified direction.
+iterator :: proc "contextless" (t: ^$T/Tree($Value), direction: Direction) -> Iterator(Value) {
+	it: Iterator(Value)
+	it._tree = transmute(^Tree(Value))t
+	it._direction = direction
+
+	iterator_first(&it)
+
+	return it
+}
+
+// iterator_from_pos returns a tree iterator in the specified direction,
+// spanning the range [pos, last] (inclusive).
+iterator_from_pos :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	pos: ^Node(Value),
+	direction: Direction,
+) -> Iterator(Value) {
+	it: Iterator(Value)
+	it._tree = transmute(^Tree(Value))t
+	it._direction = direction
+	it._next = nil
+	it._called_next = false
+
+	if it._cur = pos; pos != nil {
+		it._next = node_next_or_prev_in_order(it._cur, it._direction)
+	}
+
+	return it
+}
+
+// iterator_get returns the node currently pointed to by the iterator,
+// or nil iff the node has been removed, the tree is empty, or the end
+// of the tree has been reached.
+iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
+	return it._cur
+}
+
+// iterator_remove removes the node currently pointed to by the iterator,
+// and returns true iff the removal was successful.  Semantics are the
+// same as the Tree remove.
+iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
+	if it._cur == nil {
+		return false
+	}
+
+	ok := remove_node(it._tree, it._cur, call_on_remove)
+	if ok {
+		it._cur = nil
+	}
+
+	return ok
+}
+
+// iterator_next advances the iterator and returns the (node, true) or
+// or (nil, false) iff the end of the tree has been reached.
+//
+// Note: The first call to iterator_next will return the first node instead
+// of advancing the iterator.
+iterator_next :: proc "contextless" (it: ^$I/Iterator($Value)) -> (^Node(Value), bool) {
+	// This check is needed so that the first element gets returned from
+	// a brand-new iterator, and so that the somewhat contrived case where
+	// iterator_remove is called before the first call to iterator_next
+	// returns the correct value.
+	if !it._called_next {
+		it._called_next = true
+
+		// There can be the contrived case where iterator_remove is
+		// called before ever calling iterator_next, which needs to be
+		// handled as an actual call to next.
+		//
+		// If this happens it._cur will be nil, so only return the
+		// first value, if it._cur is valid.
+		if it._cur != nil {
+			return it._cur, true
+		}
+	}
+
+	if it._next == nil {
+		return nil, false
+	}
+
+	it._cur = it._next
+	it._next = node_next_or_prev_in_order(it._cur, it._direction)
+
+	return it._cur, true
+}
+
+@(private)
+tree_first_or_last_in_order :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	direction: Direction,
+) -> ^Node(Value) {
+	first, sign := t._root, i8(direction)
+	if first != nil {
+		for {
+			tmp := node_get_child(first, +sign)
+			if tmp == nil {
+				break
+			}
+			first = tmp
+		}
+	}
+
+	return first
+}
+
+@(private)
+tree_replace_child :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	parent, old_child, new_child: ^Node(Value),
+) {
+	if parent != nil {
+		if old_child == parent._left {
+			parent._left = new_child
+		} else {
+			parent._right = new_child
+		}
+	} else {
+		t._root = new_child
+	}
+}
+
+@(private)
+tree_rotate :: proc "contextless" (t: ^$T/Tree($Value), a: ^Node(Value), sign: i8) {
+	b := node_get_child(a, -sign)
+	e := node_get_child(b, +sign)
+	p := a._parent
+
+	node_set_child(a, -sign, e)
+	a._parent = b
+
+	node_set_child(b, +sign, a)
+	b._parent = p
+
+	if e != nil {
+		e._parent = a
+	}
+
+	tree_replace_child(t, p, a, b)
+}
+
+@(private)
+tree_double_rotate :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	b, a: ^Node(Value),
+	sign: i8,
+) -> ^Node(Value) {
+	e := node_get_child(b, +sign)
+	f := node_get_child(e, -sign)
+	g := node_get_child(e, +sign)
+	p := a._parent
+	e_bal := e._balance
+
+	node_set_child(a, -sign, g)
+	a_bal := -e_bal
+	if sign * e_bal >= 0 {
+		a_bal = 0
+	}
+	node_set_parent_balance(a, e, a_bal)
+
+	node_set_child(b, +sign, f)
+	b_bal := -e_bal
+	if sign * e_bal <= 0 {
+		b_bal = 0
+	}
+	node_set_parent_balance(b, e, b_bal)
+
+	node_set_child(e, +sign, a)
+	node_set_child(e, -sign, b)
+	node_set_parent_balance(e, p, 0)
+
+	if g != nil {
+		g._parent = a
+	}
+
+	if f != nil {
+		f._parent = b
+	}
+
+	tree_replace_child(t, p, a, e)
+
+	return e
+}
+
+@(private)
+tree_handle_subtree_growth :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	node, parent: ^Node(Value),
+	sign: i8,
+) -> bool {
+	old_balance_factor := parent._balance
+	if old_balance_factor == 0 {
+		node_adjust_balance_factor(parent, sign)
+		return false
+	}
+
+	new_balance_factor := old_balance_factor + sign
+	if new_balance_factor == 0 {
+		node_adjust_balance_factor(parent, sign)
+		return true
+	}
+
+	if sign * node._balance > 0 {
+		tree_rotate(t, parent, -sign)
+		node_adjust_balance_factor(parent, -sign)
+		node_adjust_balance_factor(node, -sign)
+	} else {
+		tree_double_rotate(t, node, parent, -sign)
+	}
+
+	return true
+}
+
+@(private)
+tree_rebalance_after_insert :: proc "contextless" (t: ^$T/Tree($Value), inserted: ^Node(Value)) {
+	node, parent := inserted, inserted._parent
+	switch {
+	case parent == nil:
+		return
+	case node == parent._left:
+		node_adjust_balance_factor(parent, -1)
+	case:
+		node_adjust_balance_factor(parent, +1)
+	}
+
+	if parent._balance == 0 {
+		return
+	}
+
+	for done := false; !done; {
+		node = parent
+		if parent = node._parent; parent == nil {
+			return
+		}
+
+		if node == parent._left {
+			done = tree_handle_subtree_growth(t, node, parent, -1)
+		} else {
+			done = tree_handle_subtree_growth(t, node, parent, +1)
+		}
+	}
+}
+
+@(private)
+tree_swap_with_successor :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	x: ^Node(Value),
+) -> (
+	^Node(Value),
+	bool,
+) {
+	ret: ^Node(Value)
+	left_deleted: bool
+
+	y := x._right
+	if y._left == nil {
+		ret = y
+	} else {
+		q: ^Node(Value)
+
+		for {
+			q = y
+			if y = y._left; y._left == nil {
+				break
+			}
+		}
+
+		if q._left = y._right; q._left != nil {
+			q._left._parent = q
+		}
+		y._right = x._right
+		x._right._parent = y
+		ret = q
+		left_deleted = true
+	}
+
+	y._left = x._left
+	x._left._parent = y
+
+	y._parent = x._parent
+	y._balance = x._balance
+
+	tree_replace_child(t, x._parent, x, y)
+
+	return ret, left_deleted
+}
+
+@(private)
+tree_handle_subtree_shrink :: proc "contextless" (
+	t: ^$T/Tree($Value),
+	parent: ^Node(Value),
+	sign: i8,
+	left_deleted: ^bool,
+) -> ^Node(Value) {
+	old_balance_factor := parent._balance
+	if old_balance_factor == 0 {
+		node_adjust_balance_factor(parent, sign)
+		return nil
+	}
+
+	node: ^Node(Value)
+	new_balance_factor := old_balance_factor + sign
+	if new_balance_factor == 0 {
+		node_adjust_balance_factor(parent, sign)
+		node = parent
+	} else {
+		node = node_get_child(parent, sign)
+		if sign * node._balance >= 0 {
+			tree_rotate(t, parent, -sign)
+			if node._balance == 0 {
+				node_adjust_balance_factor(node, -sign)
+				return nil
+			}
+			node_adjust_balance_factor(parent, -sign)
+			node_adjust_balance_factor(node, -sign)
+		} else {
+			node = tree_double_rotate(t, node, parent, -sign)
+		}
+	}
+
+	parent := parent
+	if parent = node._parent; parent != nil {
+		left_deleted^ = node == parent._left
+	}
+	return parent
+}
+
+@(private)
+node_reset :: proc "contextless" (n: ^Node($Value)) {
+	// Mostly pointless as n will be deleted after this is called, but
+	// attempt to be able to catch cases of n not being in the tree.
+	n._parent = n
+	n._left = nil
+	n._right = nil
+	n._balance = 0
+}
+
+@(private)
+node_set_parent_balance :: #force_inline proc "contextless" (
+	n, parent: ^Node($Value),
+	balance: i8,
+) {
+	n._parent = parent
+	n._balance = balance
+}
+
+@(private)
+node_get_child :: #force_inline proc "contextless" (n: ^Node($Value), sign: i8) -> ^Node(Value) {
+	if sign < 0 {
+		return n._left
+	}
+	return n._right
+}
+
+@(private)
+node_next_or_prev_in_order :: proc "contextless" (
+	n: ^Node($Value),
+	direction: Direction,
+) -> ^Node(Value) {
+	next, tmp: ^Node(Value)
+	sign := i8(direction)
+
+	if next = node_get_child(n, +sign); next != nil {
+		for {
+			tmp = node_get_child(next, -sign)
+			if tmp == nil {
+				break
+			}
+			next = tmp
+		}
+	} else {
+		tmp, next = n, n._parent
+		for next != nil && tmp == node_get_child(next, +sign) {
+			tmp, next = next, next._parent
+		}
+	}
+	return next
+}
+
+@(private)
+node_set_child :: #force_inline proc "contextless" (
+	n: ^Node($Value),
+	sign: i8,
+	child: ^Node(Value),
+) {
+	if sign < 0 {
+		n._left = child
+	} else {
+		n._right = child
+	}
+}
+
+@(private)
+node_adjust_balance_factor :: #force_inline proc "contextless" (n: ^Node($Value), amount: i8) {
+	n._balance += amount
+}
+
+@(private)
+iterator_first :: proc "contextless" (it: ^Iterator($Value)) {
+	// This is private because behavior when the user manually calls
+	// iterator_first followed by iterator_next is unintuitive, since
+	// the first call to iterator_next MUST return the first node
+	// instead of advancing so that `for node in iterator_next(&next)`
+	// works as expected.
+
+	switch it._direction {
+	case .Forward:
+		it._cur = tree_first_or_last_in_order(it._tree, .Backward)
+	case .Backward:
+		it._cur = tree_first_or_last_in_order(it._tree, .Forward)
+	}
+
+	it._next = nil
+	it._called_next = false
+
+	if it._cur != nil {
+		it._next = node_next_or_prev_in_order(it._cur, it._direction)
+	}
+}
--- a/core/container/bit_array/bit_array.odin
+++ b/core/container/bit_array/bit_array.odin
@@ -1,6 +1,6 @@
 package dynamic_bit_array

-import "core:intrinsics"
+import "base:intrinsics"
 import "core:mem"

 /*
--- a/core/container/intrusive/list/intrusive_list.odin
+++ b/core/container/intrusive/list/intrusive_list.odin
@@ -1,6 +1,6 @@
 package container_intrusive_list

-import "core:intrinsics"
+import "base:intrinsics"

 // An intrusive doubly-linked list
 //
--- a/core/container/lru/lru_cache.odin
+++ b/core/container/lru/lru_cache.odin
@@ -1,7 +1,7 @@
 package container_lru

-import "core:runtime"
-import "core:intrinsics"
+import "base:runtime"
+import "base:intrinsics"
 _ :: runtime
 _ :: intrinsics

--- a/core/container/priority_queue/priority_queue.odin
+++ b/core/container/priority_queue/priority_queue.odin
@@ -1,6 +1,6 @@
 package container_priority_queue

-import "core:builtin"
+import "base:builtin"

 Priority_Queue :: struct($T: typeid) {
 	queue: [dynamic]T,
--- a/core/container/queue/queue.odin
+++ b/core/container/queue/queue.odin
@@ -1,7 +1,7 @@
 package container_queue

-import "core:builtin"
-import "core:runtime"
+import "base:builtin"
+import "base:runtime"
 _ :: runtime

 // Dynamically resizable double-ended queue/ring-buffer
--- a/core/container/small_array/small_array.odin
+++ b/core/container/small_array/small_array.odin
@@ -1,7 +1,7 @@
 package container_small_array

-import "core:builtin"
-import "core:runtime"
+import "base:builtin"
+import "base:runtime"
 _ :: runtime

 Small_Array :: struct($N: int, $T: typeid) where N >= 0 {
--- a/core/container/topological_sort/topological_sort.odin
+++ b/core/container/topological_sort/topological_sort.odin
@@ -3,8 +3,8 @@
 // map type is being used to accelerate lookups.
 package container_topological_sort

-import "core:intrinsics"
-import "core:runtime"
+import "base:intrinsics"
+import "base:runtime"
 _ :: intrinsics
 _ :: runtime

--- a/core/crypto/README.md
+++ b/core/crypto/README.md
@@ -1,84 +1,30 @@
 # crypto

-A cryptography library for the Odin language
+A cryptography library for the Odin language.

 ## Supported

-This library offers various algorithms implemented in Odin.
-Please see the chart below for some of the options.
-
-## Hashing algorithms
-
-| Algorithm                                                                                                    |                  |
-|:-------------------------------------------------------------------------------------------------------------|:-----------------|
-| [BLAKE2B](https://datatracker.ietf.org/doc/html/rfc7693)                                                     | &#10004;&#65039; |
-| [BLAKE2S](https://datatracker.ietf.org/doc/html/rfc7693)                                                     | &#10004;&#65039; |
-| [SHA-2](https://csrc.nist.gov/csrc/media/publications/fips/180/2/archive/2002-08-01/documents/fips180-2.pdf) | &#10004;&#65039; |
-| [SHA-3](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)                                            | &#10004;&#65039; |
-| [SHAKE](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)                                            | &#10004;&#65039; |
-| [SM3](https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02)                                           | &#10004;&#65039; |
-| legacy/[Keccak](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)                                    | &#10004;&#65039; |
-| legacy/[MD5](https://datatracker.ietf.org/doc/html/rfc1321)                                                  | &#10004;&#65039; |
-| legacy/[SHA-1](https://datatracker.ietf.org/doc/html/rfc3174)                                                | &#10004;&#65039; |
-
-#### High level API
-
-Each hash algorithm contains a procedure group named `hash`, or if the algorithm provides more than one digest size `hash_<size>`\*.
-Included in these groups are six procedures.
- `hash_string` - Hash a given string and return the computed hash. Just calls `hash_bytes` internally
- `hash_bytes` - Hash a given byte slice and return the computed hash
- `hash_string_to_buffer` - Hash a given string and put the computed hash in the second proc parameter. Just calls `hash_bytes_to_buffer` internally
- `hash_bytes_to_buffer` - Hash a given string and put the computed hash in the second proc parameter. The destination buffer has to be at least as big as the digest size of the hash
- `hash_stream` - Takes a stream from io.Stream and returns the computed hash from it
- `hash_file` - Takes a file handle and returns the computed hash from it. A second optional boolean parameter controls if the file is streamed (this is the default) or read at once (set to true)
-
-\* On some algorithms there is another part to the name, since they might offer control about additional parameters.
-For instance, `SHA-2` offers different sizes.
-Computing a 512-bit hash is therefore achieved by calling `sha2.hash_512(...)`.
-
-#### Low level API
-
-The above mentioned procedures internally call three procedures: `init`, `update` and `final`.
-You may also directly call them, if you wish.
-
-#### Example
-
-```odin
-package crypto_example
-
-// Import the desired package
-import "core:crypto/blake2b"
-
-main :: proc() {
-    input := "foo"
-
-    // Compute the hash, using the high level API
-    computed_hash := blake2b.hash(input)
-
-    // Variant that takes a destination buffer, instead of returning the computed hash
-    hash := make([]byte, sha2.DIGEST_SIZE) // @note: Destination buffer has to be at least as big as the digest size of the hash
-    blake2b.hash(input, hash[:])
-
-    // Compute the hash, using the low level API
-    ctx: blake2b.Context
-    computed_hash_low: [blake2b.DIGEST_SIZE]byte
-    blake2b.init(&ctx)
-    blake2b.update(&ctx, transmute([]byte)input)
-    blake2b.final(&ctx, computed_hash_low[:])
-}
-```
-For example uses of all available algorithms, please see the tests within `tests/core/crypto`.
+This package offers various algorithms implemented in Odin, along with
+useful helpers such as access to the system entropy source, and a
+constant-time byte comparison.

 ## Implementation considerations

 - The crypto packages are not thread-safe.
 - Best-effort is make to mitigate timing side-channels on reasonable
-  architectures. Architectures that are known to be unreasonable include
+  architectures.  Architectures that are known to be unreasonable include
  but are not limited to i386, i486, and WebAssembly.
- Some but not all of the packages attempt to santize sensitive data,
-  however this is not done consistently through the library at the moment.
-  As Thomas Pornin puts it "In general, such memory cleansing is a fool's
-  quest."
+- Implementations assume a 64-bit architecture (64-bit integer arithmetic
+  is fast, and includes add-with-carry, sub-with-borrow, and full-result
+  multiply).
+- Hardware sidechannels are explicitly out of scope for this package.
+  Notable examples include but are not limited to:
+  - Power/RF side-channels etc.
+  - Fault injection attacks etc.
+  - Hardware vulnerabilities ("apply mitigations or buy a new CPU").
+- The packages attempt to santize sensitive data, however this is, and
+  will remain a "best-effort" implementation decision.  As Thomas Pornin
+  puts it "In general, such memory cleansing is a fool's quest."
 - All of these packages have not received independent third party review.

 ## License
--- a/core/crypto/_blake2/blake2.odin
+++ b/core/crypto/_blake2/blake2.odin
@@ -11,6 +11,7 @@ package _blake2
 */

 import "core:encoding/endian"
+import "core:mem"

 BLAKE2S_BLOCK_SIZE :: 64
 BLAKE2S_SIZE :: 32
@@ -28,7 +29,6 @@ Blake2s_Context :: struct {
 	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,
-	cfg:          Blake2_Config,

 	is_initialized: bool,
 }
@@ -44,7 +44,6 @@ Blake2b_Context :: struct {
 	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,
-	cfg:          Blake2_Config,

 	is_initialized: bool,
 }
@@ -83,62 +82,61 @@ BLAKE2B_IV := [8]u64 {
 	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 }

-init :: proc(ctx: ^$T) {
+init :: proc(ctx: ^$T, cfg: ^Blake2_Config) {
 	when T == Blake2s_Context {
-		block_size :: BLAKE2S_BLOCK_SIZE
 		max_size :: BLAKE2S_SIZE
 	} else when T == Blake2b_Context {
-		block_size :: BLAKE2B_BLOCK_SIZE
 		max_size :: BLAKE2B_SIZE
 	}

-	if ctx.cfg.size > max_size {
+	if cfg.size > max_size {
 		panic("blake2: requested output size exceeeds algorithm max")
 	}

-	p := make([]byte, block_size)
-	defer delete(p)
+	// To save having to allocate a scratch buffer, use the internal
+	// data buffer (`ctx.x`), as it is exactly the correct size.
+	p := ctx.x[:]

-	p[0] = ctx.cfg.size
-	p[1] = byte(len(ctx.cfg.key))
+	p[0] = cfg.size
+	p[1] = byte(len(cfg.key))

-	if ctx.cfg.salt != nil {
+	if cfg.salt != nil {
 		when T == Blake2s_Context {
-			copy(p[16:], ctx.cfg.salt)
+			copy(p[16:], cfg.salt)
 		} else when T == Blake2b_Context {
-			copy(p[32:], ctx.cfg.salt)
+			copy(p[32:], cfg.salt)
 		}
 	}
-	if ctx.cfg.person != nil {
+	if cfg.person != nil {
 		when T == Blake2s_Context {
-			copy(p[24:], ctx.cfg.person)
+			copy(p[24:], cfg.person)
 		} else when T == Blake2b_Context {
-			copy(p[48:], ctx.cfg.person)
+			copy(p[48:], cfg.person)
 		}
 	}

-	if ctx.cfg.tree != nil {
-		p[2] = ctx.cfg.tree.(Blake2_Tree).fanout
-		p[3] = ctx.cfg.tree.(Blake2_Tree).max_depth
-		endian.unchecked_put_u32le(p[4:], ctx.cfg.tree.(Blake2_Tree).leaf_size)
+	if cfg.tree != nil {
+		p[2] = cfg.tree.(Blake2_Tree).fanout
+		p[3] = cfg.tree.(Blake2_Tree).max_depth
+		endian.unchecked_put_u32le(p[4:], cfg.tree.(Blake2_Tree).leaf_size)
 		when T == Blake2s_Context {
-			p[8] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset)
-			p[9] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset >> 8)
-			p[10] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset >> 16)
-			p[11] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset >> 24)
-			p[12] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset >> 32)
-			p[13] = byte(ctx.cfg.tree.(Blake2_Tree).node_offset >> 40)
-			p[14] = ctx.cfg.tree.(Blake2_Tree).node_depth
-			p[15] = ctx.cfg.tree.(Blake2_Tree).inner_hash_size
+			p[8] = byte(cfg.tree.(Blake2_Tree).node_offset)
+			p[9] = byte(cfg.tree.(Blake2_Tree).node_offset >> 8)
+			p[10] = byte(cfg.tree.(Blake2_Tree).node_offset >> 16)
+			p[11] = byte(cfg.tree.(Blake2_Tree).node_offset >> 24)
+			p[12] = byte(cfg.tree.(Blake2_Tree).node_offset >> 32)
+			p[13] = byte(cfg.tree.(Blake2_Tree).node_offset >> 40)
+			p[14] = cfg.tree.(Blake2_Tree).node_depth
+			p[15] = cfg.tree.(Blake2_Tree).inner_hash_size
 		} else when T == Blake2b_Context {
-			endian.unchecked_put_u64le(p[8:], ctx.cfg.tree.(Blake2_Tree).node_offset)
-			p[16] = ctx.cfg.tree.(Blake2_Tree).node_depth
-			p[17] = ctx.cfg.tree.(Blake2_Tree).inner_hash_size
+			endian.unchecked_put_u64le(p[8:], cfg.tree.(Blake2_Tree).node_offset)
+			p[16] = cfg.tree.(Blake2_Tree).node_depth
+			p[17] = cfg.tree.(Blake2_Tree).inner_hash_size
 		}
 	} else {
 		p[2], p[3] = 1, 1
 	}
-	ctx.size = ctx.cfg.size
+	ctx.size = cfg.size
 	for i := 0; i < 8; i += 1 {
 		when T == Blake2s_Context {
 			ctx.h[i] = BLAKE2S_IV[i] ~ endian.unchecked_get_u32le(p[i * 4:])
@@ -147,11 +145,14 @@ init :: proc(ctx: ^$T) {
 			ctx.h[i] = BLAKE2B_IV[i] ~ endian.unchecked_get_u64le(p[i * 8:])
 		}
 	}
-	if ctx.cfg.tree != nil && ctx.cfg.tree.(Blake2_Tree).is_last_node {
+
+	mem.zero(&ctx.x, size_of(ctx.x)) // Done with the scratch space, no barrier.
+
+	if cfg.tree != nil && cfg.tree.(Blake2_Tree).is_last_node {
 		ctx.is_last_node = true
 	}
-	if len(ctx.cfg.key) > 0 {
-		copy(ctx.padded_key[:], ctx.cfg.key)
+	if len(cfg.key) > 0 {
+		copy(ctx.padded_key[:], cfg.key)
 		update(ctx, ctx.padded_key[:])
 		ctx.is_keyed = true
 	}
@@ -194,22 +195,40 @@ update :: proc(ctx: ^$T, p: []byte) {
 	ctx.nx += copy(ctx.x[ctx.nx:], p)
 }

-final :: proc(ctx: ^$T, hash: []byte) {
+final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
 	assert(ctx.is_initialized)

+	ctx := ctx
+	if finalize_clone {
+		tmp_ctx: T
+		clone(&tmp_ctx, ctx)
+		ctx = &tmp_ctx
+	}
+	defer(reset(ctx))
+
 	when T == Blake2s_Context {
-		if len(hash) < int(ctx.cfg.size) {
+		if len(hash) < int(ctx.size) {
 			panic("crypto/blake2s: invalid destination digest size")
 		}
 		blake2s_final(ctx, hash)
 	} else when T == Blake2b_Context {
-		if len(hash) < int(ctx.cfg.size) {
+		if len(hash) < int(ctx.size) {
 			panic("crypto/blake2b: invalid destination digest size")
 		}
 		blake2b_final(ctx, hash)
 	}
+}

-	ctx.is_initialized = false
+clone :: proc(ctx, other: ^$T) {
+	ctx^ = other^
+}
+
+reset :: proc(ctx: ^$T) {
+	if !ctx.is_initialized {
+		return
+	}
+
+	mem.zero_explicit(ctx, size_of(ctx^))
 }

@(private)
--- a/core/crypto/_edwards25519/edwards25519.odin
+++ b/core/crypto/_edwards25519/edwards25519.odin
@@ -0,0 +1,428 @@
+package _edwards25519
+
+/*
+This implements the edwards25519 composite-order group, primarily for
+the purpose of implementing X25519, Ed25519, and ristretto255.  Use of
+this package for other purposes is NOT RECOMMENDED.
+
+See:
+- https://eprint.iacr.org/2011/368.pdf
+- https://datatracker.ietf.org/doc/html/rfc8032
+- https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html
+*/
+
+import "base:intrinsics"
+import "core:crypto"
+import field "core:crypto/_fiat/field_curve25519"
+import "core:mem"
+
+// Group_Element is an edwards25519 group element, as extended homogenous
+// coordinates, which represents the affine point `(x, y)` as `(X, Y, Z, T)`,
+// with the relations `x = X/Z`, `y = Y/Z`, and `x * y = T/Z`.
+//
+// d = -121665/121666 = 37095705934669439343138083508754565189542113879843219016388785533085940283555
+// a = -1
+//
+// Notes:
+// - There is considerable scope for optimization, however that
+//   will not change the external API, and this is simple and reasonably
+//   performant.
+// - The API delibarately makes it hard to create arbitrary group
+//   elements that are not on the curve.
+// - The group element decoding routine takes the opinionated stance of
+//   rejecting non-canonical encodings.
+
+FE_D := field.Tight_Field_Element {
+	929955233495203,
+	466365720129213,
+	1662059464998953,
+	2033849074728123,
+	1442794654840575,
+}
+@(private)
+FE_A := field.Tight_Field_Element {
+	2251799813685228,
+	2251799813685247,
+	2251799813685247,
+	2251799813685247,
+	2251799813685247,
+}
+@(private)
+FE_D2 := field.Tight_Field_Element {
+	1859910466990425,
+	932731440258426,
+	1072319116312658,
+	1815898335770999,
+	633789495995903,
+}
+@(private)
+GE_BASEPOINT := Group_Element {
+	field.Tight_Field_Element {
+		1738742601995546,
+		1146398526822698,
+		2070867633025821,
+		562264141797630,
+		587772402128613,
+	},
+	field.Tight_Field_Element {
+		1801439850948184,
+		1351079888211148,
+		450359962737049,
+		900719925474099,
+		1801439850948198,
+	},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element {
+		1841354044333475,
+		16398895984059,
+		755974180946558,
+		900171276175154,
+		1821297809914039,
+	},
+}
+GE_IDENTITY := Group_Element {
+	field.Tight_Field_Element{0, 0, 0, 0, 0},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element{1, 0, 0, 0, 0},
+	field.Tight_Field_Element{0, 0, 0, 0, 0},
+}
+
+Group_Element :: struct {
+	x: field.Tight_Field_Element,
+	y: field.Tight_Field_Element,
+	z: field.Tight_Field_Element,
+	t: field.Tight_Field_Element,
+}
+
+ge_clear :: proc "contextless" (ge: ^Group_Element) {
+	mem.zero_explicit(ge, size_of(Group_Element))
+}
+
+ge_set :: proc "contextless" (ge, a: ^Group_Element) {
+	field.fe_set(&ge.x, &a.x)
+	field.fe_set(&ge.y, &a.y)
+	field.fe_set(&ge.z, &a.z)
+	field.fe_set(&ge.t, &a.t)
+}
+
+@(require_results)
+ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+
+	// Do the work in a scratch element, so that ge is unchanged on
+	// failure.
+	tmp: Group_Element = ---
+	defer ge_clear(&tmp)
+	field.fe_one(&tmp.z) // Z = 1
+
+	// The encoding is the y-coordinate, with the x-coordinate polarity
+	// (odd/even) encoded in the MSB.
+	field.fe_from_bytes(&tmp.y, b_) // ignores high bit
+
+	// Recover the candidate x-coordinate via the curve equation:
+	// x^2 = (y^2 - 1) / (d * y^2 + 1) (mod p)
+
+	fe_tmp := &tmp.t // Use this to store intermediaries.
+	fe_one := &tmp.z
+
+	// x = num = y^2 - 1
+	field.fe_carry_square(fe_tmp, field.fe_relax_cast(&tmp.y)) // fe_tmp = y^2
+	field.fe_carry_sub(&tmp.x, fe_tmp, fe_one)
+
+	// den = d * y^2 + 1
+	field.fe_carry_mul(fe_tmp, field.fe_relax_cast(fe_tmp), field.fe_relax_cast(&FE_D))
+	field.fe_carry_add(fe_tmp, fe_tmp, fe_one)
+
+	// x = invsqrt(den/num)
+	is_square := field.fe_carry_sqrt_ratio_m1(
+		&tmp.x,
+		field.fe_relax_cast(&tmp.x),
+		field.fe_relax_cast(fe_tmp),
+	)
+	if is_square == 0 {
+		return false
+	}
+
+	// Pick the right x-coordinate.
+	field.fe_cond_negate(&tmp.x, &tmp.x, int(b[31] >> 7))
+
+	// t = x * y
+	field.fe_carry_mul(&tmp.t, field.fe_relax_cast(&tmp.x), field.fe_relax_cast(&tmp.y))
+
+	// Reject non-canonical encodings of ge.
+	buf: [32]byte = ---
+	field.fe_to_bytes(&buf, &tmp.y)
+	buf[31] |= byte(field.fe_is_negative(&tmp.x)) << 7
+	is_canonical := crypto.compare_constant_time(b, buf[:])
+
+	ge_cond_assign(ge, &tmp, is_canonical)
+
+	mem.zero_explicit(&buf, size_of(buf))
+
+	return is_canonical == 1
+}
+
+ge_bytes :: proc "contextless" (ge: ^Group_Element, dst: []byte) {
+	if len(dst) != 32 {
+		intrinsics.trap()
+	}
+	dst_ := transmute(^[32]byte)(raw_data(dst))
+
+	// Convert the element to affine (x, y) representation.
+	x, y, z_inv: field.Tight_Field_Element = ---, ---, ---
+	field.fe_carry_inv(&z_inv, field.fe_relax_cast(&ge.z))
+	field.fe_carry_mul(&x, field.fe_relax_cast(&ge.x), field.fe_relax_cast(&z_inv))
+	field.fe_carry_mul(&y, field.fe_relax_cast(&ge.y), field.fe_relax_cast(&z_inv))
+
+	// Encode the y-coordinate.
+	field.fe_to_bytes(dst_, &y)
+
+	// Copy the least significant bit of the x-coordinate to the most
+	// significant bit of the encoded y-coordinate.
+	dst_[31] |= byte((x[0] & 1) << 7)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&x, &y, &z_inv})
+}
+
+ge_identity :: proc "contextless" (ge: ^Group_Element) {
+	field.fe_zero(&ge.x)
+	field.fe_one(&ge.y)
+	field.fe_one(&ge.z)
+	field.fe_zero(&ge.t)
+}
+
+ge_generator :: proc "contextless" (ge: ^Group_Element) {
+	ge_set(ge, &GE_BASEPOINT)
+}
+
+@(private)
+Addend_Group_Element :: struct {
+	y2_minus_x2:  field.Loose_Field_Element, // t1
+	y2_plus_x2:   field.Loose_Field_Element, // t3
+	k_times_t2:   field.Tight_Field_Element, // t4
+	two_times_z2: field.Loose_Field_Element, // t5
+}
+
+@(private)
+ge_addend_set :: proc "contextless" (ge_a: ^Addend_Group_Element, ge: ^Group_Element) {
+	field.fe_sub(&ge_a.y2_minus_x2, &ge.y, &ge.x)
+	field.fe_add(&ge_a.y2_plus_x2, &ge.y, &ge.x)
+	field.fe_carry_mul(&ge_a.k_times_t2, field.fe_relax_cast(&FE_D2), field.fe_relax_cast(&ge.t))
+	field.fe_add(&ge_a.two_times_z2, &ge.z, &ge.z)
+}
+
+@(private)
+ge_addend_conditional_assign :: proc "contextless" (ge_a, a: ^Addend_Group_Element, ctrl: int) {
+	field.fe_cond_select(&ge_a.y2_minus_x2, &ge_a.y2_minus_x2, &a.y2_minus_x2, ctrl)
+	field.fe_cond_select(&ge_a.y2_plus_x2, &ge_a.y2_plus_x2, &a.y2_plus_x2, ctrl)
+	field.fe_cond_select(&ge_a.k_times_t2, &ge_a.k_times_t2, &a.k_times_t2, ctrl)
+	field.fe_cond_select(&ge_a.two_times_z2, &ge_a.two_times_z2, &a.two_times_z2, ctrl)
+}
+
+@(private)
+Add_Scratch :: struct {
+	A, B, C, D: field.Tight_Field_Element,
+	E, F, G, H: field.Loose_Field_Element,
+	t0, t2:     field.Loose_Field_Element,
+}
+
+ge_add :: proc "contextless" (ge, a, b: ^Group_Element) {
+	b_: Addend_Group_Element = ---
+	ge_addend_set(&b_, b)
+
+	scratch: Add_Scratch = ---
+	ge_add_addend(ge, a, &b_, &scratch)
+
+	mem.zero_explicit(&b_, size_of(Addend_Group_Element))
+	mem.zero_explicit(&scratch, size_of(Add_Scratch))
+}
+
+@(private)
+ge_add_addend :: proc "contextless" (
+	ge, a: ^Group_Element,
+	b: ^Addend_Group_Element,
+	scratch: ^Add_Scratch,
+) {
+	// https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#addition-add-2008-hwcd-3
+	// Assumptions: k=2*d.
+	//
+	// t0 = Y1-X1
+	// t1 = Y2-X2
+	// A = t0*t1
+	// t2 = Y1+X1
+	// t3 = Y2+X2
+	// B = t2*t3
+	// t4 = k*T2
+	// C = T1*t4
+	// t5 = 2*Z2
+	// D = Z1*t5
+	// E = B-A
+	// F = D-C
+	// G = D+C
+	// H = B+A
+	// X3 = E*F
+	// Y3 = G*H
+	// T3 = E*H
+	// Z3 = F*G
+	//
+	// In order to make the scalar multiply faster, the addend is provided
+	// as a `Addend_Group_Element` with t1, t3, t4, and t5 precomputed, as
+	// it is trivially obvious that those are the only values used by the
+	// formula that are directly dependent on `b`, and are only dependent
+	// on `b` and constants.  This saves 1 sub, 2 adds, and 1 multiply,
+	// each time the intermediate representation can be reused.
+
+	A, B, C, D := &scratch.A, &scratch.B, &scratch.C, &scratch.D
+	E, F, G, H := &scratch.E, &scratch.F, &scratch.G, &scratch.H
+	t0, t2 := &scratch.t0, &scratch.t2
+
+	field.fe_sub(t0, &a.y, &a.x)
+	t1 := &b.y2_minus_x2
+	field.fe_carry_mul(A, t0, t1)
+	field.fe_add(t2, &a.y, &a.x)
+	t3 := &b.y2_plus_x2
+	field.fe_carry_mul(B, t2, t3)
+	t4 := &b.k_times_t2
+	field.fe_carry_mul(C, field.fe_relax_cast(&a.t), field.fe_relax_cast(t4))
+	t5 := &b.two_times_z2
+	field.fe_carry_mul(D, field.fe_relax_cast(&a.z), t5)
+	field.fe_sub(E, B, A)
+	field.fe_sub(F, D, C)
+	field.fe_add(G, D, C)
+	field.fe_add(H, B, A)
+	field.fe_carry_mul(&ge.x, E, F)
+	field.fe_carry_mul(&ge.y, G, H)
+	field.fe_carry_mul(&ge.t, E, H)
+	field.fe_carry_mul(&ge.z, F, G)
+}
+
+@(private)
+Double_Scratch :: struct {
+	A, B, C, D, G: field.Tight_Field_Element,
+	t0, t2, t3:    field.Tight_Field_Element,
+	E, F, H:       field.Loose_Field_Element,
+	t1:            field.Loose_Field_Element,
+}
+
+ge_double :: proc "contextless" (ge, a: ^Group_Element, scratch: ^Double_Scratch = nil) {
+	// https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#doubling-dbl-2008-hwcd
+	//
+	// A = X1^2
+	// B = Y1^2
+	// t0 = Z1^2
+	// C = 2*t0
+	// D = a*A
+	// t1 = X1+Y1
+	// t2 = t1^2
+	// t3 = t2-A
+	// E = t3-B
+	// G = D+B
+	// F = G-C
+	// H = D-B
+	// X3 = E*F
+	// Y3 = G*H
+	// T3 = E*H
+	// Z3 = F*G
+
+	sanitize, scratch := scratch == nil, scratch
+	if sanitize {
+		tmp: Double_Scratch = ---
+		scratch = &tmp
+	}
+
+	A, B, C, D, G := &scratch.A, &scratch.B, &scratch.C, &scratch.D, &scratch.G
+	t0, t2, t3 := &scratch.t0, &scratch.t2, &scratch.t3
+	E, F, H := &scratch.E, &scratch.F, &scratch.H
+	t1 := &scratch.t1
+
+	field.fe_carry_square(A, field.fe_relax_cast(&a.x))
+	field.fe_carry_square(B, field.fe_relax_cast(&a.y))
+	field.fe_carry_square(t0, field.fe_relax_cast(&a.z))
+	field.fe_carry_add(C, t0, t0)
+	field.fe_carry_mul(D, field.fe_relax_cast(&FE_A), field.fe_relax_cast(A))
+	field.fe_add(t1, &a.x, &a.y)
+	field.fe_carry_square(t2, t1)
+	field.fe_carry_sub(t3, t2, A)
+	field.fe_sub(E, t3, B)
+	field.fe_carry_add(G, D, B)
+	field.fe_sub(F, G, C)
+	field.fe_sub(H, D, B)
+	G_ := field.fe_relax_cast(G)
+	field.fe_carry_mul(&ge.x, E, F)
+	field.fe_carry_mul(&ge.y, G_, H)
+	field.fe_carry_mul(&ge.t, E, H)
+	field.fe_carry_mul(&ge.z, F, G_)
+
+	if sanitize {
+		mem.zero_explicit(scratch, size_of(Double_Scratch))
+	}
+}
+
+ge_negate :: proc "contextless" (ge, a: ^Group_Element) {
+	field.fe_carry_opp(&ge.x, &a.x)
+	field.fe_set(&ge.y, &a.y)
+	field.fe_set(&ge.z, &a.z)
+	field.fe_carry_opp(&ge.t, &a.t)
+}
+
+ge_cond_negate :: proc "contextless" (ge, a: ^Group_Element, ctrl: int) {
+	tmp: Group_Element = ---
+	ge_negate(&tmp, a)
+	ge_cond_assign(ge, &tmp, ctrl)
+
+	ge_clear(&tmp)
+}
+
+ge_cond_assign :: proc "contextless" (ge, a: ^Group_Element, ctrl: int) {
+	field.fe_cond_assign(&ge.x, &a.x, ctrl)
+	field.fe_cond_assign(&ge.y, &a.y, ctrl)
+	field.fe_cond_assign(&ge.z, &a.z, ctrl)
+	field.fe_cond_assign(&ge.t, &a.t, ctrl)
+}
+
+ge_cond_select :: proc "contextless" (ge, a, b: ^Group_Element, ctrl: int) {
+	field.fe_cond_select(&ge.x, &a.x, &b.x, ctrl)
+	field.fe_cond_select(&ge.y, &a.y, &b.y, ctrl)
+	field.fe_cond_select(&ge.z, &a.z, &b.z, ctrl)
+	field.fe_cond_select(&ge.t, &a.t, &b.t, ctrl)
+}
+
+@(require_results)
+ge_equal :: proc "contextless" (a, b: ^Group_Element) -> int {
+	// (x, y) ?= (x', y') -> (X/Z, Y/Z) ?= (X'/Z', Y'/Z')
+	// X/Z ?= X'/Z', Y/Z ?= Y'/Z' -> X*Z' ?= X'*Z, Y*Z' ?= Y'*Z
+	ax_bz, bx_az, ay_bz, by_az: field.Tight_Field_Element = ---, ---, ---, ---
+	field.fe_carry_mul(&ax_bz, field.fe_relax_cast(&a.x), field.fe_relax_cast(&b.z))
+	field.fe_carry_mul(&bx_az, field.fe_relax_cast(&b.x), field.fe_relax_cast(&a.z))
+	field.fe_carry_mul(&ay_bz, field.fe_relax_cast(&a.y), field.fe_relax_cast(&b.z))
+	field.fe_carry_mul(&by_az, field.fe_relax_cast(&b.y), field.fe_relax_cast(&a.z))
+
+	ret := field.fe_equal(&ax_bz, &bx_az) & field.fe_equal(&ay_bz, &by_az)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&ax_bz, &ay_bz, &bx_az, &by_az})
+
+	return ret
+}
+
+@(require_results)
+ge_is_small_order :: proc "contextless" (ge: ^Group_Element) -> bool {
+	tmp: Group_Element = ---
+	ge_double(&tmp, ge)
+	ge_double(&tmp, &tmp)
+	ge_double(&tmp, &tmp)
+	return ge_equal(&tmp, &GE_IDENTITY) == 1
+}
+
+@(require_results)
+ge_in_prime_order_subgroup_vartime :: proc "contextless" (ge: ^Group_Element) -> bool {
+	// This is currently *very* expensive.  The faster method would be
+	// something like (https://eprint.iacr.org/2022/1164.pdf), however
+	// that is a ~50% speedup, and a lot of added complexity for something
+	// that is better solved by "just use ristretto255".
+	tmp: Group_Element = ---
+	_ge_scalarmult(&tmp, ge, &SC_ELL, true)
+	return ge_equal(&tmp, &GE_IDENTITY) == 1
+}
--- a/core/crypto/_edwards25519/edwards25519_scalar.odin
+++ b/core/crypto/_edwards25519/edwards25519_scalar.odin
@@ -0,0 +1,61 @@
+package _edwards25519
+
+import "base:intrinsics"
+import field "core:crypto/_fiat/field_scalar25519"
+import "core:mem"
+
+Scalar :: field.Montgomery_Domain_Field_Element
+
+// WARNING: This is non-canonical and only to be used when checking if
+// a group element is on the prime-order subgroup.
+@(private)
+SC_ELL := field.Non_Montgomery_Domain_Field_Element {
+	field.ELL[0],
+	field.ELL[1],
+	field.ELL[2],
+	field.ELL[3],
+}
+
+sc_set_u64 :: proc "contextless" (sc: ^Scalar, i: u64) {
+	tmp := field.Non_Montgomery_Domain_Field_Element{i, 0, 0, 0}
+	field.fe_to_montgomery(sc, &tmp)
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+@(require_results)
+sc_set_bytes :: proc "contextless" (sc: ^Scalar, b: []byte) -> bool {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+	return field.fe_from_bytes(sc, b_)
+}
+
+sc_set_bytes_rfc8032 :: proc "contextless" (sc: ^Scalar, b: []byte) {
+	if len(b) != 32 {
+		intrinsics.trap()
+	}
+	b_ := transmute(^[32]byte)(raw_data(b))
+	field.fe_from_bytes_rfc8032(sc, b_)
+}
+
+sc_clear :: proc "contextless" (sc: ^Scalar) {
+	mem.zero_explicit(sc, size_of(Scalar))
+}
+
+sc_set :: field.fe_set
+sc_set_bytes_wide :: field.fe_from_bytes_wide
+sc_bytes :: field.fe_to_bytes
+
+sc_zero :: field.fe_zero
+sc_one :: field.fe_one
+
+sc_add :: field.fe_add
+sc_sub :: field.fe_sub
+sc_negate :: field.fe_opp
+sc_mul :: field.fe_mul
+sc_square :: field.fe_square
+
+sc_cond_assign :: field.fe_cond_assign
+sc_equal :: field.fe_equal
--- a/core/crypto/_edwards25519/edwards25519_scalar_mul.odin
+++ b/core/crypto/_edwards25519/edwards25519_scalar_mul.odin
@@ -0,0 +1,288 @@
+package _edwards25519
+
+import field "core:crypto/_fiat/field_scalar25519"
+import "core:math/bits"
+import "core:mem"
+
+// GE_BASEPOINT_TABLE is 1 * G, ... 15 * G, in precomputed format.
+//
+// Note: When generating, the values were reduced to Tight_Field_Element
+// ranges, even though that is not required.
+@(private)
+GE_BASEPOINT_TABLE := Multiply_Table {
+	{
+		{62697248952638, 204681361388450, 631292143396476, 338455783676468, 1213667448819585},
+		{1288382639258501, 245678601348599, 269427782077623, 1462984067271730, 137412439391563},
+		{301289933810280, 1259582250014073, 1422107436869536, 796239922652654, 1953934009299142},
+		{2, 0, 0, 0, 0},
+	},
+	{
+		{1519297034332653, 1098796920435767, 1823476547744119, 808144629470969, 2110930855619772},
+		{338005982828284, 1667856962156925, 100399270107451, 1604566703601691, 1950338038771369},
+		{1920505767731247, 1443759578976892, 1659852098357048, 1484431291070208, 275018744912646},
+		{763163817085987, 2195095074806923, 2167883174351839, 1868059999999762, 911071066608705},
+	},
+	{
+		{960627541894068, 1314966688943942, 1126875971034044, 2059608312958945, 605975666152586},
+		{1714478358025626, 2209607666607510, 1600912834284834, 496072478982142, 481970031861896},
+		{851735079403194, 1088965826757164, 141569479297499, 602804610059257, 2004026468601520},
+		{197585529552380, 324719066578543, 564481854250498, 1173818332764578, 35452976395676},
+	},
+	{
+		{1152980410747203, 2196804280851952, 25745194962557, 1915167295473129, 1266299690309224},
+		{809905889679060, 979732230071345, 1509972345538142, 188492426534402, 818965583123815},
+		{997685409185036, 1451818320876327, 2126681166774509, 2000509606057528, 235432372486854},
+		{887734189279642, 1460338685162044, 877378220074262, 102436391401299, 153369156847490},
+	},
+	{
+		{2056621900836770, 1821657694132497, 1627986892909426, 1163363868678833, 1108873376459226},
+		{1187697490593623, 1066539945237335, 885654531892000, 1357534489491782, 359370291392448},
+		{1509033452137525, 1305318174298508, 613642471748944, 1987256352550234, 1044283663101541},
+		{220105720697037, 387661783287620, 328296827867762, 360035589590664, 795213236824054},
+	},
+	{
+		{1820794733038396, 1612235121681074, 757405923441402, 1094031020892801, 231025333128907},
+		{1639067873254194, 1484176557946322, 300800382144789, 1329915446659183, 1211704578730455},
+		{641900794791527, 1711751746971612, 179044712319955, 576455585963824, 1852617592509865},
+		{743549047192397, 685091042550147, 1952415336873496, 1965124675654685, 513364998442917},
+	},
+	{
+		{1004557076870448, 1762911374844520, 1330807633622723, 384072910939787, 953849032243810},
+		{2178275058221458, 257933183722891, 376684351537894, 2010189102001786, 1981824297484148},
+		{1332915663881114, 1286540505502549, 1741691283561518, 977214932156314, 1764059494778091},
+		{429702949064027, 1368332611650677, 2019867176450999, 2212258376161746, 526160996742554},
+	},
+	{
+		{2098932988258576, 2203688382075948, 2120400160059479, 1748488020948146, 1203264167282624},
+		{677131386735829, 1850249298025188, 672782146532031, 2144145693078904, 2088656272813787},
+		{1065622343976192, 1573853211848116, 223560413590068, 333846833073379, 27832122205830},
+		{1781008836504573, 917619542051793, 544322748939913, 882577394308384, 1720521246471195},
+	},
+	{
+		{660120928379860, 2081944024858618, 1878411111349191, 424587356517195, 2111317439894005},
+		{1834193977811532, 1864164086863319, 797334633289424, 150410812403062, 2085177078466389},
+		{1438117271371866, 783915531014482, 388731514584658, 292113935417795, 1945855002546714},
+		{1678140823166658, 679103239148744, 614102761596238, 1052962498997885, 1863983323810390},
+	},
+	{
+		{1690309392496233, 1116333140326275, 1377242323631039, 717196888780674, 82724646713353},
+		{1722370213432106, 74265192976253, 264239578448472, 1714909985012994, 2216984958602173},
+		{2010482366920922, 1294036471886319, 566466395005815, 1631955803657320, 1751698647538458},
+		{1073230604155753, 1159087041338551, 1664057985455483, 127472702826203, 1339591128522371},
+	},
+	{
+		{478053307175577, 2179515791720985, 21146535423512, 1831683844029536, 462805561553981},
+		{1945267486565588, 1298536818409655, 2214511796262989, 1904981051429012, 252904800782086},
+		{268945954671210, 222740425595395, 1208025911856230, 1080418823003555, 75929831922483},
+		{1884784014268948, 643868448202966, 978736549726821, 46385971089796, 1296884812292320},
+	},
+	{
+		{1861159462859103, 7077532564710, 963010365896826, 1938780006785270, 766241051941647},
+		{1778966986051906, 1713995999765361, 1394565822271816, 1366699246468722, 1213407027149475},
+		{1978989286560907, 2135084162045594, 1951565508865477, 671788336314416, 293123929458176},
+		{902608944504080, 2167765718046481, 1285718473078022, 1222562171329269, 492109027844479},
+	},
+	{
+		{1820807832746213, 1029220580458586, 1101997555432203, 1039081975563572, 202477981158221},
+		{1866134980680205, 2222325502763386, 1830284629571201, 1046966214478970, 418381946936795},
+		{1783460633291322, 1719505443254998, 1810489639976220, 877049370713018, 2187801198742619},
+		{197118243000763, 305493867565736, 518814410156522, 1656246186645170, 901894734874934},
+	},
+	{
+		{225454942125915, 478410476654509, 600524586037746, 643450007230715, 1018615928259319},
+		{1733330584845708, 881092297970296, 507039890129464, 496397090721598, 2230888519577628},
+		{690155664737246, 1010454785646677, 753170144375012, 1651277613844874, 1622648796364156},
+		{1321310321891618, 1089655277873603, 235891750867089, 815878279563688, 1709264240047556},
+	},
+	{
+		{805027036551342, 1387174275567452, 1156538511461704, 1465897486692171, 1208567094120903},
+		{2228417017817483, 202885584970535, 2182114782271881, 2077405042592934, 1029684358182774},
+		{460447547653983, 627817697755692, 524899434670834, 1228019344939427, 740684787777653},
+		{849757462467675, 447476306919899, 422618957298818, 302134659227815, 675831828440895},
+	},
+}
+
+ge_scalarmult :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
+	tmp: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&tmp, sc)
+
+	_ge_scalarmult(ge, p, &tmp)
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+ge_scalarmult_basepoint :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar) {
+	// Something like the comb method from "Fast and compact elliptic-curve
+	// cryptography" Section 3.3, would be more performant, but more
+	// complex.
+	//
+	// - https://eprint.iacr.org/2012/309
+	ge_scalarmult(ge, &GE_BASEPOINT, sc)
+}
+
+ge_scalarmult_vartime :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
+	tmp: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&tmp, sc)
+
+	_ge_scalarmult(ge, p, &tmp, true)
+}
+
+ge_double_scalarmult_basepoint_vartime :: proc "contextless" (
+	ge: ^Group_Element,
+	a: ^Scalar,
+	A: ^Group_Element,
+	b: ^Scalar,
+) {
+	// Strauss-Shamir, commonly referred to as the "Shamir trick",
+	// saves half the doublings, relative to doing this the naive way.
+	//
+	// ABGLSV-Pornin (https://eprint.iacr.org/2020/454) is faster,
+	// but significantly more complex, and has incompatibilities with
+	// mixed-order group elements.
+
+	tmp_add: Add_Scratch = ---
+	tmp_addend: Addend_Group_Element = ---
+	tmp_dbl: Double_Scratch = ---
+	tmp: Group_Element = ---
+
+	A_tbl: Multiply_Table = ---
+	mul_tbl_set(&A_tbl, A, &tmp_add)
+
+	sc_a, sc_b: field.Non_Montgomery_Domain_Field_Element
+	field.fe_from_montgomery(&sc_a, a)
+	field.fe_from_montgomery(&sc_b, b)
+
+	ge_identity(&tmp)
+	for i := 31; i >= 0; i = i - 1 {
+		limb := i / 8
+		shift := uint(i & 7) * 8
+
+		limb_byte_a := sc_a[limb] >> shift
+		limb_byte_b := sc_b[limb] >> shift
+
+		hi_a, lo_a := (limb_byte_a >> 4) & 0x0f, limb_byte_a & 0x0f
+		hi_b, lo_b := (limb_byte_b >> 4) & 0x0f, limb_byte_b & 0x0f
+
+		if i != 31 {
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+		}
+		mul_tbl_add(&tmp, &A_tbl, hi_a, &tmp_add, &tmp_addend, true)
+		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, hi_b, &tmp_add, &tmp_addend, true)
+
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		mul_tbl_add(&tmp, &A_tbl, lo_a, &tmp_add, &tmp_addend, true)
+		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, lo_b, &tmp_add, &tmp_addend, true)
+	}
+
+	ge_set(ge, &tmp)
+}
+
+@(private)
+_ge_scalarmult :: proc "contextless" (
+	ge, p: ^Group_Element,
+	sc: ^field.Non_Montgomery_Domain_Field_Element,
+	unsafe_is_vartime := false,
+) {
+	// Do the simplest possible thing that works and provides adequate,
+	// performance, which is windowed add-then-multiply.
+
+	tmp_add: Add_Scratch = ---
+	tmp_addend: Addend_Group_Element = ---
+	tmp_dbl: Double_Scratch = ---
+	tmp: Group_Element = ---
+
+	p_tbl: Multiply_Table = ---
+	mul_tbl_set(&p_tbl, p, &tmp_add)
+
+	ge_identity(&tmp)
+	for i := 31; i >= 0; i = i - 1 {
+		limb := i / 8
+		shift := uint(i & 7) * 8
+		limb_byte := sc[limb] >> shift
+
+		hi, lo := (limb_byte >> 4) & 0x0f, limb_byte & 0x0f
+
+		if i != 31 {
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+			ge_double(&tmp, &tmp, &tmp_dbl)
+		}
+		mul_tbl_add(&tmp, &p_tbl, hi, &tmp_add, &tmp_addend, unsafe_is_vartime)
+
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		ge_double(&tmp, &tmp, &tmp_dbl)
+		mul_tbl_add(&tmp, &p_tbl, lo, &tmp_add, &tmp_addend, unsafe_is_vartime)
+	}
+
+	ge_set(ge, &tmp)
+
+	if !unsafe_is_vartime {
+		ge_clear(&tmp)
+		mem.zero_explicit(&tmp_add, size_of(Add_Scratch))
+		mem.zero_explicit(&tmp_addend, size_of(Addend_Group_Element))
+		mem.zero_explicit(&tmp_dbl, size_of(Double_Scratch))
+	}
+}
+
+@(private)
+Multiply_Table :: [15]Addend_Group_Element // 0 = inf, which is implicit.
+
+@(private)
+mul_tbl_set :: proc "contextless" (
+	tbl: ^Multiply_Table,
+	ge: ^Group_Element,
+	tmp_add: ^Add_Scratch,
+) {
+	tmp: Group_Element = ---
+	ge_set(&tmp, ge)
+
+	ge_addend_set(&tbl[0], ge)
+	for i := 1; i < 15; i = i + 1 {
+		ge_add_addend(&tmp, &tmp, &tbl[0], tmp_add)
+		ge_addend_set(&tbl[i], &tmp)
+	}
+
+	ge_clear(&tmp)
+}
+
+@(private)
+mul_tbl_add :: proc "contextless" (
+	ge: ^Group_Element,
+	tbl: ^Multiply_Table,
+	idx: u64,
+	tmp_add: ^Add_Scratch,
+	tmp_addend: ^Addend_Group_Element,
+	unsafe_is_vartime: bool,
+) {
+	// Variable time lookup, with the addition omitted entirely if idx == 0.
+	if unsafe_is_vartime {
+		// Skip adding the point at infinity.
+		if idx != 0 {
+			ge_add_addend(ge, ge, &tbl[idx - 1], tmp_add)
+		}
+		return
+	}
+
+	// Constant time lookup.
+	tmp_addend^ = {
+		// Point at infinity (0, 1, 1, 0) in precomputed form
+		{1, 0, 0, 0, 0}, // y - x
+		{1, 0, 0, 0, 0}, // y + x
+		{0, 0, 0, 0, 0}, // t * 2d
+		{2, 0, 0, 0, 0}, // z * 2
+	}
+	for i := u64(1); i < 16; i = i + 1 {
+		_, ctrl := bits.sub_u64(0, (i ~ idx), 0)
+		ge_addend_conditional_assign(tmp_addend, &tbl[i - 1], int(~ctrl) & 1)
+	}
+	ge_add_addend(ge, ge, tmp_addend, tmp_add)
+}
--- a/core/crypto/_fiat/fiat.odin
+++ b/core/crypto/_fiat/fiat.odin
@@ -9,7 +9,7 @@ package fiat
 u1 :: distinct u8
 i1 :: distinct i8

-@(optimization_mode="none")
+@(optimization_mode = "none")
 cmovznz_u64 :: proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) {
 	x1 := (u64(arg1) * 0xffffffffffffffff)
 	x2 := ((x1 & arg3) | ((~x1) & arg2))
@@ -17,7 +17,7 @@ cmovznz_u64 :: proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) {
 	return
 }

-@(optimization_mode="none")
+@(optimization_mode = "none")
 cmovznz_u32 :: proc "contextless" (arg1: u1, arg2, arg3: u32) -> (out1: u32) {
 	x1 := (u32(arg1) * 0xffffffff)
 	x2 := ((x1 & arg3) | ((~x1) & arg2))
--- a/core/crypto/_fiat/field_curve25519/field.odin
+++ b/core/crypto/_fiat/field_curve25519/field.odin
@@ -3,14 +3,32 @@ package field_curve25519
 import "core:crypto"
 import "core:mem"

-fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+fe_relax_cast :: #force_inline proc "contextless" (
+	arg1: ^Tight_Field_Element,
+) -> ^Loose_Field_Element {
 	return transmute(^Loose_Field_Element)(arg1)
 }

-fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+fe_tighten_cast :: #force_inline proc "contextless" (
+	arg1: ^Loose_Field_Element,
+) -> ^Tight_Field_Element {
 	return transmute(^Tight_Field_Element)(arg1)
 }

+fe_clear :: proc "contextless" (
+	arg1: $T,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mem.zero_explicit(arg1, size_of(arg1^))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: $T,
+) where T == []^Tight_Field_Element || T == []^Loose_Field_Element {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
 fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
 	// Ignore the unused bit by copying the input and masking the bit off
 	// prior to deserialization.
@@ -23,12 +41,25 @@ fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte
 	mem.zero_explicit(&tmp1, size_of(tmp1))
 }

+fe_is_negative :: proc "contextless" (arg1: ^Tight_Field_Element) -> int {
+	tmp1: [32]byte = ---
+
+	fe_to_bytes(&tmp1, arg1)
+	ret := tmp1[0] & 1
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+
+	return int(ret)
+}
+
 fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int {
-	tmp2: [32]byte = ---
+	tmp1, tmp2: [32]byte = ---, ---

+	fe_to_bytes(&tmp1, arg1)
 	fe_to_bytes(&tmp2, arg2)
-	ret := fe_equal_bytes(arg1, &tmp2)
+	ret := crypto.compare_constant_time(tmp1[:], tmp2[:])

+	mem.zero_explicit(&tmp1, size_of(tmp1))
 	mem.zero_explicit(&tmp2, size_of(tmp2))

 	return ret
@@ -46,7 +77,11 @@ fe_equal_bytes :: proc "contextless" (arg1: ^Tight_Field_Element, arg2: ^[32]byt
 	return ret
 }

-fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element, arg2: uint) {
+fe_carry_pow2k :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+	arg2: uint,
+) {
 	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
 	if arg2 == 0 {
 		fe_one(out1)
@@ -54,27 +89,46 @@ fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element,
 	}

 	fe_carry_square(out1, arg1)
-	for _ in 1..<arg2 {
+	for _ in 1 ..< arg2 {
 		fe_carry_square(out1, fe_relax_cast(out1))
 	}
 }

+fe_carry_add :: #force_inline proc "contextless" (out1, arg1, arg2: ^Tight_Field_Element) {
+	fe_add(fe_relax_cast(out1), arg1, arg2)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
+fe_carry_sub :: #force_inline proc "contextless" (out1, arg1, arg2: ^Tight_Field_Element) {
+	fe_sub(fe_relax_cast(out1), arg1, arg2)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
 fe_carry_opp :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
 	fe_opp(fe_relax_cast(out1), arg1)
 	fe_carry(out1, fe_relax_cast(out1))
 }

-fe_carry_invsqrt :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) -> int {
-	// Inverse square root taken from Monocypher.
+fe_carry_abs :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	fe_cond_negate(out1, arg1, fe_is_negative(arg1))
+}

+fe_carry_sqrt_ratio_m1 :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element, // u
+	arg2: ^Loose_Field_Element, // v
+) -> int {
+	// SQRT_RATIO_M1(u, v) from RFC 9496 - 4.2, based on the inverse
+	// square root from Monocypher.
+
+	w: Tight_Field_Element = ---
+	fe_carry_mul(&w, arg1, arg2) // u * v
+
+	// r = tmp1 = u * w^((p-5)/8)
 	tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
-
-	// t0 = x^((p-5)/8)
-	// Can be achieved with a simple double & add ladder,
-	// but it would be slower.
-	fe_carry_pow2k(&tmp1, arg1, 1)
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&w), 1)
 	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2)
-	fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2))
+	fe_carry_mul(&tmp2, fe_relax_cast(&w), fe_relax_cast(&tmp2))
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2))
 	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1)
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
@@ -93,46 +147,121 @@ fe_carry_invsqrt :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element
 	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50)
 	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
 	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2)
-	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&w)) // w^((p-5)/8)

-	// quartic = x^((p-1)/4)
-	quartic := &tmp2
-	fe_carry_square(quartic, fe_relax_cast(&tmp1))
-	fe_carry_mul(quartic, fe_relax_cast(quartic), arg1)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1) // u * w^((p-5)/8)

-	// Serialize quartic once to save on repeated serialization/sanitization.
-	quartic_buf: [32]byte = ---
-	fe_to_bytes(&quartic_buf, quartic)
-	check := &tmp3
+	// Serialize `check` once to save on repeated serialization.
+	r, check := &tmp1, &tmp2
+	b: [32]byte = ---
+	fe_carry_square(check, fe_relax_cast(r))
+	fe_carry_mul(check, fe_relax_cast(check), arg2) // check * v
+	fe_to_bytes(&b, check)

-	fe_one(check)
-	p1 := fe_equal_bytes(check, &quartic_buf)
-	fe_carry_opp(check, check)
-	m1 := fe_equal_bytes(check, &quartic_buf)
-	fe_carry_opp(check, &SQRT_M1)
-	ms := fe_equal_bytes(check, &quartic_buf)
+	u, neg_u, neg_u_i := &tmp3, &w, check
+	fe_carry(u, arg1)
+	fe_carry_opp(neg_u, u)
+	fe_carry_mul(neg_u_i, fe_relax_cast(neg_u), fe_relax_cast(&FE_SQRT_M1))

-	// if quartic == -1 or sqrt(-1)
-	// then  isr = x^((p-1)/4) * sqrt(-1)
-	// else  isr = x^((p-1)/4)
-	fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1))
-	fe_cond_assign(out1, &tmp1, (m1|ms) ~ 1)
+	correct_sign_sqrt := fe_equal_bytes(u, &b)
+	flipped_sign_sqrt := fe_equal_bytes(neg_u, &b)
+	flipped_sign_sqrt_i := fe_equal_bytes(neg_u_i, &b)

-	mem.zero_explicit(&tmp1, size_of(tmp1))
-	mem.zero_explicit(&tmp2, size_of(tmp2))
-	mem.zero_explicit(&tmp3, size_of(tmp3))
-	mem.zero_explicit(&quartic_buf, size_of(quartic_buf))
+	r_prime := check
+	fe_carry_mul(r_prime, fe_relax_cast(r), fe_relax_cast(&FE_SQRT_M1))
+	fe_cond_assign(r, r_prime, flipped_sign_sqrt | flipped_sign_sqrt_i)

-	return p1 | m1
+	// Pick the non-negative square root.
+	fe_carry_abs(out1, r)
+
+	fe_clear_vec([]^Tight_Field_Element{&w, &tmp1, &tmp2, &tmp3})
+	mem.zero_explicit(&b, size_of(b))
+
+	return correct_sign_sqrt | flipped_sign_sqrt
 }

-fe_carry_inv :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+fe_carry_inv :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
 	tmp1: Tight_Field_Element

 	fe_carry_square(&tmp1, arg1)
-	_ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1))
+	_ = fe_carry_sqrt_ratio_m1(&tmp1, fe_relax_cast(&FE_ONE), fe_relax_cast(&tmp1))
 	fe_carry_square(&tmp1, fe_relax_cast(&tmp1))
 	fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1)

-	mem.zero_explicit(&tmp1, size_of(tmp1))
+	fe_clear(&tmp1)
+}
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 1
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	x = (out1[4] ~ out2[4]) & mask
+	x5, y5 := out1[4] ~ x, out2[4] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+	out1[4], out2[4] = x5, y5
+}
+
+@(optimization_mode = "none")
+fe_cond_select :: #force_no_inline proc "contextless" (
+	out1, arg1, arg2: $T,
+	arg3: int,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mask := (u64(arg3) * 0xffffffffffffffff)
+	x1 := ((mask & arg2[0]) | ((~mask) & arg1[0]))
+	x2 := ((mask & arg2[1]) | ((~mask) & arg1[1]))
+	x3 := ((mask & arg2[2]) | ((~mask) & arg1[2]))
+	x4 := ((mask & arg2[3]) | ((~mask) & arg1[3]))
+	x5 := ((mask & arg2[4]) | ((~mask) & arg1[4]))
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_cond_negate :: proc "contextless" (out1, arg1: ^Tight_Field_Element, ctrl: int) {
+	tmp1: Tight_Field_Element = ---
+	fe_carry_opp(&tmp1, arg1)
+	fe_cond_select(out1, arg1, &tmp1, ctrl)
+
+	fe_clear(&tmp1)
 }
--- a/core/crypto/_fiat/field_curve25519/field51.odin
+++ b/core/crypto/_fiat/field_curve25519/field51.odin
@@ -30,8 +30,6 @@ package field_curve25519
 //
 // While the base implementation is provably correct, this implementation
 // makes no such claims as the port and optimizations were done by hand.
-// At some point, it may be worth adding support to fiat-crypto for
-// generating Odin output.
 //
 // TODO:
 //  * When fiat-crypto supports it, using a saturated 64-bit limbs
@@ -44,7 +42,10 @@ import "core:math/bits"
 Loose_Field_Element :: distinct [5]u64
 Tight_Field_Element :: distinct [5]u64

-SQRT_M1 := Tight_Field_Element{
+FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0}
+FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0}
+
+FE_SQRT_M1 := Tight_Field_Element {
 	1718705420411056,
 	234908883556509,
 	2233514472574048,
@@ -52,7 +53,13 @@ SQRT_M1 := Tight_Field_Element{
 	765476049583133,
 }

-_addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_addcarryx_u51 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((u64(arg1) + arg2) + arg3)
 	x2 := (x1 & 0x7ffffffffffff)
 	x3 := fiat.u1((x1 >> 51))
@@ -61,7 +68,13 @@ _addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u
 	return
 }

-_subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_subborrowx_u51 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
 	x2 := fiat.i1((x1 >> 51))
 	x3 := (u64(x1) & 0x7ffffffffffff)
@@ -70,7 +83,7 @@ _subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3:
 	return
 }

-fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+fe_carry_mul :: proc "contextless" (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
 	x2, x1 := bits.mul_u64(arg1[4], (arg2[4] * 0x13))
 	x4, x3 := bits.mul_u64(arg1[4], (arg2[3] * 0x13))
 	x6, x5 := bits.mul_u64(arg1[4], (arg2[2] * 0x13))
@@ -169,7 +182,7 @@ fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Eleme
 	out1[4] = x152
 }

-fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+fe_carry_square :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
 	x1 := (arg1[4] * 0x13)
 	x2 := (x1 * 0x2)
 	x3 := (arg1[4] * 0x2)
@@ -305,8 +318,11 @@ fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Ele
 	out1[4] = x5
 }

-@(optimization_mode="none")
-fe_cond_assign :: #force_no_inline proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: int) {
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Tight_Field_Element,
+	arg2: int,
+) {
 	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
 	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
 	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
@@ -527,7 +543,10 @@ fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_E
 	out1[4] = x5
 }

-fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+fe_carry_scmul_121666 :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+) {
 	x2, x1 := bits.mul_u64(0x1db42, arg1[4])
 	x4, x3 := bits.mul_u64(0x1db42, arg1[3])
 	x6, x5 := bits.mul_u64(0x1db42, arg1[2])
@@ -565,54 +584,3 @@ fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_El
 	out1[3] = x27
 	out1[4] = x32
 }
-
-// The following routines were added by hand, and do not come from fiat-crypto.
-
-fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
-	out1[0] = 0
-	out1[1] = 0
-	out1[2] = 0
-	out1[3] = 0
-	out1[4] = 0
-}
-
-fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
-	out1[0] = 1
-	out1[1] = 0
-	out1[2] = 0
-	out1[3] = 0
-	out1[4] = 0
-}
-
-fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
-	x1 := arg1[0]
-	x2 := arg1[1]
-	x3 := arg1[2]
-	x4 := arg1[3]
-	x5 := arg1[4]
-	out1[0] = x1
-	out1[1] = x2
-	out1[2] = x3
-	out1[3] = x4
-	out1[4] = x5
-}
-
-@(optimization_mode="none")
-fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
-	mask := -u64(arg1)
-	x := (out1[0] ~ out2[0]) & mask
-	x1, y1 := out1[0] ~ x, out2[0] ~ x
-	x = (out1[1] ~ out2[1]) & mask
-	x2, y2 := out1[1] ~ x, out2[1] ~ x
-	x = (out1[2] ~ out2[2]) & mask
-	x3, y3 := out1[2] ~ x, out2[2] ~ x
-	x = (out1[3] ~ out2[3]) & mask
-	x4, y4 := out1[3] ~ x, out2[3] ~ x
-	x = (out1[4] ~ out2[4]) & mask
-	x5, y5 := out1[4] ~ x, out2[4] ~ x
-	out1[0], out2[0] = x1, y1
-	out1[1], out2[1] = x2, y2
-	out1[2], out2[2] = x3, y3
-	out1[3], out2[3] = x4, y4
-	out1[4], out2[4] = x5, y5
-}
--- a/core/crypto/_fiat/field_poly1305/field.odin
+++ b/core/crypto/_fiat/field_poly1305/field.odin
@@ -1,17 +1,26 @@
 package field_poly1305

+import "base:intrinsics"
 import "core:encoding/endian"
 import "core:mem"

-fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+fe_relax_cast :: #force_inline proc "contextless" (
+	arg1: ^Tight_Field_Element,
+) -> ^Loose_Field_Element {
 	return transmute(^Loose_Field_Element)(arg1)
 }

-fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+fe_tighten_cast :: #force_inline proc "contextless" (
+	arg1: ^Loose_Field_Element,
+) -> ^Tight_Field_Element {
 	return transmute(^Tight_Field_Element)(arg1)
 }

-fe_from_bytes :: #force_inline proc (out1: ^Tight_Field_Element, arg1: []byte, arg2: byte) {
+fe_from_bytes :: #force_inline proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: []byte,
+	arg2: byte,
+) {
 	// fiat-crypto's deserialization routine effectively processes a
 	// single byte at a time, and wants 256-bits of input for a value
 	// that will be 128-bits or 129-bits.
@@ -20,7 +29,9 @@ fe_from_bytes :: #force_inline proc (out1: ^Tight_Field_Element, arg1: []byte, a
 	// makes implementing the actual MAC block processing considerably
 	// neater.

-	assert(len(arg1) == 16)
+	if len(arg1) != 16 {
+		intrinsics.trap()
+	}

 	// While it may be unwise to do deserialization here on our
 	// own when fiat-crypto provides equivalent functionality,
@@ -51,3 +62,35 @@ fe_from_u64s :: proc "contextless" (out1: ^Tight_Field_Element, lo, hi: u64) {
 	// This routine is only used to deserialize `r` which is confidential.
 	mem.zero_explicit(&tmp, size_of(tmp))
 }
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+}
+
+fe_set :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (
+	out1, out2: ^Tight_Field_Element,
+	arg1: bool,
+) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+}
--- a/core/crypto/_fiat/field_poly1305/field4344.odin
+++ b/core/crypto/_fiat/field_poly1305/field4344.odin
@@ -39,7 +39,13 @@ import "core:math/bits"
 Loose_Field_Element :: distinct [3]u64
 Tight_Field_Element :: distinct [3]u64

-_addcarryx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_addcarryx_u44 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((u64(arg1) + arg2) + arg3)
 	x2 := (x1 & 0xfffffffffff)
 	x3 := fiat.u1((x1 >> 44))
@@ -48,7 +54,13 @@ _addcarryx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u
 	return
 }

-_subborrowx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_subborrowx_u44 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
 	x2 := fiat.i1((x1 >> 44))
 	x3 := (u64(x1) & 0xfffffffffff)
@@ -57,7 +69,13 @@ _subborrowx_u44 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3:
 	return
 }

-_addcarryx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_addcarryx_u43 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((u64(arg1) + arg2) + arg3)
 	x2 := (x1 & 0x7ffffffffff)
 	x3 := fiat.u1((x1 >> 43))
@@ -66,7 +84,13 @@ _addcarryx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u
 	return
 }

-_subborrowx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+_subborrowx_u43 :: #force_inline proc "contextless" (
+	arg1: fiat.u1,
+	arg2, arg3: u64,
+) -> (
+	out1: u64,
+	out2: fiat.u1,
+) {
 	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
 	x2 := fiat.i1((x1 >> 43))
 	x3 := (u64(x1) & 0x7ffffffffff)
@@ -75,7 +99,7 @@ _subborrowx_u43 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3:
 	return
 }

-fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+fe_carry_mul :: proc "contextless" (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
 	x2, x1 := bits.mul_u64(arg1[2], (arg2[2] * 0x5))
 	x4, x3 := bits.mul_u64(arg1[2], (arg2[1] * 0xa))
 	x6, x5 := bits.mul_u64(arg1[1], (arg2[2] * 0xa))
@@ -120,7 +144,7 @@ fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Eleme
 	out1[2] = x62
 }

-fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+fe_carry_square :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
 	x1 := (arg1[2] * 0x5)
 	x2 := (x1 * 0x2)
 	x3 := (arg1[2] * 0x2)
@@ -201,8 +225,11 @@ fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Ele
 	out1[2] = x3
 }

-@(optimization_mode="none")
-fe_cond_assign :: #force_no_inline proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: bool) {
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Tight_Field_Element,
+	arg2: bool,
+) {
 	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
 	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
 	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
@@ -325,34 +352,3 @@ fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_E
 	out1[1] = x2
 	out1[2] = x3
 }
-
-// The following routines were added by hand, and do not come from fiat-crypto.
-
-fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
-	out1[0] = 0
-	out1[1] = 0
-	out1[2] = 0
-}
-
-fe_set :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
-	x1 := arg1[0]
-	x2 := arg1[1]
-	x3 := arg1[2]
-	out1[0] = x1
-	out1[1] = x2
-	out1[2] = x3
-}
-
-@(optimization_mode="none")
-fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: bool) {
-	mask := -u64(arg1)
-	x := (out1[0] ~ out2[0]) & mask
-	x1, y1 := out1[0] ~ x, out2[0] ~ x
-	x = (out1[1] ~ out2[1]) & mask
-	x2, y2 := out1[1] ~ x, out2[1] ~ x
-	x = (out1[2] ~ out2[2]) & mask
-	x3, y3 := out1[2] ~ x, out2[2] ~ x
-	out1[0], out2[0] = x1, y1
-	out1[1], out2[1] = x2, y2
-	out1[2], out2[2] = x3, y3
-}
--- a/core/crypto/_fiat/field_scalar25519/field.odin
+++ b/core/crypto/_fiat/field_scalar25519/field.odin
@@ -0,0 +1,153 @@
+package field_scalar25519
+
+import "base:intrinsics"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+@(private)
+_TWO_168 := Montgomery_Domain_Field_Element {
+	0x5b8ab432eac74798,
+	0x38afddd6de59d5d7,
+	0xa2c131b399411b7c,
+	0x6329a7ed9ce5a30,
+}
+@(private)
+_TWO_336 := Montgomery_Domain_Field_Element {
+	0xbd3d108e2b35ecc5,
+	0x5c3a3718bdf9c90b,
+	0x63aa97a331b4f2ee,
+	0x3d217f5be65cb5c,
+}
+
+fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) {
+	mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element))
+}
+
+fe_from_bytes :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^[32]byte,
+	unsafe_assume_canonical := false,
+) -> bool {
+	tmp := Non_Montgomery_Domain_Field_Element {
+		endian.unchecked_get_u64le(arg1[0:]),
+		endian.unchecked_get_u64le(arg1[8:]),
+		endian.unchecked_get_u64le(arg1[16:]),
+		endian.unchecked_get_u64le(arg1[24:]),
+	}
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	// Check that tmp is in the the range [0, ELL).
+	if !unsafe_assume_canonical {
+		_, borrow := bits.sub_u64(ELL[0] - 1, tmp[0], 0)
+		_, borrow = bits.sub_u64(ELL[1], tmp[1], borrow)
+		_, borrow = bits.sub_u64(ELL[2], tmp[2], borrow)
+		_, borrow = bits.sub_u64(ELL[3], tmp[3], borrow)
+		if borrow != 0 {
+			return false
+		}
+	}
+
+	fe_to_montgomery(out1, &tmp)
+
+	return true
+}
+
+fe_from_bytes_rfc8032 :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^[32]byte,
+) {
+	tmp: [64]byte
+	copy(tmp[:], arg1[:])
+
+	// Apply "clamping" as in RFC 8032.
+	tmp[0] &= 248
+	tmp[31] &= 127
+	tmp[31] |= 64 // Sets the 254th bit, so the encoding is non-canonical.
+
+	fe_from_bytes_wide(out1, &tmp)
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+fe_from_bytes_wide :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^[64]byte,
+) {
+	tmp: Montgomery_Domain_Field_Element
+	// Use Frank Denis' trick, as documented by Filippo Valsorda
+	// at https://words.filippo.io/dispatches/wide-reduction/
+	//
+	// x = c * 2^336 + b * 2^168 + a  mod l
+	_fe_from_bytes_short(out1, arg1[:21]) // a
+
+	_fe_from_bytes_short(&tmp, arg1[21:42]) // b
+	fe_mul(&tmp, &tmp, &_TWO_168) // b * 2^168
+	fe_add(out1, out1, &tmp) // a + b * 2^168
+
+	_fe_from_bytes_short(&tmp, arg1[42:]) // c
+	fe_mul(&tmp, &tmp, &_TWO_336) // c * 2^336
+	fe_add(out1, out1, &tmp) // a + b * 2^168 + c * 2^336
+
+	fe_clear(&tmp)
+}
+
+@(private)
+_fe_from_bytes_short :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element, arg1: []byte) {
+	// INVARIANT: len(arg1) < 32.
+	if len(arg1) >= 32 {
+		intrinsics.trap()
+	}
+	tmp: [32]byte
+	copy(tmp[:], arg1)
+
+	_ = fe_from_bytes(out1, &tmp, true)
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) {
+	if len(out1) != 32 {
+		intrinsics.trap()
+	}
+
+	tmp: Non_Montgomery_Domain_Field_Element
+	fe_from_montgomery(&tmp, arg1)
+
+	endian.unchecked_put_u64le(out1[0:], tmp[0])
+	endian.unchecked_put_u64le(out1[8:], tmp[1])
+	endian.unchecked_put_u64le(out1[16:], tmp[2])
+	endian.unchecked_put_u64le(out1[24:], tmp[3])
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> int {
+	tmp: Montgomery_Domain_Field_Element
+	fe_sub(&tmp, arg1, arg2)
+
+	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// which will be 1.
+	_, borrow := bits.sub_u64(fe_non_zero(&tmp), 1, 0)
+
+	fe_clear(&tmp)
+
+	return int(borrow)
+}
+
+fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
--- a/core/crypto/_fiat/field_scalar25519/field64.odin
+++ b/core/crypto/_fiat/field_scalar25519/field64.odin
@@ -0,0 +1,535 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_scalar25519
+
+// The file provides arithmetic on the field Z/(2^252+27742317777372353535851937790883648493)
+// using a 64-bit Montgomery form internal representation.  It is derived
+// primarily from the machine generated Golang output from the fiat-crypto
+// project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+// ELL is the saturated representation of the field order, least-significant
+// limb first.
+ELL :: [4]u64{0x5812631a5cf5d3ed, 0x14def9dea2f79cd6, 0x0, 0x1000000000000000}
+
+Montgomery_Domain_Field_Element :: distinct [4]u64
+Non_Montgomery_Domain_Field_Element :: distinct [4]u64
+
+fe_mul :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg2[3])
+	x8, x7 := bits.mul_u64(x4, arg2[2])
+	x10, x9 := bits.mul_u64(x4, arg2[1])
+	x12, x11 := bits.mul_u64(x4, arg2[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	_, x20 := bits.mul_u64(x11, 0xd2b51da312547e1b)
+	x23, x22 := bits.mul_u64(x20, 0x1000000000000000)
+	x25, x24 := bits.mul_u64(x20, 0x14def9dea2f79cd6)
+	x27, x26 := bits.mul_u64(x20, 0x5812631a5cf5d3ed)
+	x28, x29 := bits.add_u64(x27, x24, u64(0x0))
+	x30 := (u64(fiat.u1(x29)) + x25)
+	_, x32 := bits.add_u64(x11, x26, u64(0x0))
+	x33, x34 := bits.add_u64(x13, x28, u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64(x15, x30, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x17, x22, u64(fiat.u1(x36)))
+	x39, x40 := bits.add_u64(x19, x23, u64(fiat.u1(x38)))
+	x42, x41 := bits.mul_u64(x1, arg2[3])
+	x44, x43 := bits.mul_u64(x1, arg2[2])
+	x46, x45 := bits.mul_u64(x1, arg2[1])
+	x48, x47 := bits.mul_u64(x1, arg2[0])
+	x49, x50 := bits.add_u64(x48, x45, u64(0x0))
+	x51, x52 := bits.add_u64(x46, x43, u64(fiat.u1(x50)))
+	x53, x54 := bits.add_u64(x44, x41, u64(fiat.u1(x52)))
+	x55 := (u64(fiat.u1(x54)) + x42)
+	x56, x57 := bits.add_u64(x33, x47, u64(0x0))
+	x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57)))
+	x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59)))
+	x62, x63 := bits.add_u64(x39, x53, u64(fiat.u1(x61)))
+	x64, x65 := bits.add_u64(u64(fiat.u1(x40)), x55, u64(fiat.u1(x63)))
+	_, x66 := bits.mul_u64(x56, 0xd2b51da312547e1b)
+	x69, x68 := bits.mul_u64(x66, 0x1000000000000000)
+	x71, x70 := bits.mul_u64(x66, 0x14def9dea2f79cd6)
+	x73, x72 := bits.mul_u64(x66, 0x5812631a5cf5d3ed)
+	x74, x75 := bits.add_u64(x73, x70, u64(0x0))
+	x76 := (u64(fiat.u1(x75)) + x71)
+	_, x78 := bits.add_u64(x56, x72, u64(0x0))
+	x79, x80 := bits.add_u64(x58, x74, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x60, x76, u64(fiat.u1(x80)))
+	x83, x84 := bits.add_u64(x62, x68, u64(fiat.u1(x82)))
+	x85, x86 := bits.add_u64(x64, x69, u64(fiat.u1(x84)))
+	x87 := (u64(fiat.u1(x86)) + u64(fiat.u1(x65)))
+	x89, x88 := bits.mul_u64(x2, arg2[3])
+	x91, x90 := bits.mul_u64(x2, arg2[2])
+	x93, x92 := bits.mul_u64(x2, arg2[1])
+	x95, x94 := bits.mul_u64(x2, arg2[0])
+	x96, x97 := bits.add_u64(x95, x92, u64(0x0))
+	x98, x99 := bits.add_u64(x93, x90, u64(fiat.u1(x97)))
+	x100, x101 := bits.add_u64(x91, x88, u64(fiat.u1(x99)))
+	x102 := (u64(fiat.u1(x101)) + x89)
+	x103, x104 := bits.add_u64(x79, x94, u64(0x0))
+	x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104)))
+	x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106)))
+	x109, x110 := bits.add_u64(x85, x100, u64(fiat.u1(x108)))
+	x111, x112 := bits.add_u64(x87, x102, u64(fiat.u1(x110)))
+	_, x113 := bits.mul_u64(x103, 0xd2b51da312547e1b)
+	x116, x115 := bits.mul_u64(x113, 0x1000000000000000)
+	x118, x117 := bits.mul_u64(x113, 0x14def9dea2f79cd6)
+	x120, x119 := bits.mul_u64(x113, 0x5812631a5cf5d3ed)
+	x121, x122 := bits.add_u64(x120, x117, u64(0x0))
+	x123 := (u64(fiat.u1(x122)) + x118)
+	_, x125 := bits.add_u64(x103, x119, u64(0x0))
+	x126, x127 := bits.add_u64(x105, x121, u64(fiat.u1(x125)))
+	x128, x129 := bits.add_u64(x107, x123, u64(fiat.u1(x127)))
+	x130, x131 := bits.add_u64(x109, x115, u64(fiat.u1(x129)))
+	x132, x133 := bits.add_u64(x111, x116, u64(fiat.u1(x131)))
+	x134 := (u64(fiat.u1(x133)) + u64(fiat.u1(x112)))
+	x136, x135 := bits.mul_u64(x3, arg2[3])
+	x138, x137 := bits.mul_u64(x3, arg2[2])
+	x140, x139 := bits.mul_u64(x3, arg2[1])
+	x142, x141 := bits.mul_u64(x3, arg2[0])
+	x143, x144 := bits.add_u64(x142, x139, u64(0x0))
+	x145, x146 := bits.add_u64(x140, x137, u64(fiat.u1(x144)))
+	x147, x148 := bits.add_u64(x138, x135, u64(fiat.u1(x146)))
+	x149 := (u64(fiat.u1(x148)) + x136)
+	x150, x151 := bits.add_u64(x126, x141, u64(0x0))
+	x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151)))
+	x154, x155 := bits.add_u64(x130, x145, u64(fiat.u1(x153)))
+	x156, x157 := bits.add_u64(x132, x147, u64(fiat.u1(x155)))
+	x158, x159 := bits.add_u64(x134, x149, u64(fiat.u1(x157)))
+	_, x160 := bits.mul_u64(x150, 0xd2b51da312547e1b)
+	x163, x162 := bits.mul_u64(x160, 0x1000000000000000)
+	x165, x164 := bits.mul_u64(x160, 0x14def9dea2f79cd6)
+	x167, x166 := bits.mul_u64(x160, 0x5812631a5cf5d3ed)
+	x168, x169 := bits.add_u64(x167, x164, u64(0x0))
+	x170 := (u64(fiat.u1(x169)) + x165)
+	_, x172 := bits.add_u64(x150, x166, u64(0x0))
+	x173, x174 := bits.add_u64(x152, x168, u64(fiat.u1(x172)))
+	x175, x176 := bits.add_u64(x154, x170, u64(fiat.u1(x174)))
+	x177, x178 := bits.add_u64(x156, x162, u64(fiat.u1(x176)))
+	x179, x180 := bits.add_u64(x158, x163, u64(fiat.u1(x178)))
+	x181 := (u64(fiat.u1(x180)) + u64(fiat.u1(x159)))
+	x182, x183 := bits.sub_u64(x173, 0x5812631a5cf5d3ed, u64(0x0))
+	x184, x185 := bits.sub_u64(x175, 0x14def9dea2f79cd6, u64(fiat.u1(x183)))
+	x186, x187 := bits.sub_u64(x177, u64(0x0), u64(fiat.u1(x185)))
+	x188, x189 := bits.sub_u64(x179, 0x1000000000000000, u64(fiat.u1(x187)))
+	_, x191 := bits.sub_u64(x181, u64(0x0), u64(fiat.u1(x189)))
+	x192 := fiat.cmovznz_u64(fiat.u1(x191), x182, x173)
+	x193 := fiat.cmovznz_u64(fiat.u1(x191), x184, x175)
+	x194 := fiat.cmovznz_u64(fiat.u1(x191), x186, x177)
+	x195 := fiat.cmovznz_u64(fiat.u1(x191), x188, x179)
+	out1[0] = x192
+	out1[1] = x193
+	out1[2] = x194
+	out1[3] = x195
+}
+
+fe_square :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg1[3])
+	x8, x7 := bits.mul_u64(x4, arg1[2])
+	x10, x9 := bits.mul_u64(x4, arg1[1])
+	x12, x11 := bits.mul_u64(x4, arg1[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	_, x20 := bits.mul_u64(x11, 0xd2b51da312547e1b)
+	x23, x22 := bits.mul_u64(x20, 0x1000000000000000)
+	x25, x24 := bits.mul_u64(x20, 0x14def9dea2f79cd6)
+	x27, x26 := bits.mul_u64(x20, 0x5812631a5cf5d3ed)
+	x28, x29 := bits.add_u64(x27, x24, u64(0x0))
+	x30 := (u64(fiat.u1(x29)) + x25)
+	_, x32 := bits.add_u64(x11, x26, u64(0x0))
+	x33, x34 := bits.add_u64(x13, x28, u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64(x15, x30, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x17, x22, u64(fiat.u1(x36)))
+	x39, x40 := bits.add_u64(x19, x23, u64(fiat.u1(x38)))
+	x42, x41 := bits.mul_u64(x1, arg1[3])
+	x44, x43 := bits.mul_u64(x1, arg1[2])
+	x46, x45 := bits.mul_u64(x1, arg1[1])
+	x48, x47 := bits.mul_u64(x1, arg1[0])
+	x49, x50 := bits.add_u64(x48, x45, u64(0x0))
+	x51, x52 := bits.add_u64(x46, x43, u64(fiat.u1(x50)))
+	x53, x54 := bits.add_u64(x44, x41, u64(fiat.u1(x52)))
+	x55 := (u64(fiat.u1(x54)) + x42)
+	x56, x57 := bits.add_u64(x33, x47, u64(0x0))
+	x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57)))
+	x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59)))
+	x62, x63 := bits.add_u64(x39, x53, u64(fiat.u1(x61)))
+	x64, x65 := bits.add_u64(u64(fiat.u1(x40)), x55, u64(fiat.u1(x63)))
+	_, x66 := bits.mul_u64(x56, 0xd2b51da312547e1b)
+	x69, x68 := bits.mul_u64(x66, 0x1000000000000000)
+	x71, x70 := bits.mul_u64(x66, 0x14def9dea2f79cd6)
+	x73, x72 := bits.mul_u64(x66, 0x5812631a5cf5d3ed)
+	x74, x75 := bits.add_u64(x73, x70, u64(0x0))
+	x76 := (u64(fiat.u1(x75)) + x71)
+	_, x78 := bits.add_u64(x56, x72, u64(0x0))
+	x79, x80 := bits.add_u64(x58, x74, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x60, x76, u64(fiat.u1(x80)))
+	x83, x84 := bits.add_u64(x62, x68, u64(fiat.u1(x82)))
+	x85, x86 := bits.add_u64(x64, x69, u64(fiat.u1(x84)))
+	x87 := (u64(fiat.u1(x86)) + u64(fiat.u1(x65)))
+	x89, x88 := bits.mul_u64(x2, arg1[3])
+	x91, x90 := bits.mul_u64(x2, arg1[2])
+	x93, x92 := bits.mul_u64(x2, arg1[1])
+	x95, x94 := bits.mul_u64(x2, arg1[0])
+	x96, x97 := bits.add_u64(x95, x92, u64(0x0))
+	x98, x99 := bits.add_u64(x93, x90, u64(fiat.u1(x97)))
+	x100, x101 := bits.add_u64(x91, x88, u64(fiat.u1(x99)))
+	x102 := (u64(fiat.u1(x101)) + x89)
+	x103, x104 := bits.add_u64(x79, x94, u64(0x0))
+	x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104)))
+	x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106)))
+	x109, x110 := bits.add_u64(x85, x100, u64(fiat.u1(x108)))
+	x111, x112 := bits.add_u64(x87, x102, u64(fiat.u1(x110)))
+	_, x113 := bits.mul_u64(x103, 0xd2b51da312547e1b)
+	x116, x115 := bits.mul_u64(x113, 0x1000000000000000)
+	x118, x117 := bits.mul_u64(x113, 0x14def9dea2f79cd6)
+	x120, x119 := bits.mul_u64(x113, 0x5812631a5cf5d3ed)
+	x121, x122 := bits.add_u64(x120, x117, u64(0x0))
+	x123 := (u64(fiat.u1(x122)) + x118)
+	_, x125 := bits.add_u64(x103, x119, u64(0x0))
+	x126, x127 := bits.add_u64(x105, x121, u64(fiat.u1(x125)))
+	x128, x129 := bits.add_u64(x107, x123, u64(fiat.u1(x127)))
+	x130, x131 := bits.add_u64(x109, x115, u64(fiat.u1(x129)))
+	x132, x133 := bits.add_u64(x111, x116, u64(fiat.u1(x131)))
+	x134 := (u64(fiat.u1(x133)) + u64(fiat.u1(x112)))
+	x136, x135 := bits.mul_u64(x3, arg1[3])
+	x138, x137 := bits.mul_u64(x3, arg1[2])
+	x140, x139 := bits.mul_u64(x3, arg1[1])
+	x142, x141 := bits.mul_u64(x3, arg1[0])
+	x143, x144 := bits.add_u64(x142, x139, u64(0x0))
+	x145, x146 := bits.add_u64(x140, x137, u64(fiat.u1(x144)))
+	x147, x148 := bits.add_u64(x138, x135, u64(fiat.u1(x146)))
+	x149 := (u64(fiat.u1(x148)) + x136)
+	x150, x151 := bits.add_u64(x126, x141, u64(0x0))
+	x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151)))
+	x154, x155 := bits.add_u64(x130, x145, u64(fiat.u1(x153)))
+	x156, x157 := bits.add_u64(x132, x147, u64(fiat.u1(x155)))
+	x158, x159 := bits.add_u64(x134, x149, u64(fiat.u1(x157)))
+	_, x160 := bits.mul_u64(x150, 0xd2b51da312547e1b)
+	x163, x162 := bits.mul_u64(x160, 0x1000000000000000)
+	x165, x164 := bits.mul_u64(x160, 0x14def9dea2f79cd6)
+	x167, x166 := bits.mul_u64(x160, 0x5812631a5cf5d3ed)
+	x168, x169 := bits.add_u64(x167, x164, u64(0x0))
+	x170 := (u64(fiat.u1(x169)) + x165)
+	_, x172 := bits.add_u64(x150, x166, u64(0x0))
+	x173, x174 := bits.add_u64(x152, x168, u64(fiat.u1(x172)))
+	x175, x176 := bits.add_u64(x154, x170, u64(fiat.u1(x174)))
+	x177, x178 := bits.add_u64(x156, x162, u64(fiat.u1(x176)))
+	x179, x180 := bits.add_u64(x158, x163, u64(fiat.u1(x178)))
+	x181 := (u64(fiat.u1(x180)) + u64(fiat.u1(x159)))
+	x182, x183 := bits.sub_u64(x173, 0x5812631a5cf5d3ed, u64(0x0))
+	x184, x185 := bits.sub_u64(x175, 0x14def9dea2f79cd6, u64(fiat.u1(x183)))
+	x186, x187 := bits.sub_u64(x177, u64(0x0), u64(fiat.u1(x185)))
+	x188, x189 := bits.sub_u64(x179, 0x1000000000000000, u64(fiat.u1(x187)))
+	_, x191 := bits.sub_u64(x181, u64(0x0), u64(fiat.u1(x189)))
+	x192 := fiat.cmovznz_u64(fiat.u1(x191), x182, x173)
+	x193 := fiat.cmovznz_u64(fiat.u1(x191), x184, x175)
+	x194 := fiat.cmovznz_u64(fiat.u1(x191), x186, x177)
+	x195 := fiat.cmovznz_u64(fiat.u1(x191), x188, x179)
+	out1[0] = x192
+	out1[1] = x193
+	out1[2] = x194
+	out1[3] = x195
+}
+
+fe_add :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.add_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.add_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.add_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.add_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9, x10 := bits.sub_u64(x1, 0x5812631a5cf5d3ed, u64(0x0))
+	x11, x12 := bits.sub_u64(x3, 0x14def9dea2f79cd6, u64(fiat.u1(x10)))
+	x13, x14 := bits.sub_u64(x5, u64(0x0), u64(fiat.u1(x12)))
+	x15, x16 := bits.sub_u64(x7, 0x1000000000000000, u64(fiat.u1(x14)))
+	_, x18 := bits.sub_u64(u64(fiat.u1(x8)), u64(0x0), u64(fiat.u1(x16)))
+	x19 := fiat.cmovznz_u64(fiat.u1(x18), x9, x1)
+	x20 := fiat.cmovznz_u64(fiat.u1(x18), x11, x3)
+	x21 := fiat.cmovznz_u64(fiat.u1(x18), x13, x5)
+	x22 := fiat.cmovznz_u64(fiat.u1(x18), x15, x7)
+	out1[0] = x19
+	out1[1] = x20
+	out1[2] = x21
+	out1[3] = x22
+}
+
+fe_sub :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.sub_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, (x9 & 0x5812631a5cf5d3ed), u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0x14def9dea2f79cd6), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0x1000000000000000), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_opp :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(u64(0x0), arg1[0], u64(0x0))
+	x3, x4 := bits.sub_u64(u64(0x0), arg1[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(u64(0x0), arg1[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(u64(0x0), arg1[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, (x9 & 0x5812631a5cf5d3ed), u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0x14def9dea2f79cd6), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0x1000000000000000), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_one :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0xd6ec31748d98951d
+	out1[1] = 0xc6ef5bf4737dcf70
+	out1[2] = 0xfffffffffffffffe
+	out1[3] = 0xfffffffffffffff
+}
+
+fe_non_zero :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> u64 {
+	return arg1[0] | (arg1[1] | (arg1[2] | arg1[3]))
+}
+
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Montgomery_Domain_Field_Element,
+	arg2: int,
+) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+fe_from_montgomery :: proc "contextless" (
+	out1: ^Non_Montgomery_Domain_Field_Element,
+	arg1: ^Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[0]
+	_, x2 := bits.mul_u64(x1, 0xd2b51da312547e1b)
+	x5, x4 := bits.mul_u64(x2, 0x1000000000000000)
+	x7, x6 := bits.mul_u64(x2, 0x14def9dea2f79cd6)
+	x9, x8 := bits.mul_u64(x2, 0x5812631a5cf5d3ed)
+	x10, x11 := bits.add_u64(x9, x6, u64(0x0))
+	_, x13 := bits.add_u64(x1, x8, u64(0x0))
+	x14, x15 := bits.add_u64(u64(0x0), x10, u64(fiat.u1(x13)))
+	x16, x17 := bits.add_u64(x14, arg1[1], u64(0x0))
+	_, x18 := bits.mul_u64(x16, 0xd2b51da312547e1b)
+	x21, x20 := bits.mul_u64(x18, 0x1000000000000000)
+	x23, x22 := bits.mul_u64(x18, 0x14def9dea2f79cd6)
+	x25, x24 := bits.mul_u64(x18, 0x5812631a5cf5d3ed)
+	x26, x27 := bits.add_u64(x25, x22, u64(0x0))
+	_, x29 := bits.add_u64(x16, x24, u64(0x0))
+	x30, x31 := bits.add_u64(
+		(u64(fiat.u1(x17)) + (u64(fiat.u1(x15)) + (u64(fiat.u1(x11)) + x7))),
+		x26,
+		u64(fiat.u1(x29)),
+	)
+	x32, x33 := bits.add_u64(x4, (u64(fiat.u1(x27)) + x23), u64(fiat.u1(x31)))
+	x34, x35 := bits.add_u64(x5, x20, u64(fiat.u1(x33)))
+	x36, x37 := bits.add_u64(x30, arg1[2], u64(0x0))
+	x38, x39 := bits.add_u64(x32, u64(0x0), u64(fiat.u1(x37)))
+	x40, x41 := bits.add_u64(x34, u64(0x0), u64(fiat.u1(x39)))
+	_, x42 := bits.mul_u64(x36, 0xd2b51da312547e1b)
+	x45, x44 := bits.mul_u64(x42, 0x1000000000000000)
+	x47, x46 := bits.mul_u64(x42, 0x14def9dea2f79cd6)
+	x49, x48 := bits.mul_u64(x42, 0x5812631a5cf5d3ed)
+	x50, x51 := bits.add_u64(x49, x46, u64(0x0))
+	_, x53 := bits.add_u64(x36, x48, u64(0x0))
+	x54, x55 := bits.add_u64(x38, x50, u64(fiat.u1(x53)))
+	x56, x57 := bits.add_u64(x40, (u64(fiat.u1(x51)) + x47), u64(fiat.u1(x55)))
+	x58, x59 := bits.add_u64(
+		(u64(fiat.u1(x41)) + (u64(fiat.u1(x35)) + x21)),
+		x44,
+		u64(fiat.u1(x57)),
+	)
+	x60, x61 := bits.add_u64(x54, arg1[3], u64(0x0))
+	x62, x63 := bits.add_u64(x56, u64(0x0), u64(fiat.u1(x61)))
+	x64, x65 := bits.add_u64(x58, u64(0x0), u64(fiat.u1(x63)))
+	_, x66 := bits.mul_u64(x60, 0xd2b51da312547e1b)
+	x69, x68 := bits.mul_u64(x66, 0x1000000000000000)
+	x71, x70 := bits.mul_u64(x66, 0x14def9dea2f79cd6)
+	x73, x72 := bits.mul_u64(x66, 0x5812631a5cf5d3ed)
+	x74, x75 := bits.add_u64(x73, x70, u64(0x0))
+	_, x77 := bits.add_u64(x60, x72, u64(0x0))
+	x78, x79 := bits.add_u64(x62, x74, u64(fiat.u1(x77)))
+	x80, x81 := bits.add_u64(x64, (u64(fiat.u1(x75)) + x71), u64(fiat.u1(x79)))
+	x82, x83 := bits.add_u64(
+		(u64(fiat.u1(x65)) + (u64(fiat.u1(x59)) + x45)),
+		x68,
+		u64(fiat.u1(x81)),
+	)
+	x84 := (u64(fiat.u1(x83)) + x69)
+	x85, x86 := bits.sub_u64(x78, 0x5812631a5cf5d3ed, u64(0x0))
+	x87, x88 := bits.sub_u64(x80, 0x14def9dea2f79cd6, u64(fiat.u1(x86)))
+	x89, x90 := bits.sub_u64(x82, u64(0x0), u64(fiat.u1(x88)))
+	x91, x92 := bits.sub_u64(x84, 0x1000000000000000, u64(fiat.u1(x90)))
+	_, x94 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x92)))
+	x95 := fiat.cmovznz_u64(fiat.u1(x94), x85, x78)
+	x96 := fiat.cmovznz_u64(fiat.u1(x94), x87, x80)
+	x97 := fiat.cmovznz_u64(fiat.u1(x94), x89, x82)
+	x98 := fiat.cmovznz_u64(fiat.u1(x94), x91, x84)
+	out1[0] = x95
+	out1[1] = x96
+	out1[2] = x97
+	out1[3] = x98
+}
+
+fe_to_montgomery :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^Non_Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, 0x399411b7c309a3d)
+	x8, x7 := bits.mul_u64(x4, 0xceec73d217f5be65)
+	x10, x9 := bits.mul_u64(x4, 0xd00e1ba768859347)
+	x12, x11 := bits.mul_u64(x4, 0xa40611e3449c0f01)
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	_, x19 := bits.mul_u64(x11, 0xd2b51da312547e1b)
+	x22, x21 := bits.mul_u64(x19, 0x1000000000000000)
+	x24, x23 := bits.mul_u64(x19, 0x14def9dea2f79cd6)
+	x26, x25 := bits.mul_u64(x19, 0x5812631a5cf5d3ed)
+	x27, x28 := bits.add_u64(x26, x23, u64(0x0))
+	_, x30 := bits.add_u64(x11, x25, u64(0x0))
+	x31, x32 := bits.add_u64(x13, x27, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x15, (u64(fiat.u1(x28)) + x24), u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64(x17, x21, u64(fiat.u1(x34)))
+	x38, x37 := bits.mul_u64(x1, 0x399411b7c309a3d)
+	x40, x39 := bits.mul_u64(x1, 0xceec73d217f5be65)
+	x42, x41 := bits.mul_u64(x1, 0xd00e1ba768859347)
+	x44, x43 := bits.mul_u64(x1, 0xa40611e3449c0f01)
+	x45, x46 := bits.add_u64(x44, x41, u64(0x0))
+	x47, x48 := bits.add_u64(x42, x39, u64(fiat.u1(x46)))
+	x49, x50 := bits.add_u64(x40, x37, u64(fiat.u1(x48)))
+	x51, x52 := bits.add_u64(x31, x43, u64(0x0))
+	x53, x54 := bits.add_u64(x33, x45, u64(fiat.u1(x52)))
+	x55, x56 := bits.add_u64(x35, x47, u64(fiat.u1(x54)))
+	x57, x58 := bits.add_u64(
+		((u64(fiat.u1(x36)) + (u64(fiat.u1(x18)) + x6)) + x22),
+		x49,
+		u64(fiat.u1(x56)),
+	)
+	_, x59 := bits.mul_u64(x51, 0xd2b51da312547e1b)
+	x62, x61 := bits.mul_u64(x59, 0x1000000000000000)
+	x64, x63 := bits.mul_u64(x59, 0x14def9dea2f79cd6)
+	x66, x65 := bits.mul_u64(x59, 0x5812631a5cf5d3ed)
+	x67, x68 := bits.add_u64(x66, x63, u64(0x0))
+	_, x70 := bits.add_u64(x51, x65, u64(0x0))
+	x71, x72 := bits.add_u64(x53, x67, u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x55, (u64(fiat.u1(x68)) + x64), u64(fiat.u1(x72)))
+	x75, x76 := bits.add_u64(x57, x61, u64(fiat.u1(x74)))
+	x78, x77 := bits.mul_u64(x2, 0x399411b7c309a3d)
+	x80, x79 := bits.mul_u64(x2, 0xceec73d217f5be65)
+	x82, x81 := bits.mul_u64(x2, 0xd00e1ba768859347)
+	x84, x83 := bits.mul_u64(x2, 0xa40611e3449c0f01)
+	x85, x86 := bits.add_u64(x84, x81, u64(0x0))
+	x87, x88 := bits.add_u64(x82, x79, u64(fiat.u1(x86)))
+	x89, x90 := bits.add_u64(x80, x77, u64(fiat.u1(x88)))
+	x91, x92 := bits.add_u64(x71, x83, u64(0x0))
+	x93, x94 := bits.add_u64(x73, x85, u64(fiat.u1(x92)))
+	x95, x96 := bits.add_u64(x75, x87, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(
+		((u64(fiat.u1(x76)) + (u64(fiat.u1(x58)) + (u64(fiat.u1(x50)) + x38))) + x62),
+		x89,
+		u64(fiat.u1(x96)),
+	)
+	_, x99 := bits.mul_u64(x91, 0xd2b51da312547e1b)
+	x102, x101 := bits.mul_u64(x99, 0x1000000000000000)
+	x104, x103 := bits.mul_u64(x99, 0x14def9dea2f79cd6)
+	x106, x105 := bits.mul_u64(x99, 0x5812631a5cf5d3ed)
+	x107, x108 := bits.add_u64(x106, x103, u64(0x0))
+	_, x110 := bits.add_u64(x91, x105, u64(0x0))
+	x111, x112 := bits.add_u64(x93, x107, u64(fiat.u1(x110)))
+	x113, x114 := bits.add_u64(x95, (u64(fiat.u1(x108)) + x104), u64(fiat.u1(x112)))
+	x115, x116 := bits.add_u64(x97, x101, u64(fiat.u1(x114)))
+	x118, x117 := bits.mul_u64(x3, 0x399411b7c309a3d)
+	x120, x119 := bits.mul_u64(x3, 0xceec73d217f5be65)
+	x122, x121 := bits.mul_u64(x3, 0xd00e1ba768859347)
+	x124, x123 := bits.mul_u64(x3, 0xa40611e3449c0f01)
+	x125, x126 := bits.add_u64(x124, x121, u64(0x0))
+	x127, x128 := bits.add_u64(x122, x119, u64(fiat.u1(x126)))
+	x129, x130 := bits.add_u64(x120, x117, u64(fiat.u1(x128)))
+	x131, x132 := bits.add_u64(x111, x123, u64(0x0))
+	x133, x134 := bits.add_u64(x113, x125, u64(fiat.u1(x132)))
+	x135, x136 := bits.add_u64(x115, x127, u64(fiat.u1(x134)))
+	x137, x138 := bits.add_u64(
+		((u64(fiat.u1(x116)) + (u64(fiat.u1(x98)) + (u64(fiat.u1(x90)) + x78))) + x102),
+		x129,
+		u64(fiat.u1(x136)),
+	)
+	_, x139 := bits.mul_u64(x131, 0xd2b51da312547e1b)
+	x142, x141 := bits.mul_u64(x139, 0x1000000000000000)
+	x144, x143 := bits.mul_u64(x139, 0x14def9dea2f79cd6)
+	x146, x145 := bits.mul_u64(x139, 0x5812631a5cf5d3ed)
+	x147, x148 := bits.add_u64(x146, x143, u64(0x0))
+	_, x150 := bits.add_u64(x131, x145, u64(0x0))
+	x151, x152 := bits.add_u64(x133, x147, u64(fiat.u1(x150)))
+	x153, x154 := bits.add_u64(x135, (u64(fiat.u1(x148)) + x144), u64(fiat.u1(x152)))
+	x155, x156 := bits.add_u64(x137, x141, u64(fiat.u1(x154)))
+	x157 := ((u64(fiat.u1(x156)) + (u64(fiat.u1(x138)) + (u64(fiat.u1(x130)) + x118))) + x142)
+	x158, x159 := bits.sub_u64(x151, 0x5812631a5cf5d3ed, u64(0x0))
+	x160, x161 := bits.sub_u64(x153, 0x14def9dea2f79cd6, u64(fiat.u1(x159)))
+	x162, x163 := bits.sub_u64(x155, u64(0x0), u64(fiat.u1(x161)))
+	x164, x165 := bits.sub_u64(x157, 0x1000000000000000, u64(fiat.u1(x163)))
+	_, x167 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x165)))
+	x168 := fiat.cmovznz_u64(fiat.u1(x167), x158, x151)
+	x169 := fiat.cmovznz_u64(fiat.u1(x167), x160, x153)
+	x170 := fiat.cmovznz_u64(fiat.u1(x167), x162, x155)
+	x171 := fiat.cmovznz_u64(fiat.u1(x167), x164, x157)
+	out1[0] = x168
+	out1[1] = x169
+	out1[2] = x170
+	out1[3] = x171
+}
--- a/core/crypto/_sha3/sha3.odin
+++ b/core/crypto/_sha3/sha3.odin
@@ -7,50 +7,69 @@ package _sha3
    List of contributors:
        zhibog, dotbmp:  Initial implementation.

-    Implementation of the Keccak hashing algorithm, standardized as SHA3 in <https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf>
-    To use the original Keccak padding, set the is_keccak bool to true, otherwise it will use SHA3 padding.
+    Implementation of the Keccak hashing algorithm, standardized as SHA3
+    in <https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf>.
+
+    As the only difference between the legacy Keccak and SHA3 is the domain
+    separation byte, set dsbyte to the appropriate value to pick the desired
+    algorithm.
 */

 import "core:math/bits"
+import "core:mem"

 ROUNDS :: 24

-Sha3_Context :: struct {
-	st:        struct #raw_union {
+RATE_128 :: 1344 / 8 // ONLY for SHAKE128.
+RATE_224 :: 1152 / 8
+RATE_256 :: 1088 / 8
+RATE_384 :: 832 / 8
+RATE_512 :: 576 / 8
+
+DS_KECCAK :: 0x01
+DS_SHA3 :: 0x06
+DS_SHAKE :: 0x1f
+DS_CSHAKE :: 0x04
+
+Context :: struct {
+	st:             struct #raw_union {
 		b: [200]u8,
 		q: [25]u64,
 	},
-	pt:        int,
-	rsiz:      int,
-	mdlen:     int,
-	is_keccak: bool,
-
+	pt:             int,
+	rsiz:           int,
+	mdlen:          int,
+	dsbyte:         byte,
 	is_initialized: bool,
 	is_finalized:   bool, // For SHAKE (unlimited squeeze is allowed)
 }

+@(private)
+keccakf_rndc := [?]u64 {
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
+}
+
+@(private)
+keccakf_rotc := [?]int {
+	1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
+	27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
+}
+
+@(private)
+keccakf_piln := [?]i32 {
+	10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
+}
+
+@(private)
 keccakf :: proc "contextless" (st: ^[25]u64) {
-	keccakf_rndc := [?]u64 {
-		0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-		0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-		0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-		0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-		0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-		0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-		0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-		0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
-	}
-
-	keccakf_rotc := [?]int {
-		1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
-		27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
-	}
-
-	keccakf_piln := [?]i32 {
-		10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
-		15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
-	}
-
 	i, j, r: i32 = ---, ---, ---
 	t: u64 = ---
 	bc: [5]u64 = ---
@@ -103,81 +122,93 @@ keccakf :: proc "contextless" (st: ^[25]u64) {
 	}
 }

-init :: proc(c: ^Sha3_Context) {
+init :: proc(ctx: ^Context) {
 	for i := 0; i < 25; i += 1 {
-		c.st.q[i] = 0
+		ctx.st.q[i] = 0
 	}
-	c.rsiz = 200 - 2 * c.mdlen
-	c.pt = 0
+	ctx.rsiz = 200 - 2 * ctx.mdlen
+	ctx.pt = 0

-	c.is_initialized = true
-	c.is_finalized = false
+	ctx.is_initialized = true
+	ctx.is_finalized = false
 }

-update :: proc(c: ^Sha3_Context, data: []byte) {
-	assert(c.is_initialized)
-	assert(!c.is_finalized)
+update :: proc(ctx: ^Context, data: []byte) {
+	assert(ctx.is_initialized)
+	assert(!ctx.is_finalized)

-	j := c.pt
+	j := ctx.pt
 	for i := 0; i < len(data); i += 1 {
-		c.st.b[j] ~= data[i]
+		ctx.st.b[j] ~= data[i]
 		j += 1
-		if j >= c.rsiz {
-			keccakf(&c.st.q)
+		if j >= ctx.rsiz {
+			keccakf(&ctx.st.q)
 			j = 0
 		}
 	}
-	c.pt = j
+	ctx.pt = j
 }

-final :: proc(c: ^Sha3_Context, hash: []byte) {
-	assert(c.is_initialized)
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	assert(ctx.is_initialized)

-	if len(hash) < c.mdlen {
-		if c.is_keccak {
-			panic("crypto/keccac: invalid destination digest size")
-		}
+	if len(hash) < ctx.mdlen {
 		panic("crypto/sha3: invalid destination digest size")
 	}
-	if c.is_keccak {
-		c.st.b[c.pt] ~= 0x01
-	} else {
-		c.st.b[c.pt] ~= 0x06
-	}

-	c.st.b[c.rsiz - 1] ~= 0x80
-	keccakf(&c.st.q)
-	for i := 0; i < c.mdlen; i += 1 {
-		hash[i] = c.st.b[i]
+	ctx := ctx
+	if finalize_clone {
+		tmp_ctx: Context
+		clone(&tmp_ctx, ctx)
+		ctx = &tmp_ctx
 	}
+	defer (reset(ctx))

-	c.is_initialized = false // No more absorb, no more squeeze.
+	ctx.st.b[ctx.pt] ~= ctx.dsbyte
+
+	ctx.st.b[ctx.rsiz - 1] ~= 0x80
+	keccakf(&ctx.st.q)
+	for i := 0; i < ctx.mdlen; i += 1 {
+		hash[i] = ctx.st.b[i]
+	}
 }

-shake_xof :: proc(c: ^Sha3_Context) {
-	assert(c.is_initialized)
-	assert(!c.is_finalized)
-
-	c.st.b[c.pt] ~= 0x1F
-	c.st.b[c.rsiz - 1] ~= 0x80
-	keccakf(&c.st.q)
-	c.pt = 0
-
-	c.is_finalized = true // No more absorb, unlimited squeeze.
+clone :: proc(ctx, other: ^Context) {
+	ctx^ = other^
 }

-shake_out :: proc(c: ^Sha3_Context, hash: []byte) {
-	assert(c.is_initialized)
-	assert(c.is_finalized)
+reset :: proc(ctx: ^Context) {
+	if !ctx.is_initialized {
+		return
+	}

-	j := c.pt
+	mem.zero_explicit(ctx, size_of(ctx^))
+}
+
+shake_xof :: proc(ctx: ^Context) {
+	assert(ctx.is_initialized)
+	assert(!ctx.is_finalized)
+
+	ctx.st.b[ctx.pt] ~= ctx.dsbyte
+	ctx.st.b[ctx.rsiz - 1] ~= 0x80
+	keccakf(&ctx.st.q)
+	ctx.pt = 0
+
+	ctx.is_finalized = true // No more absorb, unlimited squeeze.
+}
+
+shake_out :: proc(ctx: ^Context, hash: []byte) {
+	assert(ctx.is_initialized)
+	assert(ctx.is_finalized)
+
+	j := ctx.pt
 	for i := 0; i < len(hash); i += 1 {
-		if j >= c.rsiz {
-			keccakf(&c.st.q)
+		if j >= ctx.rsiz {
+			keccakf(&ctx.st.q)
 			j = 0
 		}
-		hash[i] = c.st.b[j]
+		hash[i] = ctx.st.b[j]
 		j += 1
 	}
-	c.pt = j
+	ctx.pt = j
 }
--- a/core/crypto/_sha3/sp800_185.odin
+++ b/core/crypto/_sha3/sp800_185.odin
@@ -0,0 +1,145 @@
+package _sha3
+
+import "core:encoding/endian"
+import "core:math/bits"
+
+init_cshake :: proc(ctx: ^Context, n, s: []byte, sec_strength: int) {
+	ctx.mdlen = sec_strength / 8
+
+	// No domain separator is equivalent to vanilla SHAKE.
+	if len(n) == 0 && len(s) == 0 {
+		ctx.dsbyte = DS_SHAKE
+		init(ctx)
+		return
+	}
+
+	ctx.dsbyte = DS_CSHAKE
+	init(ctx)
+	bytepad(ctx, [][]byte{n, s}, rate_cshake(sec_strength))
+}
+
+final_cshake :: proc(ctx: ^Context, dst: []byte, finalize_clone: bool = false) {
+	ctx := ctx
+	if finalize_clone {
+		tmp_ctx: Context
+		clone(&tmp_ctx, ctx)
+		ctx = &tmp_ctx
+	}
+	defer reset(ctx)
+
+	encode_byte_len(ctx, len(dst), false) // right_encode
+	shake_xof(ctx)
+	shake_out(ctx, dst)
+}
+
+rate_cshake :: #force_inline proc(sec_strength: int) -> int {
+	switch sec_strength {
+	case 128:
+		return RATE_128
+	case 256:
+		return RATE_256
+	}
+
+	panic("crypto/sha3: invalid security strength")
+}
+
+// right_encode and left_encode are defined to support 0 <= x < 2^2040
+// however, the largest value we will ever need to encode is `max(int) * 8`.
+//
+// This is unfortunate as the extreme upper edge is larger than
+// `max(u64)`.  While such values are impractical at present,
+// they are possible (ie: https://arxiv.org/pdf/quant-ph/9908043.pdf).
+//
+// Thus we support 0 <= x < 2^128.
+
+@(private)
+_PAD: [RATE_128]byte // Biggest possible value of w per spec.
+
+bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) {
+	// 1. z = left_encode(w) || X.
+	z_hi: u64
+	z_lo := left_right_encode(ctx, 0, u64(w), true)
+	for x in x_strings {
+		// All uses of bytepad in SP 800-185 use the output from
+		// one or more encode_string values for `X`.
+		hi, lo := encode_string(ctx, x)
+
+		carry: u64
+		z_lo, carry = bits.add_u64(z_lo, lo, 0)
+		z_hi, carry = bits.add_u64(z_hi, hi, carry)
+
+		// This isn't actually possible, at least with the currently
+		// defined SP 800-185 routines.
+		if carry != 0 {
+			panic("crypto/sha3: bytepad input length overflow")
+		}
+	}
+
+	// We skip this step as we are doing a byte-oriented implementation
+	// rather than a bit oriented one.
+	//
+	// 2. while len(z) mod 8 ≠ 0:
+	//    z = z || 0
+
+	// 3. while (len(z)/8) mod w ≠ 0:
+	//    z = z || 00000000
+	z_len := u128(z_hi) << 64 | u128(z_lo)
+	z_rem := int(z_len % u128(w))
+	pad := _PAD[:w - z_rem]
+
+	// We just add the padding to the state, instead of returning z.
+	//
+	// 4. return z.
+	update(ctx, pad)
+}
+
+encode_string :: #force_inline proc(ctx: ^Context, s: []byte) -> (u64, u64) {
+	l := encode_byte_len(ctx, len(s), true) // left_encode
+	update(ctx, s)
+
+	lo, hi := bits.add_u64(l, u64(len(s)), 0)
+
+	return hi, lo
+}
+
+encode_byte_len :: #force_inline proc(ctx: ^Context, l: int, is_left: bool) -> u64 {
+	hi, lo := bits.mul_u64(u64(l), 8)
+	return left_right_encode(ctx, hi, lo, is_left)
+}
+
+@(private)
+left_right_encode :: proc(ctx: ^Context, hi, lo: u64, is_left: bool) -> u64 {
+	HI_OFFSET :: 1
+	LO_OFFSET :: HI_OFFSET + 8
+	RIGHT_OFFSET :: LO_OFFSET + 8
+	BUF_LEN :: RIGHT_OFFSET + 1
+
+	buf: [BUF_LEN]byte // prefix + largest uint + postfix
+
+	endian.unchecked_put_u64be(buf[HI_OFFSET:], hi)
+	endian.unchecked_put_u64be(buf[LO_OFFSET:], lo)
+
+	// 2. Strip leading `0x00` bytes.
+	off: int
+	for off = HI_OFFSET; off < RIGHT_OFFSET - 1; off = off + 1 {// Note: Minimum size is 1, not 0.
+		if buf[off] != 0 {
+			break
+		}
+	}
+	n := byte(RIGHT_OFFSET - off)
+
+	// 3. Prefix (left_encode) or postfix (right_encode) the length in bytes.
+	b: []byte
+	switch is_left {
+	case true:
+		buf[off - 1] = n // n | x
+		b = buf[off - 1:RIGHT_OFFSET]
+	case false:
+		buf[RIGHT_OFFSET] = n // x | n
+		b = buf[off:]
+	}
+
+	update(ctx, b)
+
+	return u64(len(b))
+}
--- a/core/crypto/blake2b/blake2b.odin
+++ b/core/crypto/blake2b/blake2b.odin
@@ -1,3 +1,10 @@
+/*
+package blake2b implements the BLAKE2b hash algorithm.
+
+See:
+- https://datatracker.ietf.org/doc/html/rfc7693
+- https://www.blake2.net
+*/
 package blake2b

 /*
@@ -6,122 +13,47 @@ package blake2b

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
-
-    Interface for the BLAKE2b hashing algorithm.
-    BLAKE2b and BLAKE2s share the implementation in the _blake2 package.
 */

-import "core:io"
-import "core:os"
-
 import "../_blake2"

-/*
-    High level API
-*/
-
+// DIGEST_SIZE is the BLAKE2b digest size in bytes.
 DIGEST_SIZE :: 64

-// hash_string will hash the given input and return the
-// computed hash
-hash_string :: proc(data: string) -> [DIGEST_SIZE]byte {
-	return hash_bytes(transmute([]byte)(data))
-}
-
-// hash_bytes will hash the given input and return the
-// computed hash
-hash_bytes :: proc(data: []byte) -> [DIGEST_SIZE]byte {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2B_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
-}
-
-// hash_string_to_buffer will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer :: proc(data, hash: []byte) {
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2B_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream will read the stream in chunks and compute a
-// hash from its contents
-hash_stream :: proc(s: io.Stream) -> ([DIGEST_SIZE]byte, bool) {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2B_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file will read the file provided by the given handle
-// and compute a hash
-hash_file :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE]byte, bool) {
-	if !load_at_once {
-		return hash_stream(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE]byte{}, false
-}
-
-hash :: proc {
-	hash_stream,
-	hash_file,
-	hash_bytes,
-	hash_string,
-	hash_bytes_to_buffer,
-	hash_string_to_buffer,
-}
-
-/*
-    Low level API
-*/
+// BLOCK_SIZE is the BLAKE2b block size in bytes.
+BLOCK_SIZE :: _blake2.BLAKE2B_BLOCK_SIZE

+// Context is a BLAKE2b instance.
 Context :: _blake2.Blake2b_Context

+// init initializes a Context with the default BLAKE2b config.
 init :: proc(ctx: ^Context) {
-	_blake2.init(ctx)
+	cfg: _blake2.Blake2_Config
+	cfg.size = _blake2.BLAKE2B_SIZE
+	_blake2.init(ctx, &cfg)
 }

+// update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
 }

-final :: proc(ctx: ^Context, hash: []byte) {
-	_blake2.final(ctx, hash)
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	_blake2.final(ctx, hash, finalize_clone)
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	_blake2.clone(ctx, other)
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	_blake2.reset(ctx)
 }
--- a/core/crypto/blake2s/blake2s.odin
+++ b/core/crypto/blake2s/blake2s.odin
@@ -1,3 +1,10 @@
+/*
+package blake2s implements the BLAKE2s hash algorithm.
+
+See:
+- https://datatracker.ietf.org/doc/html/rfc7693
+- https://www.blake2.net/
+*/
 package blake2s

 /*
@@ -6,122 +13,47 @@ package blake2s

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
-
-    Interface for the BLAKE2s hashing algorithm.
-    BLAKE2s and BLAKE2b share the implementation in the _blake2 package.
 */

-import "core:io"
-import "core:os"
-
 import "../_blake2"

-/*
-    High level API
-*/
-
+// DIGEST_SIZE is the BLAKE2s digest size in bytes.
 DIGEST_SIZE :: 32

-// hash_string will hash the given input and return the
-// computed hash
-hash_string :: proc(data: string) -> [DIGEST_SIZE]byte {
-	return hash_bytes(transmute([]byte)(data))
-}
-
-// hash_bytes will hash the given input and return the
-// computed hash
-hash_bytes :: proc(data: []byte) -> [DIGEST_SIZE]byte {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2S_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
-}
-
-// hash_string_to_buffer will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer :: proc(data, hash: []byte) {
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2S_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream will read the stream in chunks and compute a
-// hash from its contents
-hash_stream :: proc(s: io.Stream) -> ([DIGEST_SIZE]byte, bool) {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2S_SIZE
-	ctx.cfg = cfg
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file will read the file provided by the given handle
-// and compute a hash
-hash_file :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE]byte, bool) {
-	if !load_at_once {
-		return hash_stream(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE]byte{}, false
-}
-
-hash :: proc {
-	hash_stream,
-	hash_file,
-	hash_bytes,
-	hash_string,
-	hash_bytes_to_buffer,
-	hash_string_to_buffer,
-}
-
-/*
-    Low level API
-*/
+// BLOCK_SIZE is the BLAKE2s block size in bytes.
+BLOCK_SIZE :: _blake2.BLAKE2S_BLOCK_SIZE

+// Context is a BLAKE2s instance.
 Context :: _blake2.Blake2s_Context

+// init initializes a Context with the default BLAKE2s config.
 init :: proc(ctx: ^Context) {
-	_blake2.init(ctx)
+	cfg: _blake2.Blake2_Config
+	cfg.size = _blake2.BLAKE2S_SIZE
+	_blake2.init(ctx, &cfg)
 }

+// update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
 }

-final :: proc(ctx: ^Context, hash: []byte) {
-	_blake2.final(ctx, hash)
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	_blake2.final(ctx, hash, finalize_clone)
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	_blake2.clone(ctx, other)
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	_blake2.reset(ctx)
 }
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -1,11 +1,21 @@
+/*
+package chacha20 implements the ChaCha20 and XChaCha20 stream ciphers.
+
+See:
+- https://datatracker.ietf.org/doc/html/rfc8439
+- https://datatracker.ietf.org/doc/draft-irtf-cfrg-xchacha/03/
+*/
 package chacha20

 import "core:encoding/endian"
 import "core:math/bits"
 import "core:mem"

+// KEY_SIZE is the (X)ChaCha20 key size in bytes.
 KEY_SIZE :: 32
+// NONCE_SIZE is the ChaCha20 nonce size in bytes.
 NONCE_SIZE :: 12
+// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
 XNONCE_SIZE :: 24

@(private)
@@ -19,25 +29,26 @@ _STATE_SIZE_U32 :: 16
 _ROUNDS :: 20

@(private)
-_SIGMA_0 : u32 : 0x61707865
+_SIGMA_0: u32 : 0x61707865
@(private)
-_SIGMA_1 : u32 : 0x3320646e
+_SIGMA_1: u32 : 0x3320646e
@(private)
-_SIGMA_2 : u32 : 0x79622d32
+_SIGMA_2: u32 : 0x79622d32
@(private)
-_SIGMA_3 : u32 : 0x6b206574
+_SIGMA_3: u32 : 0x6b206574

+// Context is a ChaCha20 or XChaCha20 instance.
 Context :: struct {
-	_s: [_STATE_SIZE_U32]u32,
-
-	_buffer: [_BLOCK_SIZE]byte,
-	_off: int,
-
+	_s:              [_STATE_SIZE_U32]u32,
+	_buffer:         [_BLOCK_SIZE]byte,
+	_off:            int,
 	_is_ietf_flavor: bool,
 	_is_initialized: bool,
 }

-init :: proc (ctx: ^Context, key, nonce: []byte) {
+// init inititializes a Context for ChaCha20 or XChaCha20 with the provided
+// key and nonce.
+init :: proc(ctx: ^Context, key, nonce: []byte) {
 	if len(key) != KEY_SIZE {
 		panic("crypto/chacha20: invalid ChaCha20 key size")
 	}
@@ -89,7 +100,8 @@ init :: proc (ctx: ^Context, key, nonce: []byte) {
 	ctx._is_initialized = true
 }

-seek :: proc (ctx: ^Context, block_nr: u64) {
+// seek seeks the (X)ChaCha20 stream counter to the specified block.
+seek :: proc(ctx: ^Context, block_nr: u64) {
 	assert(ctx._is_initialized)

 	if ctx._is_ietf_flavor {
@@ -103,7 +115,10 @@ seek :: proc (ctx: ^Context, block_nr: u64) {
 	ctx._off = _BLOCK_SIZE
 }

-xor_bytes :: proc (ctx: ^Context, dst, src: []byte) {
+// xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20
+// keystream, and writes the resulting output to dst.  Dst and src MUST
+// alias exactly or not at all.
+xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 	assert(ctx._is_initialized)

 	// TODO: Enforcing that dst and src alias exactly or not at all
@@ -147,7 +162,8 @@ xor_bytes :: proc (ctx: ^Context, dst, src: []byte) {
 	}
 }

-keystream_bytes :: proc (ctx: ^Context, dst: []byte) {
+// keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
+keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
 	assert(ctx._is_initialized)

 	dst := dst
@@ -180,7 +196,9 @@ keystream_bytes :: proc (ctx: ^Context, dst: []byte) {
 	}
 }

-reset :: proc (ctx: ^Context) {
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
 	mem.zero_explicit(&ctx._s, size_of(ctx._s))
 	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))

@@ -188,7 +206,7 @@ reset :: proc (ctx: ^Context) {
 }

@(private)
-_do_blocks :: proc (ctx: ^Context, dst, src: []byte, nr_blocks: int) {
+_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
 	// Enforce the maximum consumed keystream per nonce.
 	//
 	// While all modern "standard" definitions of ChaCha20 use
--- a/core/crypto/chacha20poly1305/chacha20poly1305.odin
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -1,3 +1,10 @@
+/*
+package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 Authenticated
+Encryption with Additional Data algorithm.
+
+See:
+- https://www.rfc-editor.org/rfc/rfc8439
+*/
 package chacha20poly1305

 import "core:crypto"
@@ -6,8 +13,11 @@ import "core:crypto/poly1305"
 import "core:encoding/endian"
 import "core:mem"

+// KEY_SIZE is the chacha20poly1305 key size in bytes.
 KEY_SIZE :: chacha20.KEY_SIZE
+// NONCE_SIZE is the chacha20poly1305 nonce size in bytes.
 NONCE_SIZE :: chacha20.NONCE_SIZE
+// TAG_SIZE is the chacha20poly1305 tag size in bytes.
 TAG_SIZE :: poly1305.TAG_SIZE

@(private)
@@ -49,6 +59,8 @@ _update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) {
 	}
 }

+// encrypt encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided key and nonce, stores the output in ciphertext and tag.
 encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	_validate_common_slice_sizes(tag, key, nonce, aad, plaintext)
 	if len(ciphertext) != len(plaintext) {
@@ -95,6 +107,11 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context.
 }

+// decrypt authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided key, nonce, and tag, and stores the output in plaintext,
+// returning true iff the authentication was successful.
+//
+// If authentication fails, the destination plaintext buffer will be zeroed.
 decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 	_validate_common_slice_sizes(tag, key, nonce, aad, ciphertext)
 	if len(ciphertext) != len(plaintext) {
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -1,3 +1,7 @@
+/*
+package crypto implements a selection of cryptography algorithms and useful
+helper routines.
+*/
 package crypto

 import "core:mem"
@@ -51,3 +55,9 @@ rand_bytes :: proc (dst: []byte) {

 	_rand_bytes(dst)
 }
+
+// has_rand_bytes returns true iff the target has support for accessing the
+// system entropty source.
+has_rand_bytes :: proc () -> bool {
+	return _has_rand_bytes()
+}
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -0,0 +1,314 @@
+/*
+package ed25519 implements the Ed25519 EdDSA signature algorithm.
+
+See:
+- https://datatracker.ietf.org/doc/html/rfc8032
+- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.186-5.pdf
+- https://eprint.iacr.org/2020/1244.pdf
+*/
+package ed25519
+
+import "core:crypto"
+import grp "core:crypto/_edwards25519"
+import "core:crypto/sha2"
+import "core:mem"
+
+// PRIVATE_KEY_SIZE is the byte-encoded private key size.
+PRIVATE_KEY_SIZE :: 32
+// PUBLIC_KEY_SIZE is the byte-encoded public key size.
+PUBLIC_KEY_SIZE :: 32
+// SIGNATURE_SIZE is the byte-encoded signature size.
+SIGNATURE_SIZE :: 64
+
+@(private)
+NONCE_SIZE :: 32
+
+// Private_Key is an Ed25519 private key.
+Private_Key :: struct {
+	// WARNING: All of the members are to be treated as internal (ie:
+	// the Private_Key structure is intended to be opaque).  There are
+	// subtle vulnerabilities that can be introduced if the internal
+	// values are allowed to be altered.
+	//
+	// See: https://github.com/MystenLabs/ed25519-unsafe-libs
+	_b:              [PRIVATE_KEY_SIZE]byte,
+	_s:              grp.Scalar,
+	_nonce:          [NONCE_SIZE]byte,
+	_pub_key:        Public_Key,
+	_is_initialized: bool,
+}
+
+// Public_Key is an Ed25519 public key.
+Public_Key :: struct {
+	// WARNING: All of the members are to be treated as internal (ie:
+	// the Public_Key structure is intended to be opaque).
+	_b:              [PUBLIC_KEY_SIZE]byte,
+	_neg_A:          grp.Group_Element,
+	_is_valid:       bool,
+	_is_initialized: bool,
+}
+
+// private_key_set_bytes decodes a byte-encoded private key, and returns
+// true iff the operation was successful.
+private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
+	if len(b) != PRIVATE_KEY_SIZE {
+		return false
+	}
+
+	// Derive the private key.
+	ctx: sha2.Context_512 = ---
+	h_bytes: [sha2.DIGEST_SIZE_512]byte = ---
+	sha2.init_512(&ctx)
+	sha2.update(&ctx, b)
+	sha2.final(&ctx, h_bytes[:])
+
+	copy(priv_key._b[:], b)
+	copy(priv_key._nonce[:], h_bytes[32:])
+	grp.sc_set_bytes_rfc8032(&priv_key._s, h_bytes[:32])
+
+	// Derive the corresponding public key.
+	A: grp.Group_Element = ---
+	grp.ge_scalarmult_basepoint(&A, &priv_key._s)
+	grp.ge_bytes(&A, priv_key._pub_key._b[:])
+	grp.ge_negate(&priv_key._pub_key._neg_A, &A)
+	priv_key._pub_key._is_valid = !grp.ge_is_small_order(&A)
+	priv_key._pub_key._is_initialized = true
+
+	priv_key._is_initialized = true
+
+	return true
+}
+
+// private_key_bytes sets dst to byte-encoding of priv_key.
+private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
+	if !priv_key._is_initialized {
+		panic("crypto/ed25519: uninitialized private key")
+	}
+	if len(dst) != PRIVATE_KEY_SIZE {
+		panic("crypto/ed25519: invalid destination size")
+	}
+
+	copy(dst, priv_key._b[:])
+}
+
+// private_key_clear clears priv_key to the uninitialized state.
+private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
+	mem.zero_explicit(priv_key, size_of(Private_Key))
+}
+
+// sign writes the signature by priv_key over msg to sig.
+sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
+	if !priv_key._is_initialized {
+		panic("crypto/ed25519: uninitialized private key")
+	}
+	if len(sig) != SIGNATURE_SIZE {
+		panic("crypto/ed25519: invalid destination size")
+	}
+
+	// 1. Compute the hash of the private key d, H(d) = (h_0, h_1, ..., h_2b-1)
+	// using SHA-512 for Ed25519.  H(d) may be precomputed.
+	//
+	// 2. Using the second half of the digest hdigest2 = hb || ... || h2b-1,
+	// define:
+	//
+	// 2.1 For Ed25519, r = SHA-512(hdigest2 || M); Interpret r as a
+	// 64-octet little-endian integer.
+	ctx: sha2.Context_512 = ---
+	digest_bytes: [sha2.DIGEST_SIZE_512]byte = ---
+	sha2.init_512(&ctx)
+	sha2.update(&ctx, priv_key._nonce[:])
+	sha2.update(&ctx, msg)
+	sha2.final(&ctx, digest_bytes[:])
+
+	r: grp.Scalar = ---
+	grp.sc_set_bytes_wide(&r, &digest_bytes)
+
+	// 3. Compute the point [r]G. The octet string R is the encoding of
+	// the point [r]G.
+	R: grp.Group_Element = ---
+	R_bytes := sig[:32]
+	grp.ge_scalarmult_basepoint(&R, &r)
+	grp.ge_bytes(&R, R_bytes)
+
+	// 4. Derive s from H(d) as in the key pair generation algorithm.
+	// Use octet strings R, Q, and M to define:
+	//
+	// 4.1 For Ed25519, digest = SHA-512(R || Q || M).
+	// Interpret digest as a little-endian integer.
+	sha2.init_512(&ctx)
+	sha2.update(&ctx, R_bytes)
+	sha2.update(&ctx, priv_key._pub_key._b[:]) // Q in NIST terminology.
+	sha2.update(&ctx, msg)
+	sha2.final(&ctx, digest_bytes[:])
+
+	sc: grp.Scalar = --- // `digest` in NIST terminology.
+	grp.sc_set_bytes_wide(&sc, &digest_bytes)
+
+	// 5. Compute S = (r + digest × s) mod n. The octet string S is the
+	// encoding of the resultant integer.
+	grp.sc_mul(&sc, &sc, &priv_key._s)
+	grp.sc_add(&sc, &sc, &r)
+
+	// 6. Form the signature as the concatenation of the octet strings
+	// R and S.
+	grp.sc_bytes(sig[32:], &sc)
+
+	grp.sc_clear(&r)
+}
+
+// public_key_set_bytes decodes a byte-encoded public key, and returns
+// true iff the operation was successful.
+public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
+	if len(b) != PUBLIC_KEY_SIZE {
+		return false
+	}
+
+	A: grp.Group_Element = ---
+	if !grp.ge_set_bytes(&A, b) {
+		return false
+	}
+
+	copy(pub_key._b[:], b)
+	grp.ge_negate(&pub_key._neg_A, &A)
+	pub_key._is_valid = !grp.ge_is_small_order(&A)
+	pub_key._is_initialized = true
+
+	return true
+}
+
+// public_key_set_priv sets pub_key to the public component of priv_key.
+public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) {
+	if !priv_key._is_initialized {
+		panic("crypto/ed25519: uninitialized public key")
+	}
+
+	src := &priv_key._pub_key
+	copy(pub_key._b[:], src._b[:])
+	grp.ge_set(&pub_key._neg_A, &src._neg_A)
+	pub_key._is_valid = src._is_valid
+	pub_key._is_initialized = src._is_initialized
+}
+
+// public_key_bytes sets dst to byte-encoding of pub_key.
+public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
+	if !pub_key._is_initialized {
+		panic("crypto/ed25519: uninitialized public key")
+	}
+	if len(dst) != PUBLIC_KEY_SIZE {
+		panic("crypto/ed25519: invalid destination size")
+	}
+
+	copy(dst, pub_key._b[:])
+}
+
+// public_key_equal returns true iff pub_key is equal to other.
+public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
+	if !pub_key._is_initialized || !other._is_initialized {
+		panic("crypto/ed25519: uninitialized public key")
+	}
+
+	return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
+}
+
+// verify returns true iff sig is a valid signature by pub_key over msg.
+//
+// The optional `allow_small_order_A` parameter will make this
+// implementation strictly compatible with FIPS 186-5, at the expense of
+// SBS-security.  Doing so is NOT recommended, and the disallowed
+// public keys all have a known discrete-log.
+verify :: proc(pub_key: ^Public_Key, msg, sig: []byte, allow_small_order_A := false) -> bool {
+	switch {
+	case !pub_key._is_initialized:
+		return false
+	case len(sig) != SIGNATURE_SIZE:
+		return false
+	}
+
+	// TLDR: Just use ristretto255.
+	//
+	// While there are two "standards" for EdDSA, existing implementations
+	// diverge (sometimes dramatically).  This implementation opts for
+	// "Algorithm 2" from "Taming the Many EdDSAs", which provides the
+	// strongest notion of security (SUF-CMA + SBS).
+	//
+	// The relevant properties are:
+	// - Reject non-canonical S.
+	// - Reject non-canonical A/R.
+	// - Reject small-order A (Extra non-standard check).
+	// - Cofactored verification equation.
+	//
+	// There are 19 possible non-canonical group element encodings of
+	// which:
+	// - 2 are small order
+	// - 10 are mixed order
+	// - 7 are not on the curve
+	//
+	// While historical implementations have been lax about enforcing
+	// that A/R are canonically encoded, that behavior is mandated by
+	// both the RFC and FIPS specification.  No valid key generation
+	// or sign implementation will ever produce non-canonically encoded
+	// public keys or signatures.
+	//
+	// There are 8 small-order group elements, 1 which is in the
+	// prime-order sub-group, and thus the probability that a properly
+	// generated A is small-order is cryptographically insignificant.
+	//
+	// While both the RFC and FIPS standard allow for either the
+	// cofactored or non-cofactored equation.  It is possible to
+	// artificially produce signatures that are valid for the former
+	// but not the latter.  This will NEVER occur with a valid sign
+	// implementation.  The choice of the latter is to be compatible
+	// with ABGLSV-Pornin, batch verification, and FROST (among other
+	// things).
+
+	s_bytes, r_bytes := sig[32:], sig[:32]
+
+	// 1. Reject the signature if S is not in the range [0, L).
+	s: grp.Scalar = ---
+	if !grp.sc_set_bytes(&s, s_bytes) {
+		return false
+	}
+
+	// 2. Reject the signature if the public key A is one of 8 small
+	// order points.
+	//
+	// As this check is optional and not part of the standard, we allow
+	// the caller to bypass it if desired.  Disabling the check makes
+	// the scheme NOT SBS-secure.
+	if !pub_key._is_valid && !allow_small_order_A {
+		return false
+	}
+
+	// 3. Reject the signature if A or R are non-canonical.
+	//
+	// Note: All initialized public keys are guaranteed to be canonical.
+	neg_R: grp.Group_Element = ---
+	if !grp.ge_set_bytes(&neg_R, r_bytes) {
+		return false
+	}
+	grp.ge_negate(&neg_R, &neg_R)
+
+	// 4. Compute the hash SHA512(R||A||M) and reduce it mod L to get a
+	// scalar h.
+	ctx: sha2.Context_512 = ---
+	h_bytes: [sha2.DIGEST_SIZE_512]byte = ---
+	sha2.init_512(&ctx)
+	sha2.update(&ctx, r_bytes)
+	sha2.update(&ctx, pub_key._b[:])
+	sha2.update(&ctx, msg)
+	sha2.final(&ctx, h_bytes[:])
+
+	h: grp.Scalar = ---
+	grp.sc_set_bytes_wide(&h, &h_bytes)
+
+	// 5. Accept if 8(s * G) - 8R - 8(h * A) = 0
+	//
+	// > first compute V = SB − R − hA and then accept if V is one of
+	// > 8 small order points (or alternatively compute 8V with 3
+	// > doublings and check against the neutral element)
+	V: grp.Group_Element = ---
+	grp.ge_double_scalarmult_basepoint_vartime(&V, &h, &pub_key._neg_A, &s)
+	grp.ge_add(&V, &V, &neg_R)
+
+	return grp.ge_is_small_order(&V)
+}
--- a/core/crypto/hash/doc.odin
+++ b/core/crypto/hash/doc.odin
@@ -0,0 +1,62 @@
+/*
+package hash provides a generic interface to the supported hash algorithms.
+
+A high-level convenience procedure group `hash` is provided to easily
+accomplish common tasks.
+- `hash_string` - Hash a given string and return the digest.
+- `hash_bytes` - Hash a given byte slice and return the digest.
+- `hash_string_to_buffer` - Hash a given string and put the digest in
+  the third parameter.  It requires that the destination buffer
+  is at least as big as the digest size.
+- `hash_bytes_to_buffer` - Hash a given string and put the computed
+  digest in the third parameter.  It requires that the destination
+  buffer is at least as big as the digest size.
+- `hash_stream` - Incrementally fully consume a `io.Stream`, and return
+  the computed digest.
+- `hash_file` - Takes a file handle and returns the computed digest.
+  A third optional boolean parameter controls if the file is streamed
+  (default), or or read at once.
+
+```odin
+package hash_example
+
+import "core:crypto/hash"
+
+main :: proc() {
+	input := "Feed the fire."
+
+	// Compute the digest, using the high level API.
+	returned_digest := hash.hash(hash.Algorithm.SHA512_256, input)
+	defer delete(returned_digest)
+
+	// Variant that takes a destination buffer, instead of returning
+	// the digest.
+	digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.BLAKE2B]) // @note: Destination buffer has to be at least as big as the digest size of the hash.
+	defer delete(digest)
+	hash.hash(hash.Algorithm.BLAKE2B, input, digest)
+}
+```
+
+A generic low level API is provided supporting the init/update/final interface
+that is typical with cryptographic hash function implementations.
+
+```odin
+package hash_example
+
+import "core:crypto/hash"
+
+main :: proc() {
+    input := "Let the cinders burn."
+
+    // Compute the digest, using the low level API.
+    ctx: hash.Context
+    digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.SHA3_512])
+    defer delete(digest)
+
+    hash.init(&ctx, hash.Algorithm.SHA3_512)
+    hash.update(&ctx, transmute([]byte)input)
+    hash.final(&ctx, digest)
+}
+```
+*/
+package crypto_hash
--- a/core/crypto/hash/hash.odin
+++ b/core/crypto/hash/hash.odin
@@ -0,0 +1,116 @@
+package crypto_hash
+
+/*
+    Copyright 2021 zhibog
+    Made available under the BSD-3 license.
+
+    List of contributors:
+        zhibog, dotbmp:  Initial implementation.
+*/
+
+import "core:io"
+import "core:mem"
+import "core:os"
+
+// hash_bytes will hash the given input and return the computed digest
+// in a newly allocated slice.
+hash_string :: proc(algorithm: Algorithm, data: string, allocator := context.allocator) -> []byte {
+	return hash_bytes(algorithm, transmute([]byte)(data), allocator)
+}
+
+// hash_bytes will hash the given input and return the computed digest
+// in a newly allocated slice.
+hash_bytes :: proc(algorithm: Algorithm, data: []byte, allocator := context.allocator) -> []byte {
+	dst := make([]byte, DIGEST_SIZES[algorithm], allocator)
+	hash_bytes_to_buffer(algorithm, data, dst)
+	return dst
+}
+
+// hash_string_to_buffer will hash the given input and assign the
+// computed digest to the third parameter.  It requires that the
+// destination buffer is at least as big as the digest size.
+hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) {
+	hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
+}
+
+// hash_bytes_to_buffer will hash the given input and write the
+// computed digest into the third parameter.  It requires that the
+// destination buffer is at least as big as the digest size.
+hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) {
+	ctx: Context
+
+	init(&ctx, algorithm)
+	update(&ctx, data)
+	final(&ctx, hash)
+}
+
+// hash_stream will incrementally fully consume a stream, and return the
+// computed digest in a newly allocated slice.
+hash_stream :: proc(
+	algorithm: Algorithm,
+	s: io.Stream,
+	allocator := context.allocator,
+) -> (
+	[]byte,
+	io.Error,
+) {
+	ctx: Context
+
+	buf: [MAX_BLOCK_SIZE * 4]byte
+	defer mem.zero_explicit(&buf, size_of(buf))
+
+	init(&ctx, algorithm)
+
+	loop: for {
+		n, err := io.read(s, buf[:])
+		if n > 0 {
+			// XXX/yawning: Can io.read return n > 0 and EOF?
+			update(&ctx, buf[:n])
+		}
+		#partial switch err {
+		case .None:
+		case .EOF:
+			break loop
+		case:
+			return nil, err
+		}
+	}
+
+	dst := make([]byte, DIGEST_SIZES[algorithm], allocator)
+	final(&ctx, dst)
+
+	return dst, io.Error.None
+}
+
+// hash_file will read the file provided by the given handle and return the
+// computed digest in a newly allocated slice.
+hash_file :: proc(
+	algorithm: Algorithm,
+	hd: os.Handle,
+	load_at_once := false,
+	allocator := context.allocator,
+) -> (
+	[]byte,
+	io.Error,
+) {
+	if !load_at_once {
+		return hash_stream(algorithm, os.stream_from_handle(hd), allocator)
+	}
+
+	buf, ok := os.read_entire_file(hd, allocator)
+	if !ok {
+		return nil, io.Error.Unknown
+	}
+	defer delete(buf, allocator)
+
+	return hash_bytes(algorithm, buf, allocator), io.Error.None
+}
+
+hash :: proc {
+	hash_stream,
+	hash_file,
+	hash_bytes,
+	hash_string,
+	hash_bytes_to_buffer,
+	hash_string_to_buffer,
+}
--- a/core/crypto/hash/low_level.odin
+++ b/core/crypto/hash/low_level.odin
@@ -0,0 +1,353 @@
+package crypto_hash
+
+import "core:crypto/blake2b"
+import "core:crypto/blake2s"
+import "core:crypto/sha2"
+import "core:crypto/sha3"
+import "core:crypto/sm3"
+import "core:crypto/legacy/keccak"
+import "core:crypto/legacy/md5"
+import "core:crypto/legacy/sha1"
+
+import "core:reflect"
+
+// MAX_DIGEST_SIZE is the maximum size digest that can be returned by any
+// of the Algorithms supported via this package.
+MAX_DIGEST_SIZE :: 64
+// MAX_BLOCK_SIZE is the maximum block size used by any of Algorithms
+// supported by this package.
+MAX_BLOCK_SIZE :: sha3.BLOCK_SIZE_224
+
+// Algorithm is the algorithm identifier associated with a given Context.
+Algorithm :: enum {
+	Invalid,
+	BLAKE2B,
+	BLAKE2S,
+	SHA224,
+	SHA256,
+	SHA384,
+	SHA512,
+	SHA512_256,
+	SHA3_224,
+	SHA3_256,
+	SHA3_384,
+	SHA3_512,
+	SM3,
+	Legacy_KECCAK_224,
+	Legacy_KECCAK_256,
+	Legacy_KECCAK_384,
+	Legacy_KECCAK_512,
+	Insecure_MD5,
+	Insecure_SHA1,
+}
+
+// ALGORITHM_NAMES is the Algorithm to algorithm name string.
+ALGORITHM_NAMES := [Algorithm]string {
+	.Invalid           = "Invalid",
+	.BLAKE2B           = "BLAKE2b",
+	.BLAKE2S           = "BLAKE2s",
+	.SHA224            = "SHA-224",
+	.SHA256            = "SHA-256",
+	.SHA384            = "SHA-384",
+	.SHA512            = "SHA-512",
+	.SHA512_256        = "SHA-512/256",
+	.SHA3_224          = "SHA3-224",
+	.SHA3_256          = "SHA3-256",
+	.SHA3_384          = "SHA3-384",
+	.SHA3_512          = "SHA3-512",
+	.SM3               = "SM3",
+	.Legacy_KECCAK_224 = "Keccak-224",
+	.Legacy_KECCAK_256 = "Keccak-256",
+	.Legacy_KECCAK_384 = "Keccak-384",
+	.Legacy_KECCAK_512 = "Keccak-512",
+	.Insecure_MD5      = "MD5",
+	.Insecure_SHA1     = "SHA-1",
+}
+
+// DIGEST_SIZES is the Algorithm to digest size in bytes.
+DIGEST_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.BLAKE2B           = blake2b.DIGEST_SIZE,
+	.BLAKE2S           = blake2s.DIGEST_SIZE,
+	.SHA224            = sha2.DIGEST_SIZE_224,
+	.SHA256            = sha2.DIGEST_SIZE_256,
+	.SHA384            = sha2.DIGEST_SIZE_384,
+	.SHA512            = sha2.DIGEST_SIZE_512,
+	.SHA512_256        = sha2.DIGEST_SIZE_512_256,
+	.SHA3_224          = sha3.DIGEST_SIZE_224,
+	.SHA3_256          = sha3.DIGEST_SIZE_256,
+	.SHA3_384          = sha3.DIGEST_SIZE_384,
+	.SHA3_512          = sha3.DIGEST_SIZE_512,
+	.SM3               = sm3.DIGEST_SIZE,
+	.Legacy_KECCAK_224 = keccak.DIGEST_SIZE_224,
+	.Legacy_KECCAK_256 = keccak.DIGEST_SIZE_256,
+	.Legacy_KECCAK_384 = keccak.DIGEST_SIZE_384,
+	.Legacy_KECCAK_512 = keccak.DIGEST_SIZE_512,
+	.Insecure_MD5      = md5.DIGEST_SIZE,
+	.Insecure_SHA1     = sha1.DIGEST_SIZE,
+}
+
+// BLOCK_SIZES is the Algoritm to block size in bytes.
+BLOCK_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.BLAKE2B           = blake2b.BLOCK_SIZE,
+	.BLAKE2S           = blake2s.BLOCK_SIZE,
+	.SHA224            = sha2.BLOCK_SIZE_256,
+	.SHA256            = sha2.BLOCK_SIZE_256,
+	.SHA384            = sha2.BLOCK_SIZE_512,
+	.SHA512            = sha2.BLOCK_SIZE_512,
+	.SHA512_256        = sha2.BLOCK_SIZE_512,
+	.SHA3_224          = sha3.BLOCK_SIZE_224,
+	.SHA3_256          = sha3.BLOCK_SIZE_256,
+	.SHA3_384          = sha3.BLOCK_SIZE_384,
+	.SHA3_512          = sha3.BLOCK_SIZE_512,
+	.SM3               = sm3.BLOCK_SIZE,
+	.Legacy_KECCAK_224 = keccak.BLOCK_SIZE_224,
+	.Legacy_KECCAK_256 = keccak.BLOCK_SIZE_256,
+	.Legacy_KECCAK_384 = keccak.BLOCK_SIZE_384,
+	.Legacy_KECCAK_512 = keccak.BLOCK_SIZE_512,
+	.Insecure_MD5      = md5.BLOCK_SIZE,
+	.Insecure_SHA1     = sha1.BLOCK_SIZE,
+}
+
+// Context is a concrete instantiation of a specific hash algorithm.
+Context :: struct {
+	_algo: Algorithm,
+	_impl: union {
+		blake2b.Context,
+		blake2s.Context,
+		sha2.Context_256,
+		sha2.Context_512,
+		sha3.Context,
+		sm3.Context,
+		keccak.Context,
+		md5.Context,
+		sha1.Context,
+	},
+}
+
+@(private)
+_IMPL_IDS := [Algorithm]typeid {
+	.Invalid           = nil,
+	.BLAKE2B           = typeid_of(blake2b.Context),
+	.BLAKE2S           = typeid_of(blake2s.Context),
+	.SHA224            = typeid_of(sha2.Context_256),
+	.SHA256            = typeid_of(sha2.Context_256),
+	.SHA384            = typeid_of(sha2.Context_512),
+	.SHA512            = typeid_of(sha2.Context_512),
+	.SHA512_256        = typeid_of(sha2.Context_512),
+	.SHA3_224          = typeid_of(sha3.Context),
+	.SHA3_256          = typeid_of(sha3.Context),
+	.SHA3_384          = typeid_of(sha3.Context),
+	.SHA3_512          = typeid_of(sha3.Context),
+	.SM3               = typeid_of(sm3.Context),
+	.Legacy_KECCAK_224 = typeid_of(keccak.Context),
+	.Legacy_KECCAK_256 = typeid_of(keccak.Context),
+	.Legacy_KECCAK_384 = typeid_of(keccak.Context),
+	.Legacy_KECCAK_512 = typeid_of(keccak.Context),
+	.Insecure_MD5      = typeid_of(md5.Context),
+	.Insecure_SHA1     = typeid_of(sha1.Context),
+}
+
+// init initializes a Context with a specific hash Algorithm.
+init :: proc(ctx: ^Context, algorithm: Algorithm) {
+	if ctx._impl != nil {
+		reset(ctx)
+	}
+
+	// Directly specialize the union by setting the type ID (save a copy).
+	reflect.set_union_variant_typeid(
+		ctx._impl,
+		_IMPL_IDS[algorithm],
+	)
+	switch algorithm {
+	case .BLAKE2B:
+		blake2b.init(&ctx._impl.(blake2b.Context))
+	case .BLAKE2S:
+		blake2s.init(&ctx._impl.(blake2s.Context))
+	case .SHA224:
+		sha2.init_224(&ctx._impl.(sha2.Context_256))
+	case .SHA256:
+		sha2.init_256(&ctx._impl.(sha2.Context_256))
+	case .SHA384:
+		sha2.init_384(&ctx._impl.(sha2.Context_512))
+	case .SHA512:
+		sha2.init_512(&ctx._impl.(sha2.Context_512))
+	case .SHA512_256:
+		sha2.init_512_256(&ctx._impl.(sha2.Context_512))
+	case .SHA3_224:
+		sha3.init_224(&ctx._impl.(sha3.Context))
+	case .SHA3_256:
+		sha3.init_256(&ctx._impl.(sha3.Context))
+	case .SHA3_384:
+		sha3.init_384(&ctx._impl.(sha3.Context))
+	case .SHA3_512:
+		sha3.init_512(&ctx._impl.(sha3.Context))
+	case .SM3:
+		sm3.init(&ctx._impl.(sm3.Context))
+	case .Legacy_KECCAK_224:
+		keccak.init_224(&ctx._impl.(keccak.Context))
+	case .Legacy_KECCAK_256:
+		keccak.init_256(&ctx._impl.(keccak.Context))
+	case .Legacy_KECCAK_384:
+		keccak.init_384(&ctx._impl.(keccak.Context))
+	case .Legacy_KECCAK_512:
+		keccak.init_512(&ctx._impl.(keccak.Context))
+	case .Insecure_MD5:
+		md5.init(&ctx._impl.(md5.Context))
+	case .Insecure_SHA1:
+		sha1.init(&ctx._impl.(sha1.Context))
+	case .Invalid:
+		panic("crypto/hash: uninitialized algorithm")
+	case:
+		panic("crypto/hash: invalid algorithm")
+	}
+
+	ctx._algo = algorithm
+}
+
+// update adds more data to the Context.
+update :: proc(ctx: ^Context, data: []byte) {
+	switch &impl in ctx._impl {
+	case blake2b.Context:
+		blake2b.update(&impl, data)
+	case blake2s.Context:
+		blake2s.update(&impl, data)
+	case sha2.Context_256:
+		sha2.update(&impl, data)
+	case sha2.Context_512:
+		sha2.update(&impl, data)
+	case sha3.Context:
+		sha3.update(&impl, data)
+	case sm3.Context:
+		sm3.update(&impl, data)
+	case keccak.Context:
+		keccak.update(&impl, data)
+	case md5.Context:
+		md5.update(&impl, data)
+	case sha1.Context:
+		sha1.update(&impl, data)
+	case:
+		panic("crypto/hash: uninitialized algorithm")
+	}
+}
+
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	switch &impl in ctx._impl {
+	case blake2b.Context:
+		blake2b.final(&impl, hash, finalize_clone)
+	case blake2s.Context:
+		blake2s.final(&impl, hash, finalize_clone)
+	case sha2.Context_256:
+		sha2.final(&impl, hash, finalize_clone)
+	case sha2.Context_512:
+		sha2.final(&impl, hash, finalize_clone)
+	case sha3.Context:
+		sha3.final(&impl, hash, finalize_clone)
+	case sm3.Context:
+		sm3.final(&impl, hash, finalize_clone)
+	case keccak.Context:
+		keccak.final(&impl, hash, finalize_clone)
+	case md5.Context:
+		md5.final(&impl, hash, finalize_clone)
+	case sha1.Context:
+		sha1.final(&impl, hash, finalize_clone)
+	case:
+		panic("crypto/hash: uninitialized algorithm")
+	}
+
+	if !finalize_clone {
+		reset(ctx)
+	}
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	// XXX/yawning: Maybe these cases should panic, because both cases,
+	// are probably bugs.
+	if ctx == other {
+		return
+	}
+	if ctx._impl != nil {
+		reset(ctx)
+	}
+
+	ctx._algo = other._algo
+
+	reflect.set_union_variant_typeid(
+		ctx._impl,
+		reflect.union_variant_typeid(other._impl),
+	)
+	switch &src_impl in other._impl {
+	case blake2b.Context:
+		blake2b.clone(&ctx._impl.(blake2b.Context), &src_impl)
+	case blake2s.Context:
+		blake2s.clone(&ctx._impl.(blake2s.Context), &src_impl)
+	case sha2.Context_256:
+		sha2.clone(&ctx._impl.(sha2.Context_256), &src_impl)
+	case sha2.Context_512:
+		sha2.clone(&ctx._impl.(sha2.Context_512), &src_impl)
+	case sha3.Context:
+		sha3.clone(&ctx._impl.(sha3.Context), &src_impl)
+	case sm3.Context:
+		sm3.clone(&ctx._impl.(sm3.Context), &src_impl)
+	case keccak.Context:
+		keccak.clone(&ctx._impl.(keccak.Context), &src_impl)
+	case md5.Context:
+		md5.clone(&ctx._impl.(md5.Context), &src_impl)
+	case sha1.Context:
+		sha1.clone(&ctx._impl.(sha1.Context), &src_impl)
+	case:
+		panic("crypto/hash: uninitialized algorithm")
+	}
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	switch &impl in ctx._impl {
+	case blake2b.Context:
+		blake2b.reset(&impl)
+	case blake2s.Context:
+		blake2s.reset(&impl)
+	case sha2.Context_256:
+		sha2.reset(&impl)
+	case sha2.Context_512:
+		sha2.reset(&impl)
+	case sha3.Context:
+		sha3.reset(&impl)
+	case sm3.Context:
+		sm3.reset(&impl)
+	case keccak.Context:
+		keccak.reset(&impl)
+	case md5.Context:
+		md5.reset(&impl)
+	case sha1.Context:
+		sha1.reset(&impl)
+	case:
+	// Unlike clone, calling reset repeatedly is fine.
+	}
+
+	ctx._algo = .Invalid
+	ctx._impl = nil
+}
+
+// algorithm returns the Algorithm used by a Context instance.
+algorithm :: proc(ctx: ^Context) -> Algorithm {
+	return ctx._algo
+}
+
+// digest_size returns the digest size of a Context instance in bytes.
+digest_size :: proc(ctx: ^Context) -> int {
+	return DIGEST_SIZES[ctx._algo]
+}
+
+// block_size returns the block size of a Context instance in bytes.
+block_size :: proc(ctx: ^Context) -> int {
+	return BLOCK_SIZES[ctx._algo]
+}
--- a/core/crypto/hkdf/hkdf.odin
+++ b/core/crypto/hkdf/hkdf.odin
@@ -0,0 +1,103 @@
+/*
+package hkdf implements the HKDF HMAC-based Extract-and-Expand Key
+Derivation Function.
+
+See: https://www.rfc-editor.org/rfc/rfc5869
+*/
+package hkdf
+
+import "core:crypto/hash"
+import "core:crypto/hmac"
+import "core:mem"
+
+// extract_and_expand derives output keying material (OKM) via the
+// HKDF-Extract and HKDF-Expand algorithms, with the specified has
+// function, salt, input keying material (IKM), and optional info.
+// The dst buffer must be less-than-or-equal to 255 HMAC tags.
+extract_and_expand :: proc(algorithm: hash.Algorithm, salt, ikm, info, dst: []byte) {
+	h_len := hash.DIGEST_SIZES[algorithm]
+
+	tmp: [hash.MAX_DIGEST_SIZE]byte
+	prk := tmp[:h_len]
+	defer mem.zero_explicit(raw_data(prk), h_len)
+
+	extract(algorithm, salt, ikm, prk)
+	expand(algorithm, prk, info, dst)
+}
+
+// extract derives a pseudorandom key (PRK) via the HKDF-Extract algorithm,
+// with the specified hash function, salt, and input keying material (IKM).
+// It requires that the dst buffer be the HMAC tag size for the specified
+// hash function.
+extract :: proc(algorithm: hash.Algorithm, salt, ikm, dst: []byte) {
+	// PRK = HMAC-Hash(salt, IKM)
+	hmac.sum(algorithm, dst, ikm, salt)
+}
+
+// expand derives output keying material (OKM) via the HKDF-Expand algorithm,
+// with the specified hash function, pseudorandom key (PRK), and optional
+// info.  The dst buffer must be less-than-or-equal to 255 HMAC tags.
+expand :: proc(algorithm: hash.Algorithm, prk, info, dst: []byte) {
+	h_len := hash.DIGEST_SIZES[algorithm]
+
+	// (<= 255*HashLen)
+	dk_len := len(dst)
+	switch {
+	case dk_len == 0:
+		return
+	case dk_len > h_len * 255:
+		panic("crypto/hkdf: derived key too long")
+	case:
+	}
+
+	// The output OKM is calculated as follows:
+	//
+	// N = ceil(L/HashLen)
+	// T = T(1) | T(2) | T(3) | ... | T(N)
+	// OKM = first L octets of T
+	//
+	// where:
+	// T(0) = empty string (zero length)
+	// T(1) = HMAC-Hash(PRK, T(0) | info | 0x01)
+	// T(2) = HMAC-Hash(PRK, T(1) | info | 0x02)
+	// T(3) = HMAC-Hash(PRK, T(2) | info | 0x03)
+	// ...
+
+	n := dk_len / h_len
+	r := dk_len % h_len
+
+	base: hmac.Context
+	defer hmac.reset(&base)
+
+	hmac.init(&base, algorithm, prk)
+
+	dst_blk := dst
+	prev: []byte
+
+	for i in 1 ..= n {
+		_F(&base, prev, info, i, dst_blk[:h_len])
+
+		prev = dst_blk[:h_len]
+		dst_blk = dst_blk[h_len:]
+	}
+
+	if r > 0 {
+		tmp: [hash.MAX_DIGEST_SIZE]byte
+		blk := tmp[:h_len]
+		defer mem.zero_explicit(raw_data(blk), h_len)
+
+		_F(&base, prev, info, n + 1, blk)
+		copy(dst_blk, blk)
+	}
+}
+
+@(private)
+_F :: proc(base: ^hmac.Context, prev, info: []byte, i: int, dst_blk: []byte) {
+	prf: hmac.Context
+
+	hmac.clone(&prf, base)
+	hmac.update(&prf, prev)
+	hmac.update(&prf, info)
+	hmac.update(&prf, []byte{u8(i)})
+	hmac.final(&prf, dst_blk)
+}
--- a/core/crypto/hmac/hmac.odin
+++ b/core/crypto/hmac/hmac.odin
@@ -0,0 +1,174 @@
+/*
+package hmac implements the HMAC MAC algorithm.
+
+See:
+- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.198-1.pdf
+*/
+package hmac
+
+import "core:crypto"
+import "core:crypto/hash"
+import "core:mem"
+
+// sum will compute the HMAC with the specified algorithm and key
+// over msg, and write the computed tag to dst.  It requires that
+// the dst buffer is the tag size.
+sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
+	ctx: Context
+
+	init(&ctx, algorithm, key)
+	update(&ctx, msg)
+	final(&ctx, dst)
+}
+
+// verify will verify the HMAC tag computed with the specified algorithm
+// and key over msg and return true iff the tag is valid.  It requires
+// that the tag is correctly sized.
+verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
+	tag_buf: [hash.MAX_DIGEST_SIZE]byte
+
+	derived_tag := tag_buf[:hash.DIGEST_SIZES[algorithm]]
+	sum(algorithm, derived_tag, msg, key)
+
+	return crypto.compare_constant_time(derived_tag, tag) == 1
+}
+
+// Context is a concrete instantiation of HMAC with a specific hash
+// algorithm.
+Context :: struct {
+	_o_hash:         hash.Context, // H(k ^ ipad) (not finalized)
+	_i_hash:         hash.Context, // H(k ^ opad) (not finalized)
+	_tag_sz:         int,
+	_is_initialized: bool,
+}
+
+// init initializes a Context with a specific hash Algorithm and key.
+init :: proc(ctx: ^Context, algorithm: hash.Algorithm, key: []byte) {
+	if ctx._is_initialized {
+		reset(ctx)
+	}
+
+	_init_hashes(ctx, algorithm, key)
+
+	ctx._tag_sz = hash.DIGEST_SIZES[algorithm]
+	ctx._is_initialized = true
+}
+
+// update adds more data to the Context.
+update :: proc(ctx: ^Context, data: []byte) {
+	assert(ctx._is_initialized)
+
+	hash.update(&ctx._i_hash, data)
+}
+
+// final finalizes the Context, writes the tag to dst, and calls
+// reset on the Context.
+final :: proc(ctx: ^Context, dst: []byte) {
+	assert(ctx._is_initialized)
+
+	defer (reset(ctx))
+
+	if len(dst) != ctx._tag_sz {
+		panic("crypto/hmac: invalid destination tag size")
+	}
+
+	hash.final(&ctx._i_hash, dst) // H((k ^ ipad) || text)
+
+	hash.update(&ctx._o_hash, dst) // H((k ^ opad) || H((k ^ ipad) || text))
+	hash.final(&ctx._o_hash, dst)
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	if ctx == other {
+		return
+	}
+
+	hash.clone(&ctx._o_hash, &other._o_hash)
+	hash.clone(&ctx._i_hash, &other._i_hash)
+	ctx._tag_sz = other._tag_sz
+	ctx._is_initialized = other._is_initialized
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	if !ctx._is_initialized {
+		return
+	}
+
+	hash.reset(&ctx._o_hash)
+	hash.reset(&ctx._i_hash)
+	ctx._tag_sz = 0
+	ctx._is_initialized = false
+}
+
+// algorithm returns the Algorithm used by a Context instance.
+algorithm :: proc(ctx: ^Context) -> hash.Algorithm {
+	assert(ctx._is_initialized)
+
+	return hash.algorithm(&ctx._i_hash)
+}
+
+// tag_size returns the tag size of a Context instance in bytes.
+tag_size :: proc(ctx: ^Context) -> int {
+	assert(ctx._is_initialized)
+
+	return ctx._tag_sz
+}
+
+@(private)
+_I_PAD :: 0x36
+_O_PAD :: 0x5c
+
+@(private)
+_init_hashes :: proc(ctx: ^Context, algorithm: hash.Algorithm, key: []byte) {
+	K0_buf: [hash.MAX_BLOCK_SIZE]byte
+	kPad_buf: [hash.MAX_BLOCK_SIZE]byte
+
+	kLen := len(key)
+	B := hash.BLOCK_SIZES[algorithm]
+	K0 := K0_buf[:B]
+	defer mem.zero_explicit(raw_data(K0), B)
+
+	switch {
+	case kLen == B, kLen < B:
+		// If the length of K = B: set K0 = K.
+		//
+		// If the length of K < B: append zeros to the end of K to
+		// create a B-byte string K0 (e.g., if K is 20 bytes in
+		// length and B = 64, then K will be appended with 44 zero
+		// bytes x’00’).
+		//
+		// K0 is zero-initialized, so the copy handles both cases.
+		copy(K0, key)
+	case kLen > B:
+		// If the length of K > B: hash K to obtain an L byte string,
+		// then append (B-L) zeros to create a B-byte string K0
+		// (i.e., K0 = H(K) || 00...00).
+		tmpCtx := &ctx._o_hash // Saves allocating a hash.Context.
+		hash.init(tmpCtx, algorithm)
+		hash.update(tmpCtx, key)
+		hash.final(tmpCtx, K0)
+	}
+
+	// Initialize the hashes, and write the padded keys:
+	// - ctx._i_hash -> H(K0 ^ ipad)
+	// - ctx._o_hash -> H(K0 ^ opad)
+
+	hash.init(&ctx._o_hash, algorithm)
+	hash.init(&ctx._i_hash, algorithm)
+
+	kPad := kPad_buf[:B]
+	defer mem.zero_explicit(raw_data(kPad), B)
+
+	for v, i in K0 {
+		kPad[i] = v ~ _I_PAD
+	}
+	hash.update(&ctx._i_hash, kPad)
+
+	for v, i in K0 {
+		kPad[i] = v ~ _O_PAD
+	}
+	hash.update(&ctx._o_hash, kPad)
+}
--- a/core/crypto/kmac/kmac.odin
+++ b/core/crypto/kmac/kmac.odin
@@ -0,0 +1,116 @@
+/*
+package kmac implements the KMAC MAC algorithm.
+
+See:
+- https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf
+*/
+package kmac
+
+import "../_sha3"
+import "core:crypto"
+import "core:crypto/shake"
+
+// MIN_KEY_SIZE_128 is the minimum key size for KMAC128 in bytes.
+MIN_KEY_SIZE_128 :: 128 / 8
+// MIN_KEY_SIZE_256 is the minimum key size for KMAC256 in bytes.
+MIN_KEY_SIZE_256 :: 256 / 8
+
+// MIN_TAG_SIZE is the absolute minimum tag size for KMAC in bytes (8.4.2).
+// Most callers SHOULD use at least 128-bits if not 256-bits for the tag
+// size.
+MIN_TAG_SIZE :: 32 / 8
+
+// sum will compute the KMAC with the specified security strength,
+// key, and domain separator over msg, and write the computed digest to
+// dst.
+sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
+	ctx: Context
+
+	_init_kmac(&ctx, key, domain_sep, sec_strength)
+	update(&ctx, msg)
+	final(&ctx, dst)
+}
+
+// verify will verify the KMAC tag computed with the specified security
+// strength, key and domain separator over msg and return true iff the
+// tag is valid.
+verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
+	derived_tag := make([]byte, len(tag), allocator)
+
+	sum(sec_strength, derived_tag, msg, key, domain_sep)
+
+	return crypto.compare_constant_time(derived_tag, tag) == 1
+}
+
+// Context is a KMAC instance.
+Context :: distinct shake.Context
+
+// init_128 initializes a Context for KMAC28.  This routine will panic if
+// the key length is less than MIN_KEY_SIZE_128.
+init_128 :: proc(ctx: ^Context, key, domain_sep: []byte) {
+	_init_kmac(ctx, key, domain_sep, 128)
+}
+
+// init_256 initializes a Context for KMAC256.  This routine will panic if
+// the key length is less than MIN_KEY_SIZE_256.
+init_256 :: proc(ctx: ^Context, key, domain_sep: []byte) {
+	_init_kmac(ctx, key, domain_sep, 256)
+}
+
+// update adds more data to the Context.
+update :: proc(ctx: ^Context, data: []byte) {
+	assert(ctx.is_initialized)
+
+	shake.write(transmute(^shake.Context)(ctx), data)
+}
+
+// final finalizes the Context, writes the tag to dst, and calls reset
+// on the Context.  This routine will panic if the dst length is less than
+// MIN_TAG_SIZE.
+final :: proc(ctx: ^Context, dst: []byte) {
+	assert(ctx.is_initialized)
+	defer reset(ctx)
+
+	if len(dst) < MIN_TAG_SIZE {
+		panic("crypto/kmac: invalid KMAC tag_size, too short")
+	}
+
+	_sha3.final_cshake(transmute(^_sha3.Context)(ctx), dst)
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	if ctx == other {
+		return
+	}
+
+	shake.clone(transmute(^shake.Context)(ctx), transmute(^shake.Context)(other))
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	if !ctx.is_initialized {
+		return
+	}
+
+	shake.reset(transmute(^shake.Context)(ctx))
+}
+
+@(private)
+_init_kmac :: proc(ctx: ^Context, key, s: []byte, sec_strength: int) {
+	if ctx.is_initialized {
+		reset(ctx)
+	}
+
+	if len(key) < sec_strength / 8 {
+		panic("crypto/kmac: invalid KMAC key, too short")
+	}
+
+	ctx_ := transmute(^_sha3.Context)(ctx)
+	_sha3.init_cshake(ctx_, N_KMAC, s, sec_strength)
+	_sha3.bytepad(ctx_, [][]byte{key}, _sha3.rate_cshake(sec_strength))
+}
+
+@(private)
+N_KMAC := []byte{'K', 'M', 'A', 'C'}
--- a/core/crypto/legacy/keccak/keccak.odin
+++ b/core/crypto/legacy/keccak/keccak.odin
@@ -1,3 +1,11 @@
+/*
+package keccak implements the Keccak hash algorithm family.
+
+During the SHA-3 standardization process, the padding scheme was changed
+thus Keccac and SHA-3 produce different outputs.  Most users should use
+SHA-3 and/or SHAKE instead, however the legacy algorithm is provided for
+backward compatibility purposes.
+*/
 package keccak

 /*
@@ -6,372 +14,82 @@ package keccak

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
-
-    Interface for the Keccak hashing algorithm.
-    This is done because the padding in the SHA3 standard was changed by the NIST, resulting in a different output.
 */

-import "core:io"
-import "core:os"
-
 import "../../_sha3"

-/*
-    High level API
-*/
-
+// DIGEST_SIZE_224 is the Keccak-224 digest size.
 DIGEST_SIZE_224 :: 28
+// DIGEST_SIZE_256 is the Keccak-256 digest size.
 DIGEST_SIZE_256 :: 32
+// DIGEST_SIZE_384 is the Keccak-384 digest size.
 DIGEST_SIZE_384 :: 48
+// DIGEST_SIZE_512 is the Keccak-512 digest size.
 DIGEST_SIZE_512 :: 64

-// hash_string_224 will hash the given input and return the
-// computed hash
-hash_string_224 :: proc(data: string) -> [DIGEST_SIZE_224]byte {
-	return hash_bytes_224(transmute([]byte)(data))
-}
+// BLOCK_SIZE_224 is the Keccak-224 block size in bytes.
+BLOCK_SIZE_224 :: _sha3.RATE_224
+// BLOCK_SIZE_256 is the Keccak-256 block size in bytes.
+BLOCK_SIZE_256 :: _sha3.RATE_256
+// BLOCK_SIZE_384 is the Keccak-384 block size in bytes.
+BLOCK_SIZE_384 :: _sha3.RATE_384
+// BLOCK_SIZE_512 is the Keccak-512 block size in bytes.
+BLOCK_SIZE_512 :: _sha3.RATE_512

-// hash_bytes_224 will hash the given input and return the
-// computed hash
-hash_bytes_224 :: proc(data: []byte) -> [DIGEST_SIZE_224]byte {
-	hash: [DIGEST_SIZE_224]byte
-	ctx: Context
+// Context is a Keccak instance.
+Context :: distinct _sha3.Context
+
+// init_224 initializes a Context for Keccak-224.
+init_224 :: proc(ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_224
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
+	_init(ctx)
 }

-// hash_string_to_buffer_224 will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer_224 :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer_224(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer_224 will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer_224 :: proc(data, hash: []byte) {
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_224
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream_224 will read the stream in chunks and compute a
-// hash from its contents
-hash_stream_224 :: proc(s: io.Stream) -> ([DIGEST_SIZE_224]byte, bool) {
-	hash: [DIGEST_SIZE_224]byte
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_224
-	ctx.is_keccak = true
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file_224 will read the file provided by the given handle
-// and compute a hash
-hash_file_224 :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE_224]byte, bool) {
-	if !load_at_once {
-		return hash_stream_224(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes_224(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE_224]byte{}, false
-}
-
-hash_224 :: proc {
-	hash_stream_224,
-	hash_file_224,
-	hash_bytes_224,
-	hash_string_224,
-	hash_bytes_to_buffer_224,
-	hash_string_to_buffer_224,
-}
-
-// hash_string_256 will hash the given input and return the
-// computed hash
-hash_string_256 :: proc(data: string) -> [DIGEST_SIZE_256]byte {
-	return hash_bytes_256(transmute([]byte)(data))
-}
-
-// hash_bytes_256 will hash the given input and return the
-// computed hash
-hash_bytes_256 :: proc(data: []byte) -> [DIGEST_SIZE_256]byte {
-	hash: [DIGEST_SIZE_256]byte
-	ctx: Context
+// init_256 initializes a Context for Keccak-256.
+init_256 :: proc(ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_256
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
+	_init(ctx)
 }

-// hash_string_to_buffer_256 will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer_256 :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer_256(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer_256 will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer_256 :: proc(data, hash: []byte) {
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_256
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream_256 will read the stream in chunks and compute a
-// hash from its contents
-hash_stream_256 :: proc(s: io.Stream) -> ([DIGEST_SIZE_256]byte, bool) {
-	hash: [DIGEST_SIZE_256]byte
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_256
-	ctx.is_keccak = true
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file_256 will read the file provided by the given handle
-// and compute a hash
-hash_file_256 :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE_256]byte, bool) {
-	if !load_at_once {
-		return hash_stream_256(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes_256(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE_256]byte{}, false
-}
-
-hash_256 :: proc {
-	hash_stream_256,
-	hash_file_256,
-	hash_bytes_256,
-	hash_string_256,
-	hash_bytes_to_buffer_256,
-	hash_string_to_buffer_256,
-}
-
-// hash_string_384 will hash the given input and return the
-// computed hash
-hash_string_384 :: proc(data: string) -> [DIGEST_SIZE_384]byte {
-	return hash_bytes_384(transmute([]byte)(data))
-}
-
-// hash_bytes_384 will hash the given input and return the
-// computed hash
-hash_bytes_384 :: proc(data: []byte) -> [DIGEST_SIZE_384]byte {
-	hash: [DIGEST_SIZE_384]byte
-	ctx: Context
+// init_384 initializes a Context for Keccak-384.
+init_384 :: proc(ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_384
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
+	_init(ctx)
 }

-// hash_string_to_buffer_384 will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer_384 :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer_384(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer_384 will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer_384 :: proc(data, hash: []byte) {
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_384
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream_384 will read the stream in chunks and compute a
-// hash from its contents
-hash_stream_384 :: proc(s: io.Stream) -> ([DIGEST_SIZE_384]byte, bool) {
-	hash: [DIGEST_SIZE_384]byte
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_384
-	ctx.is_keccak = true
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file_384 will read the file provided by the given handle
-// and compute a hash
-hash_file_384 :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE_384]byte, bool) {
-	if !load_at_once {
-		return hash_stream_384(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes_384(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE_384]byte{}, false
-}
-
-hash_384 :: proc {
-	hash_stream_384,
-	hash_file_384,
-	hash_bytes_384,
-	hash_string_384,
-	hash_bytes_to_buffer_384,
-	hash_string_to_buffer_384,
-}
-
-// hash_string_512 will hash the given input and return the
-// computed hash
-hash_string_512 :: proc(data: string) -> [DIGEST_SIZE_512]byte {
-	return hash_bytes_512(transmute([]byte)(data))
-}
-
-// hash_bytes_512 will hash the given input and return the
-// computed hash
-hash_bytes_512 :: proc(data: []byte) -> [DIGEST_SIZE_512]byte {
-	hash: [DIGEST_SIZE_512]byte
-	ctx: Context
+// init_512 initializes a Context for Keccak-512.
+init_512 :: proc(ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_512
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
+	_init(ctx)
 }

-// hash_string_to_buffer_512 will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer_512 :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer_512(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer_512 will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer_512 :: proc(data, hash: []byte) {
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_512
-	ctx.is_keccak = true
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream_512 will read the stream in chunks and compute a
-// hash from its contents
-hash_stream_512 :: proc(s: io.Stream) -> ([DIGEST_SIZE_512]byte, bool) {
-	hash: [DIGEST_SIZE_512]byte
-	ctx: Context
-	ctx.mdlen = DIGEST_SIZE_512
-	ctx.is_keccak = true
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file_512 will read the file provided by the given handle
-// and compute a hash
-hash_file_512 :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE_512]byte, bool) {
-	if !load_at_once {
-		return hash_stream_512(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes_512(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE_512]byte{}, false
-}
-
-hash_512 :: proc {
-	hash_stream_512,
-	hash_file_512,
-	hash_bytes_512,
-	hash_string_512,
-	hash_bytes_to_buffer_512,
-	hash_string_to_buffer_512,
-}
-
-/*
-    Low level API
-*/
-
-Context :: _sha3.Sha3_Context
-
-init :: proc(ctx: ^Context) {
-	ctx.is_keccak = true
-	_sha3.init(ctx)
+@(private)
+_init :: proc(ctx: ^Context) {
+	ctx.dsbyte = _sha3.DS_KECCAK
+	_sha3.init(transmute(^_sha3.Context)(ctx))
 }

+// update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	_sha3.update(ctx, data)
+	_sha3.update(transmute(^_sha3.Context)(ctx), data)
 }

-final :: proc(ctx: ^Context, hash: []byte) {
-	_sha3.final(ctx, hash)
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	_sha3.final(transmute(^_sha3.Context)(ctx), hash, finalize_clone)
+}
+
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^Context) {
+	_sha3.clone(transmute(^_sha3.Context)(ctx), transmute(^_sha3.Context)(other))
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	_sha3.reset(transmute(^_sha3.Context)(ctx))
 }
--- a/core/crypto/legacy/md5/md5.odin
+++ b/core/crypto/legacy/md5/md5.odin
@@ -1,3 +1,13 @@
+/*
+package md5 implements the MD5 hash algorithm.
+
+WARNING: The MD5 algorithm is known to be insecure and should only be
+used for interoperating with legacy applications.
+
+See:
+- https://eprint.iacr.org/2005/075
+- https://datatracker.ietf.org/doc/html/rfc1321
+*/
 package md5

 /*
@@ -6,103 +16,29 @@ package md5

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
-
-    Implementation of the MD5 hashing algorithm, as defined in RFC 1321 <https://datatracker.ietf.org/doc/html/rfc1321>
 */

 import "core:encoding/endian"
-import "core:io"
 import "core:math/bits"
 import "core:mem"
-import "core:os"
-
-/*
-    High level API
-*/

+// DIGEST_SIZE is the MD5 digest size in bytes.
 DIGEST_SIZE :: 16

-// hash_string will hash the given input and return the
-// computed hash
-hash_string :: proc(data: string) -> [DIGEST_SIZE]byte {
-	return hash_bytes(transmute([]byte)(data))
+// BLOCK_SIZE is the MD5 block size in bytes.
+BLOCK_SIZE :: 64
+
+// Context is a MD5 instance.
+Context :: struct {
+	data:    [BLOCK_SIZE]byte,
+	state:   [4]u32,
+	bitlen:  u64,
+	datalen: u32,
+
+	is_initialized: bool,
 }

-// hash_bytes will hash the given input and return the
-// computed hash
-hash_bytes :: proc(data: []byte) -> [DIGEST_SIZE]byte {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
-}
-
-// hash_string_to_buffer will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer :: proc(data, hash: []byte) {
-	ctx: Context
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream will read the stream in chunks and compute a
-// hash from its contents
-hash_stream :: proc(s: io.Stream) -> ([DIGEST_SIZE]byte, bool) {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file will read the file provided by the given handle
-// and compute a hash
-hash_file :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE]byte, bool) {
-	if !load_at_once {
-		return hash_stream(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE]byte{}, false
-}
-
-hash :: proc {
-	hash_stream,
-	hash_file,
-	hash_bytes,
-	hash_string,
-	hash_bytes_to_buffer,
-	hash_string_to_buffer,
-}
-
-/*
-    Low level API
-*/
-
+// init initializes a Context.
 init :: proc(ctx: ^Context) {
 	ctx.state[0] = 0x67452301
 	ctx.state[1] = 0xefcdab89
@@ -115,6 +51,7 @@ init :: proc(ctx: ^Context) {
 	ctx.is_initialized = true
 }

+// update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	assert(ctx.is_initialized)

@@ -129,13 +66,26 @@ update :: proc(ctx: ^Context, data: []byte) {
 	}
 }

-final :: proc(ctx: ^Context, hash: []byte) {
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	assert(ctx.is_initialized)

 	if len(hash) < DIGEST_SIZE {
 		panic("crypto/md5: invalid destination digest size")
 	}

+	ctx := ctx
+	if finalize_clone {
+		tmp_ctx: Context
+		clone(&tmp_ctx, ctx)
+		ctx = &tmp_ctx
+	}
+	defer(reset(ctx))
+
 	i := ctx.datalen

 	if ctx.datalen < 56 {
@@ -163,25 +113,27 @@ final :: proc(ctx: ^Context, hash: []byte) {
 	for i = 0; i < DIGEST_SIZE / 4; i += 1 {
 		endian.unchecked_put_u32le(hash[i * 4:], ctx.state[i])
 	}
+}

-	ctx.is_initialized = false
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^$T) {
+	ctx^ = other^
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^$T) {
+	if !ctx.is_initialized {
+		return
+	}
+
+	mem.zero_explicit(ctx, size_of(ctx^))
 }

 /*
    MD5 implementation
 */

-BLOCK_SIZE :: 64
-
-Context :: struct {
-	data:    [BLOCK_SIZE]byte,
-	state:   [4]u32,
-	bitlen:  u64,
-	datalen: u32,
-
-	is_initialized: bool,
-}
-
 /*
    @note(zh): F, G, H and I, as mentioned in the RFC, have been inlined into FF, GG, HH
    and II respectively, instead of declaring them separately.
--- a/core/crypto/legacy/sha1/sha1.odin
+++ b/core/crypto/legacy/sha1/sha1.odin
@@ -1,3 +1,14 @@
+/*
+package sha1 implements the SHA1 hash algorithm.
+
+WARNING: The SHA1 algorithm is known to be insecure and should only be
+used for interoperating with legacy applications.
+
+See:
+- https://eprint.iacr.org/2017/190
+- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
+- https://datatracker.ietf.org/doc/html/rfc3174
+*/
 package sha1

 /*
@@ -6,103 +17,30 @@ package sha1

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
-
-    Implementation of the SHA1 hashing algorithm, as defined in RFC 3174 <https://datatracker.ietf.org/doc/html/rfc3174>
 */

 import "core:encoding/endian"
-import "core:io"
 import "core:math/bits"
 import "core:mem"
-import "core:os"
-
-/*
-    High level API
-*/

+// DIGEST_SIZE is the SHA1 digest size in bytes.
 DIGEST_SIZE :: 20

-// hash_string will hash the given input and return the
-// computed hash
-hash_string :: proc(data: string) -> [DIGEST_SIZE]byte {
-	return hash_bytes(transmute([]byte)(data))
+// BLOCK_SIZE is the SHA1 block size in bytes.
+BLOCK_SIZE :: 64
+
+// Context is a SHA1 instance.
+Context :: struct {
+	data:    [BLOCK_SIZE]byte,
+	state:   [5]u32,
+	k:       [4]u32,
+	bitlen:  u64,
+	datalen: u32,
+
+	is_initialized: bool,
 }

-// hash_bytes will hash the given input and return the
-// computed hash
-hash_bytes :: proc(data: []byte) -> [DIGEST_SIZE]byte {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash[:])
-	return hash
-}
-
-// hash_string_to_buffer will hash the given input and assign the
-// computed hash to the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_string_to_buffer :: proc(data: string, hash: []byte) {
-	hash_bytes_to_buffer(transmute([]byte)(data), hash)
-}
-
-// hash_bytes_to_buffer will hash the given input and write the
-// computed hash into the second parameter.
-// It requires that the destination buffer is at least as big as the digest size
-hash_bytes_to_buffer :: proc(data, hash: []byte) {
-	ctx: Context
-	init(&ctx)
-	update(&ctx, data)
-	final(&ctx, hash)
-}
-
-// hash_stream will read the stream in chunks and compute a
-// hash from its contents
-hash_stream :: proc(s: io.Stream) -> ([DIGEST_SIZE]byte, bool) {
-	hash: [DIGEST_SIZE]byte
-	ctx: Context
-	init(&ctx)
-
-	buf := make([]byte, 512)
-	defer delete(buf)
-
-	read := 1
-	for read > 0 {
-		read, _ = io.read(s, buf)
-		if read > 0 {
-			update(&ctx, buf[:read])
-		}
-	}
-	final(&ctx, hash[:])
-	return hash, true
-}
-
-// hash_file will read the file provided by the given handle
-// and compute a hash
-hash_file :: proc(hd: os.Handle, load_at_once := false) -> ([DIGEST_SIZE]byte, bool) {
-	if !load_at_once {
-		return hash_stream(os.stream_from_handle(hd))
-	} else {
-		if buf, ok := os.read_entire_file(hd); ok {
-			return hash_bytes(buf[:]), ok
-		}
-	}
-	return [DIGEST_SIZE]byte{}, false
-}
-
-hash :: proc {
-	hash_stream,
-	hash_file,
-	hash_bytes,
-	hash_string,
-	hash_bytes_to_buffer,
-	hash_string_to_buffer,
-}
-
-/*
-    Low level API
-*/
-
+// init initializes a Context.
 init :: proc(ctx: ^Context) {
 	ctx.state[0] = 0x67452301
 	ctx.state[1] = 0xefcdab89
@@ -120,6 +58,7 @@ init :: proc(ctx: ^Context) {
 	ctx.is_initialized = true
 }

+// update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	assert(ctx.is_initialized)

@@ -134,13 +73,26 @@ update :: proc(ctx: ^Context, data: []byte) {
 	}
 }

-final :: proc(ctx: ^Context, hash: []byte) {
+// final finalizes the Context, writes the digest to hash, and calls
+// reset on the Context.
+//
+// Iff finalize_clone is set, final will work on a copy of the Context,
+// which is useful for for calculating rolling digests.
+final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	assert(ctx.is_initialized)

 	if len(hash) < DIGEST_SIZE {
 		panic("crypto/sha1: invalid destination digest size")
 	}

+	ctx := ctx
+	if finalize_clone {
+		tmp_ctx: Context
+		clone(&tmp_ctx, ctx)
+		ctx = &tmp_ctx
+	}
+	defer(reset(ctx))
+
 	i := ctx.datalen

 	if ctx.datalen < 56 {
@@ -168,26 +120,27 @@ final :: proc(ctx: ^Context, hash: []byte) {
 	for i = 0; i < DIGEST_SIZE / 4; i += 1 {
 		endian.unchecked_put_u32be(hash[i * 4:], ctx.state[i])
 	}
+}

-	ctx.is_initialized = false
+// clone clones the Context other into ctx.
+clone :: proc(ctx, other: ^$T) {
+	ctx^ = other^
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^$T) {
+	if !ctx.is_initialized {
+		return
+	}
+
+	mem.zero_explicit(ctx, size_of(ctx^))
 }

 /*
    SHA1 implementation
 */

-BLOCK_SIZE :: 64
-
-Context :: struct {
-	data:    [BLOCK_SIZE]byte,
-	datalen: u32,
-	bitlen:  u64,
-	state:   [5]u32,
-	k:       [4]u32,
-
-	is_initialized: bool,
-}
-
@(private)
 transform :: proc "contextless" (ctx: ^Context, data: []byte) {
 	a, b, c, d, e, i, t: u32
--- a/core/crypto/pbkdf2/pbkdf2.odin
+++ b/core/crypto/pbkdf2/pbkdf2.odin
@@ -0,0 +1,122 @@
+/*
+package pbkdf2 implements the PBKDF2 password-based key derivation function.
+
+See: https://www.rfc-editor.org/rfc/rfc2898
+*/
+package pbkdf2
+
+import "core:crypto/hash"
+import "core:crypto/hmac"
+import "core:encoding/endian"
+import "core:mem"
+
+// derive invokes PBKDF2-HMAC with the specified hash algorithm, password,
+// salt, iteration count, and outputs the derived key to dst.
+derive :: proc(
+	hmac_hash: hash.Algorithm,
+	password: []byte,
+	salt: []byte,
+	iterations: u32,
+	dst: []byte,
+) {
+	h_len := hash.DIGEST_SIZES[hmac_hash]
+
+	// 1. If dkLen > (2^32 - 1) * hLen, output "derived key too long"
+	// and stop.
+
+	dk_len := len(dst)
+	switch {
+	case dk_len == 0:
+		return
+	case u64(dk_len) > u64(max(u32)) * u64(h_len):
+		// This is so beyond anything that is practical or reasonable,
+		// so just panic instead of returning an error.
+		panic("crypto/pbkdf2: derived key too long")
+	case:
+	}
+
+	// 2. Let l be the number of hLen-octet blocks in the derived key,
+	// rounding up, and let r be the number of octets in the last block.
+
+	l := dk_len / h_len // Don't need to round up.
+	r := dk_len % h_len
+
+	// 3. For each block of the derived key apply the function F defined
+	// below to the password P, the salt S, the iteration count c, and
+	// the block index to compute the block.
+	//
+	// 4. Concatenate the blocks and extract the first dkLen octets to
+	// produce a derived key DK.
+	//
+	// 5. Output the derived key DK.
+
+	// Each iteration of F is always `PRF (P, ...)`, so instantiate the
+	// PRF, and clone since memcpy is faster than having to re-initialize
+	// HMAC repeatedly.
+
+	base: hmac.Context
+	defer hmac.reset(&base)
+
+	hmac.init(&base, hmac_hash, password)
+
+	// Process all of the blocks that will be written directly to dst.
+	dst_blk := dst
+	for i in 1 ..= l { 	// F expects i starting at 1.
+		_F(&base, salt, iterations, u32(i), dst_blk[:h_len])
+		dst_blk = dst_blk[h_len:]
+	}
+
+	// Instead of rounding l up, just proceass the one extra block iff
+	// r != 0.
+	if r > 0 {
+		tmp: [hash.MAX_DIGEST_SIZE]byte
+		blk := tmp[:h_len]
+		defer mem.zero_explicit(raw_data(blk), h_len)
+
+		_F(&base, salt, iterations, u32(l + 1), blk)
+		copy(dst_blk, blk)
+	}
+}
+
+@(private)
+_F :: proc(base: ^hmac.Context, salt: []byte, c: u32, i: u32, dst_blk: []byte) {
+	h_len := len(dst_blk)
+
+	tmp: [hash.MAX_DIGEST_SIZE]byte
+	u := tmp[:h_len]
+	defer mem.zero_explicit(raw_data(u), h_len)
+
+	// F (P, S, c, i) = U_1 \xor U_2 \xor ... \xor U_c
+	//
+	// where
+	//
+	// U_1 = PRF (P, S || INT (i)) ,
+	// U_2 = PRF (P, U_1) ,
+	// ...
+	// U_c = PRF (P, U_{c-1}) .
+	//
+	// Here, INT (i) is a four-octet encoding of the integer i, most
+	// significant octet first.
+
+	prf: hmac.Context
+
+	// U_1: PRF (P, S || INT (i))
+	hmac.clone(&prf, base)
+	hmac.update(&prf, salt)
+	endian.unchecked_put_u32be(u, i) // Use u as scratch space.
+	hmac.update(&prf, u[:4])
+	hmac.final(&prf, u)
+	copy(dst_blk, u)
+
+	// U_2 ... U_c: U_n = PRF (P, U_(n-1))
+	for _ in 1 ..< c {
+		hmac.clone(&prf, base)
+		hmac.update(&prf, u)
+		hmac.final(&prf, u)
+
+		// XOR dst_blk and u.
+		for v, i in u {
+			dst_blk[i] ~= v
+		}
+	}
+}
--- a/Show More
+++ b/Show More