Merge branch 'master' into bill/feature-using-stmt

2026-06-04 09:44:40 +00:00 · 2026-01-30 10:49:55 +00:00
parent 8b745c3909 5a21213fa5
commit 19b545e7cb
350 changed files with 41068 additions and 9020 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,7 +6,7 @@ jobs:
    name: NetBSD Build, Check, and Test
    runs-on: ubuntu-latest
    env:
-      PKGSRC_BRANCH: 2025Q2
+      PKGSRC_BRANCH: 2025Q4
    steps:
    - uses: actions/checkout@v4
    - name: Build, Check, and Test
@@ -156,18 +156,14 @@ jobs:

      - name: Check benchmarks
        run: ./odin check tests/benchmark -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point
+
      - name: Odin check examples/all for Linux i386
        if: matrix.os == 'ubuntu-latest'
        run: ./odin check examples/all -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -target:linux_i386
-      - name: Odin check examples/all for Linux arm64
-        if: matrix.os == 'ubuntu-latest'
-        run: ./odin check examples/all -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -target:linux_arm64
-      - name: Odin check examples/all for FreeBSD amd64
-        if: matrix.os == 'ubuntu-latest'
-        run: ./odin check examples/all -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -target:freebsd_amd64
      - name: Odin check examples/all for OpenBSD amd64
        if: matrix.os == 'ubuntu-latest'
        run: ./odin check examples/all -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -target:openbsd_amd64
+
      - name: Odin check examples/all for js_wasm32
        if: matrix.os == 'ubuntu-latest'
        run: ./odin check examples/all -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point -target:js_wasm32
@@ -178,12 +174,6 @@ jobs:
      - name: Odin check examples/all/sdl3 for Linux i386
        if: matrix.os == 'ubuntu-latest'
        run: ./odin check examples/all/sdl3 -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point -target:linux_i386
-      - name: Odin check examples/all/sdl3 for Linux arm64
-        if: matrix.os == 'ubuntu-latest'
-        run: ./odin check examples/all/sdl3 -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point -target:linux_arm64
-      - name: Odin check examples/all/sdl3 for FreeBSD amd64
-        if: matrix.os == 'ubuntu-latest'
-        run: ./odin check examples/all/sdl3 -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point -target:freebsd_amd64
      - name: Odin check examples/all/sdl3 for OpenBSD amd64
        if: matrix.os == 'ubuntu-latest'
        run: ./odin check examples/all/sdl3 -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point -target:openbsd_amd64
--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -122,6 +122,7 @@ Type_Info_Struct_Flag :: enum u8 {
 	raw_union   = 1,
 	all_or_none = 2,
 	align       = 3,
+	simple      = 4,
 }

 Type_Info_Struct :: struct {
--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -360,7 +360,7 @@ new_aligned :: proc($T: typeid, alignment: int, allocator := context.allocator,

@(builtin, require_results)
 new_clone :: proc(data: $T, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) #optional_allocator_error {
-	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return))
+	t = (^T)(raw_data(mem_alloc_non_zeroed(size_of(T), align_of(T), allocator, loc) or_return))
 	if t != nil {
 		t^ = data
 	}
@@ -433,7 +433,6 @@ _make_dynamic_array_len_cap :: proc(array: ^Raw_Dynamic_Array, size_of_elem, ali
 	array.data = raw_data(data)
 	array.len = 0 if use_zero else len
 	array.cap = 0 if use_zero else cap
-	array.allocator = allocator
 	return
 }

--- a/base/runtime/error_checks.odin
+++ b/base/runtime/error_checks.odin
@@ -12,7 +12,7 @@ bounds_trap :: proc "contextless" () -> ! {
 }

@(no_instrumentation)
-type_assertion_trap :: proc "contextless" () -> ! {
+type_assertion_trap_contextless :: proc "contextless" () -> ! {
 	when ODIN_OS == .Windows {
 		windows_trap_type_assertion()
 	} else when ODIN_OS == .Orca {
@@ -137,20 +137,22 @@ matrix_bounds_check_error :: proc "contextless" (file: string, line, column: i32


 when ODIN_NO_RTTI {
-	type_assertion_check :: proc "contextless" (ok: bool, file: string, line, column: i32) {
+	type_assertion_check_with_context :: proc "odin" (ok: bool, file: string, line, column: i32) {
 		if ok {
 			return
 		}
 		@(cold, no_instrumentation)
-		handle_error :: proc "contextless" (file: string, line, column: i32) -> ! {
-			print_caller_location(Source_Code_Location{file, line, column, ""})
-			print_string(" Invalid type assertion\n")
-			type_assertion_trap()
+		handle_error :: proc "odin" (file: string, line, column: i32) -> ! {
+			p := context.assertion_failure_proc
+			if p == nil {
+				p = default_assertion_failure_proc
+			}
+			p("type assertion", "Invalid type assertion", Source_Code_Location{file, line, column, ""})
 		}
 		handle_error(file, line, column)
 	}

-	type_assertion_check2 :: proc "contextless" (ok: bool, file: string, line, column: i32) {
+	type_assertion_check_contextless :: proc "contextless" (ok: bool, file: string, line, column: i32) {
 		if ok {
 			return
 		}
@@ -158,12 +160,71 @@ when ODIN_NO_RTTI {
 		handle_error :: proc "contextless" (file: string, line, column: i32) -> ! {
 			print_caller_location(Source_Code_Location{file, line, column, ""})
 			print_string(" Invalid type assertion\n")
-			type_assertion_trap()
+			type_assertion_trap_contextless()
+		}
+		handle_error(file, line, column)
+	}
+
+	type_assertion_check2_with_context :: proc "odin" (ok: bool, file: string, line, column: i32) {
+		if ok {
+			return
+		}
+		@(cold, no_instrumentation)
+		handle_error :: proc "odin" (file: string, line, column: i32) -> ! {
+			p := context.assertion_failure_proc
+			if p == nil {
+				p = default_assertion_failure_proc
+			}
+			p("type assertion", "Invalid type assertion", Source_Code_Location{file, line, column, ""})
+		}
+
+		handle_error(file, line, column)
+	}
+
+	type_assertion_check2_contextless :: proc "contextless" (ok: bool, file: string, line, column: i32) {
+		if ok {
+			return
+		}
+		@(cold, no_instrumentation)
+		handle_error :: proc "contextless" (file: string, line, column: i32) -> ! {
+			print_caller_location(Source_Code_Location{file, line, column, ""})
+			print_string(" Invalid type assertion\n")
+			type_assertion_trap_contextless()
 		}
 		handle_error(file, line, column)
 	}
 } else {
-	type_assertion_check :: proc "contextless" (ok: bool, file: string, line, column: i32, from, to: typeid) {
+	@(private="file")
+	TYPE_ASSERTION_BUFFER_SIZE :: 1024
+
+	type_assertion_check_with_context :: proc "odin" (ok: bool, file: string, line, column: i32, from, to: typeid) {
+		if ok {
+			return
+		}
+		@(cold, no_instrumentation)
+		handle_error :: proc "odin" (file: string, line, column: i32, from, to: typeid) -> ! {
+			do_msg :: proc "contextless" (i: ^int, buf: []byte, file: string, line, column: i32, from, to: typeid) -> bool {
+				write_string(i, buf, "Invalid type assertion from ") or_return
+				write_typeid(i, buf, from)                           or_return
+				write_string(i, buf, " to ")                         or_return
+				write_typeid(i, buf, to)                             or_return
+				return true
+			}
+
+			buf: [TYPE_ASSERTION_BUFFER_SIZE]byte
+			i := 0
+			_ = do_msg(&i, buf[:], file, line, column, from, to)
+
+			p := context.assertion_failure_proc
+			if p == nil {
+				p = default_assertion_failure_proc
+			}
+			p("type assertion", string(buf[:i]), Source_Code_Location{file, line, column, ""})
+		}
+		handle_error(file, line, column, from, to)
+	}
+
+	type_assertion_check_contextless :: proc "contextless" (ok: bool, file: string, line, column: i32, from, to: typeid) {
 		if ok {
 			return
 		}
@@ -175,47 +236,90 @@ when ODIN_NO_RTTI {
 			print_string(" to ")
 			print_typeid(to)
 			print_byte('\n')
-			type_assertion_trap()
+			type_assertion_trap_contextless()
 		}
 		handle_error(file, line, column, from, to)
 	}

-	type_assertion_check2 :: proc "contextless" (ok: bool, file: string, line, column: i32, from, to: typeid, from_data: rawptr) {
+	@(private="file")
+	type_assertion_variant_type :: proc "contextless" (id: typeid, data: rawptr) -> typeid {
+		if id == nil || data == nil {
+			return id
+		}
+		ti := type_info_base(type_info_of(id))
+		#partial switch v in ti.variant {
+		case Type_Info_Any:
+			return (^any)(data).id
+		case Type_Info_Union:
+			if v.tag_type == nil {
+				if (^rawptr)(data)^ == nil {
+					return nil
+				}
+				return v.variants[0].id
+
+			}
+
+			tag_ptr := uintptr(data) + v.tag_offset
+			idx := 0
+			switch v.tag_type.size {
+			case 1:  idx = int(  (^u8)(tag_ptr)^); if !v.no_nil { idx -= 1 }
+			case 2:  idx = int( (^u16)(tag_ptr)^); if !v.no_nil { idx -= 1 }
+			case 4:  idx = int( (^u32)(tag_ptr)^); if !v.no_nil { idx -= 1 }
+			case 8:  idx = int( (^u64)(tag_ptr)^); if !v.no_nil { idx -= 1 }
+			case 16: idx = int((^u128)(tag_ptr)^); if !v.no_nil { idx -= 1 }
+			}
+			if idx < 0 {
+				return nil
+			} else if idx < len(v.variants) {
+				return v.variants[idx].id
+			}
+		}
+		return id
+	}
+
+	type_assertion_check2_with_context :: proc "odin" (ok: bool, file: string, line, column: i32, from, to: typeid, from_data: rawptr) {
 		if ok {
 			return
 		}

-		variant_type :: proc "contextless" (id: typeid, data: rawptr) -> typeid {
-			if id == nil || data == nil {
-				return id
-			}
-			ti := type_info_base(type_info_of(id))
-			#partial switch v in ti.variant {
-			case Type_Info_Any:
-				return (^any)(data).id
-			case Type_Info_Union:
-				tag_ptr := uintptr(data) + v.tag_offset
-				idx := 0
-				switch v.tag_type.size {
-				case 1:  idx = int((^u8)(tag_ptr)^)   - 1
-				case 2:  idx = int((^u16)(tag_ptr)^)  - 1
-				case 4:  idx = int((^u32)(tag_ptr)^)  - 1
-				case 8:  idx = int((^u64)(tag_ptr)^)  - 1
-				case 16: idx = int((^u128)(tag_ptr)^) - 1
-				}
-				if idx < 0 {
-					return nil
-				} else if idx < len(v.variants) {
-					return v.variants[idx].id
+		@(cold, no_instrumentation)
+		handle_error :: proc "odin" (file: string, line, column: i32, from, to: typeid, from_data: rawptr) -> ! {
+			do_msg :: proc "contextless" (i: ^int, buf: []byte, file: string, line, column: i32, from, to, actual: typeid) -> bool {
+				write_string(i, buf, "Invalid type assertion from ") or_return
+				write_typeid(i, buf, from)                           or_return
+				write_string(i, buf, " to ")                         or_return
+				write_typeid(i, buf, to)                             or_return
+				if actual != from {
+					write_string(i, buf, ", actual type: ") or_return
+					write_typeid(i, buf, actual)            or_return
 				}
+				return true
 			}
-			return id
+
+			actual := type_assertion_variant_type(from, from_data)
+
+			buf: [TYPE_ASSERTION_BUFFER_SIZE]byte
+			i := 0
+			_ = do_msg(&i, buf[:], file, line, column, from, to, actual)
+
+			p := context.assertion_failure_proc
+			if p == nil {
+				p = default_assertion_failure_proc
+			}
+			p("type assertion", string(buf[:i]), Source_Code_Location{file, line, column, ""})
+		}
+		handle_error(file, line, column, from, to, from_data)
+	}
+
+	type_assertion_check2_contextless :: proc "contextless" (ok: bool, file: string, line, column: i32, from, to: typeid, from_data: rawptr) {
+		if ok {
+			return
 		}

 		@(cold, no_instrumentation)
 		handle_error :: proc "contextless" (file: string, line, column: i32, from, to: typeid, from_data: rawptr) -> ! {

-			actual := variant_type(from, from_data)
+			actual := type_assertion_variant_type(from, from_data)

 			print_caller_location(Source_Code_Location{file, line, column, ""})
 			print_string(" Invalid type assertion from ")
@@ -227,7 +331,7 @@ when ODIN_NO_RTTI {
 				print_typeid(actual)
 			}
 			print_byte('\n')
-			type_assertion_trap()
+			type_assertion_trap_contextless()
 		}
 		handle_error(file, line, column, from, to, from_data)
 	}
--- a/base/runtime/print.odin
+++ b/base/runtime/print.odin
@@ -280,11 +280,22 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 			print_byte('i' if info.signed else 'u')
 			print_u64(u64(8*ti.size))
 		}
+		switch info.endianness {
+		case .Platform: // nothing
+		case .Little:   print_string("le")
+		case .Big:      print_string("be")
+		}
+
 	case Type_Info_Rune:
 		print_string("rune")
 	case Type_Info_Float:
 		print_byte('f')
 		print_u64(u64(8*ti.size))
+		switch info.endianness {
+		case .Platform: // nothing
+		case .Little:   print_string("le")
+		case .Big:      print_string("be")
+		}
 	case Type_Info_Complex:
 		print_string("complex")
 		print_u64(u64(8*ti.size))
@@ -410,6 +421,7 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		if .packed      in info.flags { print_string("#packed ") }
 		if .raw_union   in info.flags { print_string("#raw_union ") }
 		if .all_or_none in info.flags { print_string("#all_or_none ") }
+		if .simple      in info.flags { print_string("#simple ") }
 		if .align in info.flags {
 			print_string("#align(")
 			print_u64(u64(ti.align))
@@ -494,6 +506,9 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		print_type(info.elem)
 		
 	case Type_Info_Matrix:
+		if info.layout == .Row_Major {
+			print_string("#row_major ")
+		}
 		print_string("matrix[")
 		print_u64(u64(info.row_count))
 		print_string(", ")
@@ -502,3 +517,419 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		print_type(info.elem)
 	}
 }
+
+
+@(require_results)
+write_string :: proc "contextless" (i: ^int, dst: []byte, src: string) -> bool {
+	if i^ < len(dst) {
+		i^ += copy(dst[i^:], src)
+		return true
+	}
+	return false
+}
+
+
+@(require_results)
+write_byte :: proc "contextless" (i: ^int, dst: []byte, src: byte) -> bool {
+	if i^ < len(dst) {
+		dst[i^] = src
+		i^ += 1
+		return true
+	}
+	return false
+}
+
+
+@(require_results)
+write_u64 :: proc "contextless" (j: ^int, dst: []byte, x: u64) -> bool {
+	if j^ < len(dst) {
+		b :: u64(10)
+		u := x
+
+		a: [129]byte
+		i := len(a)
+		for u >= b {
+			i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]
+			u /= b
+		}
+		i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]
+
+		return write_string(j, dst, string(a[i:]))
+	}
+	return false
+}
+
+@(require_results)
+write_i64 :: proc "contextless" (j: ^int, dst: []byte, x: i64) -> bool {
+	if j^ < len(dst) {
+		b :: u64(10)
+		u := u64(abs(x))
+		neg := x < 0
+
+		a: [129]byte
+		i := len(a)
+		for u >= b {
+			i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]
+			u /= b
+		}
+		i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]
+		if neg {
+			i -= 1; a[i] = '-'
+		}
+
+		return write_string(j, dst, string(a[i:]))
+	}
+	return false
+}
+
+
+@(require_results)
+write_caller_location :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, loc: Source_Code_Location) -> bool {
+	write_string(i, buf, loc.file_path) or_return
+
+	when ODIN_ERROR_POS_STYLE == .Default {
+		write_byte(i, buf, '(')           or_return
+		write_u64 (i, buf, u64(loc.line)) or_return
+		if loc.column != 0 {
+			write_byte(i, buf, ':')             or_return
+			write_u64 (i, buf, u64(loc.column)) or_return
+		}
+		write_byte(i, buf, ')') or_return
+		return true
+	} else when ODIN_ERROR_POS_STYLE == .Unix {
+		write_byte(i, buf, ':')           or_return
+		write_u64 (i, buf, u64(loc.line)) or_return
+		if loc.column != 0 {
+			write_byte(i, buf, ':')             or_return
+			write_u64 (i, buf, u64(loc.column)) or_return
+		}
+		write_byte(i, buf, ':') or_return
+		return true
+	} else {
+		#panic("unhandled ODIN_ERROR_POS_STYLE")
+	}
+}
+
+@(require_results)
+write_typeid :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, id: typeid) -> bool {
+	when ODIN_NO_RTTI {
+		if id == nil {
+			write_string(i, buf, "nil") or_return
+		} else {
+			write_string(i, buf, "<unknown type>") or_return
+		}
+	} else {
+		if id == nil {
+			write_string(i, buf, "nil") or_return
+		} else {
+			ti := type_info_of(id)
+			write_write_type(i, buf, ti) or_return
+		}
+	}
+	return true
+}
+
+
+@(require_results)
+write_rune :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, r: rune) -> (written: int, ok: bool) #no_bounds_check {
+	RUNE_SELF :: 0x80
+
+	if r < RUNE_SELF {
+		write_byte(i, buf,byte(r)) or_return
+		return 1, true
+	}
+
+	b, n := encode_rune(r)
+	prev := i^
+	write_string(i, buf, string(b[:n])) or_return
+	return i^ - prev, true
+}
+
+@(require_results)
+write_encoded_rune :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, r: rune) -> bool {
+	write_byte(i, buf, '\'') or_return
+
+	switch r {
+	case '\a': write_string(i, buf, "\\a") or_return
+	case '\b': write_string(i, buf, "\\b") or_return
+	case '\e': write_string(i, buf, "\\e") or_return
+	case '\f': write_string(i, buf, "\\f") or_return
+	case '\n': write_string(i, buf, "\\n") or_return
+	case '\r': write_string(i, buf, "\\r") or_return
+	case '\t': write_string(i, buf, "\\t") or_return
+	case '\v': write_string(i, buf, "\\v") or_return
+	case:
+		if r <= 0 {
+			write_string(i, buf, "\\x00") or_return
+		} else if r < 32 {
+			n0, n1 := u8(r) >> 4, u8(r) & 0xf
+			write_string(i, buf, "\\x")                   or_return
+			write_byte  (i, buf, _INTEGER_DIGITS_VAR[n0]) or_return
+			write_byte  (i, buf, _INTEGER_DIGITS_VAR[n1]) or_return
+		} else {
+			_ = write_rune(i, buf, r) or_return
+		}
+	}
+
+	write_byte(i, buf, '\'') or_return
+	return true
+}
+
+@(optimization_mode="favor_size")
+write_write_type :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, ti: ^Type_Info) -> bool {
+	if ti == nil {
+		write_string(i, buf, "nil") or_return
+		return true
+	}
+
+	switch info in ti.variant {
+	case Type_Info_Named:
+		write_string(i, buf, info.name) or_return
+	case Type_Info_Integer:
+		switch ti.id {
+		case int:     write_string(i, buf, "int")     or_return
+		case uint:    write_string(i, buf, "uint")    or_return
+		case uintptr: write_string(i, buf, "uintptr") or_return
+		case:
+			write_byte(i, buf, 'i' if info.signed else 'u') or_return
+			write_u64 (i, buf, u64(8*ti.size))              or_return
+		}
+		switch info.endianness {
+		case .Platform: // nothing
+		case .Little:   write_string(i, buf, "le") or_return
+		case .Big:      write_string(i, buf, "be") or_return
+		}
+
+	case Type_Info_Rune:
+		write_string(i, buf, "rune") or_return
+	case Type_Info_Float:
+		write_byte(i, buf, 'f') or_return
+		write_u64(i, buf, u64(8*ti.size)) or_return
+		switch info.endianness {
+		case .Platform: // nothing
+		case .Little:   write_string(i, buf, "le") or_return
+		case .Big:      write_string(i, buf, "be") or_return
+		}
+
+	case Type_Info_Complex:
+		write_string(i, buf, "complex")      or_return
+		write_u64   (i, buf, u64(8*ti.size)) or_return
+	case Type_Info_Quaternion:
+		write_string(i, buf, "quaternion")   or_return
+		write_u64   (i, buf, u64(8*ti.size)) or_return
+	case Type_Info_String:
+		if info.is_cstring {
+			write_byte(i, buf, 'c') or_return
+		}
+		write_string(i, buf, "string") or_return
+		switch info.encoding {
+		case .UTF_8:  /**/
+		case .UTF_16: write_string(i, buf, "16") or_return
+		}
+	case Type_Info_Boolean:
+		switch ti.id {
+		case bool: write_string(i, buf, "bool") or_return
+		case:
+			write_byte(i, buf, 'b')            or_return
+			write_u64 (i, buf, u64(8*ti.size)) or_return
+		}
+	case Type_Info_Any:
+		write_string(i, buf, "any") or_return
+	case Type_Info_Type_Id:
+		write_string(i, buf, "typeid") or_return
+
+	case Type_Info_Pointer:
+		if info.elem == nil {
+			write_string(i, buf, "rawptr") or_return
+		} else {
+			write_string    (i, buf, "^")       or_return
+			write_write_type(i, buf, info.elem) or_return
+		}
+	case Type_Info_Multi_Pointer:
+		write_string    (i, buf, "[^]")     or_return
+		write_write_type(i, buf, info.elem) or_return
+	case Type_Info_Soa_Pointer:
+		write_string    (i, buf, "#soa ^")  or_return
+		write_write_type(i, buf, info.elem) or_return
+	case Type_Info_Procedure:
+		write_string(i, buf, "proc") or_return
+		if info.params == nil {
+			write_string(i, buf, "()") or_return
+		} else {
+			t := info.params.variant.(Type_Info_Parameters)
+			write_byte(i, buf, '(') or_return
+			for t, j in t.types {
+				if j > 0 { write_string(i, buf, ", ") or_return }
+				write_write_type(i, buf, t) or_return
+			}
+			write_string(i, buf, ")") or_return
+		}
+		if info.results != nil {
+			write_string    (i, buf, " -> ")       or_return
+			write_write_type(i, buf, info.results) or_return
+		}
+	case Type_Info_Parameters:
+		count := len(info.names)
+		if count != 1 { write_byte(i, buf, '(') or_return }
+		for name, j in info.names {
+			if j > 0 { write_string(i, buf, ", ") or_return }
+
+			t := info.types[j]
+
+			if len(name) > 0 {
+				write_string(i, buf, name) or_return
+				write_string(i, buf, ": ") or_return
+			}
+			write_write_type(i, buf, t) or_return
+		}
+		if count != 1 { write_string(i, buf, ")") or_return }
+
+	case Type_Info_Array:
+		write_byte      (i, buf, '[')             or_return
+		write_u64       (i, buf, u64(info.count)) or_return
+		write_byte      (i, buf, ']')             or_return
+		write_write_type(i, buf, info.elem)       or_return
+
+	case Type_Info_Enumerated_Array:
+		if info.is_sparse {
+			write_string(i, buf, "#sparse") or_return
+		}
+		write_byte      (i, buf, '[')        or_return
+		write_write_type(i, buf, info.index) or_return
+		write_byte      (i, buf, ']')        or_return
+		write_write_type(i, buf, info.elem)  or_return
+
+
+	case Type_Info_Dynamic_Array:
+		write_string    (i, buf, "[dynamic]") or_return
+		write_write_type(i, buf, info.elem)   or_return
+	case Type_Info_Slice:
+		write_string    (i, buf, "[]")      or_return
+		write_write_type(i, buf, info.elem) or_return
+
+	case Type_Info_Map:
+		write_string    (i, buf, "map[")     or_return
+		write_write_type(i, buf, info.key)   or_return
+		write_byte      (i, buf, ']')        or_return
+		write_write_type(i, buf, info.value) or_return
+
+	case Type_Info_Struct:
+		switch info.soa_kind {
+		case .None: // Ignore
+		case .Fixed:
+			write_string    (i, buf, "#soa[")            or_return
+			write_u64       (i, buf, u64(info.soa_len))  or_return
+			write_byte      (i, buf, ']')                or_return
+			write_write_type(i, buf, info.soa_base_type) or_return
+			return true
+		case .Slice:
+			write_string    (i, buf, "#soa[]")           or_return
+			write_write_type(i, buf, info.soa_base_type) or_return
+			return true
+		case .Dynamic:
+			write_string    (i, buf, "#soa[dynamic]")    or_return
+			write_write_type(i, buf, info.soa_base_type) or_return
+			return true
+		}
+
+		write_string(i, buf, "struct ") or_return
+		if .packed      in info.flags { write_string(i, buf, "#packed ")      or_return }
+		if .raw_union   in info.flags { write_string(i, buf, "#raw_union ")   or_return }
+		if .all_or_none in info.flags { write_string(i, buf, "#all_or_none ") or_return }
+		if .simple      in info.flags { write_string(i, buf, "#simple ")      or_return }
+		if .align in info.flags {
+			write_string(i, buf, "#align(")     or_return
+			write_u64(i,    buf, u64(ti.align)) or_return
+			write_string(i, buf, ") ")          or_return
+		}
+		write_byte(i, buf, '{') or_return
+		for name, j in info.names[:info.field_count] {
+			if j > 0 { write_string(i, buf, ", ") or_return }
+			write_string    (i, buf, name)          or_return
+			write_string    (i, buf, ": ")          or_return
+			write_write_type(i, buf, info.types[j]) or_return
+		}
+		write_byte(i, buf, '}') or_return
+
+	case Type_Info_Union:
+		write_string(i, buf, "union ") or_return
+		if info.custom_align {
+			write_string(i, buf, "#align(")     or_return
+			write_u64   (i, buf, u64(ti.align)) or_return
+			write_string(i, buf, ") ")          or_return
+		}
+		if info.no_nil {
+			write_string(i, buf, "#no_nil ") or_return
+		}
+		write_byte(i, buf, '{') or_return
+		for variant, j in info.variants {
+			if j > 0 { write_string(i, buf, ", ") or_return }
+			write_write_type(i, buf, variant) or_return
+		}
+		write_string(i, buf, "}") or_return
+
+	case Type_Info_Enum:
+		write_string    (i, buf, "enum ")   or_return
+		write_write_type(i, buf, info.base) or_return
+		write_string    (i, buf, " {")      or_return
+		for name, j in info.names {
+			if j > 0 { write_string(i, buf, ", ") or_return }
+			write_string(i, buf, name) or_return
+		}
+		write_string(i, buf, "}") or_return
+
+	case Type_Info_Bit_Set:
+		write_string(i, buf, "bit_set[") or_return
+
+		#partial switch elem in type_info_base(info.elem).variant {
+		case Type_Info_Enum:
+			write_write_type(i, buf, info.elem) or_return
+		case Type_Info_Rune:
+			write_encoded_rune(i, buf, rune(info.lower)) or_return
+			write_string      (i, buf, "..")             or_return
+			write_encoded_rune(i, buf, rune(info.upper)) or_return
+		case:
+			write_i64   (i, buf, info.lower) or_return
+			write_string(i, buf, "..")       or_return
+			write_i64   (i, buf, info.upper) or_return
+		}
+		if info.underlying != nil {
+			write_string    (i, buf, "; ")            or_return
+			write_write_type(i, buf, info.underlying) or_return
+		}
+		write_byte(i, buf, ']') or_return
+
+	case Type_Info_Bit_Field:
+		write_string    (i, buf, "bit_field ")      or_return
+		write_write_type(i, buf, info.backing_type) or_return
+		write_string    (i, buf, " {")              or_return
+		for name, j in info.names[:info.field_count] {
+			if j > 0 { write_string(i, buf, ", ") or_return }
+			write_string    (i, buf, name)                   or_return
+			write_string    (i, buf, ": ")                   or_return
+			write_write_type(i, buf, info.types[j])          or_return
+			write_string    (i, buf, " | ")                  or_return
+			write_u64       (i, buf, u64(info.bit_sizes[j])) or_return
+		}
+		write_byte(i, buf, '}') or_return
+
+
+	case Type_Info_Simd_Vector:
+		write_string    (i, buf, "#simd[")        or_return
+		write_u64       (i, buf, u64(info.count)) or_return
+		write_byte      (i, buf, ']')             or_return
+		write_write_type(i, buf, info.elem)       or_return
+
+	case Type_Info_Matrix:
+		if info.layout == .Row_Major {
+			write_string(i, buf, "#row_major ") or_return
+		}
+		write_string    (i, buf, "matrix[")              or_return
+		write_u64       (i, buf, u64(info.row_count))    or_return
+		write_string    (i, buf, ", ")                   or_return
+		write_u64       (i, buf, u64(info.column_count)) or_return
+		write_string    (i, buf, "]")                    or_return
+		write_write_type(i, buf, info.elem)              or_return
+	}
+	return true
+}
--- a/core/container/handle_map/doc.odin
+++ b/core/container/handle_map/doc.odin
@@ -0,0 +1,56 @@
+/*
+Handle-based map using fixed-length arrays.
+
+Example:
+	import hm "core:container/handle_map"
+
+	Handle :: hm.Handle32
+
+	Entity :: struct {
+		handle: Handle,
+		pos:    [2]f32,
+	}
+
+	{ // static map
+		entities: hm.Static_Handle_Map(1024, Entity, Handle)
+
+		h1 := hm.add(&entities, Entity{pos = {1,  4}})
+		h2 := hm.add(&entities, Entity{pos = {9, 16}})
+
+		if e, ok := hm.get(&entities, h2); ok {
+			e.pos.x += 32
+		}
+
+		hm.remove(&entities, h1)
+
+		h3 := hm.add(&entities, Entity{pos = {6, 7}})
+
+		it := hm.iterator_make(&entities)
+		for e, h in hm.iterate(&it) {
+			e.pos += {1, 2}
+		}
+	}
+
+	{ // dynamic map
+		entities: hm.Dynamic_Handle_Map(Entity, Handle)
+		hm.dynamic_init(&entities, context.allocator)
+		defer hm.dynamic_destroy(&entities)
+
+		h1 := hm.add(&entities, Entity{pos = {1,  4}})
+		h2 := hm.add(&entities, Entity{pos = {9, 16}})
+
+		if e, ok := hm.get(&entities, h2); ok {
+			e.pos.x += 32
+		}
+
+		hm.remove(&entities, h1)
+
+		h3 := hm.add(&entities, Entity{pos = {6, 7}})
+
+		it := hm.iterator_make(&entities)
+		for e, h in hm.iterate(&it) {
+			e.pos += {1, 2}
+		}
+	}
+*/
+package container_handle_map
--- a/core/container/handle_map/dynamic_handle_map.odin
+++ b/core/container/handle_map/dynamic_handle_map.odin
@@ -0,0 +1,141 @@
+package container_handle_map
+
+import "base:runtime"
+import "base:builtin"
+import "base:intrinsics"
+@(require) import "core:container/xar"
+
+Dynamic_Handle_Map :: struct($T: typeid, $Handle_Type: typeid)
+	where
+		intrinsics.type_has_field(Handle_Type, "idx"),
+		intrinsics.type_has_field(Handle_Type, "gen"),
+		intrinsics.type_is_unsigned(intrinsics.type_field_type(Handle_Type, "idx")),
+		intrinsics.type_is_unsigned(intrinsics.type_field_type(Handle_Type, "gen")),
+		intrinsics.type_field_type(Handle_Type, "idx") == intrinsics.type_field_type(Handle_Type, "gen"),
+
+		intrinsics.type_has_field (T, "handle"),
+		intrinsics.type_field_type(T, "handle") == Handle_Type {
+
+	items:        xar.Array(T, 4),
+	unused_items: xar.Array(u32, 4),
+}
+
+dynamic_init :: proc(m: ^$D/Dynamic_Handle_Map($T, $Handle_Type), allocator: runtime.Allocator) {
+	xar.init(&m.items,        allocator)
+	xar.init(&m.unused_items, allocator)
+}
+
+dynamic_destroy :: proc(m: ^$D/Dynamic_Handle_Map($T, $Handle_Type)) {
+	xar.destroy(&m.unused_items)
+	xar.destroy(&m.items)
+}
+
+@(require_results)
+dynamic_add :: proc(m: ^$D/Dynamic_Handle_Map($T, $Handle_Type), item: T, loc := #caller_location) -> (handle: Handle_Type, err: runtime.Allocator_Error) {
+	if xar.len(m.unused_items) > 0 {
+		i := xar.pop(&m.unused_items)
+		ptr := xar.get_ptr_unsafe(&m.items, i)
+		prev_gen := ptr.handle.gen
+		ptr^ = item
+
+		ptr.handle.idx = auto_cast i
+		ptr.handle.gen = auto_cast (prev_gen + 1)
+		return ptr.handle, nil
+	}
+
+	if xar.len(m.items) == 0 {
+		// initialize the zero-value sentinel
+		xar.append(&m.items, T{}, loc) or_return
+	}
+
+	i := xar.append(&m.items, item, loc) or_return
+
+	ptr := xar.get_ptr_unsafe(&m.items, i)
+	ptr^ = item
+
+	ptr.handle.idx = auto_cast i
+	ptr.handle.gen = 1
+	return ptr.handle, nil
+}
+
+@(require_results)
+dynamic_get :: proc "contextless" (m: ^$D/Dynamic_Handle_Map($T, $Handle_Type), h: Handle_Type) -> (^T, bool) #optional_ok {
+	if h.idx <= 0 || int(u32(h.idx)) >= xar.len(m.items) {
+		return nil, false
+	}
+	if e := xar.get_ptr_unsafe(&m.items, h.idx); e.handle == h {
+		return e, true
+	}
+	return nil, false
+}
+
+dynamic_remove :: proc(m: ^$D/Dynamic_Handle_Map($T, $Handle_Type), h: Handle_Type, loc := #caller_location) -> (found: bool, err: runtime.Allocator_Error) {
+	if h.idx <= 0 || int(u32(h.idx)) >= xar.len(m.items) {
+		return false, nil
+	}
+
+	if item := xar.get_ptr(&m.items, h.idx); item.handle == h {
+		xar.append(&m.unused_items, u32(h.idx), loc) or_return
+		item.handle.idx = 0
+		return true, nil
+	}
+
+	return false, nil
+}
+
+@(require_results)
+dynamic_is_valid :: proc "contextless" (m: ^$D/Dynamic_Handle_Map($T, $Handle_Type), h: Handle_Type) -> bool {
+	return h.idx > 0 && int(u32(h.idx)) < xar.len(m.items) && xar.get_ptr_unsafe(&m.items, h.idx).handle == h
+}
+
+// Returns the number of possibly valid items in the handle map.
+@(require_results)
+dynamic_len :: proc "contextless" (m: $D/Dynamic_Handle_Map($T, $Handle_Type)) -> uint {
+	n := xar.len(m.items) - xar.len(m.unused_items)
+	return uint(n-1 if n > 0 else 0)
+}
+
+@(require_results)
+dynamic_cap :: proc "contextless" (m: $D/Dynamic_Handle_Map($T, $Handle_Type)) -> uint {
+	n := xar.cap(m.items)
+	return uint(n-1 if n > 0 else 0)
+}
+
+dynamic_clear :: proc "contextless" (m: ^$D/Dynamic_Handle_Map($T, $Handle_Type)) {
+	xar.clear(&m.items)
+	xar.clear(&m.unused_items)
+}
+
+
+// An iterator for a handle map.
+Dynamic_Handle_Map_Iterator :: struct($D: typeid) {
+	m:     ^D,
+	index: int,
+}
+
+// Makes an iterator from a handle map.
+@(require_results)
+dynamic_iterator_make :: proc "contextless" (m: ^$D/Dynamic_Handle_Map($T, $Handle_Type)) -> Dynamic_Handle_Map_Iterator(D) {
+	return {m, 1}
+}
+
+/*
+	Iterate over a handle map. It will skip over unused item slots (e.g. handle.idx == 0).
+	Usage:
+		it := hm.dynamic_iterator_make(&the_dynamic_handle_map)
+		for item, handle in hm.iterate(&it) {
+			...
+		}
+*/
+@(require_results)
+dynamic_iterate :: proc "contextless" (it: ^$DHI/Dynamic_Handle_Map_Iterator($D/Dynamic_Handle_Map($T, $Handle_Type))) -> (val: ^T, h: Handle_Type, ok: bool) {
+	for _ in it.index..<xar.len(it.m.items) {
+		e := xar.get_ptr_unsafe(&it.m.items, it.index)
+		it.index += 1
+
+		if e.handle.idx != 0 {
+			return e, e.handle, true
+		}
+	}
+	return
+}
--- a/core/container/handle_map/static_handle_map.odin
+++ b/core/container/handle_map/static_handle_map.odin
@@ -0,0 +1,221 @@
+package container_handle_map
+
+import "base:builtin"
+import "base:intrinsics"
+
+// Default 16-bit Handle type which can be used for handle maps which only need a maximum of 254 (1<<8 - 2) items
+Handle16 :: struct {
+	idx: u8,
+	gen: u8,
+}
+
+// Default 32-bit Handle type which can be used for handle maps which only need a maximum of 65534 (1<<16 - 2) items
+Handle32 :: struct {
+	idx: u16,
+	gen: u16,
+}
+
+// Default 64-bit Handle type which can be used for handle maps which only need a maximum of 4294967294 (1<<32 - 2) items
+Handle64 :: struct {
+	idx: u32,
+	gen: u32,
+}
+
+Static_Handle_Map :: struct($N: uint, $T: typeid, $Handle_Type: typeid)
+	where
+		0 < N, N < uint(1<<31 - 1),
+
+		intrinsics.type_has_field(Handle_Type, "idx"),
+		intrinsics.type_has_field(Handle_Type, "gen"),
+		intrinsics.type_is_unsigned(intrinsics.type_field_type(Handle_Type, "idx")),
+		intrinsics.type_is_unsigned(intrinsics.type_field_type(Handle_Type, "gen")),
+		intrinsics.type_field_type(Handle_Type, "idx") == intrinsics.type_field_type(Handle_Type, "gen"),
+
+		N < uint(max(intrinsics.type_field_type(Handle_Type, "idx"))),
+
+		intrinsics.type_has_field (T, "handle"),
+		intrinsics.type_field_type(T, "handle") == Handle_Type {
+
+	// The zero element represent a zero-value sentinel (dummy value), allowing for `idx == 0` to mean a no-handle.
+	// This means the capacity is actually N-1 items.
+	items: [N]T,
+
+	used_len:     u32, // How many of the items are in use
+	unused_len:   u32, // Use to calculate the number of valid items
+	unused_items: [N]u32,
+	next_unused:  u32,
+}
+
+
+// `add` a value of type `T` to the handle map. This will return a pointer to the item and an optional boolean to check for validity.
+@(require_results)
+static_add :: proc "contextless" (m: ^$H/Static_Handle_Map($N, $T, $Handle_Type), item: T) -> (handle: Handle_Type, ok: bool) #optional_ok {
+	if i := m.next_unused; i != 0 {
+		ptr := &m.items[i]
+
+		m.next_unused = m.unused_items[i]
+		m.unused_items[i] = 0
+
+		prev_gen := ptr.handle.gen
+		ptr^ = item
+
+		ptr.handle.idx = auto_cast i
+		ptr.handle.gen = auto_cast (prev_gen + 1)
+		m.unused_len -= 1
+		return ptr.handle, true
+	}
+
+	if m.used_len == 0 {
+		// initialize the zero-value sentinel
+		m.items[0] = {}
+		m.used_len += 1
+	}
+
+	if m.used_len == builtin.len(m.items) {
+		return {}, false
+	}
+
+	ptr := &m.items[m.used_len]
+	ptr^ = item
+
+	ptr.handle.idx = auto_cast m.used_len
+	ptr.handle.gen = 1
+	m.used_len += 1
+	return ptr.handle, true
+}
+
+// `get` a stable pointer of type `^T` by resolving the handle `h`. If the handle is not valid, then `nil, false` is returned.
+@(require_results)
+static_get :: proc "contextless" (m: ^$H/Static_Handle_Map($N, $T, $Handle_Type), h: Handle_Type) -> (^T, bool) #optional_ok {
+	if h.idx <= 0 || u32(h.idx) >= m.used_len {
+		return nil, false
+	}
+	if e := &m.items[h.idx]; e.handle == h {
+		return e, true
+	}
+	return nil, false
+}
+
+// `remove` an item from the handle map from the handle `h`.
+static_remove :: proc "contextless" (m: ^$H/Static_Handle_Map($N, $T, $Handle_Type), h: Handle_Type) -> bool {
+	if h.idx <= 0 || u32(h.idx) >= m.used_len {
+		return false
+	}
+
+	if item := &m.items[h.idx]; item.handle == h {
+		m.unused_items[h.idx] = m.next_unused
+		m.next_unused = u32(h.idx)
+		m.unused_len += 1
+		item.handle.idx = 0
+		return true
+	}
+
+	return false
+}
+
+// Returns true when the handle `h` is valid relating to the handle map.
+@(require_results)
+static_is_valid :: proc "contextless" (m: $H/Static_Handle_Map($N, $T, $Handle_Type), h: Handle_Type) -> bool {
+	return h.idx > 0 && u32(h.idx) < m.used_len && m.items[h.idx].handle == h
+}
+
+// Returns the number of possibly valid items in the handle map.
+@(require_results)
+static_len :: proc "contextless" (m: $H/Static_Handle_Map($N, $T, $Handle_Type)) -> uint {
+	n := uint(m.used_len) - uint(m.unused_len)
+	return n-1 if n > 0 else 0
+}
+
+// Returns the capacity of the items in a handle map.
+// This is equivalent to `N-1` as the zero value is reserved for the zero-value sentinel.
+@(require_results)
+static_cap :: proc "contextless" (m: $H/Static_Handle_Map($N, $T, $Handle_Type)) -> uint {
+	// We could just return `N` but I am doing this for clarity
+	return builtin.len(m.items)-1
+}
+
+// `clear` the handle map by zeroing all of the memory.
+// Internally this does not do `m^ = {}` but rather uses `intrinsics.mem_zero` explicitly improve performance.
+static_clear :: proc "contextless" (m: ^$H/Static_Handle_Map($N, $T, $Handle_Type)) {
+	intrinsics.mem_zero(m, size_of(m^))
+}
+
+// An iterator for a handle map.
+Static_Handle_Map_Iterator :: struct($H: typeid) {
+	m:     ^H,
+	index: u32,
+}
+
+// Makes an iterator from a handle map.
+@(require_results)
+static_iterator_make :: proc "contextless" (m: ^$H/Static_Handle_Map($N, $T, $Handle_Type)) -> Static_Handle_Map_Iterator(H) {
+	return {m, 1}
+}
+
+/*
+	Iterate over a handle map. It will skip over unused item slots (e.g. handle.idx == 0).
+	Usage:
+		it := hm.iterator_make(&the_handle_map)
+		for item, handle in hm.iterate(&it) {
+			...
+		}
+*/
+@(require_results)
+static_iterate :: proc "contextless" (it: ^$HI/Static_Handle_Map_Iterator($H/Static_Handle_Map($N, $T, $Handle_Type))) -> (val: ^T, h: Handle_Type, ok: bool) {
+	for _ in it.index..<it.m.used_len {
+		e := &it.m.items[it.index]
+		it.index += 1
+
+		if e.handle.idx != 0 {
+			return e, e.handle, true
+		}
+	}
+	return
+}
+
+
+
+add :: proc{
+	static_add,
+	dynamic_add,
+}
+
+get :: proc{
+	static_get,
+	dynamic_get,
+}
+
+remove :: proc{
+	static_remove,
+	dynamic_remove,
+}
+
+is_valid :: proc{
+	static_is_valid,
+	dynamic_is_valid,
+}
+
+len :: proc{
+	static_len,
+	dynamic_len,
+}
+
+cap :: proc{
+	static_cap,
+	dynamic_cap,
+}
+
+clear :: proc{
+	static_clear,
+	dynamic_clear,
+}
+
+iterator_make :: proc{
+	static_iterator_make,
+	dynamic_iterator_make,
+}
+
+iterate :: proc{
+	static_iterate,
+	dynamic_iterate,
+}
--- a/core/container/pool/pool.odin
+++ b/core/container/pool/pool.odin
@@ -0,0 +1,140 @@
+package container_pool
+
+import "base:intrinsics"
+import "base:sanitizer"
+
+import "core:mem"
+import "core:sync"
+
+_ :: sanitizer
+
+DEFAULT_BLOCK_SIZE :: _DEFAULT_BLOCK_SIZE
+
+Pool_Arena :: _Pool_Arena
+
+/*
+A thread-safe (between init and destroy) object pool backed by virtual growing arena returning stable pointers.
+The element type requires an intrusive link node.
+
+Example:
+	Elem :: struct {
+		link: ^Elem,
+	}
+
+	p: pool.Pool(Elem)
+	pool.init(&p, "link")
+*/
+Pool :: struct($T: typeid) {
+	arena:           Pool_Arena,
+	num_outstanding: int,
+	num_ready:       int,
+	link_off:        uintptr,
+	free_list:       ^T,
+}
+
+@(require_results)
+init :: proc(p: ^Pool($T), $link_field: string, block_size: uint = DEFAULT_BLOCK_SIZE) -> (err: mem.Allocator_Error)
+	where intrinsics.type_has_field(T, link_field),
+	      intrinsics.type_field_type(T, link_field) == ^T {
+	p.link_off = offset_of_by_string(T, link_field)
+	return _pool_arena_init(&p.arena, block_size)
+}
+
+destroy :: proc(p: ^Pool($T)) {
+	elem := sync.atomic_exchange_explicit(&p.free_list, nil, .Acquire)
+
+	sync.atomic_store_explicit(&p.num_ready, 0, .Relaxed)
+
+	when .Address in ODIN_SANITIZER_FLAGS {
+		for ; elem != nil; elem = _get_next(p, elem) {
+			_unpoison_elem(p, elem)
+		}
+	} else {
+		_ = elem
+	}
+
+	_pool_arena_destroy(&p.arena)
+	p.arena = {}
+}
+
+@(require_results)
+get :: proc(p: ^Pool($T)) -> (elem: ^T, err: mem.Allocator_Error) #optional_allocator_error {
+	defer sync.atomic_add_explicit(&p.num_outstanding, 1, .Relaxed)
+
+	for {
+		elem = sync.atomic_load_explicit(&p.free_list, .Acquire)
+		if elem == nil {
+			// NOTE: pool arena has an internal lock.
+			return new(T, _pool_arena_allocator(&p.arena))
+		}
+
+		if _, ok := sync.atomic_compare_exchange_weak_explicit(&p.free_list, elem, _get_next(p, elem), .Acquire, .Relaxed); ok {
+			_set_next(p, elem, nil)
+			_unpoison_elem(p, elem)
+			sync.atomic_sub_explicit(&p.num_ready, 1, .Relaxed)
+			return
+		}
+	}
+}
+
+put :: proc(p: ^Pool($T), elem: ^T) {
+	mem.zero_item(elem)
+	_poison_elem(p, elem)
+
+	defer sync.atomic_sub_explicit(&p.num_outstanding, 1, .Relaxed)
+	defer sync.atomic_add_explicit(&p.num_ready, 1, .Relaxed)
+
+	for {
+		head := sync.atomic_load_explicit(&p.free_list, .Relaxed)
+		_set_next(p, elem, head)
+		if _, ok := sync.atomic_compare_exchange_weak_explicit(&p.free_list, head, elem, .Release, .Relaxed); ok {
+			return
+		}
+	}
+}
+
+num_outstanding :: proc(p: ^Pool($T)) -> int {
+	return sync.atomic_load(&p.num_outstanding)
+}
+
+num_ready :: proc(p: ^Pool($T)) -> int {
+	return sync.atomic_load(&p.num_ready)
+}
+
+cap :: proc(p: ^Pool($T)) -> int {
+	return sync.atomic_load(&p.num_ready) + sync.atomic_load(&p.num_outstanding)
+}
+
+_get_next :: proc(p: ^Pool($T), elem: ^T) -> ^T {
+	return (^^T)(uintptr(elem) + p.link_off)^
+}
+
+_set_next :: proc(p: ^Pool($T), elem: ^T, next: ^T) {
+	(^^T)(uintptr(elem) + p.link_off)^ = next
+}
+
+@(disabled=.Address not_in ODIN_SANITIZER_FLAGS)
+_poison_elem :: proc(p: ^Pool($T), elem: ^T) {
+	if p.link_off > 0 {
+		sanitizer.address_poison_rawptr(elem, int(p.link_off))
+	}
+
+	len := size_of(T) - p.link_off - size_of(rawptr)
+	if len > 0 {
+		ptr := rawptr(uintptr(elem) + p.link_off + size_of(rawptr))
+		sanitizer.address_poison_rawptr(ptr, int(len))
+	}
+}
+
+@(disabled=.Address not_in ODIN_SANITIZER_FLAGS)
+_unpoison_elem :: proc(p: ^Pool($T), elem: ^T) {
+	if p.link_off > 0 {
+		sanitizer.address_unpoison_rawptr(elem, int(p.link_off))
+	}
+
+	len := size_of(T) - p.link_off - size_of(rawptr)
+	if len > 0 {
+		ptr := rawptr(uintptr(elem) + p.link_off + size_of(rawptr))
+		sanitizer.address_unpoison_rawptr(ptr, int(len))
+	}
+}
--- a/core/container/pool/pool_arena_others.odin
+++ b/core/container/pool/pool_arena_others.odin
@@ -0,0 +1,29 @@
+#+build !darwin
+#+build !freebsd
+#+build !openbsd
+#+build !netbsd
+#+build !linux
+#+build !windows
+#+private
+package container_pool
+
+import "base:runtime"
+
+import "core:mem"
+
+_Pool_Arena :: runtime.Arena
+
+_DEFAULT_BLOCK_SIZE :: mem.Megabyte
+
+_pool_arena_init :: proc(arena: ^Pool_Arena, block_size: uint = DEFAULT_BLOCK_SIZE) -> (err: runtime.Allocator_Error) {
+	runtime.arena_init(arena, block_size, runtime.default_allocator()) or_return
+	return
+}
+
+_pool_arena_allocator :: proc(arena: ^Pool_Arena) -> runtime.Allocator {
+	return runtime.arena_allocator(arena)
+}
+
+_pool_arena_destroy :: proc(arena: ^Pool_Arena) {
+	runtime.arena_destroy(arena)
+}
--- a/core/container/pool/pool_arena_virtual.odin
+++ b/core/container/pool/pool_arena_virtual.odin
@@ -0,0 +1,24 @@
+#+build darwin, freebsd, openbsd, netbsd, linux, windows
+package container_pool
+
+import "base:runtime"
+
+import "core:mem"
+import "core:mem/virtual"
+
+_Pool_Arena :: virtual.Arena
+
+_DEFAULT_BLOCK_SIZE :: mem.Gigabyte
+
+_pool_arena_init :: proc(arena: ^Pool_Arena, block_size: uint = DEFAULT_BLOCK_SIZE) -> (err: runtime.Allocator_Error) {
+	virtual.arena_init_growing(arena, block_size) or_return
+	return
+}
+
+_pool_arena_allocator :: proc(arena: ^Pool_Arena) -> runtime.Allocator {
+	return virtual.arena_allocator(arena)
+}
+
+_pool_arena_destroy :: proc(arena: ^Pool_Arena) {
+	virtual.arena_destroy(arena)
+}
--- a/core/container/rbtree/rbtree.odin
+++ b/core/container/rbtree/rbtree.odin
@@ -91,7 +91,7 @@ destroy :: proc(t: ^$T/Tree($Key, $Value), call_on_remove: bool = true) {
 	}
 }

-len :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> (node_count: int) {
+len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
 	return t._size
 }

@@ -108,7 +108,7 @@ last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
 }

 // find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
-find :: proc(t: ^$T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
+find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 	node = t._root
 	for node != nil {
 		switch t._cmp_fn(key, node.key) {
@@ -121,7 +121,7 @@ find :: proc(t: ^$T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 }

 // find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
-find_value :: proc(t: ^$T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
+find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
 	if n := find(t, key); n != nil {
 		return n.value, true
 	}
@@ -166,7 +166,7 @@ remove :: proc {
 // removal was successful.  While the node's key + value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
-	n := find(t, key)
+	n := find(t^, key)
 	if n == nil {
 		return false // Key not found, nothing to do
 	}
--- a/core/container/xar/xar.odin
+++ b/core/container/xar/xar.odin
@@ -12,7 +12,7 @@
 		import "core:container/xar"

 		example :: proc() {
-			x: xar.Xar(int, 4)
+			x: xar.Array(int, 4)
 			defer xar.destroy(&x)

 			xar.push_back(&x, 10)
@@ -38,7 +38,7 @@ MAX_SHIFT :: PLATFORM_BITS>>1
 /*
 	An Exponential Array with stable element addresses.

-	Unlike `[dynamic]T` which reallocates and moves elements when growing, `Xar`
+	Unlike `[dynamic]T` which reallocates and moves elements when growing, `Array`
 	allocates separate chunks of exponentially increasing size. This guarantees
 	that pointers to elements remain valid for the lifetime of the container.

@@ -66,11 +66,11 @@ MAX_SHIFT :: PLATFORM_BITS>>1

 		example :: proc() {
 			// Xar with initial chunk size of 16 (1 << 4)
-			x: xar.Xar(My_Struct, 4)
+			x: xar.Array(My_Struct, 4)
 			defer xar.destroy(&x)
 		}
 */
-Xar :: struct($T: typeid, $SHIFT: uint) where 0 < SHIFT, SHIFT <= MAX_SHIFT {
+Array :: struct($T: typeid, $SHIFT: uint) where 0 < SHIFT, SHIFT <= MAX_SHIFT {
 	chunks:    [(1 << (_LOG2_PLATFORM_BITS - intrinsics.constant_log2(SHIFT))) + 1][^]T,
 	len:       int,
 	allocator: mem.Allocator,
@@ -84,7 +84,7 @@ Initializes an exponential array with the given allocator.
 - `x`: Pointer to the exponential array to initialize
 - `allocator`: Allocator to use for chunk allocations (defaults to context.allocator)
 */
-init :: proc(x: ^$X/Xar($T, $SHIFT), allocator := context.allocator) {
+init :: proc(x: ^$X/Array($T, $SHIFT), allocator := context.allocator) {
 	x^ = {allocator = allocator}
 }

@@ -94,7 +94,7 @@ Frees all allocated chunks and resets the exponential array.
 **Inputs**
 - `x`: Pointer to the exponential array to destroy
 */
-destroy :: proc(x: ^$X/Xar($T, $SHIFT)) {
+destroy :: proc(x: ^$X/Array($T, $SHIFT)) {
 	#reverse for c, i in x.chunks {
 		if c != nil {
 			n := 1 << (SHIFT + uint(i if i > 0 else 1) - 1)
@@ -109,19 +109,19 @@ destroy :: proc(x: ^$X/Xar($T, $SHIFT)) {
 Resets the array's length to zero without freeing memory.
 Allocated chunks are retained for reuse.
 */
-clear :: proc(x: ^$X/Xar($T, $SHIFT)) {
+clear :: proc "contextless" (x: ^$X/Array($T, $SHIFT)) {
 	x.len = 0
 }

 // Returns the length of the exponential-array
@(require_results)
-len :: proc(x: $X/Xar($T, $SHIFT)) -> int {
+len :: proc "contextless" (x: $X/Array($T, $SHIFT)) -> int {
 	return x.len
 }

 // Returns the number of allocated elements
@(require_results)
-cap :: proc(x: $X/Xar($T, $SHIFT)) -> int {
+cap :: proc "contextless" (x: $X/Array($T, $SHIFT)) -> int {
 	#reverse for c, i in x.chunks {
 		if c != nil {
 			return 1 << (SHIFT + uint(i if i > 0 else 1))
@@ -132,7 +132,7 @@ cap :: proc(x: $X/Xar($T, $SHIFT)) -> int {

 // Internal: computes chunk index, element index within chunk, and chunk capacity for a given index.
@(require_results)
-_meta_get :: #force_inline proc($SHIFT: uint, index: uint) -> (chunk_idx, elem_idx, chunk_cap: uint) {
+_meta_get :: #force_inline proc "contextless" ($SHIFT: uint, index: uint) -> (chunk_idx, elem_idx, chunk_cap: uint) {
 	elem_idx = index
 	chunk_cap = uint(1) << SHIFT
 	chunk_idx = 0
@@ -161,7 +161,7 @@ Get a copy of the element at the specified index.
 - a copy of the element
 */
@(require_results)
-get :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: T) #no_bounds_check {
+get :: proc(x: ^$X/Array($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: T) #no_bounds_check {
 	runtime.bounds_check_error_loc(loc, index, x.len)
 	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
 	return x.chunks[chunk_idx][elem_idx]
@@ -185,7 +185,7 @@ Example:
 	import "core:container/xar"

 	get_ptr_example :: proc() {
-		x: xar.Xar(int, 4)
+		x: xar.Array(int, 4)
 		defer xar.destroy(&x)

 		xar.push_back(&x, 100)
@@ -200,12 +200,19 @@ Example:
 	}
 */
@(require_results)
-get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: ^T) #no_bounds_check {
+get_ptr :: proc(x: ^$X/Array($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: ^T) #no_bounds_check {
 	runtime.bounds_check_error_loc(loc, index, x.len)
 	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
 	return &x.chunks[chunk_idx][elem_idx]
 }

+// No bounds checking
+@(require_results)
+get_ptr_unsafe :: proc "contextless" (x: ^$X/Array($T, $SHIFT), #any_int index: int) -> (val: ^T) #no_bounds_check {
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
+	return &x.chunks[chunk_idx][elem_idx]
+}
+
 /*
 Set the element at the specified index to the given value.

@@ -214,7 +221,7 @@ Set the element at the specified index to the given value.
 - `index`: Position of the element (0-indexed)
 - `value`: The value to set
 */
-set :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, value: T, loc := #caller_location) #no_bounds_check {
+set :: proc(x: ^$X/Array($T, $SHIFT), #any_int index: int, value: T, loc := #caller_location) #no_bounds_check {
 	runtime.bounds_check_error_loc(loc, index, x.len)
 	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
 	x.chunks[chunk_idx][elem_idx] = value
@@ -240,7 +247,7 @@ Example:
 	import "core:container/xar"

 	push_back_example :: proc() {
-		x: xar.Xar(string, 4)
+		x: xar.Array(string, 4)
 		defer xar.destroy(&x)

 		xar.push_back(&x, "hello")
@@ -250,7 +257,7 @@ Example:
 		fmt.println(xar.get(&x, 1))  // world
 	}
 */
-push_back_elem :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
+push_back_elem :: proc(x: ^$X/Array($T, $SHIFT), value: T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
 	if x.allocator.procedure == nil {
 		// to minic `[dynamic]T` behaviour
 		x.allocator = context.allocator
@@ -277,7 +284,7 @@ Append multiple elements to the end of the exponential array.
 - number of elements successfully added
 - allocation error if chunk allocation failed (partial append possible)
 */
-push_back_elems :: proc(x: ^$X/Xar($T, $SHIFT), values: ..T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
+push_back_elems :: proc(x: ^$X/Array($T, $SHIFT), values: ..T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
 	for value in values {
 		n += push_back_elem(x, value, loc) or_return
 	}
@@ -303,7 +310,7 @@ Example:
 	import "core:container/xar"

 	push_back_and_get_ptr_example :: proc() {
-		x: xar.Xar(My_Struct, 4)
+		x: xar.Array(My_Struct, 4)
 		defer xar.destroy(&x)

 		ptr := xar.push_back_elem_and_get_ptr(&x, My_Struct{}) or_else panic("alloc failed")
@@ -311,7 +318,7 @@ Example:
 	}
 */
@(require_results)
-push_back_elem_and_get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #caller_location) -> (ptr: ^T, err: mem.Allocator_Error) {
+push_back_elem_and_get_ptr :: proc(x: ^$X/Array($T, $SHIFT), value: T, loc := #caller_location) -> (ptr: ^T, err: mem.Allocator_Error) {
 	if x.allocator.procedure == nil {
 		// to minic `[dynamic]T` behaviour
 		x.allocator = context.allocator
@@ -323,7 +330,6 @@ push_back_elem_and_get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #cal
 	}
 	x.chunks[chunk_idx][elem_idx] = value
 	x.len += 1
-	n = 1
 	ptr = &x.chunks[chunk_idx][elem_idx]
 	return
 }
@@ -331,7 +337,7 @@ push_back_elem_and_get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #cal
 // `pop` will remove and return the end value of an exponential array `x` and reduces the length of the array by 1.
 //
 // Note: If the exponential array has no elements (`xar.len(x) == 0`), this procedure will panic.
-pop :: proc(x: ^$X/Xar($T, $SHIFT), loc := #caller_location) -> (val: T) {
+pop :: proc(x: ^$X/Array($T, $SHIFT), loc := #caller_location) -> (val: T) {
 	assert(x.len > 0, loc=loc)
 	index := uint(x.len-1)
 	chunk_idx, elem_idx, _ := _meta_get(SHIFT, index)
@@ -342,7 +348,7 @@ pop :: proc(x: ^$X/Xar($T, $SHIFT), loc := #caller_location) -> (val: T) {
 // `pop_safe` trys to remove and return the end value of dynamic array `x` and reduces the length of the array by 1.
 // If the operation is not possible, it will return false.
@(require_results)
-pop_safe :: proc(x: ^$X/Xar($T, $SHIFT)) -> (val: T, ok: bool) {
+pop_safe :: proc(x: ^$X/Array($T, $SHIFT)) -> (val: T, ok: bool) {
 	if x.len == 0 {
 		return
 	}
@@ -370,7 +376,7 @@ pop_safe :: proc(x: ^$X/Xar($T, $SHIFT)) -> (val: T, ok: bool) {
 		import "core:encoding/xar"

 		unordered_remove_example :: proc() {
-			x: xar.Xar(int, 4)
+			x: xar.Array(int, 4)
 			defer xar.destroy(&x)

 			xar.push_back(&x, 10)
@@ -384,7 +390,7 @@ pop_safe :: proc(x: ^$X/Xar($T, $SHIFT)) -> (val: T, ok: bool) {
 			fmt.println(xar.get(&x, 1))  // 20
 		}
 */
-unordered_remove :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) {
+unordered_remove :: proc(x: ^$X/Array($T, $SHIFT), #any_int index: int, loc := #caller_location) {
 	runtime.bounds_check_error_loc(loc, index, x.len)
 	n := x.len-1
 	if index != n {
@@ -403,7 +409,7 @@ Fields:
 - `idx`: Current iteration index
 */
 Iterator :: struct($T: typeid, $SHIFT: uint) {
-	xar: ^Xar(T, SHIFT),
+	xar: ^Array(T, SHIFT),
 	idx: int,
 }

@@ -418,10 +424,11 @@ Create an iterator for traversing the exponential array.

 Example:

-	import "lib:xar"
+	import "core:container/xar"
+	import "core:fmt"

-	iteration_example :: proc() {
-		x: xar.Xar(int, 4)
+	iterator_example :: proc() {
+		x: xar.Array(int, 4)
 		defer xar.destroy(&x)

 		xar.push_back(&x, 10)
@@ -440,7 +447,7 @@ Output:
 	20
 	30
 */
-iterator :: proc(xar: ^$X/Xar($T, $SHIFT)) -> Iterator(T, SHIFT) {
+iterator :: proc(xar: ^$X/Array($T, $SHIFT)) -> Iterator(T, SHIFT) {
 	return {xar = auto_cast xar, idx = 0}
 }

--- a/core/crypto/_chacha20/ref/chacha20_ref.odin
+++ b/core/crypto/_chacha20/ref/chacha20_ref.odin
@@ -4,133 +4,68 @@ import "core:crypto/_chacha20"
 import "core:encoding/endian"
 import "core:math/bits"

+// At least with LLVM21 force_inline produces identical perf to
+// manual inlining, yay.
+@(private)
+quarter_round :: #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
+	a, b, c, d := a, b, c, d
+
+	a += b
+	d ~= a
+	d = bits.rotate_left32(d, 16)
+
+	c += d
+	b ~= c
+	b = bits.rotate_left32(b, 12)
+
+	a += b
+	d ~= a
+	d = bits.rotate_left32(d, 8)
+
+	c += d
+	b ~= c
+	b = bits.rotate_left32(b, 7)
+
+	return a, b, c, d
+}
+
 stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
 	// Enforce the maximum consumed keystream per IV.
 	_chacha20.check_counter_limit(ctx, nr_blocks)

 	dst, src := dst, src
 	x := &ctx._s
+
+
+	// Filippo Valsorda made an observation that only one of the column
+	// round depends on the counter (s12), so it is worth precomputing
+	// and reusing across multiple blocks.  As far as I know, only Go's
+	// chacha implementation does this.
+
+	p1, p5, p9, p13 := quarter_round(_chacha20.SIGMA_1, x[5], x[9], x[13])
+	p2, p6, p10, p14 := quarter_round(_chacha20.SIGMA_2, x[6], x[10], x[14])
+	p3, p7, p11, p15 := quarter_round(_chacha20.SIGMA_3, x[7], x[11], x[15])
+
 	for n := 0; n < nr_blocks; n = n + 1 {
-		x0, x1, x2, x3 :=
-			_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
-			x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+		// First column round that depends on the counter
+		p0, p4, p8, p12 := quarter_round(_chacha20.SIGMA_0, x[4], x[8], x[12])

-		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
-			// Even when forcing inlining manually inlining all of
-			// these is decently faster.
+		// First diagonal round
+		x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
+		x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
+		x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
+		x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)

-			// quarterround(x, 0, 4, 8, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 16)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 8)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 7)
+		for i := _chacha20.ROUNDS - 2; i > 0; i = i - 2 {
+			x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+			x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+			x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+			x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)

-			// quarterround(x, 1, 5, 9, 13)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 16)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 12)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 8)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 2, 6, 10, 14)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 16)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 12)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 8)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 3, 7, 11, 15)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 16)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 12)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 8)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 0, 5, 10, 15)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 16)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 12)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 8)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 1, 6, 11, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 16)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 8)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 2, 7, 8, 13)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 16)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 12)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 8)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 3, 4, 9, 14)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 16)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 12)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 8)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 7)
+			x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+			x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+			x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+			x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
 		}

 		x0 += _chacha20.SIGMA_0
@@ -236,117 +171,15 @@ hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
 	x15 := endian.unchecked_get_u32le(iv[12:16])

 	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
-		// quarterround(x, 0, 4, 8, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 16)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 8)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 7)
+		x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+		x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+		x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+		x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)

-		// quarterround(x, 1, 5, 9, 13)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 16)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 12)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 8)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 2, 6, 10, 14)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 16)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 12)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 8)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 3, 7, 11, 15)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 16)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 12)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 8)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 0, 5, 10, 15)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 16)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 12)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 8)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 1, 6, 11, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 16)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 8)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 2, 7, 8, 13)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 16)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 12)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 8)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 3, 4, 9, 14)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 16)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 12)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 8)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 7)
+		x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+		x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+		x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+		x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
 	}

 	endian.unchecked_put_u32le(dst[0:4], x0)
--- a/core/crypto/_edwards25519/edwards25519.odin
+++ b/core/crypto/_edwards25519/edwards25519.odin
@@ -195,7 +195,6 @@ ge_generator :: proc "contextless" (ge: ^Group_Element) {
 	ge_set(ge, &GE_BASEPOINT)
 }

-@(private)
 Addend_Group_Element :: struct {
 	y2_minus_x2:  field.Loose_Field_Element, // t1
 	y2_plus_x2:   field.Loose_Field_Element, // t3
@@ -203,7 +202,6 @@ Addend_Group_Element :: struct {
 	two_times_z2: field.Loose_Field_Element, // t5
 }

-@(private)
 ge_addend_set :: proc "contextless" (ge_a: ^Addend_Group_Element, ge: ^Group_Element) {
 	field.fe_sub(&ge_a.y2_minus_x2, &ge.y, &ge.x)
 	field.fe_add(&ge_a.y2_plus_x2, &ge.y, &ge.x)
@@ -420,6 +418,6 @@ ge_in_prime_order_subgroup_vartime :: proc "contextless" (ge: ^Group_Element) ->
 	// that is a ~50% speedup, and a lot of added complexity for something
 	// that is better solved by "just use ristretto255".
 	tmp: Group_Element = ---
-	_ge_scalarmult(&tmp, ge, &SC_ELL, true)
+	ge_scalarmult_raw(&tmp, ge, &SC_ELL, true)
 	return ge_equal(&tmp, &GE_IDENTITY) == 1
 }
--- a/core/crypto/_edwards25519/edwards25519_scalar_mul.odin
+++ b/core/crypto/_edwards25519/edwards25519_scalar_mul.odin
@@ -1,130 +1,24 @@
 package _edwards25519

+import "core:crypto"
 import field "core:crypto/_fiat/field_scalar25519"
-import "core:math/bits"
+import subtle "core:crypto/_subtle"
 import "core:mem"

-// GE_BASEPOINT_TABLE is 1 * G, ... 15 * G, in precomputed format.
-//
-// Note: When generating, the values were reduced to Tight_Field_Element
-// ranges, even though that is not required.
-@(private)
-GE_BASEPOINT_TABLE := Multiply_Table {
-	{
-		{62697248952638, 204681361388450, 631292143396476, 338455783676468, 1213667448819585},
-		{1288382639258501, 245678601348599, 269427782077623, 1462984067271730, 137412439391563},
-		{301289933810280, 1259582250014073, 1422107436869536, 796239922652654, 1953934009299142},
-		{2, 0, 0, 0, 0},
-	},
-	{
-		{1519297034332653, 1098796920435767, 1823476547744119, 808144629470969, 2110930855619772},
-		{338005982828284, 1667856962156925, 100399270107451, 1604566703601691, 1950338038771369},
-		{1920505767731247, 1443759578976892, 1659852098357048, 1484431291070208, 275018744912646},
-		{763163817085987, 2195095074806923, 2167883174351839, 1868059999999762, 911071066608705},
-	},
-	{
-		{960627541894068, 1314966688943942, 1126875971034044, 2059608312958945, 605975666152586},
-		{1714478358025626, 2209607666607510, 1600912834284834, 496072478982142, 481970031861896},
-		{851735079403194, 1088965826757164, 141569479297499, 602804610059257, 2004026468601520},
-		{197585529552380, 324719066578543, 564481854250498, 1173818332764578, 35452976395676},
-	},
-	{
-		{1152980410747203, 2196804280851952, 25745194962557, 1915167295473129, 1266299690309224},
-		{809905889679060, 979732230071345, 1509972345538142, 188492426534402, 818965583123815},
-		{997685409185036, 1451818320876327, 2126681166774509, 2000509606057528, 235432372486854},
-		{887734189279642, 1460338685162044, 877378220074262, 102436391401299, 153369156847490},
-	},
-	{
-		{2056621900836770, 1821657694132497, 1627986892909426, 1163363868678833, 1108873376459226},
-		{1187697490593623, 1066539945237335, 885654531892000, 1357534489491782, 359370291392448},
-		{1509033452137525, 1305318174298508, 613642471748944, 1987256352550234, 1044283663101541},
-		{220105720697037, 387661783287620, 328296827867762, 360035589590664, 795213236824054},
-	},
-	{
-		{1820794733038396, 1612235121681074, 757405923441402, 1094031020892801, 231025333128907},
-		{1639067873254194, 1484176557946322, 300800382144789, 1329915446659183, 1211704578730455},
-		{641900794791527, 1711751746971612, 179044712319955, 576455585963824, 1852617592509865},
-		{743549047192397, 685091042550147, 1952415336873496, 1965124675654685, 513364998442917},
-	},
-	{
-		{1004557076870448, 1762911374844520, 1330807633622723, 384072910939787, 953849032243810},
-		{2178275058221458, 257933183722891, 376684351537894, 2010189102001786, 1981824297484148},
-		{1332915663881114, 1286540505502549, 1741691283561518, 977214932156314, 1764059494778091},
-		{429702949064027, 1368332611650677, 2019867176450999, 2212258376161746, 526160996742554},
-	},
-	{
-		{2098932988258576, 2203688382075948, 2120400160059479, 1748488020948146, 1203264167282624},
-		{677131386735829, 1850249298025188, 672782146532031, 2144145693078904, 2088656272813787},
-		{1065622343976192, 1573853211848116, 223560413590068, 333846833073379, 27832122205830},
-		{1781008836504573, 917619542051793, 544322748939913, 882577394308384, 1720521246471195},
-	},
-	{
-		{660120928379860, 2081944024858618, 1878411111349191, 424587356517195, 2111317439894005},
-		{1834193977811532, 1864164086863319, 797334633289424, 150410812403062, 2085177078466389},
-		{1438117271371866, 783915531014482, 388731514584658, 292113935417795, 1945855002546714},
-		{1678140823166658, 679103239148744, 614102761596238, 1052962498997885, 1863983323810390},
-	},
-	{
-		{1690309392496233, 1116333140326275, 1377242323631039, 717196888780674, 82724646713353},
-		{1722370213432106, 74265192976253, 264239578448472, 1714909985012994, 2216984958602173},
-		{2010482366920922, 1294036471886319, 566466395005815, 1631955803657320, 1751698647538458},
-		{1073230604155753, 1159087041338551, 1664057985455483, 127472702826203, 1339591128522371},
-	},
-	{
-		{478053307175577, 2179515791720985, 21146535423512, 1831683844029536, 462805561553981},
-		{1945267486565588, 1298536818409655, 2214511796262989, 1904981051429012, 252904800782086},
-		{268945954671210, 222740425595395, 1208025911856230, 1080418823003555, 75929831922483},
-		{1884784014268948, 643868448202966, 978736549726821, 46385971089796, 1296884812292320},
-	},
-	{
-		{1861159462859103, 7077532564710, 963010365896826, 1938780006785270, 766241051941647},
-		{1778966986051906, 1713995999765361, 1394565822271816, 1366699246468722, 1213407027149475},
-		{1978989286560907, 2135084162045594, 1951565508865477, 671788336314416, 293123929458176},
-		{902608944504080, 2167765718046481, 1285718473078022, 1222562171329269, 492109027844479},
-	},
-	{
-		{1820807832746213, 1029220580458586, 1101997555432203, 1039081975563572, 202477981158221},
-		{1866134980680205, 2222325502763386, 1830284629571201, 1046966214478970, 418381946936795},
-		{1783460633291322, 1719505443254998, 1810489639976220, 877049370713018, 2187801198742619},
-		{197118243000763, 305493867565736, 518814410156522, 1656246186645170, 901894734874934},
-	},
-	{
-		{225454942125915, 478410476654509, 600524586037746, 643450007230715, 1018615928259319},
-		{1733330584845708, 881092297970296, 507039890129464, 496397090721598, 2230888519577628},
-		{690155664737246, 1010454785646677, 753170144375012, 1651277613844874, 1622648796364156},
-		{1321310321891618, 1089655277873603, 235891750867089, 815878279563688, 1709264240047556},
-	},
-	{
-		{805027036551342, 1387174275567452, 1156538511461704, 1465897486692171, 1208567094120903},
-		{2228417017817483, 202885584970535, 2182114782271881, 2077405042592934, 1029684358182774},
-		{460447547653983, 627817697755692, 524899434670834, 1228019344939427, 740684787777653},
-		{849757462467675, 447476306919899, 422618957298818, 302134659227815, 675831828440895},
-	},
-}
-
 ge_scalarmult :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
 	tmp: field.Non_Montgomery_Domain_Field_Element
 	field.fe_from_montgomery(&tmp, sc)

-	_ge_scalarmult(ge, p, &tmp)
+	ge_scalarmult_raw(ge, p, &tmp)

 	mem.zero_explicit(&tmp, size_of(tmp))
 }

-ge_scalarmult_basepoint :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar) {
-	// Something like the comb method from "Fast and compact elliptic-curve
-	// cryptography" Section 3.3, would be more performant, but more
-	// complex.
-	//
-	// - https://eprint.iacr.org/2012/309
-	ge_scalarmult(ge, &GE_BASEPOINT, sc)
-}
-
 ge_scalarmult_vartime :: proc "contextless" (ge, p: ^Group_Element, sc: ^Scalar) {
 	tmp: field.Non_Montgomery_Domain_Field_Element
 	field.fe_from_montgomery(&tmp, sc)

-	_ge_scalarmult(ge, p, &tmp, true)
+	ge_scalarmult_raw(ge, p, &tmp, true)
 }

 ge_double_scalarmult_basepoint_vartime :: proc "contextless" (
@@ -147,6 +41,12 @@ ge_double_scalarmult_basepoint_vartime :: proc "contextless" (

 	A_tbl: Multiply_Table = ---
 	mul_tbl_set(&A_tbl, A, &tmp_add)
+	when crypto.COMPACT_IMPLS == true {
+		G_tbl: Multiply_Table = ---
+		mul_tbl_set(&G_tbl, &GE_BASEPOINT, &tmp_add)
+	} else {
+		tmp_bp_addend: Basepoint_Addend_Group_Element = ---
+	}

 	sc_a, sc_b: field.Non_Montgomery_Domain_Field_Element
 	field.fe_from_montgomery(&sc_a, a)
@@ -170,21 +70,28 @@ ge_double_scalarmult_basepoint_vartime :: proc "contextless" (
 			ge_double(&tmp, &tmp, &tmp_dbl)
 		}
 		mul_tbl_add(&tmp, &A_tbl, hi_a, &tmp_add, &tmp_addend, true)
-		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, hi_b, &tmp_add, &tmp_addend, true)
+		when crypto.COMPACT_IMPLS == true {
+			mul_tbl_add(&tmp, &G_tbl, hi_b, &tmp_add, &tmp_addend, true)
+		} else {
+			mul_bp_tbl_add(&tmp, GE_BASEPOINT_TABLE, hi_b, &tmp_add, &tmp_bp_addend, true)
+		}

 		ge_double(&tmp, &tmp, &tmp_dbl)
 		ge_double(&tmp, &tmp, &tmp_dbl)
 		ge_double(&tmp, &tmp, &tmp_dbl)
 		ge_double(&tmp, &tmp, &tmp_dbl)
 		mul_tbl_add(&tmp, &A_tbl, lo_a, &tmp_add, &tmp_addend, true)
-		mul_tbl_add(&tmp, &GE_BASEPOINT_TABLE, lo_b, &tmp_add, &tmp_addend, true)
+		when crypto.COMPACT_IMPLS == true {
+			mul_tbl_add(&tmp, &G_tbl, lo_b, &tmp_add, &tmp_addend, true)
+		} else {
+			mul_bp_tbl_add(&tmp, GE_BASEPOINT_TABLE, lo_b, &tmp_add, &tmp_bp_addend, true)
+		}
 	}

 	ge_set(ge, &tmp)
 }

-@(private)
-_ge_scalarmult :: proc "contextless" (
+ge_scalarmult_raw :: proc "contextless" (
 	ge, p: ^Group_Element,
 	sc: ^field.Non_Montgomery_Domain_Field_Element,
 	unsafe_is_vartime := false,
@@ -281,8 +188,8 @@ mul_tbl_add :: proc "contextless" (
 		{2, 0, 0, 0, 0}, // z * 2
 	}
 	for i := u64(1); i < 16; i = i + 1 {
-		_, ctrl := bits.sub_u64(0, (i ~ idx), 0)
-		ge_addend_conditional_assign(tmp_addend, &tbl[i - 1], int(~ctrl) & 1)
+		ctrl := subtle.eq(i, idx)
+		ge_addend_conditional_assign(tmp_addend, &tbl[i - 1], int(ctrl))
 	}
 	ge_add_addend(ge, ge, tmp_addend, tmp_add)
 }
--- a/core/crypto/_edwards25519/edwards25519_scalar_mul_base.odin
+++ b/core/crypto/_edwards25519/edwards25519_scalar_mul_base.odin
@@ -0,0 +1,147 @@
+package _edwards25519
+
+import "core:crypto"
+import field "core:crypto/_fiat/field_curve25519"
+import scalar "core:crypto/_fiat/field_scalar25519"
+import subtle "core:crypto/_subtle"
+import "core:mem"
+
+ge_scalarmult_basepoint :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar) {
+	when crypto.COMPACT_IMPLS == true {
+		ge_scalarmult(ge, &GE_BASEPOINT, sc)
+	} else {
+		tmp_sc: scalar.Non_Montgomery_Domain_Field_Element
+		scalar.fe_from_montgomery(&tmp_sc, sc)
+
+		tmp_add: Add_Scratch = ---
+		tmp_addend: Basepoint_Addend_Group_Element = ---
+
+		ge_identity(ge)
+		for i in 0..<32 {
+			limb := i / 8
+			shift := uint(i & 7) * 8
+			limb_byte := tmp_sc[limb] >> shift
+
+			hi, lo := (limb_byte >> 4) & 0x0f, limb_byte & 0x0f
+			mul_bp_tbl_add(ge, &Gen_Multiply_Table_edwards25519_lo[i], lo, &tmp_add, &tmp_addend, false)
+			mul_bp_tbl_add(ge, &Gen_Multiply_Table_edwards25519_hi[i], hi, &tmp_add, &tmp_addend, false)
+		}
+
+		mem.zero_explicit(&tmp_sc, size_of(tmp_sc))
+		mem.zero_explicit(&tmp_add, size_of(Add_Scratch))
+		mem.zero_explicit(&tmp_addend, size_of(Basepoint_Addend_Group_Element))
+	}
+}
+
+when crypto.COMPACT_IMPLS == false {
+	@(private="file",rodata)
+	TWO_TIMES_Z2 := field.Loose_Field_Element{2, 0, 0, 0, 0}
+
+	@(private)
+	Basepoint_Addend_Group_Element :: struct {
+		y2_minus_x2:  field.Loose_Field_Element, // t1
+		y2_plus_x2:   field.Loose_Field_Element, // t3
+		k_times_t2:   field.Tight_Field_Element, // t4
+	}
+
+	@(private)
+	Basepoint_Multiply_Table :: [15]Basepoint_Addend_Group_Element
+
+	@(private)
+	ge_bp_addend_conditional_assign :: proc "contextless" (ge_a, a: ^Basepoint_Addend_Group_Element, ctrl: int) {
+		field.fe_cond_select(&ge_a.y2_minus_x2, &ge_a.y2_minus_x2, &a.y2_minus_x2, ctrl)
+		field.fe_cond_select(&ge_a.y2_plus_x2, &ge_a.y2_plus_x2, &a.y2_plus_x2, ctrl)
+		field.fe_cond_select(&ge_a.k_times_t2, &ge_a.k_times_t2, &a.k_times_t2, ctrl)
+	}
+
+	@(private)
+	ge_add_bp_addend :: proc "contextless" (
+		ge, a: ^Group_Element,
+		b: ^Basepoint_Addend_Group_Element,
+		scratch: ^Add_Scratch,
+	) {
+		// https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html#addition-add-2008-hwcd-3
+		// Assumptions: k=2*d, z = 1 (precomputation ftw)
+		//
+		// t0 = Y1-X1
+		// t1 = Y2-X2
+		// A = t0*t1
+		// t2 = Y1+X1
+		// t3 = Y2+X2
+		// B = t2*t3
+		// t4 = k*T2
+		// C = T1*t4
+		// t5 = 2*Z2
+		// D = Z1*t5
+		// E = B-A
+		// F = D-C
+		// G = D+C
+		// H = B+A
+		// X3 = E*F
+		// Y3 = G*H
+		// T3 = E*H
+		// Z3 = F*G
+		//
+		// In order to make the scalar multiply faster, the addend is provided
+		// as a `Addend_Group_Element` with t1, t3, t4, and t5 precomputed, as
+		// it is trivially obvious that those are the only values used by the
+		// formula that are directly dependent on `b`, and are only dependent
+		// on `b` and constants.  This saves 1 sub, 2 adds, and 1 multiply,
+		// each time the intermediate representation can be reused.
+
+		A, B, C, D := &scratch.A, &scratch.B, &scratch.C, &scratch.D
+		E, F, G, H := &scratch.E, &scratch.F, &scratch.G, &scratch.H
+		t0, t2 := &scratch.t0, &scratch.t2
+
+		field.fe_sub(t0, &a.y, &a.x)
+		t1 := &b.y2_minus_x2
+		field.fe_carry_mul(A, t0, t1)
+		field.fe_add(t2, &a.y, &a.x)
+		t3 := &b.y2_plus_x2
+		field.fe_carry_mul(B, t2, t3)
+		t4 := &b.k_times_t2
+		field.fe_carry_mul(C, field.fe_relax_cast(&a.t), field.fe_relax_cast(t4))
+		field.fe_carry_mul(D, field.fe_relax_cast(&a.z), &TWO_TIMES_Z2)
+		field.fe_sub(E, B, A)
+		field.fe_sub(F, D, C)
+		field.fe_add(G, D, C)
+		field.fe_add(H, B, A)
+		field.fe_carry_mul(&ge.x, E, F)
+		field.fe_carry_mul(&ge.y, G, H)
+		field.fe_carry_mul(&ge.t, E, H)
+		field.fe_carry_mul(&ge.z, F, G)
+	}
+
+	@(private)
+	mul_bp_tbl_add :: proc "contextless" (
+		ge: ^Group_Element,
+		tbl: ^Basepoint_Multiply_Table,
+		idx: u64,
+		tmp_add: ^Add_Scratch,
+		tmp_addend: ^Basepoint_Addend_Group_Element,
+		unsafe_is_vartime: bool,
+	) {
+		// Variable time lookup, with the addition omitted entirely if idx == 0.
+		if unsafe_is_vartime {
+			// Skip adding the point at infinity.
+			if idx != 0 {
+				ge_add_bp_addend(ge, ge, &tbl[idx-1], tmp_add)
+			}
+			return
+		}
+
+		// Constant time lookup.
+		tmp_addend^ = {
+			// Point at infinity (0, 1, 1, 0) in precomputed form, note
+			// that the precomputed tables rescale so that `Z = 1`.
+			{1, 0, 0, 0, 0}, // y - x
+			{1, 0, 0, 0, 0}, // y + x
+			{0, 0, 0, 0, 0}, // t * 2d
+		}
+		for i := u64(1); i < 16; i = i + 1 {
+			ctrl := subtle.eq(i, idx)
+			ge_bp_addend_conditional_assign(tmp_addend, &tbl[i - 1], int(ctrl))
+		}
+		ge_add_bp_addend(ge, ge, tmp_addend, tmp_add)
+	}
+}
--- a/core/crypto/_edwards25519/edwards25519_table.odin
+++ b/core/crypto/_edwards25519/edwards25519_table.odin
--- a/core/crypto/_edwards25519/tools/edwards_gen_tables.odin
+++ b/core/crypto/_edwards25519/tools/edwards_gen_tables.odin
@@ -0,0 +1,134 @@
+package weistrass_tools
+
+import ed "core:crypto/_edwards25519"
+import field "core:crypto/_fiat/field_curve25519"
+import scalar "core:crypto/_fiat/field_scalar25519"
+import "core:encoding/endian"
+import "core:fmt"
+import path "core:path/filepath"
+import "core:os"
+import "core:strings"
+
+// Yes this leaks memory, fite me IRL.
+
+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/`
+
+@(private, rodata)
+FE_D2 := field.Tight_Field_Element {
+	1859910466990425,
+	932731440258426,
+	1072319116312658,
+	1815898335770999,
+	633789495995903,
+}
+
+main :: proc() {
+	Basepoint_Addend_Group_Element :: struct {
+		y2_minus_x2:  field.Loose_Field_Element, // t1
+		y2_plus_x2:   field.Loose_Field_Element, // t3
+		k_times_t2:   field.Tight_Field_Element, // t4
+	}
+	Basepoint_Multiply_Table :: [15]Basepoint_Addend_Group_Element
+
+	ge_bp_addend_set := proc(ge_a: ^Basepoint_Addend_Group_Element, ge: ^ed.Group_Element) {
+		// We rescale so Z == 1, so T = X * Y
+		x_, y_, z_inv: field.Tight_Field_Element
+		field.fe_carry_inv(&z_inv, field.fe_relax_cast(&ge.z))
+		field.fe_carry_mul(&x_, field.fe_relax_cast(&ge.x), field.fe_relax_cast(&z_inv))
+		field.fe_carry_mul(&y_, field.fe_relax_cast(&ge.y), field.fe_relax_cast(&z_inv))
+
+		field.fe_sub(&ge_a.y2_minus_x2, &y_, &x_)
+		field.fe_add(&ge_a.y2_plus_x2, &y_, &x_)
+		field.fe_carry_mul(&ge_a.k_times_t2, field.fe_relax_cast(&x_), field.fe_relax_cast(&y_))
+		field.fe_carry_mul(&ge_a.k_times_t2, field.fe_relax_cast(&ge_a.k_times_t2), field.fe_relax_cast(&FE_D2))
+	}
+
+	Multiply_Table_hi: [32]Basepoint_Multiply_Table
+	Multiply_Table_lo: [32]Basepoint_Multiply_Table
+
+	sc_set_unchecked := proc(sc: ^scalar.Non_Montgomery_Domain_Field_Element, b: []byte) {
+		sc[0] = endian.unchecked_get_u64le(b[0:])
+		sc[1] = endian.unchecked_get_u64le(b[8:])
+		sc[2] = endian.unchecked_get_u64le(b[16:])
+		sc[3] = endian.unchecked_get_u64le(b[24:])
+	}
+
+	g, p: ed.Group_Element
+	ed.ge_generator(&g)
+
+	sc: scalar.Non_Montgomery_Domain_Field_Element
+
+	// Precompute ([1,15] << n) * G multiples of G, LSB->MSB
+	for i in 0..<32 {
+		b: [32]byte
+		for j in 1..<16 {
+			b[i] = u8(j)
+			sc_set_unchecked(&sc, b[:])
+			ed.ge_scalarmult_raw(&p, &g, &sc, true)
+			ge_bp_addend_set(&Multiply_Table_lo[i][j-1], &p)
+
+			b[i] = u8(j) << 4
+			sc_set_unchecked(&sc, b[:])
+			ed.ge_scalarmult_raw(&p, &g, &sc, true)
+			ge_bp_addend_set(&Multiply_Table_hi[i][j-1], &p)
+
+			b[i] = 0
+		}
+	}
+
+	fn := path.join({ODIN_ROOT, "core", "crypto", "_edwards25519", "edwards25519_table.odin"})
+	bld: strings.Builder
+	w := strings.to_writer(&bld)
+
+	fmt.wprintln(w, "package _edwards25519")
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, GENERATED)
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, "import \"core:crypto\"")
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, "when crypto.COMPACT_IMPLS == false {")
+
+	fmt.wprintln(w, "\t@(private,rodata)")
+	fmt.wprintln(w, "\tGen_Multiply_Table_edwards25519_lo := [32]Basepoint_Multiply_Table {")
+	for &v in Multiply_Table_lo {
+		fmt.wprintln(w, "\t\t{")
+		for &ap in v {
+			fmt.wprintln(w, "\t\t\t{")
+
+			t1, t3, t4 := &ap.y2_minus_x2, &ap.y2_plus_x2, &ap.k_times_t2
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t1[0], t1[1], t1[2], t1[3], t1[4])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t3[0], t3[1], t3[2], t3[3], t3[4])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t4[0], t4[1], t4[2], t4[3], t4[4])
+
+			fmt.wprintln(w, "\t\t\t},")
+		}
+		fmt.wprintln(w, "\t\t},")
+	}
+	fmt.wprintln(w, "\t}\n")
+
+	fmt.wprintln(w, "\t@(private,rodata)")
+	fmt.wprintln(w, "\tGen_Multiply_Table_edwards25519_hi := [32]Basepoint_Multiply_Table {")
+	for &v in Multiply_Table_hi {
+		fmt.wprintln(w, "\t\t{")
+		for &ap in v {
+			fmt.wprintln(w, "\t\t\t{")
+
+			t1, t3, t4 := &ap.y2_minus_x2, &ap.y2_plus_x2, &ap.k_times_t2
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t1[0], t1[1], t1[2], t1[3], t1[4])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t3[0], t3[1], t3[2], t3[3], t3[4])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d, %d},\n", t4[0], t4[1], t4[2], t4[3], t4[4])
+
+			fmt.wprintln(w, "\t\t\t},")
+		}
+		fmt.wprintln(w, "\t\t},")
+	}
+	fmt.wprintln(w, "\t}\n")
+
+	fmt.wprintln(w, "\tGE_BASEPOINT_TABLE := &Gen_Multiply_Table_edwards25519_lo[0]")
+
+	fmt.wprintln(w, "}")
+
+	_ = os.write_entire_file(fn, transmute([]byte)(strings.to_string(bld)))
+}
--- a/core/crypto/_fiat/field_p256r1/field.odin
+++ b/core/crypto/_fiat/field_p256r1/field.odin
@@ -0,0 +1,346 @@
+package field_p256r1
+
+import subtle "core:crypto/_subtle"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) {
+	mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: []^Montgomery_Domain_Field_Element,
+) {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
+fe_from_bytes :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: []byte,
+	unsafe_assume_canonical := false,
+) -> bool {
+	ensure_contextless(len(arg1) == 32, "p256r1: invalid fe input buffer")
+
+	// Note: We assume the input is in big-endian.
+	tmp := Non_Montgomery_Domain_Field_Element {
+		endian.unchecked_get_u64be(arg1[24:]),
+		endian.unchecked_get_u64be(arg1[16:]),
+		endian.unchecked_get_u64be(arg1[8:]),
+		endian.unchecked_get_u64be(arg1[0:]),
+	}
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	// Check that tmp is in the the range [0, ELL).
+	if !unsafe_assume_canonical {
+		_, borrow := bits.sub_u64(ELL[0] - 1, tmp[0], 0)
+		_, borrow = bits.sub_u64(ELL[1], tmp[1], borrow)
+		_, borrow = bits.sub_u64(ELL[2], tmp[2], borrow)
+		_, borrow = bits.sub_u64(ELL[3], tmp[3], borrow)
+		if borrow != 0 {
+			return false
+		}
+	}
+
+	fe_to_montgomery(out1, &tmp)
+
+	return true
+}
+
+fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) {
+	ensure_contextless(len(out1) == 32, "p256r1: invalid fe output buffer")
+
+	tmp: Non_Montgomery_Domain_Field_Element
+	fe_from_montgomery(&tmp, arg1)
+
+	// Note: Likewise, output in big-endian.
+	endian.unchecked_put_u64be(out1[24:], tmp[0])
+	endian.unchecked_put_u64be(out1[16:], tmp[1])
+	endian.unchecked_put_u64be(out1[8:], tmp[2])
+	endian.unchecked_put_u64be(out1[0:], tmp[3])
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+@(require_results)
+fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> int {
+	tmp: Montgomery_Domain_Field_Element
+	fe_sub(&tmp, arg1, arg2)
+
+	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// which will be 1.
+	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
+
+	fe_clear(&tmp)
+
+	return int(is_eq)
+}
+
+@(require_results)
+fe_is_odd :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> int {
+	tmp: Non_Montgomery_Domain_Field_Element
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	fe_from_montgomery(&tmp, arg1)
+	return int(tmp[0] & 1)
+}
+
+fe_pow2k :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^Montgomery_Domain_Field_Element,
+	arg2: uint,
+) {
+	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
+	if arg2 == 0 {
+		fe_one(out1)
+		return
+	}
+
+	fe_square(out1, arg1)
+	for _ in 1 ..< arg2 {
+		fe_square(out1, out1)
+	}
+}
+
+fe_inv :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	// Inversion computation is derived from the addition chain:
+	//
+	//	_10     = 2*1
+	//	_11     = 1 + _10
+	//	_110    = 2*_11
+	//	_111    = 1 + _110
+	//	_111000 = _111 << 3
+	//	_111111 = _111 + _111000
+	//	x12     = _111111 << 6 + _111111
+	//	x15     = x12 << 3 + _111
+	//	x16     = 2*x15 + 1
+	//	x32     = x16 << 16 + x16
+	//	i53     = x32 << 15
+	//	x47     = x15 + i53
+	//	i263    = ((i53 << 17 + 1) << 143 + x47) << 47
+	//	return    (x47 + i263) << 2
+	//
+	// Operations: 255 squares 11 multiplies
+	//
+	// Generated by github.com/mmcloughlin/addchain v0.4.0.
+
+	// Note: Need to stash `arg1` (`xx`) in the case that `out1`/`arg1` alias,
+	// as `arg1` is used after `out1` has been altered.
+	t0, t1, xx: Montgomery_Domain_Field_Element = ---, ---, arg1^
+
+	// Step 1: z = x^0x2
+	fe_square(out1, arg1)
+
+	// Step 2: z = x^0x3
+	fe_mul(out1, &xx, out1)
+
+	// Step 3: z = x^0x6
+	fe_square(out1, out1)
+
+	// Step 4: z = x^0x7
+	fe_mul(out1, &xx, out1)
+
+	// Step 7: t0 = x^0x38
+	fe_pow2k(&t0, out1, 3)
+
+	// Step 8: t0 = x^0x3f
+	fe_mul(&t0, out1, &t0)
+
+	// Step 14: t1 = x^0xfc0
+	fe_pow2k(&t1, &t0, 6)
+
+	// Step 15: t0 = x^0xfff
+	fe_mul(&t0, &t0, &t1)
+
+	// Step 18: t0 = x^0x7ff8
+	fe_pow2k(&t0, &t0, 3)
+
+	// Step 19: z = x^0x7fff
+	fe_mul(out1, out1, &t0)
+
+	// Step 20: t0 = x^0xfffe
+	fe_square(&t0, out1)
+
+	// Step 21: t0 = x^0xffff
+	fe_mul(&t0, &xx, &t0)
+
+	// Step 37: t1 = x^0xffff0000
+	fe_pow2k(&t1, &t0, 16)
+
+	// Step 38: t0 = x^0xffffffff
+	fe_mul(&t0, &t0, &t1)
+
+	// Step 53: t0 = x^0x7fffffff8000
+	fe_pow2k(&t0, &t0, 15)
+
+	// Step 54: z = x^0x7fffffffffff
+	fe_mul(out1, out1, &t0)
+
+	// Step 71: t0 = x^0xffffffff00000000
+	fe_pow2k(&t0, &t0, 17)
+
+	// Step 72: t0 = x^0xffffffff00000001
+	fe_mul(&t0, &xx, &t0)
+
+	// Step 215: t0 = x^0x7fffffff80000000800000000000000000000000000000000000
+	fe_pow2k(&t0, &t0, 143)
+
+	// Step 216: t0 = x^0x7fffffff800000008000000000000000000000007fffffffffff
+	fe_mul(&t0, out1, &t0)
+
+	// Step 263: t0 = x^0x3fffffffc00000004000000000000000000000003fffffffffff800000000000
+	fe_pow2k(&t0, &t0, 47)
+
+	// Step 264: z = x^0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffff
+	fe_mul(out1, out1, &t0)
+
+	// Step 266: z = x^0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc
+	fe_pow2k(out1, out1, 2)
+
+	fe_mul(out1, out1, &xx)
+
+	fe_clear_vec([]^Montgomery_Domain_Field_Element{&t0, &t1, &xx})
+}
+
+@(require_results)
+fe_sqrt :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) -> int {
+	// Square root candidate can be derived via exponentiation by `(p + 1) / 4`
+	// From sage: 28948022302589062190674361737351893382521535853822578548883407827216774463488
+	//
+	// 	// Inversion computation is derived from the addition chain:
+	//
+	//	_10       = 2*1
+	//	_11       = 1 + _10
+	//	_1100     = _11 << 2
+	//	_1111     = _11 + _1100
+	//	_11110000 = _1111 << 4
+	//	_11111111 = _1111 + _11110000
+	//	x16       = _11111111 << 8 + _11111111
+	//	x32       = x16 << 16 + x16
+	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
+	//
+	// Operations: 253 squares 7 multiplies
+	//
+	// Generated by github.com/mmcloughlin/addchain v0.4.0.
+
+	// Likewise this tramples over arg1, so stash another copy.
+	t0, xx: Montgomery_Domain_Field_Element =  ---, arg1^
+
+	// Step 1: z = x^0x2
+	fe_square(out1, arg1)
+
+	// Step 2: z = x^0x3
+	fe_mul(out1, &xx, out1)
+
+	// Step 4: t0 = x^0xc
+	fe_pow2k(&t0, &xx, 2)
+
+	// Step 5: z = x^0xf
+	fe_mul(out1, out1, &t0)
+
+	// Step 9: t0 = x^0xf0
+	fe_pow2k(&t0, out1, 4)
+
+	// Step 10: z = x^0xff
+	fe_mul(out1, out1, &t0)
+
+	// Step 18: t0 = x^0xff00
+	fe_pow2k(&t0, out1, 8)
+
+	// Step 19: z = x^0xffff
+	fe_mul(out1, out1, &t0)
+
+	// Step 35: t0 = x^0xffff0000
+	fe_pow2k(&t0, out1, 16)
+
+	// Step 36: z = x^0xffffffff
+	fe_mul(out1, out1, &t0)
+
+	// Step 68: z = x^0xffffffff00000000
+	fe_pow2k(out1, out1, 32)
+
+	// Step 69: z = x^0xffffffff00000001
+	fe_mul(out1, &xx, out1)
+
+	// Step 165: z = x^0xffffffff00000001000000000000000000000000
+	fe_pow2k(out1, out1, 96)
+
+	// Step 166: z = x^0xffffffff00000001000000000000000000000001
+	fe_mul(out1, &xx, out1)
+
+	// Step 260: z = x^0x3fffffffc0000000400000000000000000000000400000000000000000000000
+	fe_pow2k(out1, out1, 94)
+
+	// Ensure that our candidate is actually the square root.
+	check, zero: Montgomery_Domain_Field_Element
+	fe_square(&check, out1)
+
+	is_valid := fe_equal(&check, &xx)
+	fe_cond_select(out1, &zero, out1, is_valid)
+
+	fe_clear_vec([]^Montgomery_Domain_Field_Element{&t0, &xx, &check})
+
+	return is_valid
+
+}
+
+fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Montgomery_Domain_Field_Element, arg1: int) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+}
+
+@(optimization_mode = "none")
+fe_cond_select :: #force_no_inline proc "contextless" (
+	out1, arg1, arg2: ^Montgomery_Domain_Field_Element,
+	arg3: int,
+) {
+	mask := (u64(arg3) * 0xffffffffffffffff)
+	x1 := ((mask & arg2[0]) | ((~mask) & arg1[0]))
+	x2 := ((mask & arg2[1]) | ((~mask) & arg1[1]))
+	x3 := ((mask & arg2[2]) | ((~mask) & arg1[2]))
+	x4 := ((mask & arg2[3]) | ((~mask) & arg1[3]))
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+fe_cond_negate :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element, ctrl: int) {
+	tmp1: Montgomery_Domain_Field_Element = ---
+	fe_opp(&tmp1, arg1)
+	fe_cond_select(out1, arg1, &tmp1, ctrl)
+
+	fe_clear(&tmp1)
+}
--- a/core/crypto/_fiat/field_p256r1/field64.odin
+++ b/core/crypto/_fiat/field_p256r1/field64.odin
@@ -0,0 +1,501 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_p256r1
+
+// The file provides arithmetic on the field Z/(2^256 - 2^224 + 2^192 + 2^96 - 1)
+// using a 64-bit Montgomery form internal representation.  It is derived
+// primarily from the machine generated Golang output from the fiat-crypto
+// project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+//
+// WARNING: While big-endian is the common representation used for this
+// curve, the fiat output uses least-significant-limb first.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+// ELL is the saturated representation of the field order, least-significant
+// limb first.
+ELL :: [4]u64{0xffffffffffffffff, 0xffffffff, 0x0, 0xffffffff00000001}
+
+Montgomery_Domain_Field_Element :: distinct [4]u64
+Non_Montgomery_Domain_Field_Element :: distinct [4]u64
+
+fe_mul :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg2[3])
+	x8, x7 := bits.mul_u64(x4, arg2[2])
+	x10, x9 := bits.mul_u64(x4, arg2[1])
+	x12, x11 := bits.mul_u64(x4, arg2[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	x21, x20 := bits.mul_u64(x11, 0xffffffff00000001)
+	x23, x22 := bits.mul_u64(x11, 0xffffffff)
+	x25, x24 := bits.mul_u64(x11, 0xffffffffffffffff)
+	x26, x27 := bits.add_u64(x25, x22, u64(0x0))
+	x28 := (u64(fiat.u1(x27)) + x23)
+	_, x30 := bits.add_u64(x11, x24, u64(0x0))
+	x31, x32 := bits.add_u64(x13, x26, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x15, x28, u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64(x17, x20, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x19, x21, u64(fiat.u1(x36)))
+	x40, x39 := bits.mul_u64(x1, arg2[3])
+	x42, x41 := bits.mul_u64(x1, arg2[2])
+	x44, x43 := bits.mul_u64(x1, arg2[1])
+	x46, x45 := bits.mul_u64(x1, arg2[0])
+	x47, x48 := bits.add_u64(x46, x43, u64(0x0))
+	x49, x50 := bits.add_u64(x44, x41, u64(fiat.u1(x48)))
+	x51, x52 := bits.add_u64(x42, x39, u64(fiat.u1(x50)))
+	x53 := (u64(fiat.u1(x52)) + x40)
+	x54, x55 := bits.add_u64(x31, x45, u64(0x0))
+	x56, x57 := bits.add_u64(x33, x47, u64(fiat.u1(x55)))
+	x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57)))
+	x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59)))
+	x62, x63 := bits.add_u64(u64(fiat.u1(x38)), x53, u64(fiat.u1(x61)))
+	x65, x64 := bits.mul_u64(x54, 0xffffffff00000001)
+	x67, x66 := bits.mul_u64(x54, 0xffffffff)
+	x69, x68 := bits.mul_u64(x54, 0xffffffffffffffff)
+	x70, x71 := bits.add_u64(x69, x66, u64(0x0))
+	x72 := (u64(fiat.u1(x71)) + x67)
+	_, x74 := bits.add_u64(x54, x68, u64(0x0))
+	x75, x76 := bits.add_u64(x56, x70, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x58, x72, u64(fiat.u1(x76)))
+	x79, x80 := bits.add_u64(x60, x64, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x62, x65, u64(fiat.u1(x80)))
+	x83 := (u64(fiat.u1(x82)) + u64(fiat.u1(x63)))
+	x85, x84 := bits.mul_u64(x2, arg2[3])
+	x87, x86 := bits.mul_u64(x2, arg2[2])
+	x89, x88 := bits.mul_u64(x2, arg2[1])
+	x91, x90 := bits.mul_u64(x2, arg2[0])
+	x92, x93 := bits.add_u64(x91, x88, u64(0x0))
+	x94, x95 := bits.add_u64(x89, x86, u64(fiat.u1(x93)))
+	x96, x97 := bits.add_u64(x87, x84, u64(fiat.u1(x95)))
+	x98 := (u64(fiat.u1(x97)) + x85)
+	x99, x100 := bits.add_u64(x75, x90, u64(0x0))
+	x101, x102 := bits.add_u64(x77, x92, u64(fiat.u1(x100)))
+	x103, x104 := bits.add_u64(x79, x94, u64(fiat.u1(x102)))
+	x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104)))
+	x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106)))
+	x110, x109 := bits.mul_u64(x99, 0xffffffff00000001)
+	x112, x111 := bits.mul_u64(x99, 0xffffffff)
+	x114, x113 := bits.mul_u64(x99, 0xffffffffffffffff)
+	x115, x116 := bits.add_u64(x114, x111, u64(0x0))
+	x117 := (u64(fiat.u1(x116)) + x112)
+	_, x119 := bits.add_u64(x99, x113, u64(0x0))
+	x120, x121 := bits.add_u64(x101, x115, u64(fiat.u1(x119)))
+	x122, x123 := bits.add_u64(x103, x117, u64(fiat.u1(x121)))
+	x124, x125 := bits.add_u64(x105, x109, u64(fiat.u1(x123)))
+	x126, x127 := bits.add_u64(x107, x110, u64(fiat.u1(x125)))
+	x128 := (u64(fiat.u1(x127)) + u64(fiat.u1(x108)))
+	x130, x129 := bits.mul_u64(x3, arg2[3])
+	x132, x131 := bits.mul_u64(x3, arg2[2])
+	x134, x133 := bits.mul_u64(x3, arg2[1])
+	x136, x135 := bits.mul_u64(x3, arg2[0])
+	x137, x138 := bits.add_u64(x136, x133, u64(0x0))
+	x139, x140 := bits.add_u64(x134, x131, u64(fiat.u1(x138)))
+	x141, x142 := bits.add_u64(x132, x129, u64(fiat.u1(x140)))
+	x143 := (u64(fiat.u1(x142)) + x130)
+	x144, x145 := bits.add_u64(x120, x135, u64(0x0))
+	x146, x147 := bits.add_u64(x122, x137, u64(fiat.u1(x145)))
+	x148, x149 := bits.add_u64(x124, x139, u64(fiat.u1(x147)))
+	x150, x151 := bits.add_u64(x126, x141, u64(fiat.u1(x149)))
+	x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151)))
+	x155, x154 := bits.mul_u64(x144, 0xffffffff00000001)
+	x157, x156 := bits.mul_u64(x144, 0xffffffff)
+	x159, x158 := bits.mul_u64(x144, 0xffffffffffffffff)
+	x160, x161 := bits.add_u64(x159, x156, u64(0x0))
+	x162 := (u64(fiat.u1(x161)) + x157)
+	_, x164 := bits.add_u64(x144, x158, u64(0x0))
+	x165, x166 := bits.add_u64(x146, x160, u64(fiat.u1(x164)))
+	x167, x168 := bits.add_u64(x148, x162, u64(fiat.u1(x166)))
+	x169, x170 := bits.add_u64(x150, x154, u64(fiat.u1(x168)))
+	x171, x172 := bits.add_u64(x152, x155, u64(fiat.u1(x170)))
+	x173 := (u64(fiat.u1(x172)) + u64(fiat.u1(x153)))
+	x174, x175 := bits.sub_u64(x165, 0xffffffffffffffff, u64(0x0))
+	x176, x177 := bits.sub_u64(x167, 0xffffffff, u64(fiat.u1(x175)))
+	x178, x179 := bits.sub_u64(x169, u64(0x0), u64(fiat.u1(x177)))
+	x180, x181 := bits.sub_u64(x171, 0xffffffff00000001, u64(fiat.u1(x179)))
+	_, x183 := bits.sub_u64(x173, u64(0x0), u64(fiat.u1(x181)))
+	x184 := fiat.cmovznz_u64(fiat.u1(x183), x174, x165)
+	x185 := fiat.cmovznz_u64(fiat.u1(x183), x176, x167)
+	x186 := fiat.cmovznz_u64(fiat.u1(x183), x178, x169)
+	x187 := fiat.cmovznz_u64(fiat.u1(x183), x180, x171)
+	out1[0] = x184
+	out1[1] = x185
+	out1[2] = x186
+	out1[3] = x187
+}
+
+fe_square :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg1[3])
+	x8, x7 := bits.mul_u64(x4, arg1[2])
+	x10, x9 := bits.mul_u64(x4, arg1[1])
+	x12, x11 := bits.mul_u64(x4, arg1[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	x21, x20 := bits.mul_u64(x11, 0xffffffff00000001)
+	x23, x22 := bits.mul_u64(x11, 0xffffffff)
+	x25, x24 := bits.mul_u64(x11, 0xffffffffffffffff)
+	x26, x27 := bits.add_u64(x25, x22, u64(0x0))
+	x28 := (u64(fiat.u1(x27)) + x23)
+	_, x30 := bits.add_u64(x11, x24, u64(0x0))
+	x31, x32 := bits.add_u64(x13, x26, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x15, x28, u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64(x17, x20, u64(fiat.u1(x34)))
+	x37, x38 := bits.add_u64(x19, x21, u64(fiat.u1(x36)))
+	x40, x39 := bits.mul_u64(x1, arg1[3])
+	x42, x41 := bits.mul_u64(x1, arg1[2])
+	x44, x43 := bits.mul_u64(x1, arg1[1])
+	x46, x45 := bits.mul_u64(x1, arg1[0])
+	x47, x48 := bits.add_u64(x46, x43, u64(0x0))
+	x49, x50 := bits.add_u64(x44, x41, u64(fiat.u1(x48)))
+	x51, x52 := bits.add_u64(x42, x39, u64(fiat.u1(x50)))
+	x53 := (u64(fiat.u1(x52)) + x40)
+	x54, x55 := bits.add_u64(x31, x45, u64(0x0))
+	x56, x57 := bits.add_u64(x33, x47, u64(fiat.u1(x55)))
+	x58, x59 := bits.add_u64(x35, x49, u64(fiat.u1(x57)))
+	x60, x61 := bits.add_u64(x37, x51, u64(fiat.u1(x59)))
+	x62, x63 := bits.add_u64(u64(fiat.u1(x38)), x53, u64(fiat.u1(x61)))
+	x65, x64 := bits.mul_u64(x54, 0xffffffff00000001)
+	x67, x66 := bits.mul_u64(x54, 0xffffffff)
+	x69, x68 := bits.mul_u64(x54, 0xffffffffffffffff)
+	x70, x71 := bits.add_u64(x69, x66, u64(0x0))
+	x72 := (u64(fiat.u1(x71)) + x67)
+	_, x74 := bits.add_u64(x54, x68, u64(0x0))
+	x75, x76 := bits.add_u64(x56, x70, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x58, x72, u64(fiat.u1(x76)))
+	x79, x80 := bits.add_u64(x60, x64, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x62, x65, u64(fiat.u1(x80)))
+	x83 := (u64(fiat.u1(x82)) + u64(fiat.u1(x63)))
+	x85, x84 := bits.mul_u64(x2, arg1[3])
+	x87, x86 := bits.mul_u64(x2, arg1[2])
+	x89, x88 := bits.mul_u64(x2, arg1[1])
+	x91, x90 := bits.mul_u64(x2, arg1[0])
+	x92, x93 := bits.add_u64(x91, x88, u64(0x0))
+	x94, x95 := bits.add_u64(x89, x86, u64(fiat.u1(x93)))
+	x96, x97 := bits.add_u64(x87, x84, u64(fiat.u1(x95)))
+	x98 := (u64(fiat.u1(x97)) + x85)
+	x99, x100 := bits.add_u64(x75, x90, u64(0x0))
+	x101, x102 := bits.add_u64(x77, x92, u64(fiat.u1(x100)))
+	x103, x104 := bits.add_u64(x79, x94, u64(fiat.u1(x102)))
+	x105, x106 := bits.add_u64(x81, x96, u64(fiat.u1(x104)))
+	x107, x108 := bits.add_u64(x83, x98, u64(fiat.u1(x106)))
+	x110, x109 := bits.mul_u64(x99, 0xffffffff00000001)
+	x112, x111 := bits.mul_u64(x99, 0xffffffff)
+	x114, x113 := bits.mul_u64(x99, 0xffffffffffffffff)
+	x115, x116 := bits.add_u64(x114, x111, u64(0x0))
+	x117 := (u64(fiat.u1(x116)) + x112)
+	_, x119 := bits.add_u64(x99, x113, u64(0x0))
+	x120, x121 := bits.add_u64(x101, x115, u64(fiat.u1(x119)))
+	x122, x123 := bits.add_u64(x103, x117, u64(fiat.u1(x121)))
+	x124, x125 := bits.add_u64(x105, x109, u64(fiat.u1(x123)))
+	x126, x127 := bits.add_u64(x107, x110, u64(fiat.u1(x125)))
+	x128 := (u64(fiat.u1(x127)) + u64(fiat.u1(x108)))
+	x130, x129 := bits.mul_u64(x3, arg1[3])
+	x132, x131 := bits.mul_u64(x3, arg1[2])
+	x134, x133 := bits.mul_u64(x3, arg1[1])
+	x136, x135 := bits.mul_u64(x3, arg1[0])
+	x137, x138 := bits.add_u64(x136, x133, u64(0x0))
+	x139, x140 := bits.add_u64(x134, x131, u64(fiat.u1(x138)))
+	x141, x142 := bits.add_u64(x132, x129, u64(fiat.u1(x140)))
+	x143 := (u64(fiat.u1(x142)) + x130)
+	x144, x145 := bits.add_u64(x120, x135, u64(0x0))
+	x146, x147 := bits.add_u64(x122, x137, u64(fiat.u1(x145)))
+	x148, x149 := bits.add_u64(x124, x139, u64(fiat.u1(x147)))
+	x150, x151 := bits.add_u64(x126, x141, u64(fiat.u1(x149)))
+	x152, x153 := bits.add_u64(x128, x143, u64(fiat.u1(x151)))
+	x155, x154 := bits.mul_u64(x144, 0xffffffff00000001)
+	x157, x156 := bits.mul_u64(x144, 0xffffffff)
+	x159, x158 := bits.mul_u64(x144, 0xffffffffffffffff)
+	x160, x161 := bits.add_u64(x159, x156, u64(0x0))
+	x162 := (u64(fiat.u1(x161)) + x157)
+	_, x164 := bits.add_u64(x144, x158, u64(0x0))
+	x165, x166 := bits.add_u64(x146, x160, u64(fiat.u1(x164)))
+	x167, x168 := bits.add_u64(x148, x162, u64(fiat.u1(x166)))
+	x169, x170 := bits.add_u64(x150, x154, u64(fiat.u1(x168)))
+	x171, x172 := bits.add_u64(x152, x155, u64(fiat.u1(x170)))
+	x173 := (u64(fiat.u1(x172)) + u64(fiat.u1(x153)))
+	x174, x175 := bits.sub_u64(x165, 0xffffffffffffffff, u64(0x0))
+	x176, x177 := bits.sub_u64(x167, 0xffffffff, u64(fiat.u1(x175)))
+	x178, x179 := bits.sub_u64(x169, u64(0x0), u64(fiat.u1(x177)))
+	x180, x181 := bits.sub_u64(x171, 0xffffffff00000001, u64(fiat.u1(x179)))
+	_, x183 := bits.sub_u64(x173, u64(0x0), u64(fiat.u1(x181)))
+	x184 := fiat.cmovznz_u64(fiat.u1(x183), x174, x165)
+	x185 := fiat.cmovznz_u64(fiat.u1(x183), x176, x167)
+	x186 := fiat.cmovznz_u64(fiat.u1(x183), x178, x169)
+	x187 := fiat.cmovznz_u64(fiat.u1(x183), x180, x171)
+	out1[0] = x184
+	out1[1] = x185
+	out1[2] = x186
+	out1[3] = x187
+}
+
+fe_add :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.add_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.add_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.add_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.add_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9, x10 := bits.sub_u64(x1, 0xffffffffffffffff, u64(0x0))
+	x11, x12 := bits.sub_u64(x3, 0xffffffff, u64(fiat.u1(x10)))
+	x13, x14 := bits.sub_u64(x5, u64(0x0), u64(fiat.u1(x12)))
+	x15, x16 := bits.sub_u64(x7, 0xffffffff00000001, u64(fiat.u1(x14)))
+	_, x18 := bits.sub_u64(u64(fiat.u1(x8)), u64(0x0), u64(fiat.u1(x16)))
+	x19 := fiat.cmovznz_u64(fiat.u1(x18), x9, x1)
+	x20 := fiat.cmovznz_u64(fiat.u1(x18), x11, x3)
+	x21 := fiat.cmovznz_u64(fiat.u1(x18), x13, x5)
+	x22 := fiat.cmovznz_u64(fiat.u1(x18), x15, x7)
+	out1[0] = x19
+	out1[1] = x20
+	out1[2] = x21
+	out1[3] = x22
+}
+
+fe_sub :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.sub_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, x9, u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0xffffffff), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000001), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_opp :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(u64(0x0), arg1[0], u64(0x0))
+	x3, x4 := bits.sub_u64(u64(0x0), arg1[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(u64(0x0), arg1[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(u64(0x0), arg1[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, x9, u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0xffffffff), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, u64(0x0), u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000001), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_one :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0x1
+	out1[1] = 0xffffffff00000000
+	out1[2] = 0xffffffffffffffff
+	out1[3] = 0xfffffffe
+}
+
+fe_non_zero :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> u64 {
+	return arg1[0] | (arg1[1] | (arg1[2] | arg1[3]))
+}
+
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Montgomery_Domain_Field_Element,
+	arg2: int,
+) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+fe_from_montgomery :: proc "contextless" (
+	out1: ^Non_Montgomery_Domain_Field_Element,
+	arg1: ^Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[0]
+	x3, x2 := bits.mul_u64(x1, 0xffffffff00000001)
+	x5, x4 := bits.mul_u64(x1, 0xffffffff)
+	x7, x6 := bits.mul_u64(x1, 0xffffffffffffffff)
+	x8, x9 := bits.add_u64(x7, x4, u64(0x0))
+	_, x11 := bits.add_u64(x1, x6, u64(0x0))
+	x12, x13 := bits.add_u64(u64(0x0), x8, u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x12, arg1[1], u64(0x0))
+	x17, x16 := bits.mul_u64(x14, 0xffffffff00000001)
+	x19, x18 := bits.mul_u64(x14, 0xffffffff)
+	x21, x20 := bits.mul_u64(x14, 0xffffffffffffffff)
+	x22, x23 := bits.add_u64(x21, x18, u64(0x0))
+	_, x25 := bits.add_u64(x14, x20, u64(0x0))
+	x26, x27 := bits.add_u64((u64(fiat.u1(x15)) + (u64(fiat.u1(x13)) + (u64(fiat.u1(x9)) + x5))), x22, u64(fiat.u1(x25)))
+	x28, x29 := bits.add_u64(x2, (u64(fiat.u1(x23)) + x19), u64(fiat.u1(x27)))
+	x30, x31 := bits.add_u64(x3, x16, u64(fiat.u1(x29)))
+	x32, x33 := bits.add_u64(x26, arg1[2], u64(0x0))
+	x34, x35 := bits.add_u64(x28, u64(0x0), u64(fiat.u1(x33)))
+	x36, x37 := bits.add_u64(x30, u64(0x0), u64(fiat.u1(x35)))
+	x39, x38 := bits.mul_u64(x32, 0xffffffff00000001)
+	x41, x40 := bits.mul_u64(x32, 0xffffffff)
+	x43, x42 := bits.mul_u64(x32, 0xffffffffffffffff)
+	x44, x45 := bits.add_u64(x43, x40, u64(0x0))
+	_, x47 := bits.add_u64(x32, x42, u64(0x0))
+	x48, x49 := bits.add_u64(x34, x44, u64(fiat.u1(x47)))
+	x50, x51 := bits.add_u64(x36, (u64(fiat.u1(x45)) + x41), u64(fiat.u1(x49)))
+	x52, x53 := bits.add_u64((u64(fiat.u1(x37)) + (u64(fiat.u1(x31)) + x17)), x38, u64(fiat.u1(x51)))
+	x54, x55 := bits.add_u64(x48, arg1[3], u64(0x0))
+	x56, x57 := bits.add_u64(x50, u64(0x0), u64(fiat.u1(x55)))
+	x58, x59 := bits.add_u64(x52, u64(0x0), u64(fiat.u1(x57)))
+	x61, x60 := bits.mul_u64(x54, 0xffffffff00000001)
+	x63, x62 := bits.mul_u64(x54, 0xffffffff)
+	x65, x64 := bits.mul_u64(x54, 0xffffffffffffffff)
+	x66, x67 := bits.add_u64(x65, x62, u64(0x0))
+	_, x69 := bits.add_u64(x54, x64, u64(0x0))
+	x70, x71 := bits.add_u64(x56, x66, u64(fiat.u1(x69)))
+	x72, x73 := bits.add_u64(x58, (u64(fiat.u1(x67)) + x63), u64(fiat.u1(x71)))
+	x74, x75 := bits.add_u64((u64(fiat.u1(x59)) + (u64(fiat.u1(x53)) + x39)), x60, u64(fiat.u1(x73)))
+	x76 := (u64(fiat.u1(x75)) + x61)
+	x77, x78 := bits.sub_u64(x70, 0xffffffffffffffff, u64(0x0))
+	x79, x80 := bits.sub_u64(x72, 0xffffffff, u64(fiat.u1(x78)))
+	x81, x82 := bits.sub_u64(x74, u64(0x0), u64(fiat.u1(x80)))
+	x83, x84 := bits.sub_u64(x76, 0xffffffff00000001, u64(fiat.u1(x82)))
+	_, x86 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x84)))
+	x87 := fiat.cmovznz_u64(fiat.u1(x86), x77, x70)
+	x88 := fiat.cmovznz_u64(fiat.u1(x86), x79, x72)
+	x89 := fiat.cmovznz_u64(fiat.u1(x86), x81, x74)
+	x90 := fiat.cmovznz_u64(fiat.u1(x86), x83, x76)
+	out1[0] = x87
+	out1[1] = x88
+	out1[2] = x89
+	out1[3] = x90
+}
+
+fe_to_montgomery :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^Non_Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, 0x4fffffffd)
+	x8, x7 := bits.mul_u64(x4, 0xfffffffffffffffe)
+	x10, x9 := bits.mul_u64(x4, 0xfffffffbffffffff)
+	x12, x11 := bits.mul_u64(x4, 0x3)
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x20, x19 := bits.mul_u64(x11, 0xffffffff00000001)
+	x22, x21 := bits.mul_u64(x11, 0xffffffff)
+	x24, x23 := bits.mul_u64(x11, 0xffffffffffffffff)
+	x25, x26 := bits.add_u64(x24, x21, u64(0x0))
+	_, x28 := bits.add_u64(x11, x23, u64(0x0))
+	x29, x30 := bits.add_u64(x13, x25, u64(fiat.u1(x28)))
+	x31, x32 := bits.add_u64(x15, (u64(fiat.u1(x26)) + x22), u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x17, x19, u64(fiat.u1(x32)))
+	x35, x36 := bits.add_u64((u64(fiat.u1(x18)) + x6), x20, u64(fiat.u1(x34)))
+	x38, x37 := bits.mul_u64(x1, 0x4fffffffd)
+	x40, x39 := bits.mul_u64(x1, 0xfffffffffffffffe)
+	x42, x41 := bits.mul_u64(x1, 0xfffffffbffffffff)
+	x44, x43 := bits.mul_u64(x1, 0x3)
+	x45, x46 := bits.add_u64(x44, x41, u64(0x0))
+	x47, x48 := bits.add_u64(x42, x39, u64(fiat.u1(x46)))
+	x49, x50 := bits.add_u64(x40, x37, u64(fiat.u1(x48)))
+	x51, x52 := bits.add_u64(x29, x43, u64(0x0))
+	x53, x54 := bits.add_u64(x31, x45, u64(fiat.u1(x52)))
+	x55, x56 := bits.add_u64(x33, x47, u64(fiat.u1(x54)))
+	x57, x58 := bits.add_u64(x35, x49, u64(fiat.u1(x56)))
+	x60, x59 := bits.mul_u64(x51, 0xffffffff00000001)
+	x62, x61 := bits.mul_u64(x51, 0xffffffff)
+	x64, x63 := bits.mul_u64(x51, 0xffffffffffffffff)
+	x65, x66 := bits.add_u64(x64, x61, u64(0x0))
+	_, x68 := bits.add_u64(x51, x63, u64(0x0))
+	x69, x70 := bits.add_u64(x53, x65, u64(fiat.u1(x68)))
+	x71, x72 := bits.add_u64(x55, (u64(fiat.u1(x66)) + x62), u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x57, x59, u64(fiat.u1(x72)))
+	x75, x76 := bits.add_u64(((u64(fiat.u1(x58)) + u64(fiat.u1(x36))) + (u64(fiat.u1(x50)) + x38)), x60, u64(fiat.u1(x74)))
+	x78, x77 := bits.mul_u64(x2, 0x4fffffffd)
+	x80, x79 := bits.mul_u64(x2, 0xfffffffffffffffe)
+	x82, x81 := bits.mul_u64(x2, 0xfffffffbffffffff)
+	x84, x83 := bits.mul_u64(x2, 0x3)
+	x85, x86 := bits.add_u64(x84, x81, u64(0x0))
+	x87, x88 := bits.add_u64(x82, x79, u64(fiat.u1(x86)))
+	x89, x90 := bits.add_u64(x80, x77, u64(fiat.u1(x88)))
+	x91, x92 := bits.add_u64(x69, x83, u64(0x0))
+	x93, x94 := bits.add_u64(x71, x85, u64(fiat.u1(x92)))
+	x95, x96 := bits.add_u64(x73, x87, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(x75, x89, u64(fiat.u1(x96)))
+	x100, x99 := bits.mul_u64(x91, 0xffffffff00000001)
+	x102, x101 := bits.mul_u64(x91, 0xffffffff)
+	x104, x103 := bits.mul_u64(x91, 0xffffffffffffffff)
+	x105, x106 := bits.add_u64(x104, x101, u64(0x0))
+	_, x108 := bits.add_u64(x91, x103, u64(0x0))
+	x109, x110 := bits.add_u64(x93, x105, u64(fiat.u1(x108)))
+	x111, x112 := bits.add_u64(x95, (u64(fiat.u1(x106)) + x102), u64(fiat.u1(x110)))
+	x113, x114 := bits.add_u64(x97, x99, u64(fiat.u1(x112)))
+	x115, x116 := bits.add_u64(((u64(fiat.u1(x98)) + u64(fiat.u1(x76))) + (u64(fiat.u1(x90)) + x78)), x100, u64(fiat.u1(x114)))
+	x118, x117 := bits.mul_u64(x3, 0x4fffffffd)
+	x120, x119 := bits.mul_u64(x3, 0xfffffffffffffffe)
+	x122, x121 := bits.mul_u64(x3, 0xfffffffbffffffff)
+	x124, x123 := bits.mul_u64(x3, 0x3)
+	x125, x126 := bits.add_u64(x124, x121, u64(0x0))
+	x127, x128 := bits.add_u64(x122, x119, u64(fiat.u1(x126)))
+	x129, x130 := bits.add_u64(x120, x117, u64(fiat.u1(x128)))
+	x131, x132 := bits.add_u64(x109, x123, u64(0x0))
+	x133, x134 := bits.add_u64(x111, x125, u64(fiat.u1(x132)))
+	x135, x136 := bits.add_u64(x113, x127, u64(fiat.u1(x134)))
+	x137, x138 := bits.add_u64(x115, x129, u64(fiat.u1(x136)))
+	x140, x139 := bits.mul_u64(x131, 0xffffffff00000001)
+	x142, x141 := bits.mul_u64(x131, 0xffffffff)
+	x144, x143 := bits.mul_u64(x131, 0xffffffffffffffff)
+	x145, x146 := bits.add_u64(x144, x141, u64(0x0))
+	_, x148 := bits.add_u64(x131, x143, u64(0x0))
+	x149, x150 := bits.add_u64(x133, x145, u64(fiat.u1(x148)))
+	x151, x152 := bits.add_u64(x135, (u64(fiat.u1(x146)) + x142), u64(fiat.u1(x150)))
+	x153, x154 := bits.add_u64(x137, x139, u64(fiat.u1(x152)))
+	x155, x156 := bits.add_u64(((u64(fiat.u1(x138)) + u64(fiat.u1(x116))) + (u64(fiat.u1(x130)) + x118)), x140, u64(fiat.u1(x154)))
+	x157, x158 := bits.sub_u64(x149, 0xffffffffffffffff, u64(0x0))
+	x159, x160 := bits.sub_u64(x151, 0xffffffff, u64(fiat.u1(x158)))
+	x161, x162 := bits.sub_u64(x153, u64(0x0), u64(fiat.u1(x160)))
+	x163, x164 := bits.sub_u64(x155, 0xffffffff00000001, u64(fiat.u1(x162)))
+	_, x166 := bits.sub_u64(u64(fiat.u1(x156)), u64(0x0), u64(fiat.u1(x164)))
+	x167 := fiat.cmovznz_u64(fiat.u1(x166), x157, x149)
+	x168 := fiat.cmovznz_u64(fiat.u1(x166), x159, x151)
+	x169 := fiat.cmovznz_u64(fiat.u1(x166), x161, x153)
+	x170 := fiat.cmovznz_u64(fiat.u1(x166), x163, x155)
+	out1[0] = x167
+	out1[1] = x168
+	out1[2] = x169
+	out1[3] = x170
+}
--- a/core/crypto/_fiat/field_scalar25519/field.odin
+++ b/core/crypto/_fiat/field_scalar25519/field.odin
@@ -1,5 +1,6 @@
 package field_scalar25519

+import subtle "core:crypto/_subtle"
 import "core:encoding/endian"
 import "core:math/bits"
 import "core:mem"
@@ -121,13 +122,11 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	tmp: Montgomery_Domain_Field_Element
 	fe_sub(&tmp, arg1, arg2)

-	// This will only underflow iff arg1 == arg2, and we return the borrow,
-	// which will be 1.
-	_, borrow := bits.sub_u64(fe_non_zero(&tmp), 1, 0)
+	is_eq := subtle.eq(fe_non_zero(&tmp), 0)

 	fe_clear(&tmp)

-	return int(borrow)
+	return int(is_eq)
 }

 fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
--- a/core/crypto/_fiat/field_scalarp256r1/field.odin
+++ b/core/crypto/_fiat/field_scalarp256r1/field.odin
@@ -0,0 +1,210 @@
+package field_scalarp256r1
+
+import subtle "core:crypto/_subtle"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+@(private, rodata)
+TWO_192 := Montgomery_Domain_Field_Element{
+	2482910415990817935,
+	2879494685571067143,
+	8732918506673730078,
+	85565669603516024,
+}
+@(private, rodata)
+TWO_384 := Montgomery_Domain_Field_Element{
+	2127524300190691059,
+	17014302137236182484,
+	16604910261202196099,
+	3621421107472562910,
+}
+// 2^384 % p (From sage)
+// 0x431905529c0166ce652e96b7ccca0a99679b73e19ad16947f01cf013fc632551
+
+fe_clear :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) {
+	mem.zero_explicit(arg1, size_of(Montgomery_Domain_Field_Element))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: []^Montgomery_Domain_Field_Element,
+) {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
+fe_from_bytes :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: []byte,
+) -> bool {
+	ensure_contextless(len(out1) <= 64, "p256r1: invalid scalar input buffer")
+
+	is_canonical := false
+	s_len := len(arg1)
+	switch {
+	case s_len < 32:
+		// No way this can be greater than the order.
+		fe_unchecked_set(out1, arg1)
+		is_canonical = true
+	case s_len == 32:
+		// It is quite likely that a reduction mod p is required,
+		// as the order of the curve is sufficiently smaller than
+		// 2^256-1, so just check if we actually needed to reduced
+		// and do the reduction anyway, so that things that require
+		// canonical scalars can reject non-canonical encodings.
+		is_canonical = fe_is_canonical(arg1)
+		fallthrough
+	case:
+		// Use Frank Denis' trick, as documented by Filippo Valsorda
+		// at https://words.filippo.io/dispatches/wide-reduction/
+		//
+		// "I represent the value as a+b*2^192+c*2^384"
+		//
+		// Note: Omitting the `c` computation is fine as, reduction
+		// being length dependent provides no useful timing information.
+
+		// Zero extend to 512-bits.
+		src_512: [64]byte
+		copy(src_512[64-s_len:], arg1)
+		defer mem.zero_explicit(&src_512, size_of(src_512))
+
+		fe_unchecked_set(out1, src_512[40:]) // a
+		b: Montgomery_Domain_Field_Element
+		fe_unchecked_set(&b, src_512[16:40]) // b
+
+		fe_mul(&b, &b, &TWO_192)
+		fe_add(out1, out1, &b)
+		if s_len >= 48 {
+			c: Montgomery_Domain_Field_Element
+			fe_unchecked_set(&c, src_512[:16]) // c
+			fe_mul(&c, &c, &TWO_384)
+			fe_add(out1, out1, &c)
+
+			fe_clear(&c)
+		}
+
+		fe_clear(&b)
+	}
+
+	return !is_canonical
+}
+
+@(private)
+fe_is_canonical :: proc "contextless" (arg1: []byte) -> bool {
+	_, borrow := bits.sub_u64(ELL[0] - 1, endian.unchecked_get_u64be(arg1[24:]), 0)
+	_, borrow = bits.sub_u64(ELL[1], endian.unchecked_get_u64be(arg1[16:]), borrow)
+	_, borrow = bits.sub_u64(ELL[2], endian.unchecked_get_u64be(arg1[8:]), borrow)
+	_, borrow = bits.sub_u64(ELL[3], endian.unchecked_get_u64be(arg1[0:]), borrow)
+	return borrow == 0
+}
+
+@(private)
+fe_unchecked_set :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element, arg1: []byte) {
+	arg1_256: [32]byte
+	defer mem.zero_explicit(&arg1_256, size_of(arg1_256))
+	copy(arg1_256[32-len(arg1):], arg1)
+
+	tmp := Non_Montgomery_Domain_Field_Element {
+		endian.unchecked_get_u64be(arg1_256[24:]),
+		endian.unchecked_get_u64be(arg1_256[16:]),
+		endian.unchecked_get_u64be(arg1_256[8:]),
+		endian.unchecked_get_u64be(arg1_256[0:]),
+	}
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	fe_to_montgomery(out1, &tmp)
+}
+
+fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) {
+	ensure_contextless(len(out1) == 32, "p256r1: invalid scalar output buffer")
+
+	tmp: Non_Montgomery_Domain_Field_Element
+	fe_from_montgomery(&tmp, arg1)
+
+	// Note: Likewise, output in big-endian.
+	endian.unchecked_put_u64be(out1[24:], tmp[0])
+	endian.unchecked_put_u64be(out1[16:], tmp[1])
+	endian.unchecked_put_u64be(out1[8:], tmp[2])
+	endian.unchecked_put_u64be(out1[0:], tmp[3])
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+}
+
+fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) -> int {
+	tmp: Montgomery_Domain_Field_Element
+	fe_sub(&tmp, arg1, arg2)
+
+	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))
+
+	fe_clear(&tmp)
+
+	return int(is_eq)
+}
+
+fe_is_odd :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> int {
+	tmp: Non_Montgomery_Domain_Field_Element
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	fe_from_montgomery(&tmp, arg1)
+	return int(tmp[0] & 1)
+}
+
+fe_zero :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Montgomery_Domain_Field_Element, arg1: int) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+}
+
+@(optimization_mode = "none")
+fe_cond_select :: #force_no_inline proc "contextless" (
+	out1, arg1, arg2: ^Montgomery_Domain_Field_Element,
+	arg3: int,
+) {
+	mask := (u64(arg3) * 0xffffffffffffffff)
+	x1 := ((mask & arg2[0]) | ((~mask) & arg1[0]))
+	x2 := ((mask & arg2[1]) | ((~mask) & arg1[1]))
+	x3 := ((mask & arg2[2]) | ((~mask) & arg1[2]))
+	x4 := ((mask & arg2[3]) | ((~mask) & arg1[3]))
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+fe_cond_negate :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element, ctrl: int) {
+	tmp1: Montgomery_Domain_Field_Element = ---
+	fe_opp(&tmp1, arg1)
+	fe_cond_select(out1, arg1, &tmp1, ctrl)
+
+	fe_clear(&tmp1)
+}
--- a/core/crypto/_fiat/field_scalarp256r1/field64.odin
+++ b/core/crypto/_fiat/field_scalarp256r1/field64.odin
@@ -0,0 +1,569 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_scalarp256r1
+
+// The file provides arithmetic on the field Z/(2^256 - 2^224 + 2^192 -
+// 89188191075325690597107910205041859247) using a 64-bit Montgomery form
+// internal representation.  It is derived primarily from the machine
+// generated Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+//
+// WARNING: While big-endian is the common representation used for this
+// curve, the fiat output uses least-significant-limb first.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+// ELL is the saturated representation of the field order, least-significant
+// limb first.
+ELL :: [4]u64{0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000}
+
+Montgomery_Domain_Field_Element :: distinct [4]u64
+Non_Montgomery_Domain_Field_Element :: distinct [4]u64
+
+fe_mul :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg2[3])
+	x8, x7 := bits.mul_u64(x4, arg2[2])
+	x10, x9 := bits.mul_u64(x4, arg2[1])
+	x12, x11 := bits.mul_u64(x4, arg2[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	_, x20 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f)
+	x23, x22 := bits.mul_u64(x20, 0xffffffff00000000)
+	x25, x24 := bits.mul_u64(x20, 0xffffffffffffffff)
+	x27, x26 := bits.mul_u64(x20, 0xbce6faada7179e84)
+	x29, x28 := bits.mul_u64(x20, 0xf3b9cac2fc632551)
+	x30, x31 := bits.add_u64(x29, x26, u64(0x0))
+	x32, x33 := bits.add_u64(x27, x24, u64(fiat.u1(x31)))
+	x34, x35 := bits.add_u64(x25, x22, u64(fiat.u1(x33)))
+	x36 := (u64(fiat.u1(x35)) + x23)
+	_, x38 := bits.add_u64(x11, x28, u64(0x0))
+	x39, x40 := bits.add_u64(x13, x30, u64(fiat.u1(x38)))
+	x41, x42 := bits.add_u64(x15, x32, u64(fiat.u1(x40)))
+	x43, x44 := bits.add_u64(x17, x34, u64(fiat.u1(x42)))
+	x45, x46 := bits.add_u64(x19, x36, u64(fiat.u1(x44)))
+	x48, x47 := bits.mul_u64(x1, arg2[3])
+	x50, x49 := bits.mul_u64(x1, arg2[2])
+	x52, x51 := bits.mul_u64(x1, arg2[1])
+	x54, x53 := bits.mul_u64(x1, arg2[0])
+	x55, x56 := bits.add_u64(x54, x51, u64(0x0))
+	x57, x58 := bits.add_u64(x52, x49, u64(fiat.u1(x56)))
+	x59, x60 := bits.add_u64(x50, x47, u64(fiat.u1(x58)))
+	x61 := (u64(fiat.u1(x60)) + x48)
+	x62, x63 := bits.add_u64(x39, x53, u64(0x0))
+	x64, x65 := bits.add_u64(x41, x55, u64(fiat.u1(x63)))
+	x66, x67 := bits.add_u64(x43, x57, u64(fiat.u1(x65)))
+	x68, x69 := bits.add_u64(x45, x59, u64(fiat.u1(x67)))
+	x70, x71 := bits.add_u64(u64(fiat.u1(x46)), x61, u64(fiat.u1(x69)))
+	_, x72 := bits.mul_u64(x62, 0xccd1c8aaee00bc4f)
+	x75, x74 := bits.mul_u64(x72, 0xffffffff00000000)
+	x77, x76 := bits.mul_u64(x72, 0xffffffffffffffff)
+	x79, x78 := bits.mul_u64(x72, 0xbce6faada7179e84)
+	x81, x80 := bits.mul_u64(x72, 0xf3b9cac2fc632551)
+	x82, x83 := bits.add_u64(x81, x78, u64(0x0))
+	x84, x85 := bits.add_u64(x79, x76, u64(fiat.u1(x83)))
+	x86, x87 := bits.add_u64(x77, x74, u64(fiat.u1(x85)))
+	x88 := (u64(fiat.u1(x87)) + x75)
+	_, x90 := bits.add_u64(x62, x80, u64(0x0))
+	x91, x92 := bits.add_u64(x64, x82, u64(fiat.u1(x90)))
+	x93, x94 := bits.add_u64(x66, x84, u64(fiat.u1(x92)))
+	x95, x96 := bits.add_u64(x68, x86, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(x70, x88, u64(fiat.u1(x96)))
+	x99 := (u64(fiat.u1(x98)) + u64(fiat.u1(x71)))
+	x101, x100 := bits.mul_u64(x2, arg2[3])
+	x103, x102 := bits.mul_u64(x2, arg2[2])
+	x105, x104 := bits.mul_u64(x2, arg2[1])
+	x107, x106 := bits.mul_u64(x2, arg2[0])
+	x108, x109 := bits.add_u64(x107, x104, u64(0x0))
+	x110, x111 := bits.add_u64(x105, x102, u64(fiat.u1(x109)))
+	x112, x113 := bits.add_u64(x103, x100, u64(fiat.u1(x111)))
+	x114 := (u64(fiat.u1(x113)) + x101)
+	x115, x116 := bits.add_u64(x91, x106, u64(0x0))
+	x117, x118 := bits.add_u64(x93, x108, u64(fiat.u1(x116)))
+	x119, x120 := bits.add_u64(x95, x110, u64(fiat.u1(x118)))
+	x121, x122 := bits.add_u64(x97, x112, u64(fiat.u1(x120)))
+	x123, x124 := bits.add_u64(x99, x114, u64(fiat.u1(x122)))
+	_, x125 := bits.mul_u64(x115, 0xccd1c8aaee00bc4f)
+	x128, x127 := bits.mul_u64(x125, 0xffffffff00000000)
+	x130, x129 := bits.mul_u64(x125, 0xffffffffffffffff)
+	x132, x131 := bits.mul_u64(x125, 0xbce6faada7179e84)
+	x134, x133 := bits.mul_u64(x125, 0xf3b9cac2fc632551)
+	x135, x136 := bits.add_u64(x134, x131, u64(0x0))
+	x137, x138 := bits.add_u64(x132, x129, u64(fiat.u1(x136)))
+	x139, x140 := bits.add_u64(x130, x127, u64(fiat.u1(x138)))
+	x141 := (u64(fiat.u1(x140)) + x128)
+	_, x143 := bits.add_u64(x115, x133, u64(0x0))
+	x144, x145 := bits.add_u64(x117, x135, u64(fiat.u1(x143)))
+	x146, x147 := bits.add_u64(x119, x137, u64(fiat.u1(x145)))
+	x148, x149 := bits.add_u64(x121, x139, u64(fiat.u1(x147)))
+	x150, x151 := bits.add_u64(x123, x141, u64(fiat.u1(x149)))
+	x152 := (u64(fiat.u1(x151)) + u64(fiat.u1(x124)))
+	x154, x153 := bits.mul_u64(x3, arg2[3])
+	x156, x155 := bits.mul_u64(x3, arg2[2])
+	x158, x157 := bits.mul_u64(x3, arg2[1])
+	x160, x159 := bits.mul_u64(x3, arg2[0])
+	x161, x162 := bits.add_u64(x160, x157, u64(0x0))
+	x163, x164 := bits.add_u64(x158, x155, u64(fiat.u1(x162)))
+	x165, x166 := bits.add_u64(x156, x153, u64(fiat.u1(x164)))
+	x167 := (u64(fiat.u1(x166)) + x154)
+	x168, x169 := bits.add_u64(x144, x159, u64(0x0))
+	x170, x171 := bits.add_u64(x146, x161, u64(fiat.u1(x169)))
+	x172, x173 := bits.add_u64(x148, x163, u64(fiat.u1(x171)))
+	x174, x175 := bits.add_u64(x150, x165, u64(fiat.u1(x173)))
+	x176, x177 := bits.add_u64(x152, x167, u64(fiat.u1(x175)))
+	_, x178 := bits.mul_u64(x168, 0xccd1c8aaee00bc4f)
+	x181, x180 := bits.mul_u64(x178, 0xffffffff00000000)
+	x183, x182 := bits.mul_u64(x178, 0xffffffffffffffff)
+	x185, x184 := bits.mul_u64(x178, 0xbce6faada7179e84)
+	x187, x186 := bits.mul_u64(x178, 0xf3b9cac2fc632551)
+	x188, x189 := bits.add_u64(x187, x184, u64(0x0))
+	x190, x191 := bits.add_u64(x185, x182, u64(fiat.u1(x189)))
+	x192, x193 := bits.add_u64(x183, x180, u64(fiat.u1(x191)))
+	x194 := (u64(fiat.u1(x193)) + x181)
+	_, x196 := bits.add_u64(x168, x186, u64(0x0))
+	x197, x198 := bits.add_u64(x170, x188, u64(fiat.u1(x196)))
+	x199, x200 := bits.add_u64(x172, x190, u64(fiat.u1(x198)))
+	x201, x202 := bits.add_u64(x174, x192, u64(fiat.u1(x200)))
+	x203, x204 := bits.add_u64(x176, x194, u64(fiat.u1(x202)))
+	x205 := (u64(fiat.u1(x204)) + u64(fiat.u1(x177)))
+	x206, x207 := bits.sub_u64(x197, 0xf3b9cac2fc632551, u64(0x0))
+	x208, x209 := bits.sub_u64(x199, 0xbce6faada7179e84, u64(fiat.u1(x207)))
+	x210, x211 := bits.sub_u64(x201, 0xffffffffffffffff, u64(fiat.u1(x209)))
+	x212, x213 := bits.sub_u64(x203, 0xffffffff00000000, u64(fiat.u1(x211)))
+	_, x215 := bits.sub_u64(x205, u64(0x0), u64(fiat.u1(x213)))
+	x216 := fiat.cmovznz_u64(fiat.u1(x215), x206, x197)
+	x217 := fiat.cmovznz_u64(fiat.u1(x215), x208, x199)
+	x218 := fiat.cmovznz_u64(fiat.u1(x215), x210, x201)
+	x219 := fiat.cmovznz_u64(fiat.u1(x215), x212, x203)
+	out1[0] = x216
+	out1[1] = x217
+	out1[2] = x218
+	out1[3] = x219
+}
+
+fe_square :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, arg1[3])
+	x8, x7 := bits.mul_u64(x4, arg1[2])
+	x10, x9 := bits.mul_u64(x4, arg1[1])
+	x12, x11 := bits.mul_u64(x4, arg1[0])
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	x19 := (u64(fiat.u1(x18)) + x6)
+	_, x20 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f)
+	x23, x22 := bits.mul_u64(x20, 0xffffffff00000000)
+	x25, x24 := bits.mul_u64(x20, 0xffffffffffffffff)
+	x27, x26 := bits.mul_u64(x20, 0xbce6faada7179e84)
+	x29, x28 := bits.mul_u64(x20, 0xf3b9cac2fc632551)
+	x30, x31 := bits.add_u64(x29, x26, u64(0x0))
+	x32, x33 := bits.add_u64(x27, x24, u64(fiat.u1(x31)))
+	x34, x35 := bits.add_u64(x25, x22, u64(fiat.u1(x33)))
+	x36 := (u64(fiat.u1(x35)) + x23)
+	_, x38 := bits.add_u64(x11, x28, u64(0x0))
+	x39, x40 := bits.add_u64(x13, x30, u64(fiat.u1(x38)))
+	x41, x42 := bits.add_u64(x15, x32, u64(fiat.u1(x40)))
+	x43, x44 := bits.add_u64(x17, x34, u64(fiat.u1(x42)))
+	x45, x46 := bits.add_u64(x19, x36, u64(fiat.u1(x44)))
+	x48, x47 := bits.mul_u64(x1, arg1[3])
+	x50, x49 := bits.mul_u64(x1, arg1[2])
+	x52, x51 := bits.mul_u64(x1, arg1[1])
+	x54, x53 := bits.mul_u64(x1, arg1[0])
+	x55, x56 := bits.add_u64(x54, x51, u64(0x0))
+	x57, x58 := bits.add_u64(x52, x49, u64(fiat.u1(x56)))
+	x59, x60 := bits.add_u64(x50, x47, u64(fiat.u1(x58)))
+	x61 := (u64(fiat.u1(x60)) + x48)
+	x62, x63 := bits.add_u64(x39, x53, u64(0x0))
+	x64, x65 := bits.add_u64(x41, x55, u64(fiat.u1(x63)))
+	x66, x67 := bits.add_u64(x43, x57, u64(fiat.u1(x65)))
+	x68, x69 := bits.add_u64(x45, x59, u64(fiat.u1(x67)))
+	x70, x71 := bits.add_u64(u64(fiat.u1(x46)), x61, u64(fiat.u1(x69)))
+	_, x72 := bits.mul_u64(x62, 0xccd1c8aaee00bc4f)
+	x75, x74 := bits.mul_u64(x72, 0xffffffff00000000)
+	x77, x76 := bits.mul_u64(x72, 0xffffffffffffffff)
+	x79, x78 := bits.mul_u64(x72, 0xbce6faada7179e84)
+	x81, x80 := bits.mul_u64(x72, 0xf3b9cac2fc632551)
+	x82, x83 := bits.add_u64(x81, x78, u64(0x0))
+	x84, x85 := bits.add_u64(x79, x76, u64(fiat.u1(x83)))
+	x86, x87 := bits.add_u64(x77, x74, u64(fiat.u1(x85)))
+	x88 := (u64(fiat.u1(x87)) + x75)
+	_, x90 := bits.add_u64(x62, x80, u64(0x0))
+	x91, x92 := bits.add_u64(x64, x82, u64(fiat.u1(x90)))
+	x93, x94 := bits.add_u64(x66, x84, u64(fiat.u1(x92)))
+	x95, x96 := bits.add_u64(x68, x86, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(x70, x88, u64(fiat.u1(x96)))
+	x99 := (u64(fiat.u1(x98)) + u64(fiat.u1(x71)))
+	x101, x100 := bits.mul_u64(x2, arg1[3])
+	x103, x102 := bits.mul_u64(x2, arg1[2])
+	x105, x104 := bits.mul_u64(x2, arg1[1])
+	x107, x106 := bits.mul_u64(x2, arg1[0])
+	x108, x109 := bits.add_u64(x107, x104, u64(0x0))
+	x110, x111 := bits.add_u64(x105, x102, u64(fiat.u1(x109)))
+	x112, x113 := bits.add_u64(x103, x100, u64(fiat.u1(x111)))
+	x114 := (u64(fiat.u1(x113)) + x101)
+	x115, x116 := bits.add_u64(x91, x106, u64(0x0))
+	x117, x118 := bits.add_u64(x93, x108, u64(fiat.u1(x116)))
+	x119, x120 := bits.add_u64(x95, x110, u64(fiat.u1(x118)))
+	x121, x122 := bits.add_u64(x97, x112, u64(fiat.u1(x120)))
+	x123, x124 := bits.add_u64(x99, x114, u64(fiat.u1(x122)))
+	_, x125 := bits.mul_u64(x115, 0xccd1c8aaee00bc4f)
+	x128, x127 := bits.mul_u64(x125, 0xffffffff00000000)
+	x130, x129 := bits.mul_u64(x125, 0xffffffffffffffff)
+	x132, x131 := bits.mul_u64(x125, 0xbce6faada7179e84)
+	x134, x133 := bits.mul_u64(x125, 0xf3b9cac2fc632551)
+	x135, x136 := bits.add_u64(x134, x131, u64(0x0))
+	x137, x138 := bits.add_u64(x132, x129, u64(fiat.u1(x136)))
+	x139, x140 := bits.add_u64(x130, x127, u64(fiat.u1(x138)))
+	x141 := (u64(fiat.u1(x140)) + x128)
+	_, x143 := bits.add_u64(x115, x133, u64(0x0))
+	x144, x145 := bits.add_u64(x117, x135, u64(fiat.u1(x143)))
+	x146, x147 := bits.add_u64(x119, x137, u64(fiat.u1(x145)))
+	x148, x149 := bits.add_u64(x121, x139, u64(fiat.u1(x147)))
+	x150, x151 := bits.add_u64(x123, x141, u64(fiat.u1(x149)))
+	x152 := (u64(fiat.u1(x151)) + u64(fiat.u1(x124)))
+	x154, x153 := bits.mul_u64(x3, arg1[3])
+	x156, x155 := bits.mul_u64(x3, arg1[2])
+	x158, x157 := bits.mul_u64(x3, arg1[1])
+	x160, x159 := bits.mul_u64(x3, arg1[0])
+	x161, x162 := bits.add_u64(x160, x157, u64(0x0))
+	x163, x164 := bits.add_u64(x158, x155, u64(fiat.u1(x162)))
+	x165, x166 := bits.add_u64(x156, x153, u64(fiat.u1(x164)))
+	x167 := (u64(fiat.u1(x166)) + x154)
+	x168, x169 := bits.add_u64(x144, x159, u64(0x0))
+	x170, x171 := bits.add_u64(x146, x161, u64(fiat.u1(x169)))
+	x172, x173 := bits.add_u64(x148, x163, u64(fiat.u1(x171)))
+	x174, x175 := bits.add_u64(x150, x165, u64(fiat.u1(x173)))
+	x176, x177 := bits.add_u64(x152, x167, u64(fiat.u1(x175)))
+	_, x178 := bits.mul_u64(x168, 0xccd1c8aaee00bc4f)
+	x181, x180 := bits.mul_u64(x178, 0xffffffff00000000)
+	x183, x182 := bits.mul_u64(x178, 0xffffffffffffffff)
+	x185, x184 := bits.mul_u64(x178, 0xbce6faada7179e84)
+	x187, x186 := bits.mul_u64(x178, 0xf3b9cac2fc632551)
+	x188, x189 := bits.add_u64(x187, x184, u64(0x0))
+	x190, x191 := bits.add_u64(x185, x182, u64(fiat.u1(x189)))
+	x192, x193 := bits.add_u64(x183, x180, u64(fiat.u1(x191)))
+	x194 := (u64(fiat.u1(x193)) + x181)
+	_, x196 := bits.add_u64(x168, x186, u64(0x0))
+	x197, x198 := bits.add_u64(x170, x188, u64(fiat.u1(x196)))
+	x199, x200 := bits.add_u64(x172, x190, u64(fiat.u1(x198)))
+	x201, x202 := bits.add_u64(x174, x192, u64(fiat.u1(x200)))
+	x203, x204 := bits.add_u64(x176, x194, u64(fiat.u1(x202)))
+	x205 := (u64(fiat.u1(x204)) + u64(fiat.u1(x177)))
+	x206, x207 := bits.sub_u64(x197, 0xf3b9cac2fc632551, u64(0x0))
+	x208, x209 := bits.sub_u64(x199, 0xbce6faada7179e84, u64(fiat.u1(x207)))
+	x210, x211 := bits.sub_u64(x201, 0xffffffffffffffff, u64(fiat.u1(x209)))
+	x212, x213 := bits.sub_u64(x203, 0xffffffff00000000, u64(fiat.u1(x211)))
+	_, x215 := bits.sub_u64(x205, u64(0x0), u64(fiat.u1(x213)))
+	x216 := fiat.cmovznz_u64(fiat.u1(x215), x206, x197)
+	x217 := fiat.cmovznz_u64(fiat.u1(x215), x208, x199)
+	x218 := fiat.cmovznz_u64(fiat.u1(x215), x210, x201)
+	x219 := fiat.cmovznz_u64(fiat.u1(x215), x212, x203)
+	out1[0] = x216
+	out1[1] = x217
+	out1[2] = x218
+	out1[3] = x219
+}
+
+fe_add :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.add_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.add_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.add_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.add_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9, x10 := bits.sub_u64(x1, 0xf3b9cac2fc632551, u64(0x0))
+	x11, x12 := bits.sub_u64(x3, 0xbce6faada7179e84, u64(fiat.u1(x10)))
+	x13, x14 := bits.sub_u64(x5, 0xffffffffffffffff, u64(fiat.u1(x12)))
+	x15, x16 := bits.sub_u64(x7, 0xffffffff00000000, u64(fiat.u1(x14)))
+	_, x18 := bits.sub_u64(u64(fiat.u1(x8)), u64(0x0), u64(fiat.u1(x16)))
+	x19 := fiat.cmovznz_u64(fiat.u1(x18), x9, x1)
+	x20 := fiat.cmovznz_u64(fiat.u1(x18), x11, x3)
+	x21 := fiat.cmovznz_u64(fiat.u1(x18), x13, x5)
+	x22 := fiat.cmovznz_u64(fiat.u1(x18), x15, x7)
+	out1[0] = x19
+	out1[1] = x20
+	out1[2] = x21
+	out1[3] = x22
+}
+
+fe_sub :: proc "contextless" (out1, arg1, arg2: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(arg1[0], arg2[0], u64(0x0))
+	x3, x4 := bits.sub_u64(arg1[1], arg2[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(arg1[2], arg2[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(arg1[3], arg2[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, (x9 & 0xf3b9cac2fc632551), u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0xbce6faada7179e84), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, x9, u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000000), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_opp :: proc "contextless" (out1, arg1: ^Montgomery_Domain_Field_Element) {
+	x1, x2 := bits.sub_u64(u64(0x0), arg1[0], u64(0x0))
+	x3, x4 := bits.sub_u64(u64(0x0), arg1[1], u64(fiat.u1(x2)))
+	x5, x6 := bits.sub_u64(u64(0x0), arg1[2], u64(fiat.u1(x4)))
+	x7, x8 := bits.sub_u64(u64(0x0), arg1[3], u64(fiat.u1(x6)))
+	x9 := fiat.cmovznz_u64(fiat.u1(x8), u64(0x0), 0xffffffffffffffff)
+	x10, x11 := bits.add_u64(x1, (x9 & 0xf3b9cac2fc632551), u64(0x0))
+	x12, x13 := bits.add_u64(x3, (x9 & 0xbce6faada7179e84), u64(fiat.u1(x11)))
+	x14, x15 := bits.add_u64(x5, x9, u64(fiat.u1(x13)))
+	x16, _ := bits.add_u64(x7, (x9 & 0xffffffff00000000), u64(fiat.u1(x15)))
+	out1[0] = x10
+	out1[1] = x12
+	out1[2] = x14
+	out1[3] = x16
+}
+
+fe_one :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element) {
+	out1[0] = 0xc46353d039cdaaf
+	out1[1] = 0x4319055258e8617b
+	out1[2] = u64(0x0)
+	out1[3] = 0xffffffff
+}
+
+fe_non_zero :: proc "contextless" (arg1: ^Montgomery_Domain_Field_Element) -> u64 {
+	return arg1[0] | (arg1[1] | (arg1[2] | arg1[3]))
+}
+
+@(optimization_mode = "none")
+fe_cond_assign :: #force_no_inline proc "contextless" (
+	out1, arg1: ^Montgomery_Domain_Field_Element,
+	arg2: int,
+) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+}
+
+fe_from_montgomery :: proc "contextless" (
+	out1: ^Non_Montgomery_Domain_Field_Element,
+	arg1: ^Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[0]
+	_, x2 := bits.mul_u64(x1, 0xccd1c8aaee00bc4f)
+	x5, x4 := bits.mul_u64(x2, 0xffffffff00000000)
+	x7, x6 := bits.mul_u64(x2, 0xffffffffffffffff)
+	x9, x8 := bits.mul_u64(x2, 0xbce6faada7179e84)
+	x11, x10 := bits.mul_u64(x2, 0xf3b9cac2fc632551)
+	x12, x13 := bits.add_u64(x11, x8, u64(0x0))
+	x14, x15 := bits.add_u64(x9, x6, u64(fiat.u1(x13)))
+	x16, x17 := bits.add_u64(x7, x4, u64(fiat.u1(x15)))
+	_, x19 := bits.add_u64(x1, x10, u64(0x0))
+	x20, x21 := bits.add_u64(u64(0x0), x12, u64(fiat.u1(x19)))
+	x22, x23 := bits.add_u64(u64(0x0), x14, u64(fiat.u1(x21)))
+	x24, x25 := bits.add_u64(u64(0x0), x16, u64(fiat.u1(x23)))
+	x26, x27 := bits.add_u64(x20, arg1[1], u64(0x0))
+	x28, x29 := bits.add_u64(x22, u64(0x0), u64(fiat.u1(x27)))
+	x30, x31 := bits.add_u64(x24, u64(0x0), u64(fiat.u1(x29)))
+	_, x32 := bits.mul_u64(x26, 0xccd1c8aaee00bc4f)
+	x35, x34 := bits.mul_u64(x32, 0xffffffff00000000)
+	x37, x36 := bits.mul_u64(x32, 0xffffffffffffffff)
+	x39, x38 := bits.mul_u64(x32, 0xbce6faada7179e84)
+	x41, x40 := bits.mul_u64(x32, 0xf3b9cac2fc632551)
+	x42, x43 := bits.add_u64(x41, x38, u64(0x0))
+	x44, x45 := bits.add_u64(x39, x36, u64(fiat.u1(x43)))
+	x46, x47 := bits.add_u64(x37, x34, u64(fiat.u1(x45)))
+	_, x49 := bits.add_u64(x26, x40, u64(0x0))
+	x50, x51 := bits.add_u64(x28, x42, u64(fiat.u1(x49)))
+	x52, x53 := bits.add_u64(x30, x44, u64(fiat.u1(x51)))
+	x54, x55 := bits.add_u64((u64(fiat.u1(x31)) + (u64(fiat.u1(x25)) + (u64(fiat.u1(x17)) + x5))), x46, u64(fiat.u1(x53)))
+	x56, x57 := bits.add_u64(x50, arg1[2], u64(0x0))
+	x58, x59 := bits.add_u64(x52, u64(0x0), u64(fiat.u1(x57)))
+	x60, x61 := bits.add_u64(x54, u64(0x0), u64(fiat.u1(x59)))
+	_, x62 := bits.mul_u64(x56, 0xccd1c8aaee00bc4f)
+	x65, x64 := bits.mul_u64(x62, 0xffffffff00000000)
+	x67, x66 := bits.mul_u64(x62, 0xffffffffffffffff)
+	x69, x68 := bits.mul_u64(x62, 0xbce6faada7179e84)
+	x71, x70 := bits.mul_u64(x62, 0xf3b9cac2fc632551)
+	x72, x73 := bits.add_u64(x71, x68, u64(0x0))
+	x74, x75 := bits.add_u64(x69, x66, u64(fiat.u1(x73)))
+	x76, x77 := bits.add_u64(x67, x64, u64(fiat.u1(x75)))
+	_, x79 := bits.add_u64(x56, x70, u64(0x0))
+	x80, x81 := bits.add_u64(x58, x72, u64(fiat.u1(x79)))
+	x82, x83 := bits.add_u64(x60, x74, u64(fiat.u1(x81)))
+	x84, x85 := bits.add_u64((u64(fiat.u1(x61)) + (u64(fiat.u1(x55)) + (u64(fiat.u1(x47)) + x35))), x76, u64(fiat.u1(x83)))
+	x86, x87 := bits.add_u64(x80, arg1[3], u64(0x0))
+	x88, x89 := bits.add_u64(x82, u64(0x0), u64(fiat.u1(x87)))
+	x90, x91 := bits.add_u64(x84, u64(0x0), u64(fiat.u1(x89)))
+	_, x92 := bits.mul_u64(x86, 0xccd1c8aaee00bc4f)
+	x95, x94 := bits.mul_u64(x92, 0xffffffff00000000)
+	x97, x96 := bits.mul_u64(x92, 0xffffffffffffffff)
+	x99, x98 := bits.mul_u64(x92, 0xbce6faada7179e84)
+	x101, x100 := bits.mul_u64(x92, 0xf3b9cac2fc632551)
+	x102, x103 := bits.add_u64(x101, x98, u64(0x0))
+	x104, x105 := bits.add_u64(x99, x96, u64(fiat.u1(x103)))
+	x106, x107 := bits.add_u64(x97, x94, u64(fiat.u1(x105)))
+	_, x109 := bits.add_u64(x86, x100, u64(0x0))
+	x110, x111 := bits.add_u64(x88, x102, u64(fiat.u1(x109)))
+	x112, x113 := bits.add_u64(x90, x104, u64(fiat.u1(x111)))
+	x114, x115 := bits.add_u64((u64(fiat.u1(x91)) + (u64(fiat.u1(x85)) + (u64(fiat.u1(x77)) + x65))), x106, u64(fiat.u1(x113)))
+	x116 := (u64(fiat.u1(x115)) + (u64(fiat.u1(x107)) + x95))
+	x117, x118 := bits.sub_u64(x110, 0xf3b9cac2fc632551, u64(0x0))
+	x119, x120 := bits.sub_u64(x112, 0xbce6faada7179e84, u64(fiat.u1(x118)))
+	x121, x122 := bits.sub_u64(x114, 0xffffffffffffffff, u64(fiat.u1(x120)))
+	x123, x124 := bits.sub_u64(x116, 0xffffffff00000000, u64(fiat.u1(x122)))
+	_, x126 := bits.sub_u64(u64(0x0), u64(0x0), u64(fiat.u1(x124)))
+	x127 := fiat.cmovznz_u64(fiat.u1(x126), x117, x110)
+	x128 := fiat.cmovznz_u64(fiat.u1(x126), x119, x112)
+	x129 := fiat.cmovznz_u64(fiat.u1(x126), x121, x114)
+	x130 := fiat.cmovznz_u64(fiat.u1(x126), x123, x116)
+	out1[0] = x127
+	out1[1] = x128
+	out1[2] = x129
+	out1[3] = x130
+}
+
+fe_to_montgomery :: proc "contextless" (
+	out1: ^Montgomery_Domain_Field_Element,
+	arg1: ^Non_Montgomery_Domain_Field_Element,
+) {
+	x1 := arg1[1]
+	x2 := arg1[2]
+	x3 := arg1[3]
+	x4 := arg1[0]
+	x6, x5 := bits.mul_u64(x4, 0x66e12d94f3d95620)
+	x8, x7 := bits.mul_u64(x4, 0x2845b2392b6bec59)
+	x10, x9 := bits.mul_u64(x4, 0x4699799c49bd6fa6)
+	x12, x11 := bits.mul_u64(x4, 0x83244c95be79eea2)
+	x13, x14 := bits.add_u64(x12, x9, u64(0x0))
+	x15, x16 := bits.add_u64(x10, x7, u64(fiat.u1(x14)))
+	x17, x18 := bits.add_u64(x8, x5, u64(fiat.u1(x16)))
+	_, x19 := bits.mul_u64(x11, 0xccd1c8aaee00bc4f)
+	x22, x21 := bits.mul_u64(x19, 0xffffffff00000000)
+	x24, x23 := bits.mul_u64(x19, 0xffffffffffffffff)
+	x26, x25 := bits.mul_u64(x19, 0xbce6faada7179e84)
+	x28, x27 := bits.mul_u64(x19, 0xf3b9cac2fc632551)
+	x29, x30 := bits.add_u64(x28, x25, u64(0x0))
+	x31, x32 := bits.add_u64(x26, x23, u64(fiat.u1(x30)))
+	x33, x34 := bits.add_u64(x24, x21, u64(fiat.u1(x32)))
+	_, x36 := bits.add_u64(x11, x27, u64(0x0))
+	x37, x38 := bits.add_u64(x13, x29, u64(fiat.u1(x36)))
+	x39, x40 := bits.add_u64(x15, x31, u64(fiat.u1(x38)))
+	x41, x42 := bits.add_u64(x17, x33, u64(fiat.u1(x40)))
+	x43, x44 := bits.add_u64((u64(fiat.u1(x18)) + x6), (u64(fiat.u1(x34)) + x22), u64(fiat.u1(x42)))
+	x46, x45 := bits.mul_u64(x1, 0x66e12d94f3d95620)
+	x48, x47 := bits.mul_u64(x1, 0x2845b2392b6bec59)
+	x50, x49 := bits.mul_u64(x1, 0x4699799c49bd6fa6)
+	x52, x51 := bits.mul_u64(x1, 0x83244c95be79eea2)
+	x53, x54 := bits.add_u64(x52, x49, u64(0x0))
+	x55, x56 := bits.add_u64(x50, x47, u64(fiat.u1(x54)))
+	x57, x58 := bits.add_u64(x48, x45, u64(fiat.u1(x56)))
+	x59, x60 := bits.add_u64(x37, x51, u64(0x0))
+	x61, x62 := bits.add_u64(x39, x53, u64(fiat.u1(x60)))
+	x63, x64 := bits.add_u64(x41, x55, u64(fiat.u1(x62)))
+	x65, x66 := bits.add_u64(x43, x57, u64(fiat.u1(x64)))
+	_, x67 := bits.mul_u64(x59, 0xccd1c8aaee00bc4f)
+	x70, x69 := bits.mul_u64(x67, 0xffffffff00000000)
+	x72, x71 := bits.mul_u64(x67, 0xffffffffffffffff)
+	x74, x73 := bits.mul_u64(x67, 0xbce6faada7179e84)
+	x76, x75 := bits.mul_u64(x67, 0xf3b9cac2fc632551)
+	x77, x78 := bits.add_u64(x76, x73, u64(0x0))
+	x79, x80 := bits.add_u64(x74, x71, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x72, x69, u64(fiat.u1(x80)))
+	_, x84 := bits.add_u64(x59, x75, u64(0x0))
+	x85, x86 := bits.add_u64(x61, x77, u64(fiat.u1(x84)))
+	x87, x88 := bits.add_u64(x63, x79, u64(fiat.u1(x86)))
+	x89, x90 := bits.add_u64(x65, x81, u64(fiat.u1(x88)))
+	x91, x92 := bits.add_u64(((u64(fiat.u1(x66)) + u64(fiat.u1(x44))) + (u64(fiat.u1(x58)) + x46)), (u64(fiat.u1(x82)) + x70), u64(fiat.u1(x90)))
+	x94, x93 := bits.mul_u64(x2, 0x66e12d94f3d95620)
+	x96, x95 := bits.mul_u64(x2, 0x2845b2392b6bec59)
+	x98, x97 := bits.mul_u64(x2, 0x4699799c49bd6fa6)
+	x100, x99 := bits.mul_u64(x2, 0x83244c95be79eea2)
+	x101, x102 := bits.add_u64(x100, x97, u64(0x0))
+	x103, x104 := bits.add_u64(x98, x95, u64(fiat.u1(x102)))
+	x105, x106 := bits.add_u64(x96, x93, u64(fiat.u1(x104)))
+	x107, x108 := bits.add_u64(x85, x99, u64(0x0))
+	x109, x110 := bits.add_u64(x87, x101, u64(fiat.u1(x108)))
+	x111, x112 := bits.add_u64(x89, x103, u64(fiat.u1(x110)))
+	x113, x114 := bits.add_u64(x91, x105, u64(fiat.u1(x112)))
+	_, x115 := bits.mul_u64(x107, 0xccd1c8aaee00bc4f)
+	x118, x117 := bits.mul_u64(x115, 0xffffffff00000000)
+	x120, x119 := bits.mul_u64(x115, 0xffffffffffffffff)
+	x122, x121 := bits.mul_u64(x115, 0xbce6faada7179e84)
+	x124, x123 := bits.mul_u64(x115, 0xf3b9cac2fc632551)
+	x125, x126 := bits.add_u64(x124, x121, u64(0x0))
+	x127, x128 := bits.add_u64(x122, x119, u64(fiat.u1(x126)))
+	x129, x130 := bits.add_u64(x120, x117, u64(fiat.u1(x128)))
+	_, x132 := bits.add_u64(x107, x123, u64(0x0))
+	x133, x134 := bits.add_u64(x109, x125, u64(fiat.u1(x132)))
+	x135, x136 := bits.add_u64(x111, x127, u64(fiat.u1(x134)))
+	x137, x138 := bits.add_u64(x113, x129, u64(fiat.u1(x136)))
+	x139, x140 := bits.add_u64(((u64(fiat.u1(x114)) + u64(fiat.u1(x92))) + (u64(fiat.u1(x106)) + x94)), (u64(fiat.u1(x130)) + x118), u64(fiat.u1(x138)))
+	x142, x141 := bits.mul_u64(x3, 0x66e12d94f3d95620)
+	x144, x143 := bits.mul_u64(x3, 0x2845b2392b6bec59)
+	x146, x145 := bits.mul_u64(x3, 0x4699799c49bd6fa6)
+	x148, x147 := bits.mul_u64(x3, 0x83244c95be79eea2)
+	x149, x150 := bits.add_u64(x148, x145, u64(0x0))
+	x151, x152 := bits.add_u64(x146, x143, u64(fiat.u1(x150)))
+	x153, x154 := bits.add_u64(x144, x141, u64(fiat.u1(x152)))
+	x155, x156 := bits.add_u64(x133, x147, u64(0x0))
+	x157, x158 := bits.add_u64(x135, x149, u64(fiat.u1(x156)))
+	x159, x160 := bits.add_u64(x137, x151, u64(fiat.u1(x158)))
+	x161, x162 := bits.add_u64(x139, x153, u64(fiat.u1(x160)))
+	_, x163 := bits.mul_u64(x155, 0xccd1c8aaee00bc4f)
+	x166, x165 := bits.mul_u64(x163, 0xffffffff00000000)
+	x168, x167 := bits.mul_u64(x163, 0xffffffffffffffff)
+	x170, x169 := bits.mul_u64(x163, 0xbce6faada7179e84)
+	x172, x171 := bits.mul_u64(x163, 0xf3b9cac2fc632551)
+	x173, x174 := bits.add_u64(x172, x169, u64(0x0))
+	x175, x176 := bits.add_u64(x170, x167, u64(fiat.u1(x174)))
+	x177, x178 := bits.add_u64(x168, x165, u64(fiat.u1(x176)))
+	_, x180 := bits.add_u64(x155, x171, u64(0x0))
+	x181, x182 := bits.add_u64(x157, x173, u64(fiat.u1(x180)))
+	x183, x184 := bits.add_u64(x159, x175, u64(fiat.u1(x182)))
+	x185, x186 := bits.add_u64(x161, x177, u64(fiat.u1(x184)))
+	x187, x188 := bits.add_u64(((u64(fiat.u1(x162)) + u64(fiat.u1(x140))) + (u64(fiat.u1(x154)) + x142)), (u64(fiat.u1(x178)) + x166), u64(fiat.u1(x186)))
+	x189, x190 := bits.sub_u64(x181, 0xf3b9cac2fc632551, u64(0x0))
+	x191, x192 := bits.sub_u64(x183, 0xbce6faada7179e84, u64(fiat.u1(x190)))
+	x193, x194 := bits.sub_u64(x185, 0xffffffffffffffff, u64(fiat.u1(x192)))
+	x195, x196 := bits.sub_u64(x187, 0xffffffff00000000, u64(fiat.u1(x194)))
+	_, x198 := bits.sub_u64(u64(fiat.u1(x188)), u64(0x0), u64(fiat.u1(x196)))
+	x199 := fiat.cmovznz_u64(fiat.u1(x198), x189, x181)
+	x200 := fiat.cmovznz_u64(fiat.u1(x198), x191, x183)
+	x201 := fiat.cmovznz_u64(fiat.u1(x198), x193, x185)
+	x202 := fiat.cmovznz_u64(fiat.u1(x198), x195, x187)
+	out1[0] = x199
+	out1[1] = x200
+	out1[2] = x201
+	out1[3] = x202
+}
--- a/core/crypto/_subtle/subtle.odin
+++ b/core/crypto/_subtle/subtle.odin
@@ -0,0 +1,42 @@
+/*
+Various useful bit operations in constant time.
+*/
+package _subtle
+
+import "core:math/bits"
+
+// byte_eq returns 1 iff a == b, 0 otherwise.
+@(optimization_mode="none")
+byte_eq :: proc "contextless" (a, b: byte) -> int {
+	v := a ~ b
+
+	// v == 0 iff a == b.  The subtraction will underflow, setting the
+	// sign bit, which will get returned.
+	return int((u32(v)-1) >> 31)
+}
+
+// u64_eq returns 1 iff a == b, 0 otherwise.
+@(optimization_mode="none")
+u64_eq :: proc "contextless" (a, b: u64) -> u64 {
+	_, borrow := bits.sub_u64(0, a ~ b, 0)
+	return (~borrow) & 1
+}
+
+eq :: proc {
+	byte_eq,
+	u64_eq,
+}
+
+// u64_is_zero returns 1 iff a == 0, 0 otherwise.
+@(optimization_mode="none")
+u64_is_zero :: proc "contextless" (a: u64) -> u64 {
+	_, borrow := bits.sub_u64(a, 1, 0)
+	return borrow
+}
+
+// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
+@(optimization_mode="none")
+u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
+	is_zero := u64_is_zero(a)
+	return (~is_zero) & 1
+}
--- a/core/crypto/_weierstrass/fe.odin
+++ b/core/crypto/_weierstrass/fe.odin
@@ -0,0 +1,135 @@
+package _weierstrass
+
+import p256r1 "core:crypto/_fiat/field_p256r1"
+import "core:math/bits"
+
+Field_Element_p256r1 :: p256r1.Montgomery_Domain_Field_Element
+
+FE_SIZE_P256R1 :: 32
+
+fe_clear :: proc {
+	p256r1.fe_clear,
+}
+
+fe_clear_vec :: proc {
+	p256r1.fe_clear_vec,
+}
+
+fe_set_bytes :: proc {
+	p256r1.fe_from_bytes,
+}
+fe_bytes :: proc {
+	p256r1.fe_to_bytes,
+}
+
+fe_set :: proc {
+	p256r1.fe_set,
+}
+
+fe_zero :: proc {
+	p256r1.fe_zero,
+}
+
+fe_a :: proc {
+	fe_a_p256r1,
+}
+
+fe_b :: proc {
+	fe_b_p256r1,
+}
+
+fe_gen_x :: proc {
+	fe_gen_x_p256r1,
+}
+
+fe_gen_y :: proc {
+	fe_gen_y_p256r1,
+}
+
+fe_one :: proc {
+	p256r1.fe_one,
+}
+
+fe_add :: proc {
+	p256r1.fe_add,
+}
+
+fe_sub :: proc {
+	p256r1.fe_sub,
+}
+
+fe_negate :: proc {
+	p256r1.fe_opp,
+}
+
+fe_mul :: proc {
+	p256r1.fe_mul,
+}
+
+fe_square :: proc {
+	p256r1.fe_square,
+}
+
+fe_inv :: proc {
+	p256r1.fe_inv,
+}
+
+fe_sqrt :: proc {
+	p256r1.fe_sqrt,
+}
+
+fe_equal :: proc {
+	p256r1.fe_equal,
+}
+
+fe_is_odd :: proc {
+	p256r1.fe_is_odd,
+}
+
+fe_is_zero :: proc {
+	fe_is_zero_p256r1,
+}
+
+fe_cond_select :: proc {
+	p256r1.fe_cond_select,
+}
+
+fe_a_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) {
+	// a = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc
+	//   = -3 mod p
+	fe[0] = 18446744073709551612
+	fe[1] = 17179869183
+	fe[2] = 0
+	fe[3] = 18446744056529682436
+}
+
+fe_b_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) {
+	// b = 0x5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b
+	fe[0] = 15608596021259845087
+	fe[1] = 12461466548982526096
+	fe[2] = 16546823903870267094
+	fe[3] = 15866188208926050356
+}
+
+fe_gen_x_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) {
+	// G_x = 0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296
+	fe[0] = 8784043285714375740
+	fe[1] = 8483257759279461889
+	fe[2] = 8789745728267363600
+	fe[3] = 1770019616739251654
+}
+
+fe_gen_y_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) {
+	// G_y = 0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5
+	fe[0] = 15992936863339206154
+	fe[1] = 10037038012062884956
+	fe[2] = 15197544864945402661
+	fe[3] = 9615747158586711429
+}
+
+@(require_results)
+fe_is_zero_p256r1 :: proc "contextless" (fe: ^Field_Element_p256r1) -> int {
+	ctrl := p256r1.fe_non_zero(fe)
+	_, borrow := bits.sub_u64(ctrl, 1, 0)
+	return int(borrow)
+}
--- a/core/crypto/_weierstrass/point.odin
+++ b/core/crypto/_weierstrass/point.odin
@@ -0,0 +1,548 @@
+package _weierstrass
+
+/*
+This implements prime order short Weierstrass curves defined over a field
+k with char(k) != 2, 3 (`y^2 = x^3 + ax + b`). for the purpose of
+implementing ECDH and ECDSA.  Use of this package for other purposes is
+NOT RECOMMENDED.
+
+As an explicit simplicity/performance tradeoff, projective representation
+was chosen so that it is possible to use the complete addition
+formulas.
+
+See:
+- https://eprint.iacr.org/2015/1060.pdf
+- https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html
+
+WARNING: The point addition and doubling formulas are specialized for
+`a = -3`, which covers secp256r1, secp384r1, secp521r1, FRP256v1, SM2,
+and GOST 34.10.  The brainpool curves and secp256k1 are NOT SUPPORTED
+and would require slightly different formulas.
+*/
+
+Point_p256r1 :: struct {
+	x: Field_Element_p256r1,
+	y: Field_Element_p256r1,
+	z: Field_Element_p256r1,
+}
+
+@(require_results)
+pt_set_xy_bytes :: proc "contextless" (p: ^$T, x_raw, y_raw: []byte) -> bool {
+	when T == Point_p256r1 {
+		FE_SZ :: FE_SIZE_P256R1
+		x, y: Field_Element_p256r1
+		defer fe_clear_vec([]^Field_Element_p256r1{&x, &y})
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	if len(x_raw) != FE_SZ || len(y_raw) != FE_SZ {
+		return false
+	}
+
+	if !fe_set_bytes(&x, x_raw) {
+		return false
+	}
+	if !fe_set_bytes(&y, y_raw) {
+		return false
+	}
+	if !is_on_curve(&x, &y) {
+		return false
+	}
+
+	fe_set(&p.x, &x)
+	fe_set(&p.y, &y)
+	fe_one(&p.z)
+
+	return true
+}
+
+@(require_results)
+pt_set_x_bytes :: proc "contextless" (p: ^$T, x_raw: []byte, y_is_odd: int) -> bool {
+	when T == Point_p256r1 {
+		FE_SZ :: FE_SIZE_P256R1
+		x, y, yy, y_neg: Field_Element_p256r1
+		defer fe_clear_vec([]^Field_Element_p256r1{&x, &y, &yy, &y_neg})
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	if len(x_raw) != FE_SZ {
+		return false
+	}
+
+	if !fe_set_bytes(&x, x_raw) {
+		return false
+	}
+	set_yy_candidate(&yy, &x)
+	if fe_sqrt(&y, &yy) != 1 {
+		return false
+	}
+
+	// Pick the correct y-coordinate.
+	fe_negate(&y_neg, &y)
+	parity_neq := (y_is_odd ~ fe_is_odd(&y)) & 1
+
+	fe_set(&p.x, &x)
+	fe_cond_select(&p.y, &y, &y_neg, parity_neq)
+	fe_one(&p.z)
+
+	return true
+}
+
+@(require_results)
+pt_bytes :: proc "contextless" (x, y: []byte, p: ^$T) -> bool {
+	when T == Point_p256r1 {
+		FE_SZ :: FE_SIZE_P256R1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	if pt_is_identity(p) == 1 {
+		return false
+	}
+
+	// Convert to affine coordinates.
+	pt_rescale(p, p)
+
+	switch len(x) {
+	case 0:
+	case FE_SZ:
+		fe_bytes(x, &p.x)
+	case:
+		panic_contextless("weierstrass: invalid x buffer")
+	}
+	switch len(y) {
+	case 0:
+	case FE_SZ:
+		fe_bytes(y, &p.y)
+	case:
+		panic_contextless("weierstrass: invalid y buffer")
+	}
+
+	return true
+}
+
+pt_set :: proc "contextless" (p, q: ^$T) {
+	fe_set(&p.x, &q.x)
+	fe_set(&p.y, &q.y)
+	fe_set(&p.z, &q.z)
+}
+
+pt_identity :: proc "contextless" (p: ^$T) {
+	fe_zero(&p.x)
+	fe_one(&p.y)
+	fe_zero(&p.z)
+}
+
+pt_generator :: proc "contextless" (p: ^$T) {
+	fe_gen_x(&p.x)
+	fe_gen_y(&p.y)
+	fe_one(&p.z)
+}
+
+pt_clear :: proc "contextless" (p: ^$T) {
+	fe_clear(&p.x)
+	fe_clear(&p.y)
+	fe_clear(&p.z)
+}
+
+pt_clear_vec :: proc "contextless" (arg: []^$T) {
+	for p in arg {
+		pt_clear(p)
+	}
+}
+
+pt_add :: proc "contextless" (p, a, b: ^$T) {
+	// Algorithm 4 from "Complete addition formulas for prime
+	// order elliptic curves" by Renes, Costello, and Batina.
+	//
+	// The formula is complete in that it is valid for all a and b,
+	// without exceptions or extra assumptions about the inputs.
+	//
+	// The operation costs are `12M + 2mb + 29a`.
+
+	when T == Point_p256r1 {
+		t0, t1, t2, t3, t4, b_fe: Field_Element_p256r1
+		x3, y3, z3: Field_Element_p256r1
+		defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &t4, &x3, &y3, &z3})
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	x1, y1, z1 := &a.x, &a.y, &a.z
+	x2, y2, z2 := &b.x, &b.y, &b.z
+
+	fe_b(&b_fe)
+
+	// t0 := X1 * X2 ; t1 := Y1 * Y2 ; t2 := Z1 * Z2 ;
+	fe_mul(&t0, x1, x2)
+	fe_mul(&t1, y1, y2)
+	fe_mul(&t2, z1, z2)
+
+	// t3 := X1 + Y1 ; t4 := X2 + Y2 ; t3 := t3 * t4 ;
+	fe_add(&t3, x1, y1)
+	fe_add(&t4, x2, y2)
+	fe_mul(&t3, &t3, &t4)
+
+	// t4 := t0 + t1 ; t3 := t3 - t4 ; t4 := Y1 + Z1 ;
+	fe_add(&t4, &t0, &t1)
+	fe_sub(&t3, &t3, &t4)
+	fe_add(&t4, y1, z1)
+
+	// X3 := Y2 + Z2 ; t4 := t4 * X3 ; X3 := t1 + t2 ;
+	fe_add(&x3, y2, z2)
+	fe_mul(&t4, &t4, &x3)
+	fe_add(&x3, &t1, &t2)
+
+	// t4 := t4 - X3 ; X3 := X1 + Z1 ; Y3 := X2 + Z2 ;
+	fe_sub(&t4, &t4, &x3)
+	fe_add(&x3, x1, z1)
+	fe_add(&y3, x2, z2)
+
+	// X3 := X3 * Y3 ; Y3 := t0 + t2 ; Y3 := X3 - Y3 ;
+	fe_mul(&x3, &x3, &y3)
+	fe_add(&y3, &t0, &t2)
+	fe_sub(&y3, &x3, &y3)
+
+	// Z3 := b * t2 ; X3 := Y3 - Z3 ; Z3 := X3 + X3 ;
+	fe_mul(&z3, &b_fe, &t2)
+	fe_sub(&x3, &y3, &z3)
+	fe_add(&z3, &x3, &x3)
+
+	// X3 := X3 + Z3 ; Z3 := t1 - X3 ; X3 := t1 + X3 ;
+	fe_add(&x3, &x3, &z3)
+	fe_sub(&z3, &t1, &x3)
+	fe_add(&x3, &t1, &x3)
+
+	// Y3 := b * Y3 ; t1 := t2 + t2 ; t2 := t1 + t2 ;
+	fe_mul(&y3, &b_fe, &y3)
+	fe_add(&t1, &t2, &t2)
+	fe_add(&t2, &t1, &t2)
+
+	// Y3 := Y3 - t2 ; Y3 := Y3 - t0 ; t1 := Y3 + Y3 ;
+	fe_sub(&y3, &y3, &t2)
+	fe_sub(&y3, &y3, &t0)
+	fe_add(&t1, &y3, &y3)
+
+	// Y3 := t1 + Y3 ; t1 := t0 + t0 ; t0 := t1 + t0 ;
+	fe_add(&y3, &t1, &y3)
+	fe_add(&t1, &t0, &t0)
+	fe_add(&t0, &t1, &t0)
+
+	// t0 := t0 - t2 ; t1 := t4 * Y3 ; t2 := t0 * Y3 ;
+	fe_sub(&t0, &t0, &t2)
+	fe_mul(&t1, &t4, &y3)
+	fe_mul(&t2, &t0, &y3)
+
+	// Y3 := X3 * Z3 ; Y3 := Y3 + t2 ; X3 := t3 * X3 ;
+	fe_mul(&y3, &x3, &z3)
+	fe_add(&y3, &y3, &t2)
+	fe_mul(&x3, &t3, &x3)
+
+	// X3 := X3 - t1 ; Z3 := t4 * Z3 ; t1 := t3 * t0 ;
+	fe_sub(&x3, &x3, &t1)
+	fe_mul(&z3, &t4, &z3)
+	fe_mul(&t1, &t3, &t0)
+
+	// Z3 := Z3 + t1 ;
+	fe_add(&z3, &z3, &t1)
+
+	// return X3 , Y3 , Z3 ;
+	fe_set(&p.x, &x3)
+	fe_set(&p.y, &y3)
+	fe_set(&p.z, &z3)
+}
+
+@(private)
+pt_add_mixed :: proc "contextless" (p, a: ^$T, x2, y2: ^$U) {
+	// Algorithm 5 from "Complete addition formulas for prime
+	// order elliptic curves" by Renes, Costello, and Batina.
+	//
+	// The formula is mixed in that it assumes the z-coordinate
+	// of the addend (`Z2`) is `1`, meaning that it CAN NOT
+	// handle the addend being the point at infinity.
+	//
+	// The operation costs are `11M + 2mb + 23a` saving
+	// `1M + 6a` over `pt_add`.
+
+	when T == Point_p256r1 && U == Field_Element_p256r1 {
+		t0, t1, t2, t3, t4, b_fe: Field_Element_p256r1
+		x3, y3, z3: Field_Element_p256r1
+		defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &t4, &x3, &y3, &z3})
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	x1, y1, z1 := &a.x, &a.y, &a.z
+
+	fe_b(&b_fe)
+
+	// t0 := X1 * X2 ; t1 := Y1 * Y2 ; t3 := X2 + Y2 ;
+	fe_mul(&t0, x1, x2)
+	fe_mul(&t1, y1, y2)
+	fe_add(&t3, x2, y2)
+
+	// t4 := X1 + Y1 ; t3 := t3 * t4 ; t4 := t0 + t1 ;
+	fe_add(&t4, x1, y1)
+	fe_mul(&t3, &t3, &t4)
+	fe_add(&t4, &t0, &t1)
+
+	// t3 := t3 − t4 ; t4 := Y2 * Z1 ; t4 := t4 + Y1 ;
+	fe_sub(&t3, &t3, &t4)
+	fe_mul(&t4, y2, z1)
+	fe_add(&t4, &t4, y1)
+
+	// Y3 := X2 * Z1 ; Y3 := Y3 + X1 ; Z3 := b * Z1 ;
+	fe_mul(&y3, x2, z1)
+	fe_add(&y3, &y3, x1)
+	fe_mul(&z3, &b_fe, z1)
+
+	// X3 := Y3 − Z3 ; Z3 := X3 + X3 ; X3 := X3 + Z3 ;
+	fe_sub(&x3, &y3, &z3)
+	fe_add(&z3, &x3, &x3)
+	fe_add(&x3, &x3, &z3)
+
+	// Z3 := t1 − X3 ; X3 := t1 + X3 ;. Y3 := b * Y3 ;
+	fe_sub(&z3, &t1, &x3)
+	fe_add(&x3, &t1, &x3)
+	fe_mul(&y3, &b_fe, &y3)
+
+	// t1 := Z1 + Z1 ; t2 := t1 + Z1 ; Y3 := Y3 − t2 ;
+	fe_add(&t1, z1, z1)
+	fe_add(&t2, &t1, z1)
+	fe_sub(&y3, &y3, &t2)
+
+	// Y3 := Y3 − t0 ; t1 := Y3 + Y3 ; Y3 := t1 + Y3 ;
+	fe_sub(&y3, &y3, &t0)
+	fe_add(&t1, &y3, &y3)
+	fe_add(&y3, &t1, &y3)
+
+	// t1 := t0 + t0 ; t0 := t1 + t0 ; t0 := t0 − t2 ;
+	fe_add(&t1, &t0, &t0)
+	fe_add(&t0, &t1, &t0)
+	fe_sub(&t0, &t0, &t2)
+
+	// t1 := t4 * Y3 ; t2 := t0 * Y3 ; Y3 := X3 * Z3 ;
+	fe_mul(&t1, &t4, &y3)
+	fe_mul(&t2, &t0, &y3)
+	fe_mul(&y3, &x3, &z3)
+
+	// Y3 := Y3 + t2 ; X3 := t3 * X3 ; X3 := X3 − t1 ;
+	fe_add(&y3, &y3, &t2)
+	fe_mul(&x3, &t3, &x3)
+	fe_sub(&x3, &x3, &t1)
+
+	// Z3 := t4 * Z3 ; t1 := t3 * t0 ; Z3 := Z3 + t1 ;
+	fe_mul(&z3, &t4, &z3)
+	fe_mul(&t1, &t3, &t0)
+	fe_add(&z3, &z3, &t1)
+
+	// return X3 , Y3 , Z3 ;
+	fe_set(&p.x, &x3)
+	fe_set(&p.y, &y3)
+	fe_set(&p.z, &z3)
+}
+
+pt_double :: proc "contextless" (p, a: ^$T) {
+	// Algorithm 6 from "Complete addition formulas for prime
+	// order elliptic curves" by Renes, Costello, and Batina.
+	//
+	// The formula is complete in that it is valid for all a,
+	// without exceptions or extra assumptions about the inputs.
+	//
+	// The operation costs are `8M + 3S + 2mb + 21a`.
+
+	when T == Point_p256r1 {
+		t0, t1, t2, t3, b_fe: Field_Element_p256r1
+		x3, y3, z3: Field_Element_p256r1
+		defer fe_clear_vec([]^Field_Element_p256r1{&t0, &t1, &t2, &t3, &x3, &y3, &z3})
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	x, y, z := &a.x, &a.y, &a.z
+
+	fe_b(&b_fe)
+
+	// t0 := X ^2; t1 := Y ^2; t2 := Z ^2;
+	fe_square(&t0, x)
+	fe_square(&t1, y)
+	fe_square(&t2, z)
+
+	// t3 := X * Y ; t3 := t3 + t3 ; Z3 := X * Z ;
+	fe_mul(&t3, x, y)
+	fe_add(&t3, &t3, &t3)
+	fe_mul(&z3, x, z)
+
+	// Z3 := Z3 + Z3 ; Y3 := b * t2 ; Y3 := Y3 - Z3 ;
+	fe_add(&z3, &z3, &z3)
+	fe_mul(&y3, &b_fe, &t2)
+	fe_sub(&y3, &y3, &z3)
+
+	// X3 := Y3 + Y3 ; Y3 := X3 + Y3 ; X3 := t1 - Y3 ;
+	fe_add(&x3, &y3, &y3)
+	fe_add(&y3, &x3, &y3)
+	fe_sub(&x3, &t1, &y3)
+
+	// Y3 := t1 + Y3 ; Y3 := X3 * Y3 ; X3 := X3 * t3 ;
+	fe_add(&y3, &t1, &y3)
+	fe_mul(&y3, &x3, &y3)
+	fe_mul(&x3, &x3, &t3)
+
+	// t3 := t2 + t2 ; t2 := t2 + t3 ; Z3 := b * Z3 ;
+	fe_add(&t3, &t2, &t2)
+	fe_add(&t2, &t2, &t3)
+	fe_mul(&z3, &b_fe, &z3)
+
+	// Z3 := Z3 - t2 ; Z3 := Z3 - t0 ; t3 := Z3 + Z3 ;
+	fe_sub(&z3, &z3, &t2)
+	fe_sub(&z3, &z3, &t0)
+	fe_add(&t3, &z3, &z3)
+
+	// Z3 := Z3 + t3 ; t3 := t0 + t0 ; t0 := t3 + t0 ;
+	fe_add(&z3, &z3, &t3)
+	fe_add(&t3, &t0, &t0)
+	fe_add(&t0, &t3, &t0)
+
+	// t0 := t0 - t2 ; t0 := t0 * Z3 ; Y3 := Y3 + t0 ;
+	fe_sub(&t0, &t0, &t2)
+	fe_mul(&t0, &t0, &z3)
+	fe_add(&y3, &y3, &t0)
+
+	// t0 := Y * Z ; t0 := t0 + t0 ; Z3 := t0 * Z3 ;
+	fe_mul(&t0, y, z)
+	fe_add(&t0, &t0, &t0)
+	fe_mul(&z3, &t0, &z3)
+
+	// X3 := X3 - Z3 ; Z3 := t0 * t1 ; Z3 := Z3 + Z3 ;
+	fe_sub(&x3, &x3, &z3)
+	fe_mul(&z3, &t0, &t1)
+	fe_add(&z3, &z3, &z3)
+
+	// Z3 := Z3 + Z3 ;
+	fe_add(&z3, &z3, &z3)
+
+	// return X3 , Y3 , Z3 ;
+	fe_set(&p.x, &x3)
+	fe_set(&p.y, &y3)
+	fe_set(&p.z, &z3)
+}
+
+pt_sub :: proc "contextless" (p, a, b: ^$T) {
+	b_neg: T
+	pt_negate(&b_neg, b)
+	pt_add(p, a, &b_neg)
+
+	fe_clear(&b_neg)
+}
+
+pt_negate :: proc "contextless" (p, a: ^$T) {
+	fe_set(&p.x, &a.x)
+	fe_negate(&p.y, &a.y)
+	fe_set(&p.z, &a.z)
+}
+
+pt_rescale :: proc "contextless" (p, a: ^$T) {
+	// A = 1/Z1
+	// X3 = A*X1
+	// Y3 = A*Y1
+	// Z3 = 1
+	//
+	// As per "From A to Z: Projective coordinates leakage in the wild"
+	// leaking the Z-coordinate is bad. The modular inversion algorithm
+	// used in this library is based on Fermat's Little Theorem.
+	//
+	// See: https://eprint.iacr.org/2020/432.pdf
+
+	was_identity := pt_is_identity(a)
+
+	when T == Point_p256r1 {
+		z_inv: Field_Element_p256r1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	ident: T
+	fe_inv(&z_inv, &a.z)
+	fe_mul(&p.x, &a.x, &z_inv)
+	fe_mul(&p.y, &a.y, &z_inv)
+	fe_one(&p.z)
+
+	pt_identity(&ident)
+	pt_cond_select(p, p, &ident, was_identity)
+
+	fe_clear(&z_inv)
+}
+
+pt_cond_select :: proc "contextless" (p, a, b: ^$T, ctrl: int) {
+	fe_cond_select(&p.x, &a.x, &b.x, ctrl)
+	fe_cond_select(&p.y, &a.y, &b.y, ctrl)
+	fe_cond_select(&p.z, &a.z, &b.z, ctrl)
+}
+
+@(require_results)
+pt_equal :: proc "contextless" (a, b: ^$T) -> int {
+	when T == Point_p256r1 {
+		x1z2, x2z1, y1z2, y2z1: Field_Element_p256r1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	// Check X1Z2 == X2Z1 && Y1Z2 == Y2Z1
+	fe_mul(&x1z2, &a.x, &b.z)
+	fe_mul(&x2z1, &b.x, &a.z)
+
+	fe_mul(&y1z2, &a.y, &b.z)
+	fe_mul(&y2z1, &b.y, &a.z)
+
+	return fe_equal(&x1z2, &x2z1) & fe_equal(&y1z2, &y2z1)
+}
+
+@(require_results)
+pt_is_identity :: proc "contextless" (p: ^$T) -> int {
+	return fe_is_zero(&p.z)
+}
+
+@(require_results)
+pt_is_y_odd :: proc "contextless" (p: ^$T) -> int {
+	tmp: T
+	defer pt_clear(&tmp)
+
+	fe_set(&tmp, p)
+	pt_rescale(&tmp)
+
+	return fe_is_odd(&tmp.y)
+}
+
+@(private)
+is_on_curve :: proc "contextless" (x, y: ^$T) -> bool {
+	maybe_yy, yy: T
+	defer fe_clear_vec([]^T{&maybe_yy, &yy})
+
+	// RHS: x^3 + ax + b
+	set_yy_candidate(&maybe_yy, x)
+
+	// LHS: y^2
+	fe_square(&yy, y)
+
+	return fe_equal(&maybe_yy, &yy) == 1
+}
+
+@(private)
+set_yy_candidate :: proc "contextless" (maybe_yy, x: ^$T) {
+	// RHS: x^3 + ax + b
+	rhs, tmp: T
+
+	fe_square(&tmp, x)
+	fe_mul(&rhs, &tmp, x)
+
+	fe_a(&tmp)
+	fe_mul(&tmp, &tmp, x)
+	fe_add(&rhs, &rhs, &tmp)
+
+	fe_b(&tmp)
+	fe_add(maybe_yy, &rhs, &tmp)
+
+	fe_clear(&rhs)
+}
--- a/core/crypto/_weierstrass/point_s11n_sec.odin
+++ b/core/crypto/_weierstrass/point_s11n_sec.odin
@@ -0,0 +1,95 @@
+package _weierstrass
+
+@(require) import "core:mem"
+
+@(private)
+SEC_PREFIX_IDENTITY        :: 0x00
+@(private)
+SEC_PREFIX_COMPRESSED_EVEN :: 0x02
+@(private)
+SEC_PREFIX_COMPRESSED_ODD  :: 0x03
+SEC_PREFIX_UNCOMPRESSED    :: 0x04
+
+@(require_results)
+pt_set_sec_bytes :: proc "contextless" (p: ^$T, b: []byte) -> bool {
+	when T == Point_p256r1 {
+		FE_SZ :: FE_SIZE_P256R1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	b_len := len(b)
+	if b_len < 1 {
+		return false
+	}
+
+	switch b[0] {
+	case SEC_PREFIX_IDENTITY:
+		if b_len != 1 {
+			return false
+		}
+		pt_identity(p)
+		return true
+	case SEC_PREFIX_COMPRESSED_EVEN, SEC_PREFIX_COMPRESSED_ODD:
+		if b_len != 1 + FE_SZ {
+			return false
+		}
+		y_is_odd := b[0] - SEC_PREFIX_COMPRESSED_EVEN
+		return pt_set_x_bytes(p, b[1:], int(y_is_odd))
+	case SEC_PREFIX_UNCOMPRESSED:
+		if b_len != 1 + 2 * FE_SZ {
+			return false
+		}
+		x, y := b[1:1+FE_SZ], b[1+FE_SZ:]
+		return pt_set_xy_bytes(p, x, y)
+	case:
+		return false
+	}
+}
+
+@(require_results)
+pt_sec_bytes :: proc "contextless" (b: []byte, p: ^$T, compressed: bool) -> bool {
+	when T == Point_p256r1 {
+		FE_SZ :: FE_SIZE_P256R1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	b_len := len(b)
+	if pt_is_identity(p) == 1 {
+		if b_len != 1 {
+			return false
+		}
+		b[0] = SEC_PREFIX_IDENTITY
+		return true
+	}
+
+	x, y: []byte
+	y_: [FE_SZ]byte
+	switch compressed {
+	case true:
+		if b_len != 1 + FE_SZ {
+			return false
+		}
+		x, y = b[1:], y_[:]
+	case false:
+		if b_len != 1 + 2 * FE_SZ {
+			return false
+		}
+		b[0]= SEC_PREFIX_UNCOMPRESSED
+		x, y = b[1:1+FE_SZ], b[1+FE_SZ:]
+	}
+	if !pt_bytes(x, y, p) {
+		return false
+	}
+	if compressed {
+		// Instead of calling pt_is_y_odd, just serializing
+		// y into a temp buffer and checking the parity saves
+		// 1 redundant rescale call.
+		y_is_odd := byte(y[FE_SZ-1] & 1)
+		b[0] = SEC_PREFIX_COMPRESSED_EVEN + y_is_odd
+		mem.zero_explicit(&y_, size_of(y_))
+	}
+
+	return true
+}
--- a/core/crypto/_weierstrass/sc.odin
+++ b/core/crypto/_weierstrass/sc.odin
@@ -0,0 +1,76 @@
+package _weierstrass
+
+import p256r1 "core:crypto/_fiat/field_scalarp256r1"
+import subtle "core:crypto/_subtle"
+
+Scalar_p256r1 :: p256r1.Montgomery_Domain_Field_Element
+
+SC_SIZE_P256R1 :: 32
+
+sc_clear :: proc {
+	p256r1.fe_clear,
+}
+
+sc_clear_vec :: proc {
+	p256r1.fe_clear_vec,
+}
+
+sc_set_bytes :: proc {
+	p256r1.fe_from_bytes,
+}
+sc_bytes :: proc {
+	p256r1.fe_to_bytes,
+}
+
+sc_set :: proc {
+	p256r1.fe_set,
+}
+
+sc_zero :: proc {
+	p256r1.fe_zero,
+}
+
+sc_one_p256r1 :: proc {
+	p256r1.fe_one,
+}
+
+sc_add :: proc {
+	p256r1.fe_add,
+}
+
+sc_sub :: proc {
+	p256r1.fe_sub,
+}
+
+sc_negate :: proc {
+	p256r1.fe_opp,
+}
+
+sc_mul :: proc {
+	p256r1.fe_mul,
+}
+
+sc_square :: proc {
+	p256r1.fe_square,
+}
+
+sc_cond_assign :: proc {
+	p256r1.fe_cond_assign,
+}
+
+sc_equal :: proc {
+	p256r1.fe_equal,
+}
+
+sc_is_odd :: proc {
+	p256r1.fe_is_odd,
+}
+
+sc_is_zero :: proc {
+	sc_is_zero_p256r1,
+}
+
+@(require_results)
+sc_is_zero_p256r1 :: proc "contextless" (fe: ^Scalar_p256r1) -> int {
+	return int(subtle.u64_is_zero(p256r1.fe_non_zero(fe)))
+}
--- a/core/crypto/_weierstrass/scalar_mul.odin
+++ b/core/crypto/_weierstrass/scalar_mul.odin
@@ -0,0 +1,204 @@
+package _weierstrass
+
+import "core:crypto"
+import subtle "core:crypto/_subtle"
+import "core:mem"
+
+pt_scalar_mul :: proc "contextless" (
+	p, a: ^$T,
+	sc: ^$S,
+	unsafe_is_vartime: bool = false,
+) {
+	when T == Point_p256r1 && S == Scalar_p256r1 {
+		SC_SZ :: SC_SIZE_P256R1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	b: [SC_SZ]byte = ---
+	sc_bytes(b[:], sc)
+
+	pt_scalar_mul_bytes(p, a, b[:], unsafe_is_vartime)
+
+	if !unsafe_is_vartime {
+		mem.zero_explicit(&b, size_of(b))
+	}
+}
+
+pt_scalar_mul_bytes :: proc "contextless" (
+	p, a: ^$T,
+	sc: []byte,
+	unsafe_is_vartime: bool = false,
+) {
+	when T == Point_p256r1 {
+		p_tbl: Multiply_Table_p256r1 = ---
+		q, tmp: Point_p256r1 = ---, ---
+		SC_SZ :: SC_SIZE_P256R1
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	assert_contextless(len(sc) == SC_SZ, "weierstrass: invalid scalar size")
+	mul_tbl_set(&p_tbl, a, unsafe_is_vartime)
+
+	pt_identity(&q)
+	for limb_byte, i in sc {
+		hi, lo := (limb_byte >> 4) & 0x0f, limb_byte & 0x0f
+
+		if i != 0 {
+			pt_double(&q, &q)
+			pt_double(&q, &q)
+			pt_double(&q, &q)
+			pt_double(&q, &q)
+		}
+		mul_tbl_lookup_add(&q, &tmp, &p_tbl, u64(hi), unsafe_is_vartime)
+
+		pt_double(&q, &q)
+		pt_double(&q, &q)
+		pt_double(&q, &q)
+		pt_double(&q, &q)
+		mul_tbl_lookup_add(&q, &tmp, &p_tbl, u64(lo), unsafe_is_vartime)
+	}
+
+	pt_set(p, &q)
+
+	if !unsafe_is_vartime {
+		mem.zero_explicit(&p_tbl, size_of(p_tbl))
+		pt_clear_vec([]^T{&q, &tmp})
+	}
+}
+
+when crypto.COMPACT_IMPLS == true {
+	pt_scalar_mul_generator :: proc "contextless" (
+		p: ^$T,
+		sc: ^$S,
+		unsafe_is_vartime: bool = false,
+	) {
+		g: T
+		pt_generator(&g)
+
+		pt_scalar_mul(p, &g, sc, unsafe_is_vartime)
+	}
+} else {
+	pt_scalar_mul_generator :: proc "contextless" (
+		p: ^$T,
+		sc: ^$S,
+		unsafe_is_vartime: bool = false,
+	) {
+		when T == Point_p256r1 && S == Scalar_p256r1 {
+			p_tbl_hi := &Gen_Multiply_Table_p256r1_hi
+			p_tbl_lo := &Gen_Multiply_Table_p256r1_lo
+			tmp: Point_p256r1 = ---
+			SC_SZ :: SC_SIZE_P256R1
+		} else {
+			#panic("weierstrass: invalid curve")
+		}
+
+		b: [SC_SZ]byte
+		sc_bytes(b[:], sc)
+
+		pt_identity(p)
+		for limb_byte, i in b {
+			hi, lo := (limb_byte >> 4) & 0x0f, limb_byte & 0x0f
+			mul_affine_tbl_lookup_add(p, &tmp, &p_tbl_hi[i], u64(hi), unsafe_is_vartime)
+			mul_affine_tbl_lookup_add(p, &tmp, &p_tbl_lo[i], u64(lo), unsafe_is_vartime)
+		}
+
+		if !unsafe_is_vartime {
+			mem.zero_explicit(&b, size_of(b))
+			pt_clear(&tmp)
+		}
+	}
+}
+
+@(private="file")
+Multiply_Table_p256r1 :: [15]Point_p256r1
+
+@(private="file")
+mul_tbl_set :: proc "contextless"(
+	tbl: ^$T,
+	point: ^$U,
+	unsafe_is_vartime: bool,
+) {
+	when T == Multiply_Table_p256r1 && U == Point_p256r1{
+		tmp: Point_p256r1
+		pt_set(&tmp, point)
+	} else {
+		#panic("weierstrass: invalid curve")
+	}
+
+	pt_set(&tbl[0], &tmp)
+	for i in 1 ..<15 {
+		pt_add(&tmp, &tmp, point)
+		pt_set(&tbl[i], &tmp)
+	}
+
+	if !unsafe_is_vartime {
+		pt_clear(&tmp)
+	}
+}
+
+@(private="file")
+mul_tbl_lookup_add :: proc "contextless" (
+	point, tmp: ^$T,
+	tbl: ^$U,
+	idx: u64,
+	unsafe_is_vartime: bool,
+ ) {
+	if unsafe_is_vartime {
+		switch idx {
+		case 0:
+		case:
+			pt_add(point, point, &tbl[idx - 1])
+		}
+		return
+	}
+
+	pt_identity(tmp)
+	for i in u64(1)..<16 {
+		ctrl := subtle.eq(i, idx)
+		pt_cond_select(tmp, tmp, &tbl[i - 1], int(ctrl))
+	}
+
+	pt_add(point, point, tmp)
+}
+
+when crypto.COMPACT_IMPLS == false {
+	@(private)
+	Affine_Point_p256r1 :: struct {
+		x: Field_Element_p256r1,
+		y: Field_Element_p256r1,
+	}
+
+	@(private="file")
+	mul_affine_tbl_lookup_add :: proc "contextless" (
+		point, tmp: ^$T,
+		tbl: ^$U,
+		idx: u64,
+		unsafe_is_vartime: bool,
+	) {
+		if unsafe_is_vartime {
+			switch idx {
+			case 0:
+			case:
+				pt_add_mixed(point, point, &tbl[idx - 1].x, &tbl[idx - 1].y)
+			}
+			return
+		}
+
+		pt_identity(tmp)
+		for i in u64(1)..<16 {
+			ctrl := int(subtle.eq(i, idx))
+			fe_cond_select(&tmp.x, &tmp.x, &tbl[i - 1].x, ctrl)
+			fe_cond_select(&tmp.y, &tmp.y, &tbl[i - 1].y, ctrl)
+		}
+
+		// The mixed addition formula assumes that the addend is not
+		// the neutral element.  Do the addition regardless, and then
+		// conditionally select the right result.
+		pt_add_mixed(tmp, point, &tmp.x, &tmp.y)
+
+		ctrl := subtle.u64_is_non_zero(idx)
+		pt_cond_select(point, point, tmp, int(ctrl))
+	}
+}
--- a/core/crypto/_weierstrass/secp256r1_table.odin
+++ b/core/crypto/_weierstrass/secp256r1_table.odin
--- a/core/crypto/_weierstrass/tools/ecc_gen_tables.odin
+++ b/core/crypto/_weierstrass/tools/ecc_gen_tables.odin
@@ -0,0 +1,99 @@
+package weistrass_tools
+
+import secec "core:crypto/_weierstrass"
+import "core:fmt"
+import path "core:path/filepath"
+import "core:os"
+import "core:strings"
+
+// Yes this leaks memory, fite me IRL.
+
+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/`
+
+main :: proc() {
+	gen_p256r1_tables()
+}
+
+gen_p256r1_tables :: proc() {
+	Affine_Point_p256r1 :: struct {
+		x: secec.Field_Element_p256r1,
+		y: secec.Field_Element_p256r1,
+	}
+	Multiply_Table_p256r1_hi: [32][15]Affine_Point_p256r1
+	Multiply_Table_p256r1_lo: [32][15]Affine_Point_p256r1
+
+	g, p: secec.Point_p256r1
+	secec.pt_generator(&g)
+
+	// Precompute ([1,15] << n) * G multiples of G, MSB->LSB
+	for i in 0..<32 {
+		b: [32]byte
+		for j in 1..<16 {
+			b[i] = u8(j) << 4
+			secec.pt_scalar_mul_bytes(&p, &g, b[:], true)
+			secec.pt_rescale(&p, &p)
+			secec.fe_set(&Multiply_Table_p256r1_hi[i][j-1].x, &p.x)
+			secec.fe_set(&Multiply_Table_p256r1_hi[i][j-1].y, &p.y)
+
+			b[i] = u8(j)
+			secec.pt_scalar_mul_bytes(&p, &g, b[:], true)
+			secec.pt_rescale(&p, &p)
+			secec.fe_set(&Multiply_Table_p256r1_lo[i][j-1].x, &p.x)
+			secec.fe_set(&Multiply_Table_p256r1_lo[i][j-1].y, &p.y)
+
+			b[i] = 0
+		}
+	}
+
+	fn := path.join({ODIN_ROOT, "core", "crypto", "_weierstrass", "secp256r1_table.odin"})
+	bld: strings.Builder
+	w := strings.to_writer(&bld)
+
+	fmt.wprintln(w, "package _weierstrass")
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, GENERATED)
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, "import \"core:crypto\"")
+	fmt.wprintln(w, "")
+	fmt.wprintln(w, "when crypto.COMPACT_IMPLS == false {")
+
+	fmt.wprintln(w, "\t@(private,rodata)")
+	fmt.wprintln(w, "\tGen_Multiply_Table_p256r1_hi := [32][15]Affine_Point_p256r1 {")
+	for &v, i in Multiply_Table_p256r1_hi {
+		fmt.wprintln(w, "\t\t{")
+		for &ap, j in v {
+			fmt.wprintln(w, "\t\t\t{")
+
+			x, y := &ap.x, &ap.y
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d},\n", x[0], x[1], x[2], x[3])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d},\n", y[0], y[1], y[2], y[3])
+
+			fmt.wprintln(w, "\t\t\t},")
+		}
+		fmt.wprintln(w, "\t\t},")
+	}
+	fmt.wprintln(w, "\t}\n")
+
+	fmt.wprintln(w, "\t@(private,rodata)")
+	fmt.wprintln(w, "\tGen_Multiply_Table_p256r1_lo := [32][15]Affine_Point_p256r1 {")
+	for &v, i in Multiply_Table_p256r1_lo {
+		fmt.wprintln(w, "\t\t{")
+		for &ap, j in v {
+			fmt.wprintln(w, "\t\t\t{")
+
+			x, y := &ap.x, &ap.y
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d},\n", x[0], x[1], x[2], x[3])
+			fmt.wprintf(w, "\t\t\t\t{{%d, %d, %d, %d},\n", y[0], y[1], y[2], y[3])
+
+			fmt.wprintln(w, "\t\t\t},")
+		}
+		fmt.wprintln(w, "\t\t},")
+	}
+	fmt.wprintln(w, "\t}")
+
+	fmt.wprintln(w, "}")
+
+	_ = os.write_entire_file(fn, transmute([]byte)(strings.to_string(bld)))
+}
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -2,8 +2,12 @@
 package crypto

 import "base:runtime"
+import subtle "core:crypto/_subtle"
 import "core:mem"

+// Omit large precomputed tables, trading off performance for size.
+COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)
+
 // HAS_RAND_BYTES is true iff the runtime provides a cryptographic
 // entropy source.
 HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
@@ -44,7 +48,17 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i

 	// After the loop, v == 0 iff a == b.  The subtraction will underflow
 	// iff v == 0, setting the sign-bit, which gets returned.
-	return int((u32(v)-1) >> 31)
+	return subtle.eq(0, v)
+}
+
+// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
+is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
+	v: byte
+	for b_ in b {
+		v |= b_
+	}
+
+	return subtle.byte_eq(0, v)
 }

 // rand_bytes fills the dst buffer with cryptographic entropy taken from
--- a/core/crypto/ecdh/doc.odin
+++ b/core/crypto/ecdh/doc.odin
@@ -0,0 +1,4 @@
+/*
+A generic interface to Elliptic Curve Diffie-Hellman key exchange.
+*/
+package ecdh
--- a/core/crypto/ecdh/ecdh.odin
+++ b/core/crypto/ecdh/ecdh.odin
@@ -0,0 +1,404 @@
+package ecdh
+
+import "core:crypto"
+import secec "core:crypto/_weierstrass"
+import "core:crypto/x25519"
+import "core:crypto/x448"
+import "core:mem"
+import "core:reflect"
+
+// Note: For these primitives scalar size = point size
+@(private="file")
+X25519_Buf :: [x25519.SCALAR_SIZE]byte
+@(private="file")
+X448_Buf :: [x448.SCALAR_SIZE]byte
+
+// Curve the curve identifier associated with a given Private_Key
+// or Public_Key
+Curve :: enum {
+	Invalid,
+	SECP256R1,
+	X25519,
+	X448,
+}
+
+// CURVE_NAMES is the Curve to curve name string.
+CURVE_NAMES := [Curve]string {
+	.Invalid   = "Invalid",
+	.SECP256R1 = "secp256r1",
+	.X25519    = "X25519",
+	.X448      = "X448",
+}
+
+// PRIVATE_KEY_SIZES is the Curve to private key size in bytes.
+PRIVATE_KEY_SIZES := [Curve]int {
+	.Invalid   = 0,
+	.SECP256R1 = secec.SC_SIZE_P256R1,
+	.X25519    = x25519.SCALAR_SIZE,
+	.X448      = x448.SCALAR_SIZE,
+}
+
+// PUBLIC_KEY_SIZES is the Curve to public key size in bytes.
+PUBLIC_KEY_SIZES := [Curve]int {
+	.Invalid   = 0,
+	.SECP256R1 = 1 + 2 * secec.FE_SIZE_P256R1,
+	.X25519    = x25519.POINT_SIZE,
+	.X448      = x448.POINT_SIZE,
+}
+
+// SHARED_SECRET_SIZES is the Curve to shared secret size in bytes.
+SHARED_SECRET_SIZES := [Curve]int {
+	.Invalid   = 0,
+	.SECP256R1 = secec.FE_SIZE_P256R1,
+	.X25519    = x25519.POINT_SIZE,
+	.X448      = x448.POINT_SIZE,
+}
+
+@(private="file")
+_PRIV_IMPL_IDS := [Curve]typeid {
+	.Invalid           = nil,
+	.SECP256R1         = typeid_of(secec.Scalar_p256r1),
+	.X25519            = typeid_of(X25519_Buf),
+	.X448              = typeid_of(X448_Buf),
+}
+
+@(private="file")
+_PUB_IMPL_IDS := [Curve]typeid {
+	.Invalid           = nil,
+	.SECP256R1         = typeid_of(secec.Point_p256r1),
+	.X25519            = typeid_of(X25519_Buf),
+	.X448              = typeid_of(X448_Buf),
+}
+
+// Private_Key is an ECDH private key.
+Private_Key :: struct {
+	// WARNING: All of the members are to be treated as internal (ie:
+	// the Private_Key structure is intended to be opaque).
+	_curve: Curve,
+	_impl: union {
+		secec.Scalar_p256r1,
+		X25519_Buf,
+		X448_Buf,
+	},
+	_pub_key: Public_Key,
+}
+
+// Public_Key is an ECDH public key.
+Public_Key :: struct {
+	// WARNING: All of the members are to be treated as internal (ie:
+	// the Public_Key structure is intended to be opaque).
+	_curve: Curve,
+	_impl: union {
+		secec.Point_p256r1,
+		X25519_Buf,
+		X448_Buf,
+	},
+}
+
+// private_key_generate uses the system entropy source to generate a new
+// Private_Key.  This will only fail iff the system entropy source is
+// missing or broken.
+private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
+	private_key_clear(priv_key)
+
+	if !crypto.HAS_RAND_BYTES {
+		return false
+	}
+
+	reflect.set_union_variant_typeid(
+		priv_key._impl,
+		_PRIV_IMPL_IDS[curve],
+	)
+
+	#partial switch curve {
+	case .SECP256R1:
+		sc := &priv_key._impl.(secec.Scalar_p256r1)
+
+		// 384-bits reduced makes the modulo bias insignificant
+		b: [48]byte = ---
+		defer (mem.zero_explicit(&b, size_of(b)))
+		for {
+			crypto.rand_bytes(b[:])
+			_ = secec.sc_set_bytes(sc, b[:])
+			if secec.sc_is_zero(sc) == 0 { // Likely
+				break
+			}
+		}
+	case .X25519:
+		sc := &priv_key._impl.(X25519_Buf)
+		crypto.rand_bytes(sc[:])
+	case .X448:
+		sc := &priv_key._impl.(X448_Buf)
+		crypto.rand_bytes(sc[:])
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+
+	priv_key._curve = curve
+	private_key_generate_public(priv_key)
+
+	return true
+}
+
+// private_key_set_bytes decodes a byte-encoded private key, and returns
+// true iff the operation was successful.
+private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
+	private_key_clear(priv_key)
+
+	if len(b) != PRIVATE_KEY_SIZES[curve] {
+		return false
+	}
+
+	reflect.set_union_variant_typeid(
+		priv_key._impl,
+		_PRIV_IMPL_IDS[curve],
+	)
+
+	#partial switch curve {
+	case .SECP256R1:
+		sc := &priv_key._impl.(secec.Scalar_p256r1)
+		did_reduce := secec.sc_set_bytes(sc, b)
+		is_zero := secec.sc_is_zero(sc) == 1
+
+		// Reject `0` and scalars that are not less than the
+		// curve order.
+		if did_reduce || is_zero {
+			private_key_clear(priv_key)
+			return false
+		}
+	case .X25519:
+		sc := &priv_key._impl.(X25519_Buf)
+		copy(sc[:], b)
+	case .X448:
+		sc := &priv_key._impl.(X448_Buf)
+		copy(sc[:], b)
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+
+	priv_key._curve = curve
+	private_key_generate_public(priv_key)
+
+	return true
+}
+
+@(private="file")
+private_key_generate_public :: proc(priv_key: ^Private_Key) {
+	switch &sc in priv_key._impl {
+	case secec.Scalar_p256r1:
+		pub_key: secec.Point_p256r1 = ---
+		secec.pt_scalar_mul_generator(&pub_key, &sc)
+		secec.pt_rescale(&pub_key, &pub_key)
+		priv_key._pub_key._impl = pub_key
+	case X25519_Buf:
+		pub_key: X25519_Buf = ---
+		x25519.scalarmult_basepoint(pub_key[:], sc[:])
+		priv_key._pub_key._impl = pub_key
+	case X448_Buf:
+		pub_key: X448_Buf = ---
+		x448.scalarmult_basepoint(pub_key[:], sc[:])
+		priv_key._pub_key._impl = pub_key
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+
+	priv_key._pub_key._curve = priv_key._curve
+}
+
+// private_key_bytes sets dst to byte-encoding of priv_key.
+private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
+	ensure(priv_key._curve != .Invalid, "crypto/ecdh: uninitialized private key")
+	ensure(len(dst) == PRIVATE_KEY_SIZES[priv_key._curve], "crypto/ecdh: invalid destination size")
+
+	#partial switch priv_key._curve {
+	case .SECP256R1:
+		sc := &priv_key._impl.(secec.Scalar_p256r1)
+		secec.sc_bytes(dst, sc)
+	case .X25519:
+		sc := &priv_key._impl.(X25519_Buf)
+		copy(dst, sc[:])
+	case .X448:
+		sc := &priv_key._impl.(X448_Buf)
+		copy(dst, sc[:])
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+}
+
+// private_key_equal returns true iff the private keys are equal,
+// in constant time.
+private_key_equal :: proc(p, q: ^Private_Key) -> bool {
+	if p._curve != q._curve {
+		return false
+	}
+
+	#partial switch p._curve {
+	case .SECP256R1:
+		sc_p, sc_q := &p._impl.(secec.Scalar_p256r1), &q._impl.(secec.Scalar_p256r1)
+		return secec.sc_equal(sc_p, sc_q) == 1
+	case .X25519:
+		b_p, b_q  := &p._impl.(X25519_Buf), &q._impl.(X25519_Buf)
+		return crypto.compare_constant_time(b_p[:], b_q[:]) == 1
+	case .X448:
+		b_p, b_q  := &p._impl.(X448_Buf), &q._impl.(X448_Buf)
+		return crypto.compare_constant_time(b_p[:], b_q[:]) == 1
+	case:
+		return false
+	}
+}
+
+// private_key_clear clears priv_key to the uninitialized state.
+private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
+	mem.zero_explicit(priv_key, size_of(Private_Key))
+}
+
+// public_key_set_bytes decodes a byte-encoded public key, and returns
+// true iff the operation was successful.
+public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
+	public_key_clear(pub_key)
+
+	if len(b) != PUBLIC_KEY_SIZES[curve] {
+		return false
+	}
+
+	reflect.set_union_variant_typeid(
+		pub_key._impl,
+		_PUB_IMPL_IDS[curve],
+	)
+
+	#partial switch curve {
+	case .SECP256R1:
+		if b[0] != secec.SEC_PREFIX_UNCOMPRESSED {
+			return false
+		}
+
+		pt := &pub_key._impl.(secec.Point_p256r1)
+		ok := secec.pt_set_sec_bytes(pt, b)
+		if !ok || secec.pt_is_identity(pt) == 1 {
+			return false
+		}
+	case .X25519:
+		pt := &pub_key._impl.(X25519_Buf)
+		copy(pt[:], b)
+	case .X448:
+		pt := &pub_key._impl.(X448_Buf)
+		copy(pt[:], b)
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+
+	pub_key._curve = curve
+
+	return true
+}
+
+// public_key_set_priv sets pub_key to the public component of priv_key.
+public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) {
+	ensure(priv_key._curve != .Invalid, "crypto/ecdh: uninitialized private key")
+	public_key_clear(pub_key)
+	pub_key^ = priv_key._pub_key
+}
+
+// public_key_bytes sets dst to byte-encoding of pub_key.
+public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
+	ensure(pub_key._curve != .Invalid, "crypto/ecdh: uninitialized public key")
+	ensure(len(dst) == PUBLIC_KEY_SIZES[pub_key._curve], "crypto/ecdh: invalid destination size")
+
+	#partial switch pub_key._curve {
+	case .SECP256R1:
+		// Invariant: Unless the caller is manually building pub_key
+		// `Z = 1`, so we can skip the rescale.
+		pt := &pub_key._impl.(secec.Point_p256r1)
+
+		dst[0] = secec.SEC_PREFIX_UNCOMPRESSED
+		secec.fe_bytes(dst[1:1+secec.FE_SIZE_P256R1], &pt.x)
+		secec.fe_bytes(dst[1+secec.FE_SIZE_P256R1:], &pt.y)
+	case .X25519:
+		pt := &pub_key._impl.(X25519_Buf)
+		copy(dst, pt[:])
+	case .X448:
+		pt := &pub_key._impl.(X448_Buf)
+		copy(dst, pt[:])
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+}
+
+// public_key_equal returns true iff the public keys are equal,
+// in constant time.
+public_key_equal :: proc(p, q: ^Public_Key) -> bool {
+	if p._curve != q._curve {
+		return false
+	}
+
+	#partial switch p._curve {
+	case .SECP256R1:
+		pt_p, pt_q := &p._impl.(secec.Point_p256r1), &q._impl.(secec.Point_p256r1)
+		return secec.pt_equal(pt_p, pt_q) == 1
+	case .X25519:
+		b_p, b_q  := &p._impl.(X25519_Buf), &q._impl.(X25519_Buf)
+		return crypto.compare_constant_time(b_p[:], b_q[:]) == 1
+	case .X448:
+		b_p, b_q  := &p._impl.(X448_Buf), &q._impl.(X448_Buf)
+		return crypto.compare_constant_time(b_p[:], b_q[:]) == 1
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+}
+
+// public_key_clear clears pub_key to the uninitialized state.
+public_key_clear :: proc "contextless" (pub_key: ^Public_Key) {
+	mem.zero_explicit(pub_key, size_of(Public_Key))
+}
+
+// ecdh performs an Elliptic Curve Diffie-Hellman key exchange betwween
+// the Private_Key and Public_Key, writing the shared secret to dst.
+//
+// The neutral element is rejected as an error.
+@(require_results)
+ecdh :: proc(priv_key: ^Private_Key, pub_key: ^Public_Key, dst: []byte) -> bool {
+	ensure(priv_key._curve == pub_key._curve, "crypto/ecdh: curve mismatch")
+	ensure(pub_key._curve != .Invalid, "crypto/ecdh: uninitialized public key")
+	ensure(len(dst) == SHARED_SECRET_SIZES[priv_key._curve], "crypto/ecdh: invalid shared secret size")
+
+	#partial switch priv_key._curve {
+	case .SECP256R1:
+		sc, pt := &priv_key._impl.(secec.Scalar_p256r1), &pub_key._impl.(secec.Point_p256r1)
+		ss: secec.Point_p256r1
+		defer secec.pt_clear(&ss)
+
+		secec.pt_scalar_mul(&ss, pt, sc)
+		return secec.pt_bytes(dst, nil, &ss)
+	case .X25519:
+		sc, pt := &priv_key._impl.(X25519_Buf), &pub_key._impl.(X25519_Buf)
+		x25519.scalarmult(dst, sc[:], pt[:])
+	case .X448:
+		sc, pt := &priv_key._impl.(X448_Buf), &pub_key._impl.(X448_Buf)
+		x448.scalarmult(dst, sc[:], pt[:])
+	case:
+		panic("crypto/ecdh: invalid curve")
+	}
+
+	// X25519/X448 check for all zero digest.
+	return crypto.is_zero_constant_time(dst) == 0
+}
+
+// curve returns the Curve used by a Private_Key or Public_Key instance.
+curve :: proc(k: ^$T) -> Curve where(T == Private_Key || T == Public_Key) {
+	return k._curve
+}
+
+// key_size returns the key size of a Private_Key or Public_Key in bytes.
+key_size :: proc(k: ^$T) -> int where(T == Private_Key || T == Public_Key) {
+	when T == Private_Key {
+		return PRIVATE_KEY_SIZES[k._curve]
+	} else {
+		return PUBLIC_KEY_SIZES[k._curve]
+	}
+}
+
+// shared_secret_size returns the shared secret size of a key exchange
+// in bytes.
+shared_secret_size :: proc(k: ^$T) -> int  where(T == Private_Key || T == Public_Key) {
+	return SHARED_SECRET_SIZES[k._curve]
+}
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -170,7 +170,7 @@ public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) ->

 // public_key_set_priv sets pub_key to the public component of priv_key.
 public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) {
-	ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized public key")
+	ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized private key")

 	src := &priv_key._pub_key
 	copy(pub_key._b[:], src._b[:])
--- a/core/crypto/x25519/x25519.odin
+++ b/core/crypto/x25519/x25519.odin
@@ -6,6 +6,8 @@ See:
 */
 package x25519

+import "core:crypto"
+import ed "core:crypto/_edwards25519"
 import field "core:crypto/_fiat/field_curve25519"
 import "core:mem"

@@ -14,8 +16,10 @@ SCALAR_SIZE :: 32
 // POINT_SIZE is the size of a X25519 point (public key/shared secret) in bytes.
 POINT_SIZE :: 32

-@(private, rodata)
-_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+when crypto.COMPACT_IMPLS == true {
+	@(private,rodata)
+	_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+}

@(private)
 _scalar_bit :: #force_inline proc "contextless" (s: ^[32]byte, i: int) -> u8 {
@@ -111,19 +115,44 @@ scalarmult :: proc(dst, scalar, point: []byte) {
 	e[31] &= 127
 	e[31] |= 64

-	p: [32]byte = ---
-	copy_slice(p[:], point)
-
-	d: [32]byte = ---
-	_scalarmult(&d, &e, &p)
-	copy_slice(dst, d[:])
+	p := (^[32]byte)(raw_data(point))
+	d := (^[32]byte)(raw_data(dst))
+	_scalarmult(d, &e, p)

 	mem.zero_explicit(&e, size_of(e))
-	mem.zero_explicit(&d, size_of(d))
 }

 // scalarmult_basepoint "multiplies" the provided scalar with the X25519
 // base point and writes the resulting point to dst.
 scalarmult_basepoint :: proc(dst, scalar: []byte) {
-	scalarmult(dst, scalar, _BASE_POINT[:])
+	when crypto.COMPACT_IMPLS == true {
+		scalarmult(dst, scalar, _BASE_POINT[:])
+	} else {
+		ensure(len(scalar) == SCALAR_SIZE, "crypto/x25519: invalid scalar size")
+		ensure(len(dst) == POINT_SIZE, "crypto/x25519: invalid destination point size")
+
+		sc: ed.Scalar = ---
+		ed.sc_set_bytes_rfc8032(&sc, scalar)
+
+		ge: ed.Group_Element = ---
+		ed.ge_scalarmult_basepoint(&ge, &sc)
+
+		// u = (y + z)/(z - y)
+		y_plus_z: field.Loose_Field_Element = ---
+		z_minus_y: field.Loose_Field_Element = ---
+		u: field.Tight_Field_Element = ---
+
+		field.fe_add(&y_plus_z, &ge.y, &ge.z)
+		field.fe_sub(&z_minus_y, &ge.z, &ge.y)
+		field.fe_carry_inv(&u, &z_minus_y)
+		field.fe_carry_mul(&u, &y_plus_z, field.fe_relax_cast(&u))
+
+		dst_ := (^[32]byte)(raw_data(dst))
+		field.fe_to_bytes(dst_, &u)
+
+		field.fe_clear_vec([]^field.Loose_Field_Element{&y_plus_z, &z_minus_y})
+		field.fe_clear(&u)
+		ed.sc_clear(&sc)
+		ed.ge_clear(&ge)
+	}
 }
--- a/core/encoding/base32/base32.odin
+++ b/core/encoding/base32/base32.odin
@@ -153,15 +153,15 @@ decode :: proc(
 		padding_count += 1
 	}

+	// Verify no padding in the middle
+	for i := 0; i < data_len - padding_count; i += 1 {
+		if data[i] == byte(PADDING) {
+			return nil, .Malformed_Input
+		}
+	}
+
 	// Check for proper padding and length combinations
 	if padding_count > 0 {
-		// Verify no padding in the middle
-		for i := 0; i < data_len - padding_count; i += 1 {
-			if data[i] == byte(PADDING) {
-				return nil, .Malformed_Input
-			}
-		}
-
 		content_len := data_len - padding_count
 		mod8 := content_len % 8
 		required_padding: int
--- a/core/encoding/base64/base64.odin
+++ b/core/encoding/base64/base64.odin
@@ -24,6 +24,18 @@ ENC_TABLE := [64]byte {
    '4', '5', '6', '7', '8', '9', '+', '/',
 }

+// Encoding table for Base64url variant
+ENC_URL_TABLE := [64]byte {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+    'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+    'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+    'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+    'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+    'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_',
+}
+
 PADDING :: '='

 DEC_TABLE := [256]u8 {
@@ -61,6 +73,43 @@ DEC_TABLE := [256]u8 {
     0,  0,  0,  0,  0,  0,  0,  0,
 }

+// Decoding table for Base64url variant
+DEC_URL_TABLE := [256]u8 {
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0, 62,  0,  0,
+    52, 53, 54, 55, 56, 57, 58, 59,
+    60, 61,  0,  0,  0,  0,  0,  0,
+     0,  0,  1,  2,  3,  4,  5,  6,
+     7,  8,  9, 10, 11, 12, 13, 14,
+    15, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25,  0,  0,  0,  0, 63,
+     0, 26, 27, 28, 29, 30, 31, 32,
+    33, 34, 35, 36, 37, 38, 39, 40,
+    41, 42, 43, 44, 45, 46, 47, 48,
+    49, 50, 51,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+}
+
+
 encode :: proc(data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) -> (encoded: string, err: mem.Allocator_Error) #optional_allocator_error {
 	out_length := encoded_len(data)
 	if out_length == 0 {
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -21,6 +21,7 @@ package encoding_unicode_entity
 		Jeroen van Rijn: Initial implementation.
 */

+import "base:runtime"
 import "core:unicode/utf8"
 import "core:unicode"
 import "core:strings"
@@ -141,8 +142,10 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 					write_string(&builder, entity)
 				} else {
 					if .No_Entity_Decode not_in options {
-						if decoded, ok := xml_decode_entity(entity); ok {
-							write_rune(&builder, decoded)
+						if decoded, count, ok := xml_decode_entity(entity); ok {
+							for i in 0..<count {
+								write_rune(&builder, decoded[i])
+							}
 							continue
 						}
 					}
@@ -212,17 +215,16 @@ advance :: proc(t: ^Tokenizer) -> (err: Error) {
 	}
 }

-xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
+xml_decode_entity :: proc(entity: string) -> (decoded: [2]rune, rune_count: int, ok: bool) {
 	entity := entity
-	if len(entity) == 0 { return -1, false }
+	if len(entity) == 0 { return }

-	switch entity[0] {
-	case '#':
+	if entity[0] == '#' {
 		base  := 10
 		val   := 0
 		entity = entity[1:]

-		if len(entity) == 0 { return -1, false }
+		if len(entity) == 0 { return }

 		if entity[0] == 'x' || entity[0] == 'X' {
 			base = 16
@@ -237,30 +239,275 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
 				val += int(r - '0')

 			case 'a'..='f':
-				if base == 10 { return -1, false }
+				if base == 10 { return }
 				val *= base
 				val += int(r - 'a' + 10)

 			case 'A'..='F':
-				if base == 10 { return -1, false }
+				if base == 10 { return }
 				val *= base
 				val += int(r - 'A' + 10)

 			case:
-				return -1, false
+				return
 			}

-			if val > MAX_RUNE_CODEPOINT { return -1, false }
+			if val > MAX_RUNE_CODEPOINT { return  }
 			entity = entity[1:]
 		}
-		return rune(val), true
-
-	case:
-		// Named entity.
-		return named_xml_entity_to_rune(entity)
+		return rune(val), 1, true
 	}
+	// Named entity.
+	return named_xml_entity_to_rune(entity)
 }

+
+// escape_html escapes special characters like '&' to become '&amp;'.
+// It escapes only 5 different characters: & ' < > and "
+@(require_results)
+escape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool) {
+	/*
+		& -> &amp;
+		' -> &#39; // &#39; is shorter than &apos; (NOTE: &apos; was not available until HTML 5)
+		< -> &lt;
+		> -> &gt;
+		" -> &#34; // &#34; is shorter than &quot;
+	*/
+
+	b := transmute([]byte)s
+
+	extra_bytes_needed := 0
+
+	for c in b {
+		switch c {
+		case '&':  extra_bytes_needed += 4
+		case '\'': extra_bytes_needed += 4
+		case '<':  extra_bytes_needed += 3
+		case '>':  extra_bytes_needed += 3
+		case '"':  extra_bytes_needed += 4
+		}
+	}
+
+	if extra_bytes_needed == 0 {
+		return s, false
+	}
+
+	t, err := make([]byte, len(s) + extra_bytes_needed, allocator, loc)
+	if err != nil {
+		return
+	}
+	was_allocation = true
+
+	w := 0
+	for c in b {
+		x := ""
+		switch c {
+		case '&':  x = "&amp;"
+		case '\'': x = "&#39;"
+		case '<':  x = "&lt;"
+		case '>':  x = "&gt;"
+		case '"':  x = "&#34;"
+		}
+		if x != "" {
+			copy(t[w:], x)
+			w += len(x)
+		} else {
+			t[w] = c
+			w += 1
+		}
+	}
+	output = string(t[0:w])
+	return
+}
+
+
+@(require_results)
+unescape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool, err: runtime.Allocator_Error) {
+	@(require_results)
+	do_append :: proc(s: string, amp_idx: int, buf: ^[dynamic]byte) -> (n: int) {
+		s, amp_idx := s, amp_idx
+
+		n += len(s[:amp_idx])
+		if buf != nil { append(buf, s[:amp_idx]) }
+		s = s[amp_idx:]
+		for len(s) > 0 {
+			b, w, j := unescape_entity(s)
+			n += w
+			if buf != nil { append(buf, ..b[:w]) }
+
+			s = s[j:]
+
+			amp_idx = strings.index_byte(s, '&')
+			if amp_idx < 0 {
+				n += len(s)
+				if buf != nil { append(buf, s) }
+				break
+			}
+			n += amp_idx
+			if buf != nil { append(buf, s[:amp_idx]) }
+			s = s[amp_idx:]
+		}
+
+		return
+	}
+
+	s := s
+	amp_idx := strings.index_byte(s, '&')
+	if amp_idx < 0 {
+		return s, false, nil
+	}
+
+	// NOTE(bill): this does a two pass in order to minimize the allocations required
+	bytes_required := do_append(s, amp_idx, nil)
+
+	buf := make([dynamic]byte, 0, bytes_required, allocator, loc) or_return
+	was_allocation = true
+
+	_ = do_append(s, amp_idx, &buf)
+
+	assert(len(buf) == cap(buf))
+	output = string(buf[:])
+
+	return
+}
+
+// Returns an unescaped string of an encoded XML/HTML entity.
+@(require_results)
+unescape_entity :: proc(s: string) -> (b: [8]byte, w: int, j: int) {
+	s := s
+	if len(s) < 2 {
+		return
+	}
+	if s[0] != '&' {
+		return
+	}
+	j = 1
+
+	if s[j] == '#' { // scan numbers
+		j += 1
+		if len(s) <= 3 { // remove `&#.`
+			return
+		}
+		c := s[j]
+		hex := false
+		if c == 'x' || c == 'X' {
+			hex = true
+			j += 1
+		}
+
+		x := rune(0)
+		scan_number: for j < len(s) {
+			c = s[j]
+			j += 1
+			if hex {
+				switch c {
+				case '0'..='9': x = 16*x + rune(c) - '0';      continue scan_number
+				case 'a'..='f': x = 16*x + rune(c) - 'a' + 10; continue scan_number
+				case 'A'..='F': x = 16*x + rune(c) - 'A' + 10; continue scan_number
+				}
+			} else {
+				switch c {
+				case '0'..='9': x = 10*x + rune(c) - '0'; continue scan_number
+				}
+			}
+
+			// Keep the ';' to check for cases which require it and cases which might not
+			if c != ';' {
+				j -= 1
+			}
+			break scan_number
+		}
+
+
+		if j <= 3 { // no replacement characters found
+			return
+		}
+
+		@(static, rodata)
+		windows_1252_replacement_table := [0xa0 - 0x80]rune{ // Windows-1252 -> UTF-8
+			'\u20ac', '\u0081', '\u201a', '\u0192',
+			'\u201e', '\u2026', '\u2020', '\u2021',
+			'\u02c6', '\u2030', '\u0160', '\u2039',
+			'\u0152', '\u008d', '\u017d', '\u008f',
+			'\u0090', '\u2018', '\u2019', '\u201c',
+			'\u201d', '\u2022', '\u2013', '\u2014',
+			'\u02dc', '\u2122', '\u0161', '\u203a',
+			'\u0153', '\u009d', '\u017e', '\u0178',
+		}
+
+		switch x {
+		case 0x80..<0xa0:
+			x = windows_1252_replacement_table[x-0x80]
+		case 0, 0xd800..=0xdfff:
+			x = utf8.RUNE_ERROR
+		case:
+			if x > 0x10ffff {
+				x = utf8.RUNE_ERROR
+			}
+
+		}
+
+		b1, w1 := utf8.encode_rune(x)
+		w += copy(b[:], b1[:w1])
+		return
+	}
+
+	// Lookup by entity names
+
+	scan_ident: for j < len(s) { // scan over letters and digits
+		c := s[j]
+		j += 1
+
+		switch c {
+		case 'a'..='z', 'A'..='Z', '0'..='9':
+			continue scan_ident
+		}
+		// Keep the ';' to check for cases which require it and cases which might not
+		if c != ';' {
+			j -= 1
+		}
+		break scan_ident
+	}
+
+	entity_name := s[1:j]
+	if len(entity_name) == 0 {
+		return
+	}
+
+	if entity_name[len(entity_name)-1] == ';' {
+		entity_name = entity_name[:len(entity_name)-1]
+	}
+
+	if r2, _, ok := named_xml_entity_to_rune(entity_name); ok {
+		b1, w1 := utf8.encode_rune(r2[0])
+		w += copy(b[w:], b1[:w1])
+		if r2[1] != 0 {
+			b2, w2 := utf8.encode_rune(r2[1])
+			w += copy(b[w:], b2[:w2])
+		}
+		return
+	}
+
+	// The longest entities that do not end with a semicolon are <=6 bytes long
+	LONGEST_ENTITY_WITHOUT_SEMICOLON :: 6
+
+	n := min(len(entity_name)-1, LONGEST_ENTITY_WITHOUT_SEMICOLON)
+	for i := n; i > 1; i -= 1 {
+		if r2, _, ok := named_xml_entity_to_rune(entity_name[:i]); ok {
+			b1, w1 := utf8.encode_rune(r2[0])
+			w += copy(b[w:], b1[:w1])
+			if r2[1] != 0 {
+				b2, w2 := utf8.encode_rune(r2[1])
+				w += copy(b[w:], b2[:w2])
+			}
+			return
+		}
+	}
+
+	return
+}
+
+
 // Private XML helper to extract `&<stuff>;` entity.
@(private="file")
 _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
--- a/core/encoding/entity/generated.odin
+++ b/core/encoding/entity/generated.odin
--- a/core/encoding/json/marshal.odin
+++ b/core/encoding/json/marshal.odin
@@ -62,6 +62,78 @@ Marshal_Options :: struct {
 	mjson_skipped_first_braces_end: bool,
 }

+User_Marshaler :: #type proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> Marshal_Error
+
+Register_User_Marshaler_Error :: enum {
+	None,
+	No_User_Marshaler,
+	Marshaler_Previously_Found,
+}
+
+// Example User Marshaler:
+// Custom Marshaler for `int`
+// Some_Marshaler :: proc(w: io.Writer, v: any, opt: ^json.Marshal_Options) -> json.Marshal_Error {
+// 	io.write_string(w, fmt.tprintf("%b", v))
+// 	return json.Marshal_Data_Error.None
+// }
+//
+// main :: proc() {
+//	// Ensure the json._user_marshaler map is initialized
+//	json.set_user_marshalers(new(map[typeid]json.User_Marshaler))
+//	reg_err := json.register_user_marshaler(type_info_of(int).id, Some_Marshaler)
+//	assert(reg_err == .None)
+//
+//
+// 	// Use the custom marshaler
+// 	SomeType :: struct {
+// 		value: int,
+// 	}
+//
+// 	x := SomeType{42}
+// 	data, marshal_err := json.marshal(x)
+// 	assert(marshal_err == nil)
+// 	defer delete(data)
+//
+// 	fmt.println("Custom output:", string(data)) // Custom output: {"value":101010}
+// }
+
+// NOTE(Jeroen): This is a pointer to prevent accidental additions
+// it is prefixed with `_` rather than marked with a private attribute so that users can access it if necessary
+_user_marshalers: ^map[typeid]User_Marshaler
+
+// Sets user-defined marshalers for custom json marshaling of specific types
+//
+// Inputs:
+// - m: A pointer to a map of typeids to User_Marshaler procs.
+//
+// NOTE: Must be called before using register_user_marshaler.
+//
+set_user_marshalers :: proc(m: ^map[typeid]User_Marshaler) {
+	assert(_user_marshalers == nil, "set_user_marshalers must not be called more than once.")
+	_user_marshalers = m
+}
+
+// Registers a user-defined marshaler for a specific typeid
+//
+// Inputs:
+// - id: The typeid of the custom type.
+// - formatter: The User_Marshaler function for the custom type.
+//
+// Returns: A Register_User_Marshaler_Error value indicating the success or failure of the operation.
+//
+// WARNING: set_user_marshalers must be called before using this procedure.
+//
+register_user_marshaler :: proc(id: typeid, marshaler: User_Marshaler) -> Register_User_Marshaler_Error {
+	if _user_marshalers == nil {
+		return .No_User_Marshaler
+	}
+	if prev, found := _user_marshalers[id]; found && prev != nil {
+		return .Marshaler_Previously_Found
+	}
+	_user_marshalers[id] = marshaler
+	return .None
+}
+
 marshal :: proc(v: any, opt: Marshal_Options = {}, allocator := context.allocator, loc := #caller_location) -> (data: []byte, err: Marshal_Error) {
 	b := strings.builder_make(allocator, loc)
 	defer if err != nil {
@@ -91,6 +163,13 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 		return
 	}

+	if _user_marshalers != nil {
+		marshaler := _user_marshalers[v.id]
+		if marshaler != nil {
+			return marshaler(w, v, opt)
+		}
+	}
+
 	ti := runtime.type_info_base(type_info_of(v.id))
 	a := any{v.data, ti.id}

@@ -122,9 +201,9 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:

 	case runtime.Type_Info_Rune:
 		r := a.(rune)
-		io.write_byte(w, '"')                  or_return
-		io.write_escaped_rune(w, r, '"', true) or_return
-		io.write_byte(w, '"')                  or_return
+		io.write_byte(w, '"')                             or_return
+		io.write_escaped_rune(w, r, '"', for_json = true) or_return
+		io.write_byte(w, '"')                             or_return

 	case runtime.Type_Info_Float:
 		switch f in a {
@@ -414,6 +493,12 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:

 				opt_write_iteration(w, opt, first_iteration) or_return
 				first_iteration = false
+
+				if opt.pretty {
+					comment := reflect.struct_tag_get(reflect.Struct_Tag(info.tags[i]), "jsoncomment")
+					opt_write_comment(w, opt, &comment) or_return
+				}
+
 				if json_name != "" {
 					opt_write_key(w, opt, json_name) or_return
 				} else {
@@ -533,6 +618,26 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 	return
 }

+// Newlines are split into multiple comment lines
+opt_write_comment :: proc(w: io.Writer, opt: ^Marshal_Options, comment: ^string) -> (err: io.Error) {
+	if comment^ == "" {
+		return nil
+	}
+
+	switch opt.spec {
+	case .JSON5, .MJSON:
+		for line in strings.split_iterator(comment, "\n") {
+			io.write_string(w, "// ") or_return
+			io.write_string(w, line) or_return
+			io.write_rune(w, '\n') or_return
+			opt_write_indentation(w, opt) or_return
+		}
+	case .JSON: return nil
+	}
+
+	return nil
+}
+
 // write key as quoted string or with optional quotes in mjson
 opt_write_key :: proc(w: io.Writer, opt: ^Marshal_Options, name: string) -> (err: io.Error)  {
 	switch opt.spec {
--- a/core/encoding/json/parser.odin
+++ b/core/encoding/json/parser.odin
@@ -38,7 +38,7 @@ parse_string :: proc(data: string, spec := DEFAULT_SPECIFICATION, parse_integers

 	switch p.spec {
 	case .JSON:
-		return parse_object(&p, loc)
+		return parse_value(&p, loc)
 	case .JSON5:
 		return parse_value(&p, loc)
 	case .SJSON:
@@ -84,7 +84,7 @@ expect_token :: proc(p: ^Parser, kind: Token_Kind) -> Error {


 parse_colon :: proc(p: ^Parser) -> (err: Error) {
-	colon_err := expect_token(p, .Colon) 
+	colon_err := expect_token(p, .Colon)
 	if colon_err == nil {
 		return nil
 	}
@@ -133,13 +133,13 @@ parse_value :: proc(p: ^Parser, loc := #caller_location) -> (value: Value, err:
 		f, _ := strconv.parse_f64(token.text)
 		value = Float(f)
 		return
-		
+
 	case .Ident:
 		if p.spec == .MJSON {
 			advance_token(p)
 			return clone_string(token.text, p.allocator, loc)
 		}
-		
+
 	case .String:
 		advance_token(p)
 		return unquote_string(token, p.spec, p.allocator, loc)
@@ -192,7 +192,7 @@ parse_array :: proc(p: ^Parser, loc := #caller_location) -> (value: Value, err:
 	for p.curr_token.kind != .Close_Bracket {
 		elem := parse_value(p, loc) or_return
 		append(&array, elem, loc)
-		
+
 		if parse_comma(p) {
 			break
 		}
@@ -278,7 +278,7 @@ parse_object_body :: proc(p: ^Parser, end_token: Token_Kind, loc := #caller_loca
 		if parse_comma(p) {
 			break
 		}
-	}	
+	}
 	return obj, .None
 }

@@ -481,4 +481,4 @@ unquote_string :: proc(token: Token, spec: Specification, allocator := context.a
 	}

 	return string(b[:w]), nil
-}
+}
--- a/core/encoding/json/types.odin
+++ b/core/encoding/json/types.odin
@@ -76,6 +76,7 @@ Error :: enum {
 	Invalid_Number,
 	String_Not_Terminated,
 	Invalid_String,
+	Invalid_Rune,


 	// Parsing Errors
--- a/core/encoding/json/unmarshal.odin
+++ b/core/encoding/json/unmarshal.odin
@@ -26,6 +26,80 @@ Unmarshal_Error :: union {
 	Unsupported_Type_Error,
 }

+User_Unmarshaler :: #type proc(p: ^Parser, v: any) -> Unmarshal_Error
+
+Register_User_Unmarshaler_Error :: enum {
+	None,
+	No_User_Unmarshaler,
+	Unmarshaler_Previously_Found,
+}
+
+// Example User Unmarshaler:
+// Custom Unmarshaler for `int`
+// Some_Unmarshaler :: proc(p: ^json.Parser, v: any) -> json.Unmarshal_Error {
+// 	token := p.curr_token.text
+// 	i, ok := strconv.parse_i64_of_base(token, 2)
+// 	if !ok {
+//		return .Invalid_Data
+//
+//	}
+//	(^int)(v.data)^ = int(i)
+//	return .None
+// }
+//
+// _main :: proc() {
+//	// Ensure the json._user_unmarshaler map is initialized
+//	json.set_user_unmarshalers(new(map[typeid]json.User_Unmarshaler))
+//	reg_err := json.register_user_unmarshaler(type_info_of(int).id, Some_Unmarshaler)
+//	assert(reg_err == .None)
+//
+//	data := `{"value":101010}`
+//	SomeType :: struct {
+//		value: int,
+//	}
+//	y: SomeType
+//
+//	unmarshal_err := json.unmarshal(transmute([]byte)data, &y)
+//	fmt.println(y, unmarshal_err)
+// }
+
+// NOTE(Jeroen): This is a pointer to prevent accidental additions
+// it is prefixed with `_` rather than marked with a private attribute so that users can access it if necessary
+_user_unmarshalers: ^map[typeid]User_Unmarshaler
+
+// Sets user-defined unmarshalers for custom json unmarshaling of specific types
+//
+// Inputs:
+// - m: A pointer to a map of typeids to User_Unmarshaler procs.
+//
+// NOTE: Must be called before using register_user_unmarshaler.
+//
+set_user_unmarshalers :: proc(m: ^map[typeid]User_Unmarshaler) {
+	assert(_user_unmarshalers == nil, "set_user_unmarshalers must not be called more than once.")
+	_user_unmarshalers = m
+}
+
+// Registers a user-defined unmarshaler for a specific typeid
+//
+// Inputs:
+// - id: The typeid of the custom type.
+// - unmarshaler: The User_Unmarshaler function for the custom type.
+//
+// Returns: A Register_User_Unmarshaler_Error value indicating the success or failure of the operation.
+//
+// WARNING: set_user_unmarshalers must be called before using this procedure.
+//
+register_user_unmarshaler :: proc(id: typeid, unmarshaler: User_Unmarshaler) -> Register_User_Unmarshaler_Error {
+	if _user_unmarshalers == nil {
+		return .No_User_Unmarshaler
+	}
+	if prev, found := _user_unmarshalers[id]; found && prev != nil {
+		return .Unmarshaler_Previously_Found
+	}
+	_user_unmarshalers[id] = unmarshaler
+	return .None
+}
+
 unmarshal_any :: proc(data: []byte, v: any, spec := DEFAULT_SPECIFICATION, allocator := context.allocator) -> Unmarshal_Error {
 	v := v
 	if v == nil || v.id == nil {
@@ -37,8 +111,10 @@ unmarshal_any :: proc(data: []byte, v: any, spec := DEFAULT_SPECIFICATION, alloc
 		return .Non_Pointer_Parameter
 	}
 	PARSE_INTEGERS :: true
-	
-	if !is_valid(data, spec, PARSE_INTEGERS) {
+
+	// If we have custom unmarshalers, we skip validation in case the custom data is not quite up to spec.
+	have_custom := _user_unmarshalers != nil && len(_user_unmarshalers) > 0
+	if !have_custom && !is_valid(data, spec, PARSE_INTEGERS) {
 		return .Invalid_Data
 	}
 	p := make_parser(data, spec, PARSE_INTEGERS, allocator)
@@ -225,6 +301,15 @@ unmarshal_string_token :: proc(p: ^Parser, val: any, token: Token, ti: ^reflect.
 		}
 		ok = true
 		return
+	case rune:
+		for rne, i in str {
+			if i > 0 {
+				dst = {}
+				return false, .Invalid_Rune
+			}
+			dst = rne
+		}
+		return true, nil
 	}
 	
 	#partial switch variant in ti.variant {
@@ -265,12 +350,18 @@ unmarshal_string_token :: proc(p: ^Parser, val: any, token: Token, ti: ^reflect.
 	return false, nil
 }

-
@(private)
 unmarshal_value :: proc(p: ^Parser, v: any) -> (err: Unmarshal_Error) {
 	UNSUPPORTED_TYPE := Unsupported_Type_Error{v.id, p.curr_token}
 	token := p.curr_token

+	if _user_unmarshalers != nil {
+		unmarshaler := _user_unmarshalers[v.id]
+		if unmarshaler != nil {
+			return unmarshaler(p, v)
+		}
+	}
+
 	v := v
 	ti := reflect.type_info_base(type_info_of(v.id))
 	if u, ok := ti.variant.(reflect.Type_Info_Union); ok && token.kind != .Null {
--- a/core/encoding/json/validator.odin
+++ b/core/encoding/json/validator.odin
@@ -5,10 +5,10 @@ import "core:mem"
 // NOTE(bill): is_valid will not check for duplicate keys
 is_valid :: proc(data: []byte, spec := DEFAULT_SPECIFICATION, parse_integers := false) -> bool {
 	p := make_parser(data, spec, parse_integers, mem.nil_allocator())
-	
+
 	switch p.spec {
 	case .JSON:
-		return validate_object(&p)
+		return validate_value(&p)
 	case .JSON5:
 		return validate_value(&p)
 	case .MJSON:
@@ -52,7 +52,7 @@ validate_object :: proc(p: ^Parser) -> bool {
 	if err := expect_token(p, .Open_Brace); err != .None {
 		return false
 	}
-	
+
 	validate_object_body(p, .Close_Brace) or_return

 	if err := expect_token(p, .Close_Brace); err != .None {
@@ -102,7 +102,7 @@ validate_value :: proc(p: ^Parser) -> bool {

 	case .Open_Bracket:
 		return validate_array(p)
-		
+
 	case .Ident:
 		if p.spec == .MJSON {
 			advance_token(p)
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -264,7 +264,7 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
 	expect(t, .Dash)
 	expect(t, .Gt)

-	return string(t.src[offset : t.offset - 1]), .None
+	return string(t.src[offset : t.offset - 3]), .None
 }

 // Skip CDATA
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -294,7 +294,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 					comment := scan_comment(t) or_return

 					if .Intern_Comments in opts.flags {
-						if len(doc.elements) == 0 {
+						if doc.element_count == 0 {
 							append(&doc.comments, comment)
 						} else {
 							el := new_element(doc)
@@ -308,6 +308,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 				case .Open_Bracket:
 					// This could be a CDATA tag part of a tag's body. Unread the `<![`
 					t.offset -= 3
+					t.read_offset = t.offset

 					// Instead of calling `parse_body` here, we could also `continue loop`
 					// and fall through to the `case:` at the bottom of the outer loop.
@@ -385,7 +386,8 @@ load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handl
 	return parse_bytes(data, options, filename, error_handler, allocator)
 }

-destroy :: proc(doc: ^Document) {
+destroy :: proc(doc: ^Document, allocator := context.allocator) {
+	context.allocator = allocator
 	if doc == nil { return }

 	for el in doc.elements {
@@ -625,4 +627,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
 	cur := doc.element_count
 	doc.element_count += 1
 	return cur
-}
+}
--- a/core/flags/constants.odin
+++ b/core/flags/constants.odin
@@ -11,8 +11,7 @@ NO_CORE_NAMED_TYPES :: #config(ODIN_CORE_FLAGS_NO_CORE_NAMED_TYPES, false)
 IMPORTING_TIME      :: #config(ODIN_CORE_FLAGS_USE_TIME, time.IS_SUPPORTED)

 // Override support for parsing `net` types.
-// TODO: Update this when the BSDs are supported.
-IMPORTING_NET       :: #config(ODIN_CORE_FLAGS_USE_NET, ODIN_OS == .Windows || ODIN_OS == .Linux || ODIN_OS == .Darwin || ODIN_OS == .FreeBSD)
+IMPORTING_NET       :: #config(ODIN_CORE_FLAGS_USE_NET, ODIN_OS == .Windows || ODIN_OS == .Linux || ODIN_OS == .Darwin || ODIN_OS == .FreeBSD || ODIN_OS == .NetBSD || ODIN_OS == .OpenBSD)

 TAG_ARGS          :: "args"
 SUBTAG_NAME       :: "name"
--- a/core/flags/errors.odin
+++ b/core/flags/errors.odin
@@ -1,5 +1,7 @@
 package flags

+import "base:runtime"
+import "core:net"
 import "core:os"

 Parse_Error_Reason :: enum {
@@ -24,6 +26,12 @@ Parse_Error :: struct {
 	message: string,
 }

+Unified_Parse_Error_Reason :: union #shared_nil {
+	Parse_Error_Reason,
+	runtime.Allocator_Error,
+	net.Parse_Endpoint_Error,
+}
+
 // Raised during parsing.
 // Provides more granular information than what just a string could hold.
 Open_File_Error :: struct {
--- a/core/flags/errors_bsd.odin
+++ b/core/flags/errors_bsd.odin
@@ -1,9 +0,0 @@
-#+build netbsd, openbsd
-package flags
-
-import "base:runtime"
-
-Unified_Parse_Error_Reason :: union #shared_nil {
-	Parse_Error_Reason,
-	runtime.Allocator_Error,
-}
--- a/core/flags/errors_nonbsd.odin
+++ b/core/flags/errors_nonbsd.odin
@@ -1,12 +0,0 @@
-#+build !netbsd
-#+build !openbsd
-package flags
-
-import "base:runtime"
-import "core:net"
-
-Unified_Parse_Error_Reason :: union #shared_nil {
-	Parse_Error_Reason,
-	runtime.Allocator_Error,
-	net.Parse_Endpoint_Error,
-}
--- a/core/flags/internal_rtti.odin
+++ b/core/flags/internal_rtti.odin
@@ -5,6 +5,7 @@ import "base:intrinsics"
 import "base:runtime"
 import "core:fmt"
 import "core:mem"
+import "core:net"
 import "core:os"
 import "core:reflect"
 import "core:strconv"
@@ -310,7 +311,18 @@ parse_and_set_pointer_by_named_type :: proc(ptr: rawptr, str: string, data_type:
 	}

 	when IMPORTING_NET {
-		if try_net_parse_workaround(data_type, str, ptr, out_error) {
+		if data_type == net.Host_Or_Endpoint {
+			addr, net_error := net.parse_hostname_or_endpoint(str)
+			if net_error != nil {
+				// We pass along `net.Error` here.
+				out_error^ = Parse_Error {
+					net_error,
+					"Invalid Host/Endpoint.",
+				}
+				return
+			}
+
+			(cast(^net.Host_Or_Endpoint)ptr)^ = addr
 			return
 		}
 	}
--- a/core/flags/internal_rtti_nonbsd.odin
+++ b/core/flags/internal_rtti_nonbsd.odin
@@ -1,32 +0,0 @@
-#+private
-#+build !netbsd
-#+build !openbsd
-package flags
-
-import "core:net"
-
-// This proc exists purely as a workaround for import restrictions.
-// Returns true if caller should return early.
-try_net_parse_workaround :: #force_inline proc (
-	data_type: typeid,
-	str: string,
-	ptr: rawptr,
-	out_error: ^Error,
-) -> bool {
-	if data_type == net.Host_Or_Endpoint {
-		addr, net_error := net.parse_hostname_or_endpoint(str)
-		if net_error != nil {
-			// We pass along `net.Error` here.
-			out_error^ = Parse_Error {
-				net_error,
-				"Invalid Host/Endpoint.",
-			}
-			return true
-		}
-
-		(cast(^net.Host_Or_Endpoint)ptr)^ = addr
-		return true
-	}
-
-	return false
-}
--- a/core/fmt/fmt.odin
+++ b/core/fmt/fmt.odin
--- a/core/image/general.odin
+++ b/core/image/general.odin
@@ -86,7 +86,7 @@ which_bytes :: proc(data: []byte) -> Which_File_Type {
 			return v
 		}
 		get16le :: #force_inline  proc(s: ^string) -> u16 {
-			v := u16(s[0]) | u16(s[1])<<16
+			v := u16(s[0]) | u16(s[1])<<8
 			s^ = s[2:]
 			return v
 		}
--- a/core/image/tga/tga.odin
+++ b/core/image/tga/tga.odin
@@ -152,7 +152,7 @@ load_from_context :: proc(ctx: ^$C, options := Options{}, allocator := context.a
 		// Intentionally blank
 	case .Uncompressed_Black_White:
 		black_white  = true
-		dest_depth   = 24
+		dest_depth   = 8 if .do_not_expand_grayscale in options else 24
 	case .Uncompressed_Color_Mapped:
 		color_mapped = true
 	case .Compressed_Color_Mapped:
@@ -161,7 +161,7 @@ load_from_context :: proc(ctx: ^$C, options := Options{}, allocator := context.a
 	case .Compressed_Black_White:
 		black_white  = true
 		rle_encoding = true
-		dest_depth   = 24
+		dest_depth   = 8 if .do_not_expand_grayscale in options else 24

 	case:
 		return nil, .Unsupported_Format
@@ -180,6 +180,9 @@ load_from_context :: proc(ctx: ^$C, options := Options{}, allocator := context.a
 	}

 	switch dest_depth {
+	case 8: // R8
+		src_channels = 1
+		dest_channels = 1
 	case 15: // B5G5R5
 		src_channels  = 2
 		dest_channels = 3
--- a/core/io/util.odin
+++ b/core/io/util.odin
@@ -21,12 +21,12 @@ write_ptr_at :: proc(w: Writer_At, p: rawptr, byte_size: int, offset: i64, n_wri
 }

 write_u64 :: proc(w: Writer, i: u64, base: int = 10, n_written: ^int = nil) -> (n: int, err: Error) {
-	buf: [32]byte
+	buf: [64]byte
 	s := strconv.write_bits(buf[:], i, base, false, 64, strconv.digits, nil)
 	return write_string(w, s, n_written)
 }
 write_i64 :: proc(w: Writer, i: i64, base: int = 10, n_written: ^int = nil) -> (n: int, err: Error) {
-	buf: [32]byte
+	buf: [65]byte
 	s := strconv.write_bits(buf[:], u64(i), base, true, 64, strconv.digits, nil)
 	return write_string(w, s, n_written)
 }
@@ -39,12 +39,12 @@ write_int :: proc(w: Writer, i: int, base: int = 10, n_written: ^int = nil) -> (
 }

 write_u128 :: proc(w: Writer, i: u128, base: int = 10, n_written: ^int = nil) -> (n: int, err: Error) {
-	buf: [39]byte
+	buf: [128]byte
 	s := strconv.write_bits_128(buf[:], i, base, false, 128, strconv.digits, nil)
 	return write_string(w, s, n_written)
 }
 write_i128 :: proc(w: Writer, i: i128, base: int = 10, n_written: ^int = nil) -> (n: int, err: Error) {
-	buf: [40]byte
+	buf: [129]byte
 	s := strconv.write_bits_128(buf[:], u128(i), base, true, 128, strconv.digits, nil)
 	return write_string(w, s, n_written)
 }
--- a/core/math/rand/rand.odin
+++ b/core/math/rand/rand.odin
@@ -260,7 +260,7 @@ Example:
 Possible Output:

 	6
-	500
+	13

 */
@(require_results)
@@ -301,7 +301,7 @@ Example:
 Possible Output:

 	6
-	500
+	13

 */
@(require_results)
@@ -342,7 +342,7 @@ Example:
 Possible Output:

 	6
-	500
+	13

 */
@(require_results)
--- a/core/math/rand/rand_pcg.odin
+++ b/core/math/rand/rand_pcg.odin
@@ -55,16 +55,20 @@ pcg_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_M
 			intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
 		case:
 			// All other cases.
-			pos := i8(0)
-			val := u64(0)
-			for &v in p {
-				if pos == 0 {
-					val = read_u64(r)
-					pos = 8
+			n := len(p) / size_of(u64)
+			buff := ([^]u64)(raw_data(p))[:n]
+			for &e in buff {
+				intrinsics.unaligned_store(&e, read_u64(r))
+			}
+			// Handle remaining bytes
+			rem := len(p) % size_of(u64)
+			if rem > 0 {
+				val := read_u64(r)
+				tail := p[len(p) - rem:]
+				for &b in tail {
+					b = byte(val)
+					val >>= 8
 				}
-				v = byte(val)
-				val >>= 8
-				pos -= 1
 			}
 		}

--- a/core/math/rand/rand_xoshiro256.odin
+++ b/core/math/rand/rand_xoshiro256.odin
@@ -74,16 +74,20 @@ xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Gene
 			intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
 		case:
 			// All other cases.
-			pos := i8(0)
-			val := u64(0)
-			for &v in p {
-				if pos == 0 {
-					val = read_u64(r)
-					pos = 8
+			n := len(p) / size_of(u64)
+			buff := ([^]u64)(raw_data(p))[:n]
+			for &e in buff {
+				intrinsics.unaligned_store(&e, read_u64(r))
+			}
+			// Handle remaining bytes
+			rem := len(p) % size_of(u64)
+			if rem > 0 {
+				val := read_u64(r)
+				tail := p[len(p) - rem:]
+				for &b in tail {
+					b = byte(val)
+					val >>= 8
 				}
-				v = byte(val)
-				val >>= 8
-				pos -= 1
 			}
 		}

--- a/core/nbio/doc.odin
+++ b/core/nbio/doc.odin
@@ -0,0 +1,213 @@
+/*
+package nbio implements a non-blocking I/O and event loop abstraction layer
+over several platform-specific asynchronous I/O APIs.
+
+More examples can be found in Odin's examples repository
+at [[ examples/nbio ; https://github.com/odin-lang/examples/tree/master/nbio ]].
+
+**Event Loop**:
+
+Each thread may have at most one event loop associated with it.
+This is enforced by the package, as running multiple event loops on a single
+thread does not make sense.
+
+Event loops are reference counted and managed by the package.
+
+`acquire_thread_event_loop` and `release_thread_event_loop` can be used
+to acquire and release a reference. Acquiring must be done before any operation
+is done.
+
+The event loop progresses in ticks. A tick checks if any work is to be done,
+and based on the given timeout may block waiting for work.
+
+Ticks are typically done using the `tick`, `run`, and `run_until` procedures.
+
+Example:
+	package main
+
+	import "core:nbio"
+	import "core:time"
+	import "core:fmt"
+
+	main :: proc() {
+		err := nbio.acquire_thread_event_loop()
+		assert(err == nil)
+		defer nbio.release_thread_event_loop()
+
+		nbio.timeout(time.Second, proc(_: ^nbio.Operation) {
+			fmt.println("Hellope after 1 second!")
+		})
+
+		err = nbio.run()
+		assert(err == nil)
+	}
+
+
+**Time and timeouts**:
+
+Timeouts are intentionally *slightly inaccurate* by design.
+
+A timeout is not checked continuously, instead, it is evaluated only when
+a tick occurs. This means if a tick took a long time, your timeout may be ready
+for a bit of time already before the callback is called.
+
+The function `now` returns the current time as perceived by the event
+loop. This value is cached at least once per tick so it is fast to retrieve.
+
+Most operations also take an optional timeout when executed.
+If the timeout completes before the operation, the operation is cancelled and
+called back with a `.Timeout` error.
+
+
+**Threading**:
+
+The package has a concept of I/O threads (threads that are ticking) and worker
+threads (any other thread).
+
+An I/O thread is mostly self contained, operations are executed on it, and
+callbacks run on it.
+
+If you try to execute an operation on a thread that has no running event loop
+a panic will be executed. Instead a worker thread can execute operations onto
+a running event loop by taking it's reference and executing operations with
+that reference.
+
+In this case:
+- The operation is enqueued from the worker thread
+- The I/O thread is optionally woken up from blocking for work with `wake_up`
+- The next tick, the operation is executed by the I/O thread
+- The callback is invoked on the I/O thread
+
+Example:
+	package main
+
+	import "core:nbio"
+	import "core:net"
+	import "core:thread"
+	import "core:time"
+
+	Connection :: struct {
+		loop:   ^nbio.Event_Loop,
+		socket: net.TCP_Socket,
+	}
+
+	main :: proc() {
+		workers: thread.Pool
+		thread.pool_init(&workers, context.allocator, 2)
+		thread.pool_start(&workers)
+
+		err := nbio.acquire_thread_event_loop()
+		defer nbio.release_thread_event_loop()
+		assert(err == nil)
+
+		server, listen_err := nbio.listen_tcp({nbio.IP4_Any, 1234})
+		assert(listen_err == nil)
+		nbio.accept_poly(server, &workers, on_accept)
+
+		err = nbio.run()
+		assert(err == nil)
+
+		on_accept :: proc(op: ^nbio.Operation, workers: ^thread.Pool) {
+			assert(op.accept.err == nil)
+
+			nbio.accept_poly(op.accept.socket, workers, on_accept)
+
+			thread.pool_add_task(workers, context.allocator, do_work, new_clone(Connection{
+				loop   = op.l,
+				socket = op.accept.client,
+			}))
+		}
+
+		do_work :: proc(t: thread.Task) {
+			connection := (^Connection)(t.data)
+
+			// Imagine CPU intensive work that's been ofloaded to a worker thread.
+			time.sleep(time.Second * 1)
+
+			nbio.send_poly(connection.socket, {transmute([]byte)string("Hellope!\n")}, connection, on_sent, l=connection.loop)
+		}
+
+		on_sent :: proc(op: ^nbio.Operation, connection: ^Connection) {
+			assert(op.send.err == nil)
+			// Client got our message, clean up.
+			nbio.close(connection.socket)
+			free(connection)
+		}
+	}
+
+
+**Handle and socket association**:
+
+Most platforms require handles (files, sockets, etc.) to be explicitly
+associated with an event loop or configured for non-blocking/asynchronous
+operation.
+
+On some platforms (notably Windows), this requires a specific flag at open
+time (`.Non_Blocking` for `core:os`) and association may fail if the handle was
+not created correctly.
+
+For this reason, prefer `open` and `create_socket` from this package instead.
+
+`associate_handle`, `associate_file`, and `associate_socket` can be used for
+externally opened files/sockets.
+
+
+**Offsets and positional I/O**:
+
+Operations do not implicitly use or modify a handle’s internal file
+offset.
+
+Instead, operations such as `read` and `write` are *positional* and require
+an explicit offset.
+
+This avoids ambiguity and subtle bugs when multiple asynchronous operations
+are issued concurrently against the same handle.
+
+
+**Contexts and callbacks**:
+
+The `context` inside a callback is *not* the context that submitted the
+operation.
+
+Instead, the callback receives the context that was active when the event
+loop function (`tick`, `run`, etc.) was called.
+
+This is because otherwise the context would have to be copied and held onto for
+each operation.
+
+If the submitting context is required inside the callback, it must be copied
+into the operation’s user data explicitly.
+
+Example:
+	nbio.timeout_poly(time.Second, new_clone(context), proc(_: ^Operation, ctx: ^runtime.Context) {
+		context = ctx^
+		free(ctx)
+	})
+
+
+**Callback scheduling guarantees**:
+
+Callbacks are guaranteed to be invoked in a later tick, never synchronously.
+This means that the operation returned from a procedure is at least valid till
+the end of the current tick, because an operation is freed after it's callback
+is called. Thus you can set user data after an execution is queued, or call
+`remove`, removing subtle "race" conditions and simplifying control flow.
+
+**Why does the design of this package use callbacks?**
+
+Callbacks are the simplest interface an event loop can reasonably expose: "Run
+this when the operation completes". This means the loop itself doesn’t need to
+know how the result is consumed (You could give control of the entire loop to
+the user, but that comes with all the problems in that).
+
+Other mechanisms can be built on top of this, such as coroutines or even a queue
+you comsume at your own time.
+
+The choice of one `Operation` type and one callback type for all the operations
+is partly so it is easy to bind into Lua-like or native coroutines.
+
+Callbacks also allow multiple independent users to share the same event loop.
+A package can register its own operations, and application code can register
+others, without either seeing or handling the other’s completions.
+*/
+package nbio
--- a/core/nbio/errors.odin
+++ b/core/nbio/errors.odin
@@ -0,0 +1,84 @@
+package nbio
+
+import "base:intrinsics"
+
+import "core:reflect"
+
+Error :: intrinsics.type_merge(
+	Network_Error,
+	union #shared_nil {
+		General_Error,
+		FS_Error,
+	},
+)
+#assert(size_of(Error) == 8)
+
+// Errors regarding general usage of the event loop.
+General_Error :: enum i32 {
+	None,
+
+	Allocation_Failed = i32(PLATFORM_ERR_ALLOCATION_FAILED),
+	Unsupported       = i32(PLATFORM_ERR_UNSUPPORTED),
+}
+
+// Errors gotten from file system operations.
+FS_Error :: enum i32 {
+	None,
+	Unsupported        = i32(PLATFORM_ERR_UNSUPPORTED),
+	Allocation_Failed  = i32(PLATFORM_ERR_ALLOCATION_FAILED),
+	Timeout            = i32(PLATFORM_ERR_TIMEOUT),
+	Invalid_Argument   = i32(PLATFORM_ERR_INVALID_ARGUMENT),
+	Permission_Denied  = i32(PLATFORM_ERR_PERMISSION_DENIED),
+	EOF                = i32(PLATFORM_ERR_EOF),
+	Exists             = i32(PLATFORM_ERR_EXISTS),
+	Not_Found          = i32(PLATFORM_ERR_NOT_FOUND),
+}
+
+Platform_Error :: _Platform_Error
+
+error_string :: proc(err: Error) -> string {
+	err := err
+	variant := any{
+		id   = reflect.union_variant_typeid(err),
+		data = &err,
+	}
+	str := reflect.enum_string(variant)
+
+	if str == "" {
+		#partial switch uerr in err {
+		case FS_Error:
+			str, _ = reflect.enum_name_from_value(Platform_Error(uerr))
+		case General_Error:
+			str, _ = reflect.enum_name_from_value(Platform_Error(uerr))
+		}
+	}
+	if str == "" {
+		str = "Unknown"
+	}
+
+	return str
+}
+
+error_string_recv :: proc(recv_err: Recv_Error) -> string {
+	switch err in recv_err {
+	case TCP_Recv_Error: return error_string(err)
+	case UDP_Recv_Error: return error_string(err)
+	case:                return "Unknown"
+	}
+}
+
+error_string_send :: proc(send_err: Send_Error) -> string {
+	switch err in send_err {
+	case TCP_Send_Error: return error_string(err)
+	case UDP_Send_Error: return error_string(err)
+	case:                return "Unknown"
+	}
+}
+
+error_string_sendfile :: proc(send_err: Send_File_Error) -> string {
+	switch err in send_err {
+	case TCP_Send_Error: return error_string(err)
+	case FS_Error:       return error_string(err)
+	case:                return "Unknown"
+	}
+}
--- a/core/nbio/errors_linux.odin
+++ b/core/nbio/errors_linux.odin
@@ -0,0 +1,16 @@
+#+private
+package nbio
+
+import "core:sys/linux"
+
+PLATFORM_ERR_UNSUPPORTED       :: linux.Errno.ENOSYS
+PLATFORM_ERR_ALLOCATION_FAILED :: linux.Errno.ENOMEM
+PLATFORM_ERR_TIMEOUT           :: linux.Errno.ECANCELED
+PLATFORM_ERR_INVALID_ARGUMENT  :: linux.Errno.EINVAL
+PLATFORM_ERR_OVERFLOW          :: linux.Errno.E2BIG
+PLATFORM_ERR_NOT_FOUND         :: linux.Errno.ENOENT
+PLATFORM_ERR_EXISTS            :: linux.Errno.EEXIST
+PLATFORM_ERR_PERMISSION_DENIED :: linux.Errno.EPERM
+PLATFORM_ERR_EOF               :: -100 // There is no EOF errno, we use negative for our own error codes.
+
+_Platform_Error :: linux.Errno
--- a/core/nbio/errors_others.odin
+++ b/core/nbio/errors_others.odin
@@ -0,0 +1,20 @@
+#+build !darwin
+#+build !freebsd
+#+build !openbsd
+#+build !netbsd
+#+build !linux
+#+build !windows
+#+private
+package nbio
+
+PLATFORM_ERR_UNSUPPORTED       :: 1
+PLATFORM_ERR_ALLOCATION_FAILED :: 2
+PLATFORM_ERR_TIMEOUT           :: 3
+PLATFORM_ERR_INVALID_ARGUMENT  :: 4
+PLATFORM_ERR_OVERFLOW          :: 5
+PLATFORM_ERR_NOT_FOUND         :: 6
+PLATFORM_ERR_EXISTS            :: 7
+PLATFORM_ERR_PERMISSION_DENIED :: 8
+PLATFORM_ERR_EOF               :: 9
+
+_Platform_Error :: enum i32 {}
--- a/core/nbio/errors_posix.odin
+++ b/core/nbio/errors_posix.odin
@@ -0,0 +1,17 @@
+#+build darwin, freebsd, netbsd, openbsd
+#+private
+package nbio
+
+import "core:sys/posix"
+
+PLATFORM_ERR_UNSUPPORTED       :: posix.Errno.ENOSYS
+PLATFORM_ERR_ALLOCATION_FAILED :: posix.Errno.ENOMEM
+PLATFORM_ERR_TIMEOUT           :: posix.Errno.ECANCELED
+PLATFORM_ERR_INVALID_ARGUMENT  :: posix.Errno.EINVAL
+PLATFORM_ERR_OVERFLOW          :: posix.Errno.E2BIG
+PLATFORM_ERR_NOT_FOUND         :: posix.Errno.ENOENT
+PLATFORM_ERR_EXISTS            :: posix.Errno.EEXIST
+PLATFORM_ERR_PERMISSION_DENIED :: posix.Errno.EPERM
+PLATFORM_ERR_EOF               :: -100 // There is no EOF errno, we use negative for our own error codes.
+
+_Platform_Error :: posix.Errno
--- a/core/nbio/errors_windows.odin
+++ b/core/nbio/errors_windows.odin
@@ -0,0 +1,17 @@
+#+private
+package nbio
+
+import win "core:sys/windows"
+
+PLATFORM_ERR_UNSUPPORTED       :: win.System_Error.NOT_SUPPORTED
+
+PLATFORM_ERR_ALLOCATION_FAILED :: win.System_Error.OUTOFMEMORY
+PLATFORM_ERR_TIMEOUT           :: win.System_Error.WAIT_TIMEOUT
+PLATFORM_ERR_INVALID_ARGUMENT  :: win.System_Error.BAD_ARGUMENTS
+PLATFORM_ERR_OVERFLOW          :: win.System_Error.BUFFER_OVERFLOW
+PLATFORM_ERR_NOT_FOUND         :: win.System_Error.FILE_NOT_FOUND
+PLATFORM_ERR_EXISTS            :: win.System_Error.FILE_EXISTS
+PLATFORM_ERR_PERMISSION_DENIED :: win.System_Error.ACCESS_DENIED
+PLATFORM_ERR_EOF               :: win.System_Error.HANDLE_EOF
+
+_Platform_Error :: win.System_Error
--- a/core/nbio/impl.odin
+++ b/core/nbio/impl.odin
@@ -0,0 +1,259 @@
+#+private
+package nbio
+
+import "base:runtime"
+import "base:intrinsics"
+
+import "core:container/pool"
+import "core:net"
+import "core:strings"
+import "core:time"
+import "core:reflect"
+
+@(init, private)
+init_thread_local_cleaner :: proc "contextless" () {
+	runtime.add_thread_local_cleaner(proc() {
+		l := &_tls_event_loop
+		if l.refs > 0 {
+			l.refs = 1
+			_release_thread_event_loop()
+		}
+	})
+}
+
+@(thread_local)
+_tls_event_loop: Event_Loop
+
+_acquire_thread_event_loop :: proc() -> General_Error {
+	l := &_tls_event_loop
+	if l.err == nil && l.refs == 0 {
+		when ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32 && ODIN_OS != .Orca {
+			allocator := runtime.default_wasm_allocator()
+		} else {
+			allocator := runtime.heap_allocator()
+		}
+
+		l.allocator = allocator
+
+		if alloc_err := mpsc_init(&l.queue, 128, l.allocator); alloc_err != nil {
+			l.err = .Allocation_Failed
+			return l.err
+		}
+		defer if l.err != nil { mpsc_destroy(&l.queue, l.allocator) }
+
+		if pool_err := pool.init(&l.operation_pool, "_pool_link"); pool_err != nil {
+			l.err = .Allocation_Failed
+			return l.err
+		}
+		defer if l.err != nil { pool.destroy(&l.operation_pool) }
+
+		l.err = _init(l, allocator)
+		l.now = time.now()
+	}
+
+	if l.err != nil {
+		return l.err
+	}
+
+	l.refs += 1
+	return nil
+}
+
+_release_thread_event_loop :: proc() {
+	l := &_tls_event_loop
+	if l.err != nil {
+		assert(l.refs == 0)
+		return
+	}
+
+	if l.refs > 0 {
+		l.refs -= 1
+		if l.refs == 0 {
+			mpsc_destroy(&l.queue, l.allocator)
+			pool.destroy(&l.operation_pool)
+			_destroy(l)
+			l^ = {}
+		}
+	}
+}
+
+_current_thread_event_loop :: #force_inline proc(loc := #caller_location) -> (^Event_Loop) {
+	l := &_tls_event_loop
+
+	if intrinsics.expect(l.refs == 0, false) {
+		return nil
+	}
+
+	return l
+}
+
+_tick :: proc(l: ^Event_Loop, timeout: time.Duration) -> (err: General_Error) {
+	// Receive operations queued from other threads first.
+	for {
+		op := (^Operation)(mpsc_dequeue(&l.queue))
+		if op == nil { break }
+		_exec(op)
+	}
+
+	return __tick(l, timeout)
+}
+
+_listen_tcp :: proc(
+	l: ^Event_Loop,
+	endpoint: Endpoint,
+	backlog := 1000,
+	loc := #caller_location,
+) -> (
+	socket: TCP_Socket,
+	err: Network_Error,
+) {
+	family := family_from_endpoint(endpoint)
+	socket = create_tcp_socket(family, l, loc) or_return
+	defer if err != nil { close(socket, l=l) }
+
+	net.set_option(socket, .Reuse_Address, true)
+
+	bind(socket, endpoint) or_return
+
+	_listen(socket, backlog) or_return
+	return
+}
+
+_read_entire_file :: proc(l: ^Event_Loop, path: string, user_data: rawptr, cb: Read_Entire_File_Callback, allocator := context.allocator, dir := CWD) {
+	open_poly3(path, user_data, cb, allocator, on_open, dir=dir, l=l)
+
+	on_open :: proc(op: ^Operation, user_data: rawptr, cb: Read_Entire_File_Callback, allocator: runtime.Allocator) {
+		if op.open.err != nil {
+			cb(user_data, nil, {.Open, op.open.err})
+			return
+		}
+
+		stat_poly3(op.open.handle, user_data, cb, allocator, on_stat)
+	}
+
+	on_stat :: proc(op: ^Operation, user_data: rawptr, cb: Read_Entire_File_Callback, allocator: runtime.Allocator) {
+		if op.stat.err != nil {
+			close(op.stat.handle)
+			cb(user_data, nil, {.Stat, op.stat.err})
+			return
+		}
+
+		if op.stat.type != .Regular {
+			close(op.stat.handle)
+			cb(user_data, nil, {.Stat, .Unsupported})
+			return
+		}
+
+		buf, err := make([]byte, op.stat.size, allocator)
+		if err != nil {
+			close(op.stat.handle)
+			cb(user_data, nil, {.Read, .Allocation_Failed})
+			return
+		}
+
+		read_poly3(op.stat.handle, 0, buf, user_data, cb, allocator, on_read, all=true)
+	}
+
+	on_read :: proc(op: ^Operation, user_data: rawptr, cb: Read_Entire_File_Callback, allocator: runtime.Allocator) {
+		close(op.read.handle)
+
+		if op.read.err != nil {
+			delete(op.read.buf, allocator)
+			cb(user_data, nil, {.Read, op.read.err})
+			return
+		}
+
+		assert(op.read.read == len(op.read.buf))
+		cb(user_data, op.read.buf, {})
+	}
+}
+
+NBIO_DEBUG :: #config(NBIO_DEBUG, false)
+
+Debuggable :: union {
+	Operation_Type,
+	string,
+	int,
+	time.Time,
+	time.Duration,
+}
+
+@(disabled=!NBIO_DEBUG)
+debug :: proc(contents: ..Debuggable, location := #caller_location) {
+	if context.logger.procedure == nil || .Debug < context.logger.lowest_level {
+		return
+	}
+
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+
+	b: strings.Builder
+	b.buf.allocator = context.temp_allocator
+
+	strings.write_string(&b, "[nbio] ")
+
+	for content, i in contents {
+		switch val in content {
+		case Operation_Type:
+			name, _ := reflect.enum_name_from_value(val)
+			strings.write_string(&b, name)
+		case string:
+			strings.write_string(&b, val)
+		case int:
+			strings.write_int(&b, val)
+		case time.Duration:
+			ms := time.duration_milliseconds(val)
+			strings.write_f64(&b, ms, 'f')
+			strings.write_string(&b, "ms")
+
+		case time.Time:
+			buf: [time.MIN_HMS_LEN+1]byte
+			h, m, s, ns := time.precise_clock_from_time(val)
+			buf[8] = '.'
+			buf[7] = '0' + u8(s % 10); s /= 10
+			buf[6] = '0' + u8(s)
+			buf[5] = ':'
+			buf[4] = '0' + u8(m % 10); m /= 10
+			buf[3] = '0' + u8(m)
+			buf[2] = ':'
+			buf[1] = '0' + u8(h % 10); h /= 10
+			buf[0] = '0' + u8(h)
+
+			strings.write_string(&b, string(buf[:]))
+			strings.write_int(&b, ns)
+		}
+
+		if i < len(contents)-1 {
+			strings.write_byte(&b, ' ')
+		}
+	}
+
+	context.logger.procedure(context.logger.data, .Debug, strings.to_string(b), context.logger.options, location)
+}
+
+warn :: proc(text: string, location := #caller_location) {
+	if context.logger.procedure == nil || .Warning < context.logger.lowest_level {
+		return
+	}
+
+	context.logger.procedure(context.logger.data, .Warning, text, context.logger.options, location)
+}
+
+@(require_results)
+constraint_bufs_to_max_rw :: proc(bufs: [][]byte) -> (constrained: [][]byte, total: int) {
+	for buf in bufs {
+		total += len(buf)
+	}
+
+	constrained = bufs
+	for n := total; n > MAX_RW; {
+		last := &constrained[len(constrained)-1]
+		take := min(len(last), n-MAX_RW)
+		last^ = last[:take]
+		if len(last) == 0 {
+			constrained = constrained[:len(constrained)-1]
+		}
+		n -= take
+	}
+
+	return
+}
--- a/core/nbio/impl_linux.odin
+++ b/core/nbio/impl_linux.odin
--- a/core/nbio/impl_others.odin
+++ b/core/nbio/impl_others.odin
@@ -0,0 +1,218 @@
+#+build !darwin
+#+build !freebsd
+#+build !openbsd
+#+build !netbsd
+#+build !linux
+#+build !windows
+#+private
+package nbio
+
+import "core:container/avl"
+import "core:container/pool"
+import "core:container/queue"
+import "core:mem"
+import "core:slice"
+import "core:time"
+
+_FULLY_SUPPORTED :: false
+
+_Event_Loop :: struct {
+	completed: queue.Queue(^Operation),
+	timeouts:  avl.Tree(^Operation),
+}
+
+_Handle :: uintptr
+
+_CWD :: Handle(-100)
+
+MAX_RW :: mem.Gigabyte
+
+_Operation :: struct {
+	removed: bool,
+}
+
+_Accept :: struct {}
+
+_Close :: struct {}
+
+_Dial :: struct {}
+
+_Recv :: struct {
+	small_bufs: [1][]byte,
+}
+
+_Send :: struct {
+	small_bufs: [1][]byte,
+}
+
+_Read :: struct {}
+
+_Write :: struct {}
+
+_Timeout :: struct {
+	expires: time.Time,
+}
+
+_Poll :: struct {}
+
+_Send_File :: struct {}
+
+_Open :: struct {}
+
+_Stat :: struct {}
+
+_Splice :: struct {}
+
+_Remove :: struct {}
+
+_Link_Timeout :: struct {}
+
+_init :: proc(l: ^Event_Loop, allocator: mem.Allocator) -> (rerr: General_Error) {
+	l.completed.data.allocator = allocator
+
+	avl.init_cmp(&l.timeouts, timeouts_cmp, allocator)
+
+	return nil
+
+	timeouts_cmp :: #force_inline proc(a, b: ^Operation) -> slice.Ordering {
+		switch {
+		case a.timeout._impl.expires._nsec < b.timeout._impl.expires._nsec:
+			return .Less
+		case a.timeout._impl.expires._nsec > b.timeout._impl.expires._nsec:
+			return .Greater
+		case uintptr(a) < uintptr(b):
+			return .Less
+		case uintptr(a) > uintptr(b):
+			return .Greater
+		case:
+			assert(a == b)
+			return .Equal
+		}
+	}
+}
+
+_destroy :: proc(l: ^Event_Loop) {
+	queue.destroy(&l.completed)
+	avl.destroy(&l.timeouts, false)
+}
+
+__tick :: proc(l: ^Event_Loop, timeout: time.Duration) -> General_Error {
+	l.now = time.now()
+
+	for op in queue.pop_front_safe(&l.completed) {
+		if !op._impl.removed {
+			op.cb(op)
+		}
+		if !op.detached {
+			pool.put(&l.operation_pool, op)
+		}
+	}
+
+	iter := avl.iterator(&l.timeouts, .Forward)
+	for node in avl.iterator_next(&iter) {
+		op := node.value
+		cexpires := time.diff(l.now, op.timeout._impl.expires)
+
+		done := cexpires <= 0
+		if done {
+			op.cb(op)
+			avl.remove_node(&l.timeouts, node)
+			if !op.detached {
+				pool.put(&l.operation_pool, op)
+			}
+			continue
+		}
+
+		break
+	}
+
+	return nil
+}
+
+_create_socket :: proc(l: ^Event_Loop, family: Address_Family, protocol: Socket_Protocol) -> (socket: Any_Socket, err: Create_Socket_Error) {
+	return nil, .Network_Unreachable
+}
+
+_listen :: proc(socket: TCP_Socket, backlog := 1000) -> Listen_Error {
+	return .Network_Unreachable
+}
+
+_exec :: proc(op: ^Operation) {
+	switch op.type {
+	case .Timeout:
+		_, _, err := avl.find_or_insert(&op.l.timeouts, op)
+		if err != nil {
+			panic("nbio: allocation failure")
+		}
+		return
+	case .Accept:
+		op.accept.err = .Network_Unreachable
+	case .Close:
+		op.close.err = .Unsupported
+	case .Dial:
+		op.dial.err = Dial_Error.Network_Unreachable
+	case .Recv:
+		switch _ in op.recv.socket {
+		case TCP_Socket: op.recv.err = TCP_Recv_Error.Network_Unreachable
+		case UDP_Socket: op.recv.err = UDP_Recv_Error.Network_Unreachable
+		case:            op.recv.err = TCP_Recv_Error.Network_Unreachable
+		}
+	case .Send:
+		switch _ in op.send.socket {
+		case TCP_Socket: op.send.err = TCP_Send_Error.Network_Unreachable
+		case UDP_Socket: op.send.err = UDP_Send_Error.Network_Unreachable
+		case:            op.send.err = TCP_Send_Error.Network_Unreachable
+		}
+	case .Send_File:
+		op.sendfile.err = .Network_Unreachable
+	case .Read:
+		op.read.err = .Unsupported
+	case .Write:
+		op.write.err = .Unsupported
+	case .Poll:
+		op.poll.result = .Error
+	case .Open:
+		op.open.err = .Unsupported
+	case .Stat:
+		op.stat.err = .Unsupported
+	case .None, ._Link_Timeout, ._Remove, ._Splice:
+		fallthrough
+	case:
+		unreachable()
+	}
+
+	_, err := queue.push_back(&op.l.completed, op)
+	if err != nil {
+		panic("nbio: allocation failure")
+	}
+}
+
+_remove :: proc(target: ^Operation) {
+	#partial switch target.type {
+	case .Timeout:
+		avl.remove_value(&target.l.timeouts, target)
+		if !target.detached {
+			pool.put(&target.l.operation_pool, target)
+		}
+	case:
+		target._impl.removed = true
+	}
+}
+
+_open_sync :: proc(l: ^Event_Loop, path: string, dir: Handle, mode: File_Flags, perm: Permissions) -> (handle: Handle, err: FS_Error) {
+	return 0, FS_Error.Unsupported
+}
+
+_associate_handle :: proc(handle: uintptr, l: ^Event_Loop) -> (Handle, Association_Error) {
+	return Handle(handle), nil
+}
+
+_associate_socket :: proc(socket: Any_Socket, l: ^Event_Loop) -> Association_Error {
+	return nil
+}
+
+_wake_up :: proc(l: ^Event_Loop) {
+}
+
+_yield :: proc() {
+}
--- a/core/nbio/impl_posix.odin
+++ b/core/nbio/impl_posix.odin
--- a/core/nbio/impl_posix_darwin.odin
+++ b/core/nbio/impl_posix_darwin.odin
@@ -0,0 +1,29 @@
+#+private
+package nbio
+
+import "core:net"
+import "core:sys/posix"
+
+foreign import lib "system:System"
+
+posix_sendfile :: proc(fd: Handle, s: TCP_Socket, offset, nbytes: int) -> (sent: int, ok := true) {
+	foreign lib {
+		@(link_name="sendfile")
+		_posix_sendfile :: proc (fd, s: posix.FD, offset: posix.off_t, len: ^posix.off_t, hdtr: rawptr, flags: i32) -> posix.result ---
+	}
+
+	len := posix.off_t(nbytes)
+	if _posix_sendfile(posix.FD(fd), posix.FD(s), posix.off_t(offset), &len, nil, 0) != .OK {
+		ok = false
+	}
+	sent = int(len)
+	return
+}
+
+posix_listen_error   :: net._listen_error
+posix_accept_error   :: net._accept_error
+posix_dial_error     :: net._dial_error
+posix_tcp_send_error :: net._tcp_send_error
+posix_udp_send_error :: net._udp_send_error
+posix_tcp_recv_error :: net._tcp_recv_error
+posix_udp_recv_error :: net._udp_recv_error
--- a/core/nbio/impl_posix_freebsd.odin
+++ b/core/nbio/impl_posix_freebsd.odin
@@ -0,0 +1,52 @@
+#+private
+package nbio
+
+import "core:net"
+import "core:sys/posix"
+import "core:sys/freebsd"
+
+foreign import lib "system:c"
+
+// TODO: rewrite freebsd implementation to use `sys/freebsd` instead of `sys/posix`.
+
+posix_sendfile :: proc(fd: Handle, s: TCP_Socket, offset, nbytes: int) -> (sent: int, ok := true) {
+	foreign lib {
+		@(link_name="sendfile")
+		_posix_sendfile :: proc (fd, s: posix.FD, offset: posix.off_t, nbytes: uint, hdtr: rawptr, sbytes: ^posix.off_t, flags: i32) -> posix.result ---
+	}
+
+	len: posix.off_t
+	if _posix_sendfile(posix.FD(fd), posix.FD(s), posix.off_t(offset), uint(nbytes), nil, &len, 0) != .OK {
+		ok = false
+	}
+	sent = int(len)
+	return
+}
+
+posix_listen_error :: proc() -> Listen_Error {
+	return net._listen_error(freebsd.Errno(posix.errno()))
+}
+
+posix_accept_error :: proc() -> Accept_Error {
+	return net._accept_error(freebsd.Errno(posix.errno()))
+}
+
+posix_dial_error :: proc() -> Dial_Error {
+	return net._dial_error(freebsd.Errno(posix.errno()))
+}
+
+posix_tcp_send_error :: proc() -> TCP_Send_Error {
+	return net._tcp_send_error(freebsd.Errno(posix.errno()))
+}
+
+posix_udp_send_error :: proc() -> UDP_Send_Error {
+	return net._udp_send_error(freebsd.Errno(posix.errno()))
+}
+
+posix_tcp_recv_error :: proc() -> TCP_Recv_Error {
+	return net._tcp_recv_error(freebsd.Errno(posix.errno()))
+}
+
+posix_udp_recv_error :: proc() -> UDP_Recv_Error {
+	return net._udp_recv_error(freebsd.Errno(posix.errno()))
+}
--- a/core/nbio/impl_posix_netbsd.odin
+++ b/core/nbio/impl_posix_netbsd.odin
@@ -0,0 +1,12 @@
+#+private
+package nbio
+
+import "core:net"
+
+posix_listen_error   :: net._listen_error
+posix_accept_error   :: net._accept_error
+posix_dial_error     :: net._dial_error
+posix_tcp_send_error :: net._tcp_send_error
+posix_udp_send_error :: net._udp_send_error
+posix_tcp_recv_error :: net._tcp_recv_error
+posix_udp_recv_error :: net._udp_recv_error
--- a/core/nbio/impl_posix_openbsd.odin
+++ b/core/nbio/impl_posix_openbsd.odin
@@ -0,0 +1,12 @@
+#+private
+package nbio
+
+import "core:net"
+
+posix_listen_error   :: net._listen_error
+posix_accept_error   :: net._accept_error
+posix_dial_error     :: net._dial_error
+posix_tcp_send_error :: net._tcp_send_error
+posix_udp_send_error :: net._udp_send_error
+posix_tcp_recv_error :: net._tcp_recv_error
+posix_udp_recv_error :: net._udp_recv_error
--- a/core/nbio/impl_windows.odin
+++ b/core/nbio/impl_windows.odin
--- a/core/nbio/mpsc.odin
+++ b/core/nbio/mpsc.odin
@@ -0,0 +1,63 @@
+#+private
+package nbio
+
+import "base:runtime"
+
+import "core:sync"
+
+Multi_Producer_Single_Consumer :: struct {
+	count:  int,
+	head:   int,
+	tail:   int,
+	buffer: []rawptr,
+	mask:   int,
+}
+
+mpsc_init :: proc(mpscq: ^Multi_Producer_Single_Consumer, cap: int, allocator: runtime.Allocator) -> runtime.Allocator_Error {
+	assert(runtime.is_power_of_two_int(cap), "cap must be a power of 2")
+	mpscq.buffer = make([]rawptr, cap, allocator) or_return
+	mpscq.mask   = cap-1
+	sync.atomic_thread_fence(.Release)
+	return nil
+}
+
+mpsc_destroy :: proc(mpscq: ^Multi_Producer_Single_Consumer, allocator: runtime.Allocator) {
+	delete(mpscq.buffer, allocator)
+}
+
+mpsc_enqueue :: proc(mpscq: ^Multi_Producer_Single_Consumer, obj: rawptr) -> bool {
+	count := sync.atomic_add_explicit(&mpscq.count, 1, .Acquire)
+	if count >= len(mpscq.buffer) {
+		sync.atomic_sub_explicit(&mpscq.count, 1, .Release)
+		return false
+	}
+
+	head := sync.atomic_add_explicit(&mpscq.head, 1, .Acquire)
+	assert(mpscq.buffer[head & mpscq.mask] == nil)
+	rv := sync.atomic_exchange_explicit(&mpscq.buffer[head & mpscq.mask], obj, .Release)
+	assert(rv == nil)
+	return true
+}
+
+mpsc_dequeue :: proc(mpscq: ^Multi_Producer_Single_Consumer) -> rawptr {
+	ret := sync.atomic_exchange_explicit(&mpscq.buffer[mpscq.tail], nil, .Acquire)
+	if ret == nil {
+		return nil
+	}
+
+	mpscq.tail += 1
+	if mpscq.tail >= len(mpscq.buffer) {
+		mpscq.tail = 0
+	}
+	r := sync.atomic_sub_explicit(&mpscq.count, 1, .Release)
+	assert(r > 0)
+	return ret
+}
+
+mpsc_count :: proc(mpscq: ^Multi_Producer_Single_Consumer) -> int {
+	return sync.atomic_load_explicit(&mpscq.count, .Relaxed)
+}
+
+mpsc_cap :: proc(mpscq: ^Multi_Producer_Single_Consumer) -> int {
+	return len(mpscq.buffer)
+}
--- a/core/nbio/nbio.odin
+++ b/core/nbio/nbio.odin
@@ -0,0 +1,431 @@
+package nbio
+
+import "base:intrinsics"
+import "base:runtime"
+
+import "core:container/pool"
+import "core:net"
+import "core:time"
+
+/*
+If the package is fully supported on the current target. If it is not it will compile but work
+in a matter where things are unimplemented.
+
+Additionally if it is `FULLY_SUPPORTED` it may still return `.Unsupported` in `acquire_thread_event_loop`
+If the target does not support the needed syscalls for operating the package.
+*/
+FULLY_SUPPORTED :: _FULLY_SUPPORTED
+
+/*
+An event loop, one per thread, consider the fields private.
+Do not copy.
+*/
+Event_Loop :: struct /* #no_copy */ {
+	using impl:  _Event_Loop,
+	allocator:   runtime.Allocator,
+	err:         General_Error,
+	refs:        int,
+	now:         time.Time,
+
+	// Queue that is used to queue operations from another thread to be executed on this thread.
+	queue: Multi_Producer_Single_Consumer,
+
+	operation_pool: pool.Pool(Operation),
+}
+
+Handle :: _Handle
+
+// The maximum size of user arguments for an operation, can be increased at the cost of more RAM.
+MAX_USER_ARGUMENTS :: #config(NBIO_MAX_USER_ARGUMENTS, 4)
+#assert(MAX_USER_ARGUMENTS >= 4)
+
+Operation :: struct {
+	cb:              Callback,
+	user_data:       [MAX_USER_ARGUMENTS + 1]rawptr,
+	detached:        bool,
+	type:            Operation_Type,
+	using specifics: Specifics,
+
+	_impl:   _Operation `fmt:"-"`,
+	using _: struct #raw_union {
+		_pool_link: ^Operation,
+		l:          ^Event_Loop,
+	},
+}
+
+Specifics :: struct #raw_union {
+	accept:   Accept    `raw_union_tag:"type=.Accept"`,
+	close:    Close     `raw_union_tag:"type=.Close"`,
+	dial:     Dial      `raw_union_tag:"type=.Dial"`,
+	read:     Read      `raw_union_tag:"type=.Read"`,
+	recv:     Recv      `raw_union_tag:"type=.Recv"`,
+	send:     Send      `raw_union_tag:"type=.Send"`,
+	write:    Write     `raw_union_tag:"type=.Write"`,
+	timeout:  Timeout   `raw_union_tag:"type=.Timeout"`,
+	poll:     Poll      `raw_union_tag:"type=.Poll"`,
+	sendfile: Send_File `raw_union_tag:"type=.Send_File"`,
+	open:     Open      `raw_union_tag:"type=.Open"`,
+	stat:     Stat      `raw_union_tag:"type=.Stat"`,
+
+	_remove:       _Remove       `raw_union_tag:"type=._Remove"`,
+	_link_timeout: _Link_Timeout `raw_union_tag:"type=._Link_Timeout"`,
+	_splice:       _Splice       `raw_union_tag:"type=._Splice"`,
+}
+
+Operation_Type :: enum i32 {
+	None,
+	Accept,
+	Close,
+	Dial,
+	Read,
+	Recv,
+	Send,
+	Write,
+	Timeout,
+	Poll,
+	Send_File,
+	Open,
+	Stat,
+
+	_Link_Timeout,
+	_Remove,
+	_Splice,
+}
+
+Callback :: #type proc(op: ^Operation)
+
+/*
+Initialize or increment the reference counted event loop for the current thread.
+*/
+acquire_thread_event_loop :: proc() -> General_Error {
+	return _acquire_thread_event_loop()
+}
+
+/*
+Destroy or decrease the reference counted event loop for the current thread.
+*/
+release_thread_event_loop :: proc() {
+	_release_thread_event_loop()
+}
+
+current_thread_event_loop :: proc(loc := #caller_location) -> ^Event_Loop {
+	return _current_thread_event_loop(loc)
+}
+
+/*
+Each time you call this the implementation checks its state
+and calls any callbacks which are ready. You would typically call this in a loop.
+
+Blocks for up-to timeout waiting for events if there is nothing to do.
+*/
+tick :: proc(timeout: time.Duration = NO_TIMEOUT) -> General_Error {
+	l := &_tls_event_loop
+	if l.refs == 0 { return nil }
+	return _tick(l, timeout)
+}
+
+/*
+Runs the event loop by ticking in a loop until there is no more work to be done.
+*/
+run :: proc() -> General_Error {
+	l := &_tls_event_loop
+	if l.refs == 0 { return nil }
+
+	acquire_thread_event_loop()
+	defer release_thread_event_loop()
+
+	for num_waiting() > 0 {
+		if errno := _tick(l, NO_TIMEOUT); errno != nil {
+			return errno
+		}
+	}
+	return nil
+}
+
+/*
+Runs the event loop by ticking in a loop until there is no more work to be done, or the flag `done` is `true`.
+*/
+run_until :: proc(done: ^bool) -> General_Error {
+	l := &_tls_event_loop
+	if l.refs == 0 { return nil }
+
+	acquire_thread_event_loop()
+	defer release_thread_event_loop()
+
+	for num_waiting() > 0 && !intrinsics.volatile_load(done) {
+		if errno := _tick(l, NO_TIMEOUT); errno != nil {
+			return errno
+		}
+	}
+	return nil
+}
+
+/*
+Returns the number of in-progress operations to be completed on the event loop.
+*/
+num_waiting :: proc(l: Maybe(^Event_Loop) = nil) -> int {
+	l_ := l.? or_else &_tls_event_loop
+	if l_.refs == 0 { return 0 }
+	return pool.num_outstanding(&l_.operation_pool)
+}
+
+/*
+Returns the current time (cached at most at the beginning of the current tick).
+*/
+now :: proc() -> time.Time {
+	if _tls_event_loop.now == {} {
+		return time.now()
+	}
+	return _tls_event_loop.now
+}
+
+/*
+Remove the given operation from the event loop. The callback of it won't be called and resources are freed.
+
+Calling `remove`:
+- Cancels the operation if it has not yet completed
+- Prevents the callback from being called
+
+Cancellation via `remove` is *final* and silent:
+- The callback will never be invoked
+- No error is delivered
+- The operation must be considered dead after removal
+
+WARN: the operation could have already been (partially or completely) completed.
+	  A send with `all` set to true could have sent a portion already.
+	  But also, a send that could be completed without blocking could have been completed.
+	  You just won't get a callback.
+
+WARN: once an operation's callback is called it can not be removed anymore (use after free).
+
+WARN: needs to be called from the thread of the event loop the target belongs to.
+
+Common use would be to cancel a timeout, remove a polling, or remove an `accept` before calling `close` on it's socket.
+*/
+remove :: proc(target: ^Operation) {
+	if target == nil {
+		return
+	}
+
+	assert(target.type != .None)
+
+	if target.l != &_tls_event_loop {
+		panic("nbio.remove called on different thread")
+	}
+
+	_remove(target)
+}
+
+/*
+Creates a socket for use in `nbio` and relates it to the given event loop.
+
+Inputs:
+- family:   Should this be an IP4 or IP6 socket
+- protocol: The type of socket (TCP or UDP)
+- l:        The event loop to associate it with, defaults to the current thread's loop
+
+Returns:
+- socket: The created socket, consider `create_{udp|tcp}_socket` for a typed socket instead of the union
+- err:    A network error (`Create_Socket_Error`, or `Set_Blocking_Error`) which happened while opening
+*/
+create_socket :: proc(
+	family:   Address_Family,
+	protocol: Socket_Protocol,
+	l:        ^Event_Loop = nil,
+	loc       := #caller_location,
+) -> (
+	socket: Any_Socket,
+	err:    Create_Socket_Error,
+) {
+	return _create_socket(l if l != nil else _current_thread_event_loop(loc), family, protocol)
+}
+
+/*
+Creates a UDP socket for use in `nbio` and relates it to the given event loop.
+
+Inputs:
+- family: Should this be an IP4 or IP6 socket
+- l:      The event loop to associate it with, defaults to the current thread's loop
+
+Returns:
+- socket: The created UDP socket
+- err:    A network error (`Create_Socket_Error`, or `Set_Blocking_Error`) which happened while opening
+*/
+create_udp_socket :: proc(family: Address_Family, l: ^Event_Loop = nil, loc := #caller_location) -> (net.UDP_Socket, Create_Socket_Error) {
+	socket, err := create_socket(family, .UDP, l, loc)
+	if err != nil {
+		return -1, err
+	}
+
+	return socket.(UDP_Socket), nil
+}
+
+/*
+Creates a TCP socket for use in `nbio` and relates it to the given event loop.
+
+Inputs:
+- family: Should this be an IP4 or IP6 socket
+- l:      The event loop to associate it with, defaults to the current thread's loop
+
+Returns:
+- socket: The created TCP socket
+- err:    A network error (`Create_Socket_Error`, or `Set_Blocking_Error`) which happened while opening
+*/
+create_tcp_socket :: proc(family: Address_Family, l: ^Event_Loop = nil, loc := #caller_location) -> (net.TCP_Socket, Create_Socket_Error) {
+	socket, err := create_socket(family, .TCP, l, loc)
+	if err != nil {
+		return -1, err
+	}
+
+	return socket.(TCP_Socket), nil
+}
+
+/*
+Creates a socket, sets non blocking mode, relates it to the given IO, binds the socket to the given endpoint and starts listening.
+
+Inputs:
+- endpoint: Where to bind the socket to
+- backlog:  The maximum length to which the queue of pending connections may grow, before refusing connections
+- l:        The event loop to associate the socket with, defaults to the current thread's loop
+
+Returns:
+- socket: The opened, bound and listening socket
+- err:    A network error (`Create_Socket_Error`, `Bind_Error`, or `Listen_Error`) that has happened
+*/
+listen_tcp :: proc(endpoint: Endpoint, backlog := 1000, l: ^Event_Loop = nil, loc := #caller_location) -> (socket: TCP_Socket, err: net.Network_Error) {
+	assert(backlog > 0 && backlog < int(max(i32)))
+	return _listen_tcp(l if l != nil else _current_thread_event_loop(loc), endpoint, backlog)
+}
+
+/*
+Opens a file and associates it with the event loop.
+
+Inputs:
+- path: path to the file, if not absolute: relative from `dir`
+- dir:  directory that `path` is relative from (if it is relative), defaults to the current working directory
+- mode: open mode, defaults to read-only
+- perm: permissions to use when creating a file, defaults to read+write for everybody
+- l:    event loop to associate the file with, defaults to the current thread's
+
+Returns:
+- handle: The file handle
+- err:    An error if it occurred
+*/
+open_sync :: proc(path: string, dir: Handle = CWD, mode: File_Flags = {.Read}, perm := Permissions_Default_File, l: ^Event_Loop = nil, loc := #caller_location) -> (handle: Handle, err: FS_Error) {
+	return _open_sync(l if l != nil else _current_thread_event_loop(loc), path, dir, mode, perm)
+}
+
+Association_Error :: enum {
+	None,
+	// The given file/handle/socket was not opened in a mode that it can be made non-blocking afterwards.
+	//
+	// On Windows, this can happen when a file is not opened with the `FILE_FLAG_OVERLAPPED` flag.
+	// If using `core:os`, that is set when you specify the `O_NONBLOCK` flag.
+	// There is no way to add that after the fact.
+	Not_Possible_To_Associate,
+	// The given handle is not a valid handle.
+	Invalid_Handle,
+	// No network connection, or the network stack is not initialized.
+	Network_Unreachable,
+}
+
+/*
+Associate the given OS handle, not opened through this package, with the event loop.
+
+Consider using this package's `open` or `open_sync` directly instead.
+
+The handle returned is for convenience, it is actually still the same handle as given.
+Thus you should not close the given handle.
+
+On Windows, this can error when a file is not opened with the `FILE_FLAG_OVERLAPPED` flag.
+If using `core:os`, that is set when you specify the `O_NONBLOCK` flag.
+There is no way to add that after the fact.
+*/
+associate_handle :: proc(handle: uintptr, l: ^Event_Loop = nil, loc := #caller_location) -> (Handle, Association_Error) {
+	return _associate_handle(handle, l if l != nil else _current_thread_event_loop(loc))
+}
+
+/*
+Associate the given socket, not created through this package, with the event loop.
+
+Consider using this package's `create_socket` directly instead.
+*/
+associate_socket :: proc(socket: Any_Socket, l: ^Event_Loop = nil, loc := #caller_location) -> Association_Error {
+	return _associate_socket(socket, l if l != nil else _current_thread_event_loop(loc))
+}
+
+Read_Entire_File_Error :: struct {
+	operation: Operation_Type,
+	value:     FS_Error,
+}
+
+Read_Entire_File_Callback :: #type proc(user_data: rawptr, data: []byte, err: Read_Entire_File_Error)
+
+/*
+Combines multiple operations (open, stat, read, close) into one that reads an entire regular file.
+
+The error contains the `operation` that the error happened on.
+
+Inputs:
+- path:      path to the file, if not absolute: relative from `dir`
+- user_data: a pointer passed through into the callback
+- cb:        the callback to call once completed, called with the user data, file data, and an optional error
+- allocator: the allocator to allocate the file's contents onto
+- dir:       directory that `path` is relative from (if it is relative), defaults to the current working directory
+- l:         event loop to execute the operation on
+*/
+read_entire_file :: proc(path: string, user_data: rawptr, cb: Read_Entire_File_Callback, allocator := context.allocator, dir := CWD, l: ^Event_Loop = nil, loc := #caller_location) {
+	_read_entire_file(l if l != nil else _current_thread_event_loop(loc), path, user_data, cb, allocator, dir)
+}
+
+/*
+Detach an operation from the package's lifetime management.
+
+By default the operation's lifetime is managed by the package and freed after a callback is called.
+Calling this function detaches the operation from this lifetime.
+You are expected to call `reattach` to give the package back this operation.
+*/
+detach :: proc(op: ^Operation) {
+	op.detached = true
+}
+
+/*
+Reattach an operation to the package's lifetime management.
+*/
+reattach :: proc(op: ^Operation) {
+	pool.put(&op.l.operation_pool, op)
+}
+
+/*
+Execute an operation.
+
+If the operation is attached to another thread's event loop, it is queued to be executed on that event loop,
+optionally waking that loop up (from a blocking `tick`) with `trigger_wake_up`.
+*/
+exec :: proc(op: ^Operation, trigger_wake_up := true) {
+	if op.l == &_tls_event_loop {
+		_exec(op)
+	} else {
+		for !mpsc_enqueue(&op.l.queue, op) {
+			warn("operation queue on event loop filled up")
+			wake_up(op.l)
+			_yield()
+		}
+		if trigger_wake_up {
+			wake_up(op.l)
+		}
+	}
+}
+
+/*
+Wake up an event loop on another thread which may be blocking for completed operations.
+
+Commonly used with `exec` from a worker thread to have the event loop pick up that work.
+Note that by default `exec` already calls this procedure.
+*/
+wake_up :: proc(l: ^Event_Loop) {
+	if l == &_tls_event_loop {
+		return
+	}
+	_wake_up(l)
+}
--- a/core/nbio/net.odin
+++ b/core/nbio/net.odin
@@ -0,0 +1,39 @@
+package nbio
+
+import "core:net"
+
+Network_Error  :: net.Network_Error
+Accept_Error   :: net.Accept_Error
+Dial_Error     :: net.Dial_Error
+Send_Error     :: net.Send_Error
+TCP_Send_Error :: net.TCP_Send_Error
+UDP_Send_Error :: net.UDP_Send_Error
+Recv_Error     :: net.Recv_Error
+TCP_Recv_Error :: net.TCP_Recv_Error
+UDP_Recv_Error :: net.UDP_Recv_Error
+Listen_Error   :: net.Listen_Error
+Create_Socket_Error :: net.Create_Socket_Error
+
+Address_Family  :: net.Address_Family
+Socket_Protocol :: net.Socket_Protocol
+
+Address     :: net.Address
+IP4_Address :: net.IP4_Address
+IP6_Address :: net.IP6_Address
+
+Endpoint :: net.Endpoint
+
+TCP_Socket :: net.TCP_Socket
+UDP_Socket :: net.UDP_Socket
+Any_Socket :: net.Any_Socket
+
+IP4_Any      :: net.IP4_Any
+IP6_Any      :: net.IP6_Any
+IP4_Loopback :: net.IP4_Loopback
+IP6_Loopback :: net.IP6_Loopback
+
+family_from_endpoint :: net.family_from_endpoint
+bind                 :: net.bind
+bound_endpoint       :: net.bound_endpoint
+parse_endpoint       :: net.parse_endpoint
+endpoint_to_string   :: net.endpoint_to_string
--- a/core/nbio/ops.odin
+++ b/core/nbio/ops.odin
--- a/core/net/addr.odin
+++ b/core/net/addr.odin
@@ -1,4 +1,3 @@
-#+build windows, linux, darwin, freebsd
 package net

 /*
@@ -22,7 +21,6 @@ package net

 import "core:strconv"
 import "core:strings"
-import "core:fmt"

 /*
 	Expects an IPv4 address with no leading or trailing whitespace:
@@ -473,13 +471,20 @@ join_port :: proc(address_or_host: string, port: int, allocator := context.alloc
 	addr := parse_address(addr_or_host)
 	if addr == nil {
 		// hostname
-		fmt.sbprintf(&b, "%v:%v", addr_or_host, port)
+		strings.write_string(&b, addr_or_host)
+		strings.write_string(&b, ":")
+		strings.write_int(&b, port)
 	} else {
 		switch _ in addr {
 		case IP4_Address:
-			fmt.sbprintf(&b, "%v:%v", address_to_string(addr), port)
+			strings.write_string(&b, address_to_string(addr))
+			strings.write_string(&b, ":")
+			strings.write_int(&b, port)
 		case IP6_Address:
-			fmt.sbprintf(&b, "[%v]:%v", address_to_string(addr), port)
+			strings.write_string(&b, "[")
+			strings.write_string(&b, address_to_string(addr))
+			strings.write_string(&b, "]:")
+			strings.write_int(&b, port)
 		}
 	}
 	return strings.to_string(b)
@@ -495,8 +500,8 @@ map_to_ip6 :: proc(addr: Address) -> Address {
 	addr4 := addr.(IP4_Address)
 	addr4_u16 := transmute([2]u16be) addr4
 	addr6: IP6_Address
-	addr6[4] = 0xffff
-	copy(addr6[5:], addr4_u16[:])
+	addr6[5] = 0xffff
+	copy(addr6[6:], addr4_u16[:])
 	return addr6
 }

@@ -509,7 +514,13 @@ address_to_string :: proc(addr: Address, allocator := context.temp_allocator) ->
 	b := strings.builder_make(allocator)
 	switch v in addr {
 	case IP4_Address:
-		fmt.sbprintf(&b, "%v.%v.%v.%v", v[0], v[1], v[2], v[3])
+		strings.write_uint(&b, uint(v[0]))
+		strings.write_byte(&b, '.')
+		strings.write_uint(&b, uint(v[1]))
+		strings.write_byte(&b, '.')
+		strings.write_uint(&b, uint(v[2]))
+		strings.write_byte(&b, '.')
+		strings.write_uint(&b, uint(v[3]))
 	case IP6_Address:
 		// First find the longest run of zeroes.
 		Zero_Run :: struct {
@@ -563,25 +574,33 @@ address_to_string :: proc(addr: Address, allocator := context.temp_allocator) ->
 		for val, i in v {
 			if best.start == i || best.end == i {
 				// For the left and right side of the best zero run, print a `:`.
-				fmt.sbprint(&b, ":")
+				strings.write_string(&b, ":")
 			} else if i < best.start {
 				/*
 					If we haven't made it to the best run yet, print the digit.
 					Make sure we only print a `:` after the digit if it's not
 					immediately followed by the run's own leftmost `:`.
 				*/
-				fmt.sbprintf(&b, "%x", val)
+
+				buf: [32]byte
+				str := strconv.write_bits(buf[:], u64(val), 16, false, size_of(val), strconv.digits, {})
+				strings.write_string(&b, str)
+
 				if i < best.start - 1 {
-					fmt.sbprintf(&b, ":")
+					strings.write_string(&b, ":")
 				}
 			} else if i > best.end {
 				/*
 					If there are any digits after the zero run, print them.
 					But don't print the `:` at the end of the IP number.
 				*/
-				fmt.sbprintf(&b, "%x", val)
+
+				buf: [32]byte
+				str := strconv.write_bits(buf[:], u64(val), 16, false, size_of(val), strconv.digits, {})
+				strings.write_string(&b, str)
+
 				if i != 7 {
-					fmt.sbprintf(&b, ":")
+					strings.write_string(&b, ":")
 				}
 			}
 		}
@@ -598,8 +617,15 @@ endpoint_to_string :: proc(ep: Endpoint, allocator := context.temp_allocator) ->
 		s := address_to_string(ep.address, context.temp_allocator)
 		b := strings.builder_make(allocator)
 		switch a in ep.address {
-		case IP4_Address:  fmt.sbprintf(&b, "%v:%v",   s, ep.port)
-		case IP6_Address:  fmt.sbprintf(&b, "[%v]:%v", s, ep.port)
+		case IP4_Address:
+			strings.write_string(&b, s)
+			strings.write_string(&b, ":")
+			strings.write_int(&b, ep.port)
+		case IP6_Address:
+			strings.write_string(&b, "[")
+			strings.write_string(&b, s)
+			strings.write_string(&b, "]:")
+			strings.write_int(&b, ep.port)
 		}
 		return strings.to_string(b)
 	}
--- a/core/net/common.odin
+++ b/core/net/common.odin
@@ -1,4 +1,3 @@
-#+build windows, linux, darwin, freebsd
 package net

 /*
@@ -91,6 +90,7 @@ Parse_Endpoint_Error :: enum u32 {
 Resolve_Error :: enum u32 {
 	None = 0,
 	Unable_To_Resolve = 1,
+	Allocation_Failure,
 }

 DNS_Error :: enum u32 {
@@ -144,11 +144,11 @@ Address :: union {IP4_Address, IP6_Address}
 IP4_Loopback :: IP4_Address{127, 0, 0, 1}
 IP6_Loopback :: IP6_Address{0, 0, 0, 0, 0, 0, 0, 1}

-IP4_Any := IP4_Address{}
-IP6_Any := IP6_Address{}
+IP4_Any :: IP4_Address{}
+IP6_Any :: IP6_Address{}

-IP4_mDNS_Broadcast := Endpoint{address=IP4_Address{224, 0, 0, 251}, port=5353}
-IP6_mDNS_Broadcast := Endpoint{address=IP6_Address{65282, 0, 0, 0, 0, 0, 0, 251}, port = 5353}
+IP4_mDNS_Broadcast :: Endpoint{address=IP4_Address{224, 0, 0, 251}, port=5353}
+IP6_mDNS_Broadcast :: Endpoint{address=IP6_Address{65282, 0, 0, 0, 0, 0, 0, 251}, port = 5353}

 Endpoint :: struct {
 	address: Address,
--- a/core/net/dns.odin
+++ b/core/net/dns.odin
@@ -1,4 +1,3 @@
-#+build windows, linux, darwin, freebsd
 package net

 /*
@@ -22,13 +21,18 @@ package net
 		Haesbaert:       Security fixes
 */

-@(require) import "base:runtime"
+@(require)
+import "base:runtime"
+
+import "core:bufio"
+import "core:io"
+import "core:math/rand"
 import "core:mem"
 import "core:strings"
 import "core:time"
-import "core:os"
-import "core:math/rand"
-@(require) import "core:sync"
+
+@(require)
+import "core:sync"

 dns_config_initialized: sync.Once
 when ODIN_OS == .Windows {
@@ -42,20 +46,12 @@ when ODIN_OS == .Windows {
 		hosts_file  = "/etc/hosts",
 	}
 } else {
-	#panic("Please add a configuration for this OS.")
+	DEFAULT_DNS_CONFIGURATION :: DNS_Configuration{}
 }

-/*
-	Replaces environment placeholders in `dns_configuration`. Only necessary on Windows.
-	Is automatically called, once, by `get_dns_records_*`.
-*/
-@(private)
 init_dns_configuration :: proc() {
 	when ODIN_OS == .Windows {
-		runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
-		val := os.replace_environment_placeholders(dns_configuration.hosts_file, context.temp_allocator)
-		copy(dns_configuration.hosts_file_buf[:], val)
-		dns_configuration.hosts_file = string(dns_configuration.hosts_file_buf[:len(val)])
+		_init_dns_configuration()
 	}
 }

@@ -178,9 +174,7 @@ resolve_ip6 :: proc(hostname_and_maybe_port: string) -> (ep6: Endpoint, err: Net
 	See `destroy_records`.
 */
 get_dns_records_from_os :: proc(hostname: string, type: DNS_Record_Type, allocator := context.allocator) -> (records: []DNS_Record, err: DNS_Error) {
-	when ODIN_OS == .Windows {
-		sync.once_do(&dns_config_initialized, init_dns_configuration)
-	}
+	init_dns_configuration()
 	return _get_dns_records_os(hostname, type, allocator)
 }

@@ -196,51 +190,14 @@ get_dns_records_from_os :: proc(hostname: string, type: DNS_Record_Type, allocat
 	See `destroy_records`.
 */
 get_dns_records_from_nameservers :: proc(hostname: string, type: DNS_Record_Type, name_servers: []Endpoint, host_overrides: []DNS_Record, allocator := context.allocator) -> (records: []DNS_Record, err: DNS_Error) {
-	when ODIN_OS == .Windows {
-		sync.once_do(&dns_config_initialized, init_dns_configuration)
-	}
+	init_dns_configuration()
 	context.allocator = allocator

-	if type != .SRV {
-		// NOTE(tetra): 'hostname' can contain underscores when querying SRV records
-		ok := validate_hostname(hostname)
-		if !ok {
-			return nil, .Invalid_Hostname_Error
-		}
-	}
+	id := u16be(rand.uint32())
+	dns_packet_buf: [DNS_PACKET_MIN_LEN]byte = ---
+	dns_packet := make_dns_packet(dns_packet_buf[:], id, hostname, type) or_return

-	hdr := DNS_Header{
-		id = u16be(rand.uint32()),
-		is_response = false,
-		opcode = 0,
-		is_authoritative = false,
-		is_truncated = false,
-		is_recursion_desired = true,
-		is_recursion_available = false,
-		response_code = DNS_Response_Code.No_Error,
-	}
-
-	id, bits := pack_dns_header(hdr)
-	dns_hdr := [6]u16be{}
-	dns_hdr[0] = id
-	dns_hdr[1] = bits
-	dns_hdr[2] = 1
-
-	dns_query := [2]u16be{ u16be(type), 1 }
-
-	output := [(size_of(u16be) * 6) + NAME_MAX + (size_of(u16be) * 2)]u8{}
-	b := strings.builder_from_slice(output[:])
-
-	strings.write_bytes(&b, mem.slice_data_cast([]u8, dns_hdr[:]))
-	ok := encode_hostname(&b, hostname)
-	if !ok {
-		return nil, .Invalid_Hostname_Error
-	}
-	strings.write_bytes(&b, mem.slice_data_cast([]u8, dns_query[:]))
-
-	dns_packet := output[:strings.builder_len(b)]
-
-	dns_response_buf := [4096]u8{}
+	dns_response_buf: [4096]u8 = ---
 	dns_response: []u8
 	for name_server in name_servers {
 		conn, sock_err := make_unbound_udp_socket(family_from_endpoint(name_server))
@@ -283,6 +240,42 @@ get_dns_records_from_nameservers :: proc(hostname: string, type: DNS_Record_Type
 	return
 }

+DNS_PACKET_MIN_LEN :: (size_of(u16be) * 6) + NAME_MAX + (size_of(u16be) * 2)
+
+make_dns_packet :: proc(buf: []byte, id: u16be, hostname: string, type: DNS_Record_Type) -> (packet: []byte, err: DNS_Error) {
+	assert(len(buf) >= DNS_PACKET_MIN_LEN)
+
+	hdr := DNS_Header{
+		id = id,
+		is_response = false,
+		opcode = 0,
+		is_authoritative = false,
+		is_truncated = false,
+		is_recursion_desired = true,
+		is_recursion_available = false,
+		response_code = DNS_Response_Code.No_Error,
+	}
+
+	_, bits := pack_dns_header(hdr)
+	dns_hdr := [6]u16be{}
+	dns_hdr[0] = id
+	dns_hdr[1] = bits
+	dns_hdr[2] = 1
+
+	dns_query := [2]u16be{ u16be(type), 1 }
+
+	b := strings.builder_from_slice(buf[:])
+
+	strings.write_bytes(&b, mem.slice_data_cast([]u8, dns_hdr[:]))
+	ok := encode_hostname(&b, hostname)
+	if !ok {
+		return nil, .Invalid_Hostname_Error
+	}
+	strings.write_bytes(&b, mem.slice_data_cast([]u8, dns_query[:]))
+
+	return buf[:strings.builder_len(b)], nil
+}
+
 // `records` slice is also destroyed.
 destroy_dns_records :: proc(records: []DNS_Record, allocator := context.allocator) {
 	context.allocator = allocator
@@ -364,13 +357,8 @@ unpack_dns_header :: proc(id: u16be, bits: u16be) -> (hdr: DNS_Header) {
 	return hdr
 }

-load_resolv_conf :: proc(resolv_conf_path: string, allocator := context.allocator) -> (name_servers: []Endpoint, ok: bool) {
-	context.allocator = allocator
-
-	res := os.read_entire_file_from_filename(resolv_conf_path) or_return
-	defer delete(res)
-	resolv_str := string(res)
-
+parse_resolv_conf :: proc(resolv_str: string, allocator := context.allocator) -> (name_servers: []Endpoint) {
+	resolv_str := resolv_str
 	id_str := "nameserver"
 	id_len := len(id_str)

@@ -401,41 +389,51 @@ load_resolv_conf :: proc(resolv_conf_path: string, allocator := context.allocato
 		append(&_name_servers, endpoint)
 	}

-	return _name_servers[:], true
+	return _name_servers[:]
 }

-load_hosts :: proc(hosts_file_path: string, allocator := context.allocator) -> (hosts: []DNS_Host_Entry, ok: bool) {
-	context.allocator = allocator
+parse_hosts :: proc(stream: io.Stream, allocator := context.allocator) -> (hosts: []DNS_Host_Entry, ok: bool) {
+	s := bufio.scanner_init(&{}, stream, allocator)
+	defer bufio.scanner_destroy(s)

-	res := os.read_entire_file_from_filename(hosts_file_path, allocator) or_return
-	defer delete(res)
+	resize(&s.buf, 256)

-	_hosts := make([dynamic]DNS_Host_Entry, 0, allocator)
-	hosts_str := string(res)
-	for line in strings.split_lines_iterator(&hosts_str) {
-		if len(line) == 0 || line[0] == '#' {
-			continue
+	_hosts: [dynamic]DNS_Host_Entry
+	_hosts.allocator = allocator
+	defer if !ok {
+		for host in _hosts {
+			delete(host.name, allocator)
 		}
+		delete(_hosts)
+	}

-		splits := strings.fields(line)
-		defer delete(splits)
+	for bufio.scanner_scan(s) {
+		line := bufio.scanner_text(s)

-		(len(splits) >= 2) or_continue
+		line, _, _ = strings.partition(line, "#")
+		(len(line) > 0) or_continue
+
+		ip_str := strings.fields_iterator(&line) or_continue

-		ip_str := splits[0]
 		addr := parse_address(ip_str)
-		if addr == nil {
-			continue
-		}
+		(addr != nil) or_continue

-		for hostname in splits[1:] {
-			if len(hostname) != 0 {
-				append(&_hosts, DNS_Host_Entry{hostname, addr})
-			}
+		for hostname in strings.fields_iterator(&line) {
+			(len(hostname) > 0) or_continue
+
+			clone, alloc_err := strings.clone(hostname, allocator)
+			if alloc_err != nil { return }
+
+			_, alloc_err = append(&_hosts, DNS_Host_Entry{clone, addr})
+			if alloc_err != nil { return }
 		}
 	}

-	return _hosts[:], true
+	if bufio.scanner_error(s) != nil { return }
+
+	hosts = _hosts[:]
+	ok    = true
+	return
 }

 // www.google.com -> 3www6google3com0
@@ -594,7 +592,7 @@ decode_hostname :: proc(packet: []u8, start_idx: int, allocator := context.alloc

 // Uses RFC 952 & RFC 1123
 validate_hostname :: proc(hostname: string) -> (ok: bool) {
-	if len(hostname) > 255 || len(hostname) == 0 {
+	if len(hostname) > NAME_MAX || len(hostname) == 0 {
 		return
 	}

@@ -604,7 +602,7 @@ validate_hostname :: proc(hostname: string) -> (ok: bool) {

 	_hostname := hostname
 	for label in strings.split_iterator(&_hostname, ".") {
-		if len(label) > 63 || len(label) == 0 {
+		if len(label) > LABEL_MAX || len(label) == 0 {
 			return
 		}

@@ -868,4 +866,4 @@ parse_response :: proc(response: []u8, filter: DNS_Record_Type = nil, allocator
 	xid = hdr.id

 	return _records[:], xid, true
-}
+}
--- a/core/net/dns_os.odin
+++ b/core/net/dns_os.odin
@@ -0,0 +1,24 @@
+#+build darwin, freebsd, openbsd, netbsd, linux, windows, wasi
+#+private
+package net
+
+import "core:os"
+
+load_resolv_conf :: proc(resolv_conf_path: string, allocator := context.allocator) -> (name_servers: []Endpoint, ok: bool) {
+	context.allocator = allocator
+
+	res := os.read_entire_file_from_filename(resolv_conf_path) or_return
+	defer delete(res)
+	resolv_str := string(res)
+
+	return parse_resolv_conf(resolv_str), true
+}
+
+load_hosts :: proc(hosts_file_path: string, allocator := context.allocator) -> (hosts: []DNS_Host_Entry, ok: bool) {
+	hosts_file, err := os.open(hosts_file_path)
+	if err != nil { return }
+	defer os.close(hosts_file)
+
+	return parse_hosts(os.stream_from_handle(hosts_file), allocator)
+}
+
--- a/core/net/dns_others.odin
+++ b/core/net/dns_others.odin
@@ -0,0 +1,12 @@
+#+build !windows
+#+build !linux
+#+build !darwin
+#+build !freebsd
+#+build !netbsd
+#+build !openbsd
+package net
+
+@(private)
+_get_dns_records_os :: proc(hostname: string, type: DNS_Record_Type, allocator := context.allocator) -> (records: []DNS_Record, err: DNS_Error) {
+	return
+}
--- a/core/net/dns_unix.odin
+++ b/core/net/dns_unix.odin
@@ -1,4 +1,4 @@
-#+build linux, darwin, freebsd
+#+build linux, darwin, freebsd, openbsd, netbsd
 package net
 /*
 	Package net implements cross-platform Berkeley Sockets, DNS resolution and associated procedures.
@@ -42,14 +42,19 @@ _get_dns_records_os :: proc(hostname: string, type: DNS_Record_Type, allocator :
 	}

 	hosts, hosts_ok := load_hosts(dns_configuration.hosts_file)
-	defer delete(hosts)
 	if !hosts_ok {
 		return nil, .Invalid_Hosts_Config_Error
 	}
+	defer {
+		for h in hosts {
+			delete(h.name)
+		}
+		delete(hosts)
+	}

 	host_overrides := make([dynamic]DNS_Record)
 	for host in hosts {
-		if strings.compare(host.name, hostname) != 0 {
+		if host.name != hostname {
 			continue
 		}

@@ -79,4 +84,4 @@ _get_dns_records_os :: proc(hostname: string, type: DNS_Record_Type, allocator :
 	}

 	return get_dns_records_from_nameservers(hostname, type, name_servers, host_overrides[:])
-}
+}
--- a/core/net/dns_windows.odin
+++ b/core/net/dns_windows.odin
@@ -20,11 +20,29 @@ package net
 		Feoramund:       FreeBSD platform code
 */

-import "core:strings"
+import "base:runtime"
+
 import "core:mem"
+import "core:os"
+import "core:strings"
+import "core:sync"

 import win "core:sys/windows"

+/*
+	Replaces environment placeholders in `dns_configuration`. Only necessary on Windows.
+	Is automatically called, once, by `get_dns_records_*`.
+*/
+@(private)
+_init_dns_configuration :: proc() {
+	sync.once_do(&dns_config_initialized, proc() {
+		runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+		val := os.replace_environment_placeholders(dns_configuration.hosts_file, context.temp_allocator)
+		copy(dns_configuration.hosts_file_buf[:], val)
+		dns_configuration.hosts_file = string(dns_configuration.hosts_file_buf[:len(val)])
+	})
+}
+
@(private)
 _get_dns_records_os :: proc(hostname: string, type: DNS_Record_Type, allocator := context.allocator) -> (records: []DNS_Record, err: DNS_Error) {
 	context.allocator = allocator
@@ -171,4 +189,4 @@ _get_dns_records_os :: proc(hostname: string, type: DNS_Record_Type, allocator :

 	records = recs[:]
 	return
-}
+}
--- a/core/net/errors.odin
+++ b/core/net/errors.odin
@@ -139,6 +139,11 @@ Accept_Error :: enum i32 {
 	Unknown,
 }

+Recv_Error :: union #shared_nil {
+	TCP_Recv_Error,
+	UDP_Recv_Error,
+}
+
 TCP_Recv_Error :: enum i32 {
 	None,
 	// No network connection, or the network stack is not initialized.
@@ -149,7 +154,8 @@ TCP_Recv_Error :: enum i32 {
 	Invalid_Argument,
 	// The socket is not connected.
 	Not_Connected,
-	// Connection was closed/broken/shutdown while receiving data.
+	// Connection was closed due to an error or shutdown.
+	// NOTE: a graceful close is indicated by a `0, nil` (0 bytes received and no error) return.
 	Connection_Closed,
 	// Timed out before being able to receive any data.
 	Timeout,
@@ -170,7 +176,8 @@ UDP_Recv_Error :: enum i32 {
 	Insufficient_Resources,
 	// Invalid socket or buffer given.
 	Invalid_Argument,
-	// "Connection" was refused by remote, or closed/broken/shutdown while receiving data.
+	// "Connection" was refused, or closed due to an error.
+	// NOTE: a graceful close is indicated by a `0, nil` (0 bytes received and no error) return.
 	Connection_Refused,
 	// Timed out before being able to receive any data.
 	Timeout,
@@ -185,6 +192,11 @@ UDP_Recv_Error :: enum i32 {
 	Unknown,
 }

+Send_Error :: union #shared_nil {
+	TCP_Send_Error,
+	UDP_Send_Error,
+}
+
 TCP_Send_Error :: enum i32 {
 	None,
 	// No network connection, or the network stack is not initialized.
@@ -193,7 +205,7 @@ TCP_Send_Error :: enum i32 {
 	Insufficient_Resources,
 	// Invalid socket or buffer given.
 	Invalid_Argument,
-	// Connection was closed/broken/shutdown while receiving data.
+	// Connection was closed/broken/shutdown while sending data.
 	Connection_Closed,
 	// The socket is not connected.
 	Not_Connected,
--- a/core/net/errors_others.odin
+++ b/core/net/errors_others.odin
@@ -2,6 +2,8 @@
 #+build !linux
 #+build !freebsd
 #+build !windows
+#+build !netbsd
+#+build !openbsd
 package net

@(private="file", thread_local)
@@ -18,10 +20,3 @@ _last_platform_error_string :: proc() -> string {
 _set_last_platform_error :: proc(err: i32) {
 	_last_error = err
 }
-
-Parse_Endpoint_Error :: enum u32 {
-	None          = 0,
-	Bad_Port      = 1,
-	Bad_Address,
-	Bad_Hostname,
-}
--- a/core/net/errors_darwin.odin
+++ b/core/net/errors_darwin.odin
@@ -1,4 +1,4 @@
-#+build darwin
+#+build darwin, netbsd, openbsd
 package net

 /*
--- a/core/net/errors_windows.odin
+++ b/core/net/errors_windows.odin
@@ -63,7 +63,7 @@ _dial_error :: proc() -> Dial_Error {
 		return .Already_Connecting
 	case .WSAEADDRNOTAVAIL, .WSAEAFNOSUPPORT, .WSAEFAULT, .WSAENOTSOCK, .WSAEINPROGRESS, .WSAEINVAL:
 		return .Invalid_Argument
-	case .WSAECONNREFUSED:
+	case .WSAECONNREFUSED, .CONNECTION_REFUSED:
 		return .Refused
 	case .WSAEISCONN:
 		return .Already_Connected
@@ -122,7 +122,7 @@ _accept_error :: proc() -> Accept_Error {
 		return .Aborted
 	case .WSAEFAULT, .WSAEINPROGRESS, .WSAENOTSOCK:
 		return .Invalid_Argument
-	case .WSAEINTR:
+	case .WSAEINTR, .OPERATION_ABORTED:
 		return .Interrupted
 	case .WSAEINVAL:
 		return .Not_Listening
--- a/core/net/interface_others.odin
+++ b/core/net/interface_others.odin
@@ -0,0 +1,11 @@
+#+build !darwin
+#+build !linux
+#+build !freebsd
+#+build !windows
+#+build !netbsd
+#+build !openbsd
+package net
+
+_enumerate_interfaces :: proc(allocator := context.allocator) -> (interfaces: []Network_Interface, err: Interfaces_Error) {
+	return
+}
--- a/core/net/interface_darwin.odin
+++ b/core/net/interface_darwin.odin
@@ -1,4 +1,4 @@
-#+build darwin
+#+build darwin, openbsd, netbsd
 package net

 /*
@@ -117,32 +117,47 @@ IF_Flag :: enum u32 {
 	BROADCAST,
 	DEBUG,
 	LOOPBACK,
-	POINTTOPOINT,
-	NOTRAILERS,
-	RUNNING,
-	NOARP,
-	PROMISC,
-	ALLMULTI,
-	OACTIVE,
-	SIMPLEX,
-	LINK0,
-	LINK1,
-	LINK2,
-	MULTICAST,
+	// NOTE: different order on other BSDs but we don't even need these.
+	// POINTTOPOINT,
+	// NOTRAILERS,
+	// RUNNING,
+	// NOARP,
+	// PROMISC,
+	// ALLMULTI,
+	// OACTIVE,
+	// SIMPLEX,
+	// LINK0,
+	// LINK1,
+	// LINK2,
+	// MULTICAST,
 }

@(private)
 IF_Flags :: bit_set[IF_Flag; u32]

-@(private)
-ifaddrs :: struct {
-	next:    ^ifaddrs,
-	name:    cstring,
-	flags:   IF_Flags,
-	addr:    ^posix.sockaddr,
-	netmask: ^posix.sockaddr,
-	dstaddr: ^posix.sockaddr,
-	data:    rawptr,
+when ODIN_OS == .Darwin || ODIN_OS == .OpenBSD {
+	@(private)
+	ifaddrs :: struct {
+		next:    ^ifaddrs,
+		name:    cstring,
+		flags:   IF_Flags,
+		addr:    ^posix.sockaddr,
+		netmask: ^posix.sockaddr,
+		dstaddr: ^posix.sockaddr,
+		data:    rawptr,
+	}
+} else when ODIN_OS == .NetBSD {
+	@(private)
+	ifaddrs :: struct {
+		next:      ^ifaddrs,
+		name:      cstring,
+		flags:     IF_Flags,
+		addr:      ^posix.sockaddr,
+		netmask:   ^posix.sockaddr,
+		dstaddr:   ^posix.sockaddr,
+		data:      rawptr,
+		addrflags: u32,
+	}
 }

@(private)
--- a/core/net/socket.odin
+++ b/core/net/socket.odin
@@ -1,4 +1,3 @@
-#+build windows, linux, darwin, freebsd
 package net

 /*
@@ -20,6 +19,35 @@ package net
 		Feoramund:       FreeBSD platform code
 */

+Socket_Option :: enum i32 {
+	Broadcast                 = i32(_SOCKET_OPTION_BROADCAST),
+	Reuse_Address             = i32(_SOCKET_OPTION_REUSE_ADDRESS),
+	Keep_Alive                = i32(_SOCKET_OPTION_KEEP_ALIVE),
+	Out_Of_Bounds_Data_Inline = i32(_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE),
+	Linger                    = i32(_SOCKET_OPTION_LINGER),
+	Receive_Buffer_Size       = i32(_SOCKET_OPTION_RECEIVE_BUFFER_SIZE),
+	Send_Buffer_Size          = i32(_SOCKET_OPTION_SEND_BUFFER_SIZE),
+	Receive_Timeout           = i32(_SOCKET_OPTION_RECEIVE_TIMEOUT),
+	Send_Timeout              = i32(_SOCKET_OPTION_SEND_TIMEOUT),
+
+	TCP_Nodelay               = i32(_SOCKET_OPTION_TCP_NODELAY),
+
+	Use_Loopback              = i32(_SOCKET_OPTION_USE_LOOPBACK),
+	Reuse_Port                = i32(_SOCKET_OPTION_REUSE_PORT),
+	No_SIGPIPE_From_EPIPE     = i32(_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE),
+	Reuse_Port_Load_Balancing = i32(_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING),
+
+	Exclusive_Addr_Use        = i32(_SOCKET_OPTION_EXCLUSIVE_ADDR_USE),
+	Conditional_Accept        = i32(_SOCKET_OPTION_CONDITIONAL_ACCEPT),
+	Dont_Linger               = i32(_SOCKET_OPTION_DONT_LINGER),
+}
+
+Shutdown_Manner :: enum i32 {
+	Receive = i32(_SHUTDOWN_MANNER_RECEIVE),
+	Send    = i32(_SHUTDOWN_MANNER_SEND),
+	Both    = i32(_SHUTDOWN_MANNER_BOTH),
+}
+
 any_socket_to_socket :: proc "contextless" (socket: Any_Socket) -> Socket {
 	switch s in socket {
 	case TCP_Socket:  return Socket(s)
@@ -193,21 +221,36 @@ close :: proc(socket: Any_Socket) {
 	_close(socket)
 }

+/*
+	Receive data into a buffer from a TCP socket.
+
+	If no error occurs, `recv_tcp` returns the number of bytes received and `buf` will contain this data received.
+	If the connection has been gracefully closed, the return value is `0, nil` (0 bytes read and no error).
+*/
 recv_tcp :: proc(socket: TCP_Socket, buf: []byte) -> (bytes_read: int, err: TCP_Recv_Error) {
 	return _recv_tcp(socket, buf)
 }

+/*
+	Receive data into a buffer from a UDP socket.
+
+	If no error occurs, `recv_udp` returns the number of bytes received and `buf` will contain this data received.
+	If the "connection" has been gracefully closed, the return value is `0, nil` (0 bytes read and no error).
+*/
 recv_udp :: proc(socket: UDP_Socket, buf: []byte) -> (bytes_read: int, remote_endpoint: Endpoint, err: UDP_Recv_Error) {
 	return _recv_udp(socket, buf)
 }

 /*
-	Receive data from into a buffer from any socket.
+	Receive data into a buffer from any socket.

 	Note: `remote_endpoint` parameter is non-nil only if the socket type is UDP. On TCP sockets it
 	will always return `nil`.

-	Errors that can be returned: `TCP_Recv_Error`, or `UDP_Recv_Error`
+	Errors that can be returned: `TCP_Recv_Error`, or `UDP_Recv_Error`.
+
+	If no error occurs, `recv_any` returns the number of bytes received and `buf` will contain this data received.
+	If the connection has been gracefully closed, the return value is `0, nil, nil` (0 bytes read and no error).
 */
 recv_any :: proc(socket: Any_Socket, buf: []byte) -> (
 	bytes_read: int,
--- a/core/net/socket_freebsd.odin
+++ b/core/net/socket_freebsd.odin
@@ -20,45 +20,35 @@ package net
 		Feoramund:       FreeBSD platform code
 */

-import "core:c"
 import "core:sys/freebsd"
 import "core:time"

 Fd :: freebsd.Fd

-Socket_Option :: enum c.int {
-	// TODO: Test and implement more socket options.
-	// DEBUG
-	Reuse_Address             = cast(c.int)freebsd.Socket_Option.REUSEADDR,
-	Keep_Alive                = cast(c.int)freebsd.Socket_Option.KEEPALIVE,
-	// DONTROUTE
-	Broadcast                 = cast(c.int)freebsd.Socket_Option.BROADCAST,
-	Use_Loopback              = cast(c.int)freebsd.Socket_Option.USELOOPBACK,
-	Linger                    = cast(c.int)freebsd.Socket_Option.LINGER,
-	Out_Of_Bounds_Data_Inline = cast(c.int)freebsd.Socket_Option.OOBINLINE,
-	Reuse_Port                = cast(c.int)freebsd.Socket_Option.REUSEPORT,
-	// TIMESTAMP
-	No_SIGPIPE_From_EPIPE     = cast(c.int)freebsd.Socket_Option.NOSIGPIPE,
-	// ACCEPTFILTER
-	// BINTIME
-	// NO_OFFLOAD
-	// NO_DDP
-	Reuse_Port_Load_Balancing = cast(c.int)freebsd.Socket_Option.REUSEPORT_LB,
-	// RERROR
+_SOCKET_OPTION_BROADCAST                 :: freebsd.Socket_Option.BROADCAST
+_SOCKET_OPTION_REUSE_ADDRESS             :: freebsd.Socket_Option.REUSEADDR
+_SOCKET_OPTION_KEEP_ALIVE                :: freebsd.Socket_Option.KEEPALIVE
+_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE :: freebsd.Socket_Option.OOBINLINE
+_SOCKET_OPTION_LINGER                    :: freebsd.Socket_Option.LINGER
+_SOCKET_OPTION_RECEIVE_BUFFER_SIZE       :: freebsd.Socket_Option.RCVBUF
+_SOCKET_OPTION_SEND_BUFFER_SIZE          :: freebsd.Socket_Option.SNDBUF
+_SOCKET_OPTION_RECEIVE_TIMEOUT           :: freebsd.Socket_Option.RCVTIMEO
+_SOCKET_OPTION_SEND_TIMEOUT              :: freebsd.Socket_Option.SNDTIMEO

-	Send_Buffer_Size          = cast(c.int)freebsd.Socket_Option.SNDBUF,
-	Receive_Buffer_Size       = cast(c.int)freebsd.Socket_Option.RCVBUF,
-	// SNDLOWAT
-	// RCVLOWAT
-	Send_Timeout              = cast(c.int)freebsd.Socket_Option.SNDTIMEO,
-	Receive_Timeout           = cast(c.int)freebsd.Socket_Option.RCVTIMEO,
-}
+_SOCKET_OPTION_TCP_NODELAY :: -1

-Shutdown_Manner :: enum c.int {
-	Receive = cast(c.int)freebsd.Shutdown_Method.RD,
-	Send    = cast(c.int)freebsd.Shutdown_Method.WR,
-	Both    = cast(c.int)freebsd.Shutdown_Method.RDWR,
-}
+_SOCKET_OPTION_USE_LOOPBACK              :: freebsd.Socket_Option.USELOOPBACK
+_SOCKET_OPTION_REUSE_PORT                :: freebsd.Socket_Option.REUSEPORT
+_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE     :: freebsd.Socket_Option.NOSIGPIPE
+_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING :: freebsd.Socket_Option.REUSEPORT_LB
+
+_SOCKET_OPTION_EXCLUSIVE_ADDR_USE :: -1
+_SOCKET_OPTION_CONDITIONAL_ACCEPT :: -1
+_SOCKET_OPTION_DONT_LINGER        :: -1
+
+_SHUTDOWN_MANNER_RECEIVE :: freebsd.Shutdown_Method.RD
+_SHUTDOWN_MANNER_SEND    :: freebsd.Shutdown_Method.WR
+_SHUTDOWN_MANNER_BOTH    :: freebsd.Shutdown_Method.RDWR

@(private)
 _create_socket :: proc(family: Address_Family, protocol: Socket_Protocol) -> (socket: Any_Socket, err: Create_Socket_Error) {
@@ -272,7 +262,7 @@ _set_option :: proc(socket: Any_Socket, option: Socket_Option, value: any, loc :
 	ptr: rawptr
 	len: freebsd.socklen_t

-	switch option {
+	#partial switch option {
 	case
 		.Reuse_Address,
 		.Keep_Alive,
@@ -344,7 +334,7 @@ _set_option :: proc(socket: Any_Socket, option: Socket_Option, value: any, loc :
 			ptr = &int_value
 			len = size_of(int_value)
 	case:
-		unimplemented("set_option() option not yet implemented", loc)
+		return .Invalid_Option
 	}

 	real_socket := any_socket_to_socket(socket)
@@ -391,7 +381,7 @@ _endpoint_to_sockaddr :: proc(ep: Endpoint) -> (sockaddr: freebsd.Socket_Address
 		}
 	case IP6_Address:
 		(cast(^freebsd.Socket_Address_Internet6)(&sockaddr))^ = {
-			len = size_of(freebsd.Socket_Address_Internet),
+			len = size_of(freebsd.Socket_Address_Internet6),
 			family = .INET6,
 			port = cast(freebsd.in_port_t)ep.port,
 			addr = transmute(freebsd.IP6_Address)addr,
--- a/core/net/socket_linux.odin
+++ b/core/net/socket_linux.odin
@@ -21,28 +21,33 @@ package net
 		Feoramund:       FreeBSD platform code
 */

-import "core:c"
 import "core:time"
 import "core:sys/linux"

-Socket_Option :: enum c.int {
-	Reuse_Address             = c.int(linux.Socket_Option.REUSEADDR),
-	Keep_Alive                = c.int(linux.Socket_Option.KEEPALIVE),
-	Out_Of_Bounds_Data_Inline = c.int(linux.Socket_Option.OOBINLINE),
-	TCP_Nodelay               = c.int(linux.Socket_TCP_Option.NODELAY),
-	Linger                    = c.int(linux.Socket_Option.LINGER),
-	Receive_Buffer_Size       = c.int(linux.Socket_Option.RCVBUF),
-	Send_Buffer_Size          = c.int(linux.Socket_Option.SNDBUF),
-	Receive_Timeout           = c.int(linux.Socket_Option.RCVTIMEO),
-	Send_Timeout              = c.int(linux.Socket_Option.SNDTIMEO),
-	Broadcast                 = c.int(linux.Socket_Option.BROADCAST),
-}
+_SOCKET_OPTION_BROADCAST                 :: linux.Socket_Option.BROADCAST
+_SOCKET_OPTION_REUSE_ADDRESS             :: linux.Socket_Option.REUSEADDR
+_SOCKET_OPTION_KEEP_ALIVE                :: linux.Socket_Option.KEEPALIVE
+_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE :: linux.Socket_Option.OOBINLINE
+_SOCKET_OPTION_LINGER                    :: linux.Socket_Option.LINGER
+_SOCKET_OPTION_RECEIVE_BUFFER_SIZE       :: linux.Socket_Option.RCVBUF
+_SOCKET_OPTION_SEND_BUFFER_SIZE          :: linux.Socket_Option.SNDBUF
+_SOCKET_OPTION_RECEIVE_TIMEOUT           :: linux.Socket_Option.RCVTIMEO
+_SOCKET_OPTION_SEND_TIMEOUT              :: linux.Socket_Option.SNDTIMEO

-Shutdown_Manner :: enum c.int {
-	Receive = c.int(linux.Shutdown_How.RD),
-	Send    = c.int(linux.Shutdown_How.WR),
-	Both    = c.int(linux.Shutdown_How.RDWR),
-}
+_SOCKET_OPTION_TCP_NODELAY :: linux.Socket_TCP_Option.NODELAY
+
+_SOCKET_OPTION_USE_LOOPBACK              :: -1
+_SOCKET_OPTION_REUSE_PORT                :: -1
+_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE     :: -1
+_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING :: -1
+
+_SOCKET_OPTION_EXCLUSIVE_ADDR_USE :: -1
+_SOCKET_OPTION_CONDITIONAL_ACCEPT :: -1
+_SOCKET_OPTION_DONT_LINGER        :: -1
+
+_SHUTDOWN_MANNER_RECEIVE :: linux.Shutdown_How.RD
+_SHUTDOWN_MANNER_SEND    :: linux.Shutdown_How.WR
+_SHUTDOWN_MANNER_BOTH    :: linux.Shutdown_How.RDWR

 // Wrappers and unwrappers for system-native types

@@ -347,7 +352,7 @@ _set_option :: proc(sock: Any_Socket, option: Socket_Option, value: any, loc :=
 	int_value: i32
 	timeval_value: linux.Time_Val
 	errno: linux.Errno
-	switch option {
+	#partial switch option {
 	case
 		.Reuse_Address,
 		.Keep_Alive,
@@ -400,10 +405,14 @@ _set_option :: proc(sock: Any_Socket, option: Socket_Option, value: any, loc :=
 				panic("set_option() value must be an integer here", loc)
 			}
 			errno = linux.setsockopt(os_sock, level, int(option), &int_value)
+	case:
+		return .Invalid_Socket
 	}
+
 	if errno != .NONE {
 		return _socket_option_error(errno)
 	}
+
 	return nil
 }

--- a/core/net/socket_others.odin
+++ b/core/net/socket_others.odin
@@ -0,0 +1,105 @@
+#+build !darwin
+#+build !linux
+#+build !freebsd
+#+build !windows
+#+build !netbsd
+#+build !openbsd
+#+private
+package net
+
+_SOCKET_OPTION_BROADCAST                 :: -1
+_SOCKET_OPTION_REUSE_ADDRESS             :: -1
+_SOCKET_OPTION_KEEP_ALIVE                :: -1
+_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE :: -1
+_SOCKET_OPTION_LINGER                    :: -1
+_SOCKET_OPTION_RECEIVE_BUFFER_SIZE       :: -1
+_SOCKET_OPTION_SEND_BUFFER_SIZE          :: -1
+_SOCKET_OPTION_RECEIVE_TIMEOUT           :: -1
+_SOCKET_OPTION_SEND_TIMEOUT              :: -1
+
+_SOCKET_OPTION_TCP_NODELAY :: -1
+
+_SOCKET_OPTION_USE_LOOPBACK              :: -1
+_SOCKET_OPTION_REUSE_PORT                :: -1
+_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE     :: -1
+_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING :: -1
+
+_SOCKET_OPTION_EXCLUSIVE_ADDR_USE :: -1
+_SOCKET_OPTION_CONDITIONAL_ACCEPT :: -1
+_SOCKET_OPTION_DONT_LINGER        :: -1
+
+_SHUTDOWN_MANNER_RECEIVE :: -1
+_SHUTDOWN_MANNER_SEND    :: -1
+_SHUTDOWN_MANNER_BOTH    :: -1
+
+_dial_tcp_from_endpoint :: proc(endpoint: Endpoint, options := DEFAULT_TCP_OPTIONS) -> (sock: TCP_Socket, err: Network_Error) {
+	err = Create_Socket_Error.Network_Unreachable
+	return
+}
+
+_create_socket :: proc(family: Address_Family, protocol: Socket_Protocol) -> (sock: Any_Socket, err: Create_Socket_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_bind :: proc(skt: Any_Socket, ep: Endpoint) -> (err: Bind_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_listen_tcp :: proc(interface_endpoint: Endpoint, backlog := 1000) -> (skt: TCP_Socket, err: Network_Error) {
+	err = Create_Socket_Error.Network_Unreachable
+	return
+}
+
+_bound_endpoint :: proc(sock: Any_Socket) -> (ep: Endpoint, err: Socket_Info_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_peer_endpoint :: proc(sock: Any_Socket) -> (ep: Endpoint, err: Socket_Info_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_accept_tcp :: proc(sock: TCP_Socket, options := DEFAULT_TCP_OPTIONS) -> (client: TCP_Socket, source: Endpoint, err: Accept_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_close :: proc(skt: Any_Socket) {
+}
+
+_recv_tcp :: proc(skt: TCP_Socket, buf: []byte) -> (bytes_read: int, err: TCP_Recv_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_recv_udp :: proc(skt: UDP_Socket, buf: []byte) -> (bytes_read: int, remote_endpoint: Endpoint, err: UDP_Recv_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_send_tcp :: proc(skt: TCP_Socket, buf: []byte) -> (bytes_written: int, err: TCP_Send_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_send_udp :: proc(skt: UDP_Socket, buf: []byte, to: Endpoint) -> (bytes_written: int, err: UDP_Send_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_shutdown :: proc(skt: Any_Socket, manner: Shutdown_Manner) -> (err: Shutdown_Error) {
+	err = .Network_Unreachable
+	return
+}
+
+_set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #caller_location) -> Socket_Option_Error {
+	return .Network_Unreachable
+}
+
+_set_blocking :: proc(socket: Any_Socket, should_block: bool) -> (err: Set_Blocking_Error) {
+	err = .Network_Unreachable
+	return
+}
--- a/core/net/socket_darwin.odin
+++ b/core/net/socket_darwin.odin
@@ -1,4 +1,4 @@
-#+build darwin
+#+build darwin, netbsd, openbsd
 package net

 /*
@@ -20,28 +20,33 @@ package net
 		Feoramund:       FreeBSD platform code
 */

-import "core:c"
 import "core:sys/posix"
 import "core:time"

-Socket_Option :: enum c.int {
-	Broadcast                 = c.int(posix.Sock_Option.BROADCAST),
-	Reuse_Address             = c.int(posix.Sock_Option.REUSEADDR),
-	Keep_Alive                = c.int(posix.Sock_Option.KEEPALIVE),
-	Out_Of_Bounds_Data_Inline = c.int(posix.Sock_Option.OOBINLINE),
-	TCP_Nodelay               = c.int(posix.TCP_NODELAY),
-	Linger                    = c.int(posix.Sock_Option.LINGER),
-	Receive_Buffer_Size       = c.int(posix.Sock_Option.RCVBUF),
-	Send_Buffer_Size          = c.int(posix.Sock_Option.SNDBUF),
-	Receive_Timeout           = c.int(posix.Sock_Option.RCVTIMEO),
-	Send_Timeout              = c.int(posix.Sock_Option.SNDTIMEO),
-}
+_SOCKET_OPTION_BROADCAST                 :: posix.Sock_Option.BROADCAST
+_SOCKET_OPTION_REUSE_ADDRESS             :: posix.Sock_Option.REUSEADDR
+_SOCKET_OPTION_KEEP_ALIVE                :: posix.Sock_Option.KEEPALIVE
+_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE :: posix.Sock_Option.OOBINLINE
+_SOCKET_OPTION_LINGER                    :: posix.Sock_Option.LINGER
+_SOCKET_OPTION_RECEIVE_BUFFER_SIZE       :: posix.Sock_Option.RCVBUF
+_SOCKET_OPTION_SEND_BUFFER_SIZE          :: posix.Sock_Option.SNDBUF
+_SOCKET_OPTION_RECEIVE_TIMEOUT           :: posix.Sock_Option.RCVTIMEO
+_SOCKET_OPTION_SEND_TIMEOUT              :: posix.Sock_Option.SNDTIMEO

-Shutdown_Manner :: enum c.int {
-	Receive = c.int(posix.SHUT_RD),
-	Send    = c.int(posix.SHUT_WR),
-	Both    = c.int(posix.SHUT_RDWR),
-}
+_SOCKET_OPTION_TCP_NODELAY :: posix.TCP_NODELAY
+
+_SOCKET_OPTION_USE_LOOPBACK              :: -1
+_SOCKET_OPTION_REUSE_PORT                :: -1
+_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE     :: -1
+_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING :: -1
+
+_SOCKET_OPTION_EXCLUSIVE_ADDR_USE :: -1
+_SOCKET_OPTION_CONDITIONAL_ACCEPT :: -1
+_SOCKET_OPTION_DONT_LINGER        :: -1
+
+_SHUTDOWN_MANNER_RECEIVE :: posix.SHUT_RD
+_SHUTDOWN_MANNER_SEND    :: posix.SHUT_WR
+_SHUTDOWN_MANNER_BOTH    :: posix.SHUT_RDWR

@(private)
 _create_socket :: proc(family: Address_Family, protocol: Socket_Protocol) -> (socket: Any_Socket, err: Create_Socket_Error) {
@@ -273,7 +278,7 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 	ptr: rawptr
 	len: posix.socklen_t

-	switch option {
+	#partial switch option {
 	case
 		.Broadcast,
 		.Reuse_Address,
@@ -327,6 +332,8 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 			}
 			ptr = &int_value
 			len = size_of(int_value)
+	case:
+		return .Invalid_Option
 	}

 	skt := any_socket_to_socket(s)
--- a/core/net/socket_windows.odin
+++ b/core/net/socket_windows.odin
@@ -24,59 +24,30 @@ import "core:c"
 import win "core:sys/windows"
 import "core:time"

-Socket_Option :: enum c.int {
-	// bool: Whether the address that this socket is bound to can be reused by other sockets.
-	//       This allows you to bypass the cooldown period if a program dies while the socket is bound.
-	Reuse_Address             = win.SO_REUSEADDR,
+_SOCKET_OPTION_BROADCAST                 :: win.SO_BROADCAST
+_SOCKET_OPTION_REUSE_ADDRESS             :: win.SO_REUSEADDR
+_SOCKET_OPTION_KEEP_ALIVE                :: win.SO_KEEPALIVE
+_SOCKET_OPTION_OUT_OF_BOUNDS_DATA_INLINE :: win.SO_OOBINLINE
+_SOCKET_OPTION_LINGER                    :: win.SO_LINGER
+_SOCKET_OPTION_RECEIVE_BUFFER_SIZE       :: win.SO_RCVBUF
+_SOCKET_OPTION_SEND_BUFFER_SIZE          :: win.SO_SNDBUF
+_SOCKET_OPTION_RECEIVE_TIMEOUT           :: win.SO_RCVTIMEO
+_SOCKET_OPTION_SEND_TIMEOUT              :: win.SO_SNDTIMEO

-	// bool: Whether other programs will be inhibited from binding the same endpoint as this socket.
-	Exclusive_Addr_Use        = win.SO_EXCLUSIVEADDRUSE,
+_SOCKET_OPTION_TCP_NODELAY :: win.TCP_NODELAY

-	// bool: When true, keepalive packets will be automatically be sent for this connection. TODO: verify this understanding
-	Keep_Alive                = win.SO_KEEPALIVE, 
+_SOCKET_OPTION_USE_LOOPBACK              :: -1
+_SOCKET_OPTION_REUSE_PORT                :: -1
+_SOCKET_OPTION_NO_SIGPIPE_FROM_EPIPE     :: -1
+_SOCKET_OPTION_REUSE_PORT_LOAD_BALANCING :: -1

-	// bool: When true, client connections will immediately be sent a TCP/IP RST response, rather than being accepted.
-	Conditional_Accept        = win.SO_CONDITIONAL_ACCEPT,
+_SOCKET_OPTION_EXCLUSIVE_ADDR_USE :: win.SO_EXCLUSIVEADDRUSE
+_SOCKET_OPTION_CONDITIONAL_ACCEPT :: win.SO_CONDITIONAL_ACCEPT
+_SOCKET_OPTION_DONT_LINGER        :: win.SO_DONTLINGER

-	// bool: If true, when the socket is closed, but data is still waiting to be sent, discard that data.
-	Dont_Linger               = win.SO_DONTLINGER,
-
-	// bool: When true, 'out-of-band' data sent over the socket will be read by a normal net.recv() call, the same as normal 'in-band' data.
-	Out_Of_Bounds_Data_Inline = win.SO_OOBINLINE,   
-
-	// bool: When true, disables send-coalescing, therefore reducing latency.
-	TCP_Nodelay               = win.TCP_NODELAY, 
-
-	// win.LINGER: Customizes how long (if at all) the socket will remain open when there
-	// is some remaining data waiting to be sent, and net.close() is called.
-	Linger                    = win.SO_LINGER, 
-
-	// win.DWORD: The size, in bytes, of the OS-managed receive-buffer for this socket.
-	Receive_Buffer_Size       = win.SO_RCVBUF, 
-
-	// win.DWORD: The size, in bytes, of the OS-managed send-buffer for this socket.
-	Send_Buffer_Size          = win.SO_SNDBUF,
-
-	// win.DWORD: For blocking sockets, the time in milliseconds to wait for incoming data to be received, before giving up and returning .Timeout.
-	//            For non-blocking sockets, ignored.
-	//            Use a value of zero to potentially wait forever.
-	Receive_Timeout           = win.SO_RCVTIMEO,
-
-	// win.DWORD: For blocking sockets, the time in milliseconds to wait for outgoing data to be sent, before giving up and returning .Timeout.
-	//            For non-blocking sockets, ignored.
-	//            Use a value of zero to potentially wait forever.
-	Send_Timeout              = win.SO_SNDTIMEO,
-
-	// bool: Allow sending to, receiving from, and binding to, a broadcast address.
-	Broadcast                 = win.SO_BROADCAST, 
-}
-
-
-Shutdown_Manner :: enum c.int {
-	Receive = win.SD_RECEIVE,
-	Send    = win.SD_SEND,
-	Both    = win.SD_BOTH,
-}
+_SHUTDOWN_MANNER_RECEIVE :: win.SD_RECEIVE
+_SHUTDOWN_MANNER_SEND    :: win.SD_SEND
+_SHUTDOWN_MANNER_BOTH    :: win.SD_BOTH

@(init, private)
 ensure_winsock_initialized :: proc "contextless" () {
@@ -322,7 +293,7 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 	ptr: rawptr
 	len: c.int

-	switch option {
+	#partial switch option {
 	case
 		.Reuse_Address,
 		.Exclusive_Addr_Use,
@@ -383,6 +354,8 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 			}
 			ptr = &int_value
 			len = size_of(int_value)
+	case:
+		return .Invalid_Option
 	}

 	socket := any_socket_to_socket(s)
--- a/core/odin/ast/ast.odin
+++ b/core/odin/ast/ast.odin
@@ -17,6 +17,11 @@ Proc_Inlining :: enum u32 {
 	No_Inline = 2,
 }

+Proc_Tailing :: enum u32 {
+	None      = 0,
+	Must_Tail = 1,
+}
+
 Proc_Calling_Convention_Extra :: enum i32 {
 	Foreign_Block_Default,
 }
@@ -147,6 +152,7 @@ Proc_Lit :: struct {
 	body:          ^Stmt, // nil when it represents a foreign procedure
 	tags:          Proc_Tags,
 	inlining:      Proc_Inlining,
+	tailing:       Proc_Tailing,
 	where_token:   tokenizer.Token,
 	where_clauses: []^Expr,
 }
@@ -243,6 +249,7 @@ Matrix_Index_Expr :: struct {
 Call_Expr :: struct {
 	using node: Expr,
 	inlining: Proc_Inlining,
+	tailing:  Proc_Tailing,
 	expr:     ^Expr,
 	open:     tokenizer.Pos,
 	args:     []^Expr,
@@ -791,6 +798,7 @@ Struct_Type :: struct {
 	is_raw_union:    bool,
 	is_no_copy:      bool,
 	is_all_or_none:  bool,
+	is_simple:       bool,
 	fields:          ^Field_List,
 	name_count:      int,
 }
--- a/Show More
+++ b/Show More