diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index f6e50e60b..6c5200c7c 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -169,22 +169,23 @@ jobs:
         run: python -c "import sys; print(sys.version)"
 
       - name: Download Windows artifacts
-        uses: actions/download-artifact@v4
+
+        uses: actions/download-artifact@v4.1.7
         with:
           name: windows_artifacts
 
       - name: Download Ubuntu artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v4.1.7
         with:
           name: linux_artifacts
 
       - name: Download macOS artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v4.1.7
         with:
           name: macos_artifacts
 
       - name: Download macOS arm artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v4.1.7
         with:
           name: macos_arm_artifacts
 
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index 5a510951e..45eb44307 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -476,11 +476,9 @@ last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
 	// worth vectorizing assuming there is a hardware vector unit, and
 	// the data size is large enough.
 	if i < SIMD_REG_SIZE_128 {
-		if i > 0 { // Handle s == nil.
-			for /**/; i >= 0; i -= 1 {
-				if s[i] == c {
-					return i
-				}
+		#reverse for ch, j in s {
+			if ch == c {
+				return j
 			}
 		}
 		return -1
diff --git a/core/debug/trace/trace_nil.odin b/core/debug/trace/trace_nil.odin
index 8611d7726..ca8bd7817 100644
--- a/core/debug/trace/trace_nil.odin
+++ b/core/debug/trace/trace_nil.odin
@@ -1,4 +1,6 @@
-//+build !windows !linux !darwin
+//+build !windows
+//+build !linux
+//+build !darwin
 package debug_trace
 
 import "base:runtime"
diff --git a/core/flags/errors_nonbsd.odin b/core/flags/errors_nonbsd.odin
index a77f12abf..e129aff74 100644
--- a/core/flags/errors_nonbsd.odin
+++ b/core/flags/errors_nonbsd.odin
@@ -1,4 +1,5 @@
-//+build !netbsd !openbsd
+//+build !netbsd
+//+build !openbsd
 package flags
 
 import "base:runtime"
diff --git a/core/flags/internal_rtti_nonbsd.odin b/core/flags/internal_rtti_nonbsd.odin
index 27fdb3b75..0044898d5 100644
--- a/core/flags/internal_rtti_nonbsd.odin
+++ b/core/flags/internal_rtti_nonbsd.odin
@@ -1,5 +1,6 @@
 //+private
-//+build !netbsd !openbsd
+//+build !netbsd
+//+build !openbsd
 package flags
 
 import "core:net"
diff --git a/core/math/math.odin b/core/math/math.odin
index f5e904da6..0e21afa67 100644
--- a/core/math/math.odin
+++ b/core/math/math.odin
@@ -444,11 +444,11 @@ bias :: proc "contextless" (t, b: $T) -> T where intrinsics.type_is_numeric(T) {
 	return t / (((1/b) - 2) * (1 - t) + 1)
 }
 @(require_results)
-gain :: proc "contextless" (t, g: $T) -> T where intrinsics.type_is_numeric(T) {
+gain :: proc "contextless" (t, g: $T) -> T where intrinsics.type_is_float(T) {
 	if t < 0.5 {
-		return bias(t*2, g)*0.5
+		return bias(t*2, g) * 0.5
 	}
-	return bias(t*2 - 1, 1 - g)*0.5 + 0.5
+	return bias(t*2 - 1, 1 - g) * 0.5 + 0.5
 }
 
 
diff --git a/core/os/os2/heap_linux.odin b/core/os/os2/heap_linux.odin
index e80bb3dee..e765c320b 100644
--- a/core/os/os2/heap_linux.odin
+++ b/core/os/os2/heap_linux.odin
@@ -1,17 +1,10 @@
 //+private
 package os2
 
-import "base:runtime"
-
 import "core:sys/linux"
 import "core:sync"
 import "core:mem"
 
-// Use the experimental custom heap allocator (over calling `malloc` etc.).
-// This is a switch because there are thread-safety problems that need to be fixed.
-// See: https://github.com/odin-lang/Odin/issues/4161
-USE_EXPERIMENTAL_ALLOCATOR :: #config(OS2_LINUX_USE_EXPERIMENTAL_ALLOCATOR, false)
-
 // NOTEs
 //
 // All allocations below DIRECT_MMAP_THRESHOLD exist inside of memory "Regions." A region
@@ -146,8 +139,6 @@ Region :: struct {
 	memory: [BLOCKS_PER_REGION]Allocation_Header,
 }
 
-when USE_EXPERIMENTAL_ALLOCATOR {
-
 _heap_allocator_proc :: proc(allocator_data: rawptr, mode: mem.Allocator_Mode,
                             size, alignment: int,
                             old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, mem.Allocator_Error) {
@@ -228,10 +219,6 @@ _heap_allocator_proc :: proc(allocator_data: rawptr, mode: mem.Allocator_Mode,
 	return nil, nil
 }
 
-} else {
-	_heap_allocator_proc :: runtime.heap_allocator_proc
-}
-
 heap_alloc :: proc(size: int) -> rawptr {
 	if size >= DIRECT_MMAP_THRESHOLD {
 		return _direct_mmap_alloc(size)
@@ -293,7 +280,8 @@ heap_alloc :: proc(size: int) -> rawptr {
 		_local_region, back_idx = _region_retrieve_with_space(blocks_needed, local_region_idx, back_idx)
 	}
 	user_ptr, used := _region_get_block(_local_region, idx, blocks_needed)
-	_local_region.hdr.free_blocks -= (used + 1)
+
+	sync.atomic_sub_explicit(&_local_region.hdr.free_blocks, used + 1, .Release)
 
 	// If this memory was ever used before, it now needs to be zero'd.
 	if idx < _local_region.hdr.last_used {
@@ -320,7 +308,7 @@ heap_resize :: proc(old_memory: rawptr, new_size: int) -> rawptr #no_bounds_chec
 
 heap_free :: proc(memory: rawptr) {
 	alloc := _get_allocation_header(memory)
-	if alloc.requested & IS_DIRECT_MMAP == IS_DIRECT_MMAP {
+	if sync.atomic_load(&alloc.requested) & IS_DIRECT_MMAP == IS_DIRECT_MMAP {
 		_direct_mmap_free(alloc)
 		return
 	}
@@ -475,25 +463,31 @@ _region_local_free :: proc(alloc: ^Allocation_Header) #no_bounds_check {
 	alloc := alloc
 	add_to_free_list := true
 
-	_local_region.hdr.free_blocks += _get_block_count(alloc^) + 1
+	idx := sync.atomic_load(&alloc.idx)
+	prev := sync.atomic_load(&alloc.prev)
+	next := sync.atomic_load(&alloc.next)
+	block_count := next - idx - 1
+	free_blocks := sync.atomic_load(&_local_region.hdr.free_blocks) + block_count + 1
+	sync.atomic_store_explicit(&_local_region.hdr.free_blocks, free_blocks, .Release)
 
 	// try to merge with prev
-	if alloc.idx > 0 && _local_region.memory[alloc.prev].free_idx != NOT_FREE {
-		_local_region.memory[alloc.prev].next = alloc.next
-		_local_region.memory[alloc.next].prev = alloc.prev
-		alloc = &_local_region.memory[alloc.prev]
+	if idx > 0 && sync.atomic_load(&_local_region.memory[prev].free_idx) != NOT_FREE {
+		sync.atomic_store_explicit(&_local_region.memory[prev].next, next, .Release)
+		_local_region.memory[next].prev = prev
+		alloc = &_local_region.memory[prev]
 		add_to_free_list = false
 	}
 
 	// try to merge with next
-	if alloc.next < BLOCKS_PER_REGION - 1 && _local_region.memory[alloc.next].free_idx != NOT_FREE {
-		old_next := alloc.next
-		alloc.next = _local_region.memory[old_next].next
-		_local_region.memory[alloc.next].prev = alloc.idx
+	if next < BLOCKS_PER_REGION - 1 && sync.atomic_load(&_local_region.memory[next].free_idx) != NOT_FREE {
+		old_next := next
+		sync.atomic_store_explicit(&alloc.next, sync.atomic_load(&_local_region.memory[old_next].next), .Release)
+
+		sync.atomic_store_explicit(&_local_region.memory[next].prev, idx, .Release)
 
 		if add_to_free_list {
-			_local_region.hdr.free_list[_local_region.memory[old_next].free_idx] = alloc.idx
-			alloc.free_idx = _local_region.memory[old_next].free_idx
+		        sync.atomic_store_explicit(&_local_region.hdr.free_list[_local_region.memory[old_next].free_idx], idx, .Release)
+		        sync.atomic_store_explicit(&alloc.free_idx, _local_region.memory[old_next].free_idx, .Release)
 		} else {
 			// NOTE: We have aleady merged with prev, and now merged with next.
 			//       Now, we are actually going to remove from the free_list.
@@ -505,10 +499,11 @@ _region_local_free :: proc(alloc: ^Allocation_Header) #no_bounds_check {
 	// This is the only place where anything is appended to the free list.
 	if add_to_free_list {
 		fl := _local_region.hdr.free_list
-		alloc.free_idx = _local_region.hdr.free_list_len
-		fl[alloc.free_idx] = alloc.idx
-		_local_region.hdr.free_list_len += 1
-		if int(_local_region.hdr.free_list_len) == len(fl) {
+		fl_len := sync.atomic_load(&_local_region.hdr.free_list_len)
+		sync.atomic_store_explicit(&alloc.free_idx, fl_len, .Release)
+		fl[alloc.free_idx] = idx
+		sync.atomic_store_explicit(&_local_region.hdr.free_list_len, fl_len + 1, .Release)
+		if int(fl_len + 1) == len(fl) {
 			free_alloc := _get_allocation_header(mem.raw_data(_local_region.hdr.free_list))
 			_region_resize(free_alloc, len(fl) * 2 * size_of(fl[0]), true)
 		}
@@ -525,8 +520,8 @@ _region_assign_free_list :: proc(region: ^Region, memory: rawptr, blocks: u16) {
 _region_retrieve_with_space :: proc(blocks: u16, local_idx: int = -1, back_idx: int = -1) -> (^Region, int) {
 	r: ^Region
 	idx: int
-	for r = global_regions; r != nil; r = r.hdr.next_region {
-		if idx == local_idx || idx < back_idx || r.hdr.free_blocks < blocks {
+	for r = sync.atomic_load(&global_regions); r != nil; r = r.hdr.next_region {
+		if idx == local_idx || idx < back_idx || sync.atomic_load(&r.hdr.free_blocks) < blocks {
 			idx += 1
 			continue
 		}
@@ -594,7 +589,7 @@ _region_segment :: proc(region: ^Region, alloc: ^Allocation_Header, blocks, new_
 
 _region_get_local_idx :: proc() -> int {
 	idx: int
-	for r := global_regions; r != nil; r = r.hdr.next_region {
+	for r := sync.atomic_load(&global_regions); r != nil; r = r.hdr.next_region {
 		if r == _local_region {
 			return idx
 		}
@@ -610,9 +605,10 @@ _region_find_and_assign_local :: proc(alloc: ^Allocation_Header) {
 		_local_region = _region_retrieve_from_addr(alloc)
 	}
 
-	// At this point, _local_region is set correctly. Spin until acquired
-	res: ^^Region
-	for res != &_local_region {
+	// At this point, _local_region is set correctly. Spin until acquire
+	res := CURRENTLY_ACTIVE
+
+	for res == CURRENTLY_ACTIVE {
 		res = sync.atomic_compare_exchange_strong_explicit(
 			&_local_region.hdr.local_addr,
 			&_local_region,
@@ -634,9 +630,9 @@ _region_contains_mem :: proc(r: ^Region, memory: rawptr) -> bool #no_bounds_chec
 _region_free_list_remove :: proc(region: ^Region, free_idx: u16) #no_bounds_check {
 	// pop, swap and update allocation hdr
 	if n := region.hdr.free_list_len - 1; free_idx != n {
-		region.hdr.free_list[free_idx] = region.hdr.free_list[n]
+		region.hdr.free_list[free_idx] = sync.atomic_load(&region.hdr.free_list[n]) 
 		alloc_idx := region.hdr.free_list[free_idx]
-		region.memory[alloc_idx].free_idx = free_idx
+		sync.atomic_store_explicit(&region.memory[alloc_idx].free_idx, free_idx, .Release)
 	}
 	region.hdr.free_list_len -= 1
 }
@@ -727,3 +723,4 @@ _get_allocation_header :: #force_inline proc(raw_mem: rawptr) -> ^Allocation_Hea
 _round_up_to_nearest :: #force_inline proc(size, round: int) -> int {
 	return (size-1) + round - (size-1) % round
 }
+
diff --git a/core/strconv/strconv.odin b/core/strconv/strconv.odin
index dce9f834a..b1155c22f 100644
--- a/core/strconv/strconv.odin
+++ b/core/strconv/strconv.odin
@@ -7,11 +7,11 @@ Parses a boolean value from the input string
 
 **Inputs**  
 - s: The input string  
-  - true: "1", "t", "T", "true", "TRUE", "True"
-  - false: "0", "f", "F", "false", "FALSE", "False"
+	- true: "1", "t", "T", "true", "TRUE", "True"
+	- false: "0", "f", "F", "false", "FALSE", "False"
 - n: An optional pointer to an int to store the length of the parsed substring (default: nil)
 
-**Returns**  
+**Returns**
 - result: The parsed boolean value (default: false)
 - ok: A boolean indicating whether the parsing was successful
 */
@@ -29,7 +29,7 @@ parse_bool :: proc(s: string, n: ^int = nil) -> (result: bool = false, ok: bool)
 /*
 Finds the integer value of the given rune
 
-**Inputs**  
+**Inputs**
 - r: The input rune to find the integer value of
 
 **Returns**   The integer value of the given rune
@@ -47,7 +47,7 @@ _digit_value :: proc(r: rune) -> int {
 /*
 Parses an integer value from the input string in the given base, without a prefix
 
-**Inputs**  
+**Inputs**
 - str: The input string to parse the integer value from
 - base: The base of the integer value to be parsed (must be between 1 and 16)
 - n: An optional pointer to an int to store the length of the parsed substring (default: nil)
@@ -65,7 +65,7 @@ Output:
 
 	-1234 false
 
-**Returns**  
+**Returns**
 - value: Parses an integer value from a string, in the given base, without a prefix.
 - ok: ok=false if no numeric value of the appropriate base could be found, or if the input string contained more than just the number.
 */
@@ -117,12 +117,12 @@ parse_i64_of_base :: proc(str: string, base: int, n: ^int = nil) -> (value: i64,
 /*
 Parses an integer value from the input string in base 10, unless there's a prefix
 
-**Inputs**  
+**Inputs**
 - str: The input string to parse the integer value from
 - n: An optional pointer to an int to store the length of the parsed substring (default: nil)
 
 Example:
-	
+
 	import "core:fmt"
 	import "core:strconv"
 	parse_i64_maybe_prefixed_example :: proc() {
@@ -132,13 +132,13 @@ Example:
 		n, ok = strconv.parse_i64_maybe_prefixed("0xeeee")
 		fmt.println(n,ok)
 	}
-	
+
 Output:
 
 	1234 true
 	61166 true
 
-**Returns**  
+**Returns**
 - value: The parsed integer value
 - ok: ok=false if a valid integer could not be found, or if the input string contained more than just the number.
 */
@@ -200,14 +200,14 @@ parse_i64 :: proc{parse_i64_maybe_prefixed, parse_i64_of_base}
 /*
 Parses an unsigned 64-bit integer value from the input string without a prefix, using the specified base
 
-**Inputs**  
+**Inputs**
 - str: The input string to parse
 - base: The base of the number system to use for parsing
-  - Must be between 1 and 16 (inclusive)
+	- Must be between 1 and 16 (inclusive)
 - n: An optional pointer to an int to store the length of the parsed substring (default: nil)
 
 Example:
-	
+
 	import "core:fmt"
 	import "core:strconv"
 	parse_u64_of_base_example :: proc() {
@@ -217,13 +217,13 @@ Example:
 		n, ok = strconv.parse_u64_of_base("5678eee",16)
 		fmt.println(n,ok)
 	}
-	
+
 Output:
 
 	1234 false
 	90672878 true
 
-**Returns**  
+**Returns**
 - value: The parsed uint64 value
 - ok: A boolean indicating whether the parsing was successful
 */
@@ -261,15 +261,15 @@ parse_u64_of_base :: proc(str: string, base: int, n: ^int = nil) -> (value: u64,
 /*
 Parses an unsigned 64-bit integer value from the input string, using the specified base or inferring the base from a prefix
 
-**Inputs**  
+**Inputs**
 - str: The input string to parse
 - base: The base of the number system to use for parsing (default: 0)
-  - If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
-  - If base is not 0, it will be used for parsing regardless of any prefix in the input string
+	- If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
+	- If base is not 0, it will be used for parsing regardless of any prefix in the input string
 - n: An optional pointer to an int to store the length of the parsed substring (default: nil)
 
 Example:
-	
+
 	import "core:fmt"
 	import "core:strconv"
 	parse_u64_maybe_prefixed_example :: proc() {
@@ -279,13 +279,13 @@ Example:
 		n, ok = strconv.parse_u64_maybe_prefixed("0xee")
 		fmt.println(n,ok)
 	}
-	
+
 Output:
 
 	1234 true
 	238 true
 
-**Returns**  
+**Returns**
 - value: The parsed uint64 value
 - ok: ok=false if a valid integer could not be found, if the value was negative, or if the input string contained more than just the number.
 */
@@ -336,14 +336,14 @@ parse_u64 :: proc{parse_u64_maybe_prefixed, parse_u64_of_base}
 /*
 Parses a signed integer value from the input string, using the specified base or inferring the base from a prefix
 
-**Inputs**  
+**Inputs**
 - s: The input string to parse
 - base: The base of the number system to use for parsing (default: 0)
-  - If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
-  - If base is not 0, it will be used for parsing regardless of any prefix in the input string
+	- If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
+	- If base is not 0, it will be used for parsing regardless of any prefix in the input string
 
 Example:
-	
+
 	import "core:fmt"
 	import "core:strconv"
 	parse_int_example :: proc() {
@@ -356,14 +356,14 @@ Example:
 		n, ok = strconv.parse_int("0xffff") // with prefix and inferred base
 		fmt.println(n,ok)
 	}
-	
+
 Output:
 
 	1234 true
 	65535 true
 	65535 true
 
-**Returns**  
+**Returns**
 - value: The parsed int value
 - ok: `false` if no appropriate value could be found, or if the input string contained more than just the number.
 */
@@ -379,11 +379,11 @@ parse_int :: proc(s: string, base := 0, n: ^int = nil) -> (value: int, ok: bool)
 /*
 Parses an unsigned integer value from the input string, using the specified base or inferring the base from a prefix
 
-**Inputs**  
+**Inputs**
 - s: The input string to parse
 - base: The base of the number system to use for parsing (default: 0, inferred)
-  - If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
-  - If base is not 0, it will be used for parsing regardless of any prefix in the input string
+	- If base is 0, it will be inferred based on the prefix in the input string (e.g. '0x' for hexadecimal)
+	- If base is not 0, it will be used for parsing regardless of any prefix in the input string
 
 Example:
 	
@@ -1729,7 +1729,7 @@ quote_rune :: proc(buf: []byte, r: rune) -> string {
 		}
 	}
 
-	if buf == nil {
+	if buf == nil || r < 0 {
 		return ""
 	}
 
diff --git a/core/sync/chan/chan.odin b/core/sync/chan/chan.odin
index 0c98124de..53a3bff4b 100644
--- a/core/sync/chan/chan.odin
+++ b/core/sync/chan/chan.odin
@@ -421,21 +421,20 @@ raw_queue_pop :: proc "contextless" (q: ^Raw_Queue) -> (data: rawptr) {
 
 @(require_results)
 can_recv :: proc "contextless" (c: ^Raw_Chan) -> bool {
+	sync.guard(&c.mutex)
 	if is_buffered(c) {
 		return len(c) > 0
 	}
-	sync.guard(&c.mutex)
 	return sync.atomic_load(&c.w_waiting) > 0
 }
 
 
 @(require_results)
 can_send :: proc "contextless" (c: ^Raw_Chan) -> bool {
-	if is_buffered(c) {
-		sync.guard(&c.mutex)
-		return len(c) < cap(c)
-	}
 	sync.guard(&c.mutex)
+	if is_buffered(c) {
+		return c.queue.len < c.queue.cap
+	}
 	return sync.atomic_load(&c.r_waiting) > 0
 }
 
diff --git a/core/testing/signal_handler_other.odin b/core/testing/signal_handler_other.odin
index 6f39205c7..d6d494fa4 100644
--- a/core/testing/signal_handler_other.odin
+++ b/core/testing/signal_handler_other.odin
@@ -1,5 +1,11 @@
 //+private
-//+build !windows !linux !darwin !freebsd !openbsd !netbsd !haiku
+//+build !windows
+//+build !linux
+//+build !darwin
+//+build !freebsd
+//+build !openbsd
+//+build !netbsd
+//+build !haiku
 package testing
 
 /*
diff --git a/src/build_settings.cpp b/src/build_settings.cpp
index fe0e478c7..e86224665 100644
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -430,6 +430,7 @@ struct BuildContext {
 	bool   json_errors;
 	bool   has_ansi_terminal_colours;
 
+	bool   fast_isel;
 	bool   ignore_lazy;
 	bool   ignore_llvm_build;
 	bool   ignore_panic;
@@ -2048,10 +2049,11 @@ gb_internal bool init_build_paths(String init_filename) {
 	gbFile      output_file_test;
 	const char* output_file_name = (const char*)output_file.text;
 	gbFileError output_test_err = gb_file_open_mode(&output_file_test, gbFileMode_Append | gbFileMode_Rw, output_file_name);
-	gb_file_close(&output_file_test);
-	gb_file_remove(output_file_name);
 
-	if (output_test_err != 0) {
+	if (output_test_err == 0) {
+		gb_file_close(&output_file_test);
+		gb_file_remove(output_file_name);
+	} else {
 		String output_file = path_to_string(ha, bc->build_paths[BuildPath_Output]);
 		defer (gb_free(ha, output_file.text));
 		gb_printf_err("No write permissions for output path: %.*s\n", LIT(output_file));
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index 910e7ffdb..888aa074d 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -5203,6 +5203,16 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
 				return false;
 			}
 
+			if (sz >= 64) {
+				if (is_type_unsigned(x.type)) {
+					add_package_dependency(c, "runtime", "umodti3", true);
+					add_package_dependency(c, "runtime", "udivti3", true);
+				} else {
+					add_package_dependency(c, "runtime", "modti3", true);
+					add_package_dependency(c, "runtime", "divti3", true);
+				}
+			}
+
 			operand->type = x.type;
 			operand->mode = Addressing_Value;
 		}
diff --git a/src/check_expr.cpp b/src/check_expr.cpp
index 6ab87da09..27ba2448e 100644
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -3615,7 +3615,7 @@ gb_internal bool check_transmute(CheckerContext *c, Ast *node, Operand *o, Type
 		if (is_type_integer(src_t) && is_type_integer(dst_t)) {
 			if (types_have_same_internal_endian(src_t, dst_t)) {
 				ExactValue src_v = exact_value_to_integer(o->value);
-				GB_ASSERT(src_v.kind == ExactValue_Integer);
+				GB_ASSERT(src_v.kind == ExactValue_Integer || src_v.kind == ExactValue_Invalid);
 				BigInt v = src_v.value_integer;
 
 				BigInt smax = {};
diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp
index f852636a6..01ded321e 100644
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -3081,6 +3081,13 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 		lbModule *m = entry.value;
 		m->target_machine = target_machine;
 		LLVMSetModuleDataLayout(m->mod, LLVMCreateTargetDataLayout(target_machine));
+
+	#if LLVM_VERSION_MAJOR >= 18
+		if (build_context.fast_isel) {
+			LLVMSetTargetMachineFastISel(m->target_machine, true);
+		}
+	#endif
+
 		array_add(&target_machines, target_machine);
 	}
 
diff --git a/src/llvm_backend_debug.cpp b/src/llvm_backend_debug.cpp
index 5d90dccea..68e1efc1c 100644
--- a/src/llvm_backend_debug.cpp
+++ b/src/llvm_backend_debug.cpp
@@ -82,13 +82,36 @@ gb_internal LLVMMetadataRef lb_debug_type_internal_proc(lbModule *m, Type *type)
 			parameter_count += 1;
 		}
 	}
-	LLVMMetadataRef *parameters = gb_alloc_array(permanent_allocator(), LLVMMetadataRef, parameter_count);
 
-	unsigned param_index = 0;
-	if (type->Proc.result_count == 0) {
-		parameters[param_index++] = nullptr;
-	} else {
-		parameters[param_index++] = lb_debug_procedure_parameters(m, type->Proc.results);
+	auto parameters = array_make<LLVMMetadataRef>(permanent_allocator(), 0, type->Proc.param_count+type->Proc.result_count+2);
+
+	array_add(&parameters, cast(LLVMMetadataRef)nullptr);
+
+	bool return_is_tuple = false;
+	if (type->Proc.result_count != 0) {
+		Type *single_ret = reduce_tuple_to_single_type(type->Proc.results);
+		if (is_type_proc(single_ret)) {
+			single_ret = t_rawptr;
+		}
+		if (is_type_tuple(single_ret) && is_calling_convention_odin(type->Proc.calling_convention)) {
+			LLVMTypeRef actual = lb_type_internal_for_procedures_raw(m, type);
+			actual = LLVMGetReturnType(actual);
+			if (actual == nullptr) {
+				// results were passed as a single pointer
+				parameters[0] = lb_debug_procedure_parameters(m, single_ret);
+			} else {
+				LLVMTypeRef possible = lb_type(m, type->Proc.results);
+				if (possible == actual) {
+					// results were returned directly
+					parameters[0] = lb_debug_procedure_parameters(m, single_ret);
+				} else {
+					// resulsts were returned separately
+					return_is_tuple = true;
+				}
+			}
+		} else {
+			parameters[0] = lb_debug_procedure_parameters(m, single_ret);
+		}
 	}
 
 	LLVMMetadataRef file = nullptr;
@@ -98,8 +121,22 @@ gb_internal LLVMMetadataRef lb_debug_type_internal_proc(lbModule *m, Type *type)
 		if (e->kind != Entity_Variable) {
 			continue;
 		}
-		parameters[param_index] = lb_debug_procedure_parameters(m, e->type);
-		param_index += 1;
+		array_add(&parameters, lb_debug_procedure_parameters(m, e->type));
+	}
+
+
+	if (return_is_tuple) {
+		Type *results = type->Proc.results;
+		GB_ASSERT(results != nullptr && results->kind == Type_Tuple);
+		isize count = results->Tuple.variables.count;
+		parameters[0] = lb_debug_procedure_parameters(m, results->Tuple.variables[count-1]->type);
+		for (isize i = 0; i < count-1; i++) {
+			array_add(&parameters, lb_debug_procedure_parameters(m, results->Tuple.variables[i]->type));
+		}
+	}
+
+	if (type->Proc.calling_convention == ProcCC_Odin) {
+		array_add(&parameters, lb_debug_type(m, t_context_ptr));
 	}
 
 	LLVMDIFlags flags = LLVMDIFlagZero;
@@ -107,7 +144,7 @@ gb_internal LLVMMetadataRef lb_debug_type_internal_proc(lbModule *m, Type *type)
 		flags = LLVMDIFlagNoReturn;
 	}
 
-	return LLVMDIBuilderCreateSubroutineType(m->debug_builder, file, parameters, parameter_count, flags);
+	return LLVMDIBuilderCreateSubroutineType(m->debug_builder, file, parameters.data, cast(unsigned)parameters.count, flags);
 }
 
 gb_internal LLVMMetadataRef lb_debug_struct_field(lbModule *m, String const &name, Type *type, u64 offset_in_bits) {
diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp
index f6b9934ef..f20c52e88 100644
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -705,31 +705,37 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type
 
 	lbAddr res = lb_add_local_generated(p, type, true);
 
-	i64 row_count = mt->Matrix.row_count;
-	i64 column_count = mt->Matrix.column_count;
-	TEMPORARY_ALLOCATOR_GUARD();
+	GB_ASSERT(type_size_of(type) == type_size_of(m.type));
 
-	auto srcs = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
-	auto dsts = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
+	lbValue m_ptr = lb_address_from_load_or_generate_local(p, m);
+	lbValue n = lb_const_int(p->module, t_int, type_size_of(type));
+	lb_mem_copy_non_overlapping(p, res.addr, m_ptr, n);
 
-	for (i64 j = 0; j < column_count; j++) {
-		for (i64 i = 0; i < row_count; i++) {
-			lbValue src = lb_emit_matrix_ev(p, m, i, j);
-			array_add(&srcs, src);
-		}
-	}
+	// i64 row_count = mt->Matrix.row_count;
+	// i64 column_count = mt->Matrix.column_count;
+	// TEMPORARY_ALLOCATOR_GUARD();
 
-	for (i64 j = 0; j < column_count; j++) {
-		for (i64 i = 0; i < row_count; i++) {
-			lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count);
-			array_add(&dsts, dst);
-		}
-	}
+	// auto srcs = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
+	// auto dsts = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
 
-	GB_ASSERT(srcs.count == dsts.count);
-	for_array(i, srcs) {
-		lb_emit_store(p, dsts[i], srcs[i]);
-	}
+	// for (i64 j = 0; j < column_count; j++) {
+	// 	for (i64 i = 0; i < row_count; i++) {
+	// 		lbValue src = lb_emit_matrix_ev(p, m, i, j);
+	// 		array_add(&srcs, src);
+	// 	}
+	// }
+
+	// for (i64 j = 0; j < column_count; j++) {
+	// 	for (i64 i = 0; i < row_count; i++) {
+	// 		lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count);
+	// 		array_add(&dsts, dst);
+	// 	}
+	// }
+
+	// GB_ASSERT(srcs.count == dsts.count);
+	// for_array(i, srcs) {
+	// 	lb_emit_store(p, dsts[i], srcs[i]);
+	// }
 	return lb_addr_load(p, res);
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index a03126caf..0a84b2f97 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -389,6 +389,7 @@ enum BuildFlagKind {
 	BuildFlag_PrintLinkerFlags,
 
 	// internal use only
+	BuildFlag_InternalFastISel,
 	BuildFlag_InternalIgnoreLazy,
 	BuildFlag_InternalIgnoreLLVMBuild,
 	BuildFlag_InternalIgnorePanic,
@@ -594,6 +595,7 @@ gb_internal bool parse_build_flags(Array<String> args) {
 
 	add_flag(&build_flags, BuildFlag_PrintLinkerFlags,        str_lit("print-linker-flags"),        BuildFlagParam_None,    Command_build);
 
+	add_flag(&build_flags, BuildFlag_InternalFastISel,        str_lit("internal-fast-isel"),        BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalIgnoreLazy,      str_lit("internal-ignore-lazy"),      BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalIgnoreLLVMBuild, str_lit("internal-ignore-llvm-build"),BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalIgnorePanic,     str_lit("internal-ignore-panic"),     BuildFlagParam_None,    Command_all);
@@ -1408,6 +1410,9 @@ gb_internal bool parse_build_flags(Array<String> args) {
 							build_context.print_linker_flags = true;
 							break;
 
+						case BuildFlag_InternalFastISel:
+							build_context.fast_isel = true;
+							break;
 						case BuildFlag_InternalIgnoreLazy:
 							build_context.ignore_lazy = true;
 							break;
diff --git a/src/types.cpp b/src/types.cpp
index 63182f5c4..a9a7d6dda 100644
--- a/src/types.cpp
+++ b/src/types.cpp
@@ -1474,6 +1474,7 @@ gb_internal i64 matrix_align_of(Type *t, struct TypePath *tp) {
 	
 	Type *elem = t->Matrix.elem;
 	i64 row_count = gb_max(t->Matrix.row_count, 1);
+	i64 column_count = gb_max(t->Matrix.column_count, 1);
 
 	bool pop = type_path_push(tp, elem);
 	if (tp->failure) {
@@ -1491,7 +1492,7 @@ gb_internal i64 matrix_align_of(Type *t, struct TypePath *tp) {
 	// could be maximally aligned but as a compromise, having no padding will be
 	// beneficial to third libraries that assume no padding
 	
-	i64 total_expected_size = row_count*t->Matrix.column_count*elem_size;
+	i64 total_expected_size = row_count*column_count*elem_size;
 	// i64 min_alignment = prev_pow2(elem_align * row_count);
 	i64 min_alignment = prev_pow2(total_expected_size);
 	while (total_expected_size != 0 && (total_expected_size % min_alignment) != 0) {
@@ -1523,12 +1524,15 @@ gb_internal i64 matrix_type_stride_in_bytes(Type *t, struct TypePath *tp) {
 	i64 stride_in_bytes = 0;
 	
 	// NOTE(bill, 2021-10-25): The alignment strategy here is to have zero padding
-	// It would be better for performance to pad each column so that each column
+	// It would be better for performance to pad each column/row so that each column/row
 	// could be maximally aligned but as a compromise, having no padding will be
 	// beneficial to third libraries that assume no padding
-	i64 row_count = t->Matrix.row_count;
-	stride_in_bytes = elem_size*row_count;
-	
+
+	if (t->Matrix.is_row_major) {
+		stride_in_bytes = elem_size*t->Matrix.column_count;
+	} else {
+		stride_in_bytes = elem_size*t->Matrix.row_count;
+	}
 	t->Matrix.stride_in_bytes = stride_in_bytes;
 	return stride_in_bytes;
 }
@@ -4217,7 +4221,11 @@ gb_internal i64 type_size_of_internal(Type *t, TypePath *path) {
 	
 	case Type_Matrix: {
 		i64 stride_in_bytes = matrix_type_stride_in_bytes(t, path);
-		return stride_in_bytes * t->Matrix.column_count;
+		if (t->Matrix.is_row_major) {
+			return stride_in_bytes * t->Matrix.row_count;
+		} else {
+			return stride_in_bytes * t->Matrix.column_count;
+		}
 	}
 
 	case Type_BitField:
diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin
index fb3c460aa..7f078c423 100644
--- a/tests/core/bytes/test_core_bytes.odin
+++ b/tests/core/bytes/test_core_bytes.odin
@@ -87,3 +87,11 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 		}
 	}
 }
+
+@test
+test_last_index_byte_bounds :: proc(t: ^testing.T) {
+	input := "helloworld.odin."
+	assert(len(input) == 16)
+	idx := bytes.last_index_byte(transmute([]byte)(input[:len(input)-1]), '.')
+	testing.expect_value(t, idx, 10)
+}
\ No newline at end of file
diff --git a/tests/core/net/test_core_net.odin b/tests/core/net/test_core_net.odin
index f38fa11e6..8a9272882 100644
--- a/tests/core/net/test_core_net.odin
+++ b/tests/core/net/test_core_net.odin
@@ -10,7 +10,8 @@
 
 	A test suite for `core:net`
 */
-//+build !netbsd !openbsd
+//+build !netbsd
+//+build !openbsd
 package test_core_net
 
 import "core:testing"
diff --git a/tests/issues/run.bat b/tests/issues/run.bat
index 299e08791..dcea3d483 100644
--- a/tests/issues/run.bat
+++ b/tests/issues/run.bat
@@ -15,6 +15,7 @@ set COMMON=-define:ODIN_TEST_FANCY=false -file -vet -strict-style
 ..\..\..\odin test ..\test_issue_2615.odin %COMMON%  || exit /b
 ..\..\..\odin test ..\test_issue_2637.odin %COMMON%  || exit /b
 ..\..\..\odin test ..\test_issue_2666.odin %COMMON%  || exit /b
+..\..\..\odin test ..\test_issue_4210.odin %COMMON%  || exit /b
 
 @echo off
 
diff --git a/tests/issues/run.sh b/tests/issues/run.sh
index 8b4c1e7f2..c3bc00e24 100755
--- a/tests/issues/run.sh
+++ b/tests/issues/run.sh
@@ -16,6 +16,7 @@ $ODIN test ../test_issue_2466.odin $COMMON
 $ODIN test ../test_issue_2615.odin $COMMON
 $ODIN test ../test_issue_2637.odin $COMMON
 $ODIN test ../test_issue_2666.odin $COMMON
+$ODIN test ../test_issue_4210.odin $COMMON
 if [[ $($ODIN build ../test_issue_2395.odin $COMMON 2>&1 >/dev/null | grep -c "Error:") -eq 2 ]] ; then
 	echo "SUCCESSFUL 1/1"
 else
diff --git a/tests/issues/test_issue_4210.odin b/tests/issues/test_issue_4210.odin
new file mode 100644
index 000000000..f50086a4e
--- /dev/null
+++ b/tests/issues/test_issue_4210.odin
@@ -0,0 +1,85 @@
+// Tests issue #4210 https://github.com/odin-lang/Odin/issues/4210
+package test_issues
+
+import "core:testing"
+import "base:intrinsics"
+
+@test
+test_row_major_matrix :: proc(t: ^testing.T) {
+	row_major34: #row_major matrix[3,4]int = {
+		11,12,13,14,
+		21,22,23,24,
+		31,32,33,34,
+	}
+	row_major34_expected := [?]int{11,12,13,14, 21,22,23,24, 31,32,33,34}
+
+	row_major43: #row_major matrix[4,3]int = {
+		11,12,13,
+		21,22,23,
+		31,32,33,
+		41,42,43,
+	}
+	row_major43_expected := [?]int{11,12,13, 21,22,23, 31,32,33, 41,42,43}
+
+	major34_flattened := intrinsics.matrix_flatten(row_major34)
+	major34_from_ptr  := intrinsics.unaligned_load((^[3 * 4]int)(&row_major34))
+
+	for row in 0..<3 {
+		for column in 0..<4 {
+			idx := row * 4 + column
+			testing.expect_value(t, major34_flattened[idx], row_major34_expected[idx])
+			testing.expect_value(t, major34_from_ptr [idx], row_major34_expected[idx])
+		}
+	}
+
+	major43_flattened := intrinsics.matrix_flatten(row_major43)
+	major43_from_ptr  := intrinsics.unaligned_load((^[4 * 3]int)(&row_major43))
+
+	for row in 0..<4 {
+		for column in 0..<3 {
+			idx := row * 3 + column
+			testing.expect_value(t, major43_flattened[idx], row_major43_expected[idx])
+			testing.expect_value(t, major43_from_ptr [idx], row_major43_expected[idx])
+		}
+	}
+}
+
+@test
+test_row_minor_matrix :: proc(t: ^testing.T) {
+	row_minor34: matrix[3,4]int = {
+		11,12,13,14,
+		21,22,23,24,
+		31,32,33,34,
+	}
+	row_minor34_expected := [?]int{11,21,31, 12,22,32, 13,23,33, 14,24,34}
+
+	row_minor43: matrix[4,3]int = {
+		11,12,13,
+		21,22,23,
+		31,32,33,
+		41,42,43,
+	}
+	row_minor43_expected := [?]int{11,21,31,41, 12,22,32,42, 13,23,33,43}
+
+	minor34_flattened := intrinsics.matrix_flatten(row_minor34)
+	minor34_from_ptr  := intrinsics.unaligned_load((^[3 * 4]int)(&row_minor34))
+
+	for row in 0..<3 {
+		for column in 0..<4 {
+			idx := row * 4 + column
+			testing.expect_value(t, minor34_flattened[idx], row_minor34_expected[idx])
+			testing.expect_value(t, minor34_from_ptr [idx], row_minor34_expected[idx])
+		}
+	}
+
+	minor43_flattened := intrinsics.matrix_flatten(row_minor43)
+	minor43_from_ptr  := intrinsics.unaligned_load((^[4 * 3]int)(&row_minor43))
+
+	for row in 0..<4 {
+		for column in 0..<3 {
+			idx := row * 3 + column
+			testing.expect_value(t, minor43_flattened[idx], row_minor43_expected[idx])
+			testing.expect_value(t, minor43_from_ptr [idx], row_minor43_expected[idx])
+		}
+	}
+}
\ No newline at end of file
diff --git a/vendor/stb/lib/darwin/stb_truetype.a b/vendor/stb/lib/darwin/stb_truetype.a
index f871693d0..b55fbe5d3 100644
Binary files a/vendor/stb/lib/darwin/stb_truetype.a and b/vendor/stb/lib/darwin/stb_truetype.a differ
diff --git a/vendor/stb/lib/stb_truetype.lib b/vendor/stb/lib/stb_truetype.lib
index d4139c707..16ecf944d 100644
Binary files a/vendor/stb/lib/stb_truetype.lib and b/vendor/stb/lib/stb_truetype.lib differ
diff --git a/vendor/stb/lib/stb_truetype_wasm.o b/vendor/stb/lib/stb_truetype_wasm.o
index 15c4fa0d5..d3380e8a2 100644
Binary files a/vendor/stb/lib/stb_truetype_wasm.o and b/vendor/stb/lib/stb_truetype_wasm.o differ
diff --git a/vendor/stb/src/Makefile b/vendor/stb/src/Makefile
index 6123a95fa..b7217d528 100644
--- a/vendor/stb/src/Makefile
+++ b/vendor/stb/src/Makefile
@@ -8,7 +8,7 @@ endif
 
 wasm:
 	mkdir -p ../lib
-	clang -c -Os --target=wasm32 -nostdlib stb_truetype_wasm.c -o ../lib/stb_truetype_wasm.o
+	$(CC) -c -Os --target=wasm32 -nostdlib stb_truetype_wasm.c -o ../lib/stb_truetype_wasm.o
 
 unix:
 	mkdir -p ../lib
diff --git a/vendor/stb/src/stb_truetype.c b/vendor/stb/src/stb_truetype.c
index e44c22c89..05c23f583 100644
--- a/vendor/stb/src/stb_truetype.c
+++ b/vendor/stb/src/stb_truetype.c
@@ -1,5 +1,2 @@
-#define STB_RECT_PACK_IMPLEMENTATION
-#include "stb_rect_pack.h"
-
 #define STB_TRUETYPE_IMPLEMENTATION
 #include "stb_truetype.h"
\ No newline at end of file
diff --git a/vendor/wgpu/wgpu.odin b/vendor/wgpu/wgpu.odin
index 691aed9ce..ae4649aed 100644
--- a/vendor/wgpu/wgpu.odin
+++ b/vendor/wgpu/wgpu.odin
@@ -27,6 +27,8 @@ when ODIN_OS == .Windows {
 		"system:advapi32.lib",
 		"system:user32.lib",
 		"system:gdi32.lib",
+		"system:ole32.lib",
+		"system:oleaut32.lib",
 	}
 } else when ODIN_OS == .Darwin {
 	@(private) ARCH :: "x86_64" when ODIN_ARCH == .amd64 else "aarch64" when ODIN_ARCH == .arm64 else #panic("unsupported WGPU Native architecture")