Merge pull request #6676 from odin-lang/bill/matrix-optimizations

General Matrix Optimizations + `@(fast_math)` attribute
2026-05-25 13:18:14 +00:00 · 2026-05-12 10:42:35 +01:00
parent 9cf809f329 724ce14677
commit e4733b9a4e
14 changed files with 476 additions and 168 deletions
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -4,6 +4,7 @@ package intrinsics

 import "base:runtime"

+
 // Package-Related
 is_package_imported :: proc(package_name: string) -> bool ---

--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -23,6 +23,23 @@ package runtime

 import "base:intrinsics"

+/*
+Fast_Math_Flag :: enum u8 {
+	Allow_Reassoc    = 0,
+	No_NaNs          = 1,
+	No_Infs          = 2,
+	No_Signed_Zeros  = 3,
+	Allow_Reciprocal = 4,
+	Allow_Contract   = 5,
+	Approx_Func      = 6,
+}
+*/
+Fast_Math_Flag  :: intrinsics.Fast_Math_Flag
+
+// Fast_Math_Flags :: distinct bit_set[Fast_Math_Flag; u32]
+Fast_Math_Flags :: intrinsics.Fast_Math_Flags
+
+
 // NOTE(bill): This must match the compiler's
 Calling_Convention :: enum u8 {
 	Invalid     = 0,
--- a/core/math/linalg/extended.odin
+++ b/core/math/linalg/extended.odin
@@ -7,7 +7,7 @@ import "core:math"
@(require_results)
 to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = degrees[i] * RAD_PER_DEG
 		}
 	} else {
@@ -19,7 +19,7 @@ to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM
@(require_results)
 to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = radians[i] * DEG_PER_RAD
 		}
 	} else {
@@ -31,7 +31,7 @@ to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM
@(require_results)
 min_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = builtin.min(a[i], b[i])
 		}
 	} else {
@@ -51,7 +51,7 @@ min_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
 			out = builtin.min(a[0], a[1])
 		} else {
 			out = builtin.min(a[0], a[1])
-			for i in 2..<N {
+			#no_bounds_check for i in 2..<N {
 				out = builtin.min(out, a[i])
 			}
 		}
@@ -71,7 +71,7 @@ min :: proc{min_single, min_double, min_triple}
@(require_results)
 max_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = builtin.max(a[i], b[i])
 		}
 	} else {
@@ -93,7 +93,7 @@ max_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
 			out = builtin.max(a[0], a[1], a[2])
 		}else {
 			out = builtin.max(a[0], a[1])
-			for i in 2..<N {
+			#no_bounds_check for i in 2..<N {
 				out = builtin.max(out, a[i])
 			}
 		}
@@ -113,7 +113,7 @@ max :: proc{max_single, max_double, max_triple}
@(require_results)
 abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = auto_cast builtin.abs(a[i])
 		}
 	} else {
@@ -125,7 +125,7 @@ abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
@(require_results)
 sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = #force_inline math.sign(a[i])
 		}
 	} else {
@@ -137,7 +137,7 @@ sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
@(require_results)
 clamp :: proc "contextless" (x, a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = builtin.clamp(x[i], a[i], b[i])
 		}
 	} else {
@@ -155,7 +155,7 @@ saturate :: proc "contextless" (x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = a[i]*(1-t[i]) + b[i]*t[i]
 		}
 	} else {
@@ -166,7 +166,7 @@ lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)
@(require_results)
 mix :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = a[i]*(1-t[i]) + b[i]*t[i]
 		}
 	} else {
@@ -183,7 +183,7 @@ unlerp :: proc "contextless" (a, b, x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 step :: proc "contextless" (e, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = x[i] < e[i] ? 0.0 : 1.0
 		}
 	} else {
@@ -208,7 +208,7 @@ smootherstep :: proc "contextless" (e0, e1, x: $T) -> T where IS_FLOAT(ELEM_TYPE
@(require_results)
 sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.sqrt(x[i])
 		}
 	} else {
@@ -220,7 +220,7 @@ sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = 1.0/math.sqrt(x[i])
 		}
 	} else {
@@ -232,7 +232,7 @@ inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(
@(require_results)
 cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.cos(x[i])
 		}
 	} else {
@@ -244,7 +244,7 @@ cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.sin(x[i])
 		}
 	} else {
@@ -256,7 +256,7 @@ sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.tan(x[i])
 		}
 	} else {
@@ -268,7 +268,7 @@ tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.acos(x[i])
 		}
 	} else {
@@ -280,7 +280,7 @@ acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.asin(x[i])
 		}
 	} else {
@@ -292,7 +292,7 @@ asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.atan(x[i])
 		}
 	} else {
@@ -303,7 +303,7 @@ atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.atan2(y[i], x[i])
 		}
 	} else {
@@ -316,7 +316,7 @@ atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T))
@(require_results)
 ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.ln(x[i])
 		}
 	} else {
@@ -329,7 +329,7 @@ ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	INVLN2 :: 1.4426950408889634073599246810018921374266459541529859341354494069
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = INVLN2 * math.ln(x[i])
 		}
 	} else {
@@ -342,7 +342,7 @@ log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	INVLN10 :: 0.4342944819032518276511289189166050822943970058036665661144537831
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = INVLN10 * math.ln(x[i])
 		}
 	} else {
@@ -354,7 +354,7 @@ log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.ln(x[i]) / math.ln(cast(ELEM_TYPE(T))b[i])
 		}
 	} else {
@@ -366,7 +366,7 @@ log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.exp(x[i])
 		}
 	} else {
@@ -378,7 +378,7 @@ exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.exp(LN2 * x[i])
 		}
 	} else {
@@ -390,7 +390,7 @@ exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.exp(LN10 * x[i])
 		}
 	} else {
@@ -402,7 +402,7 @@ exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 pow :: proc "contextless" (x, e: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = math.pow(x[i], e[i])
 		}
 	} else {
@@ -425,7 +425,7 @@ floor :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
 round :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
 	when IS_ARRAY(T) {
-		for i in 0..<len(T) {
+		#no_bounds_check for i in 0..<len(T) {
 			out[i] = #force_inline math.round(x[i])
 		}
 	} else {
@@ -486,7 +486,7 @@ is_nan_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {

@(require_results)
 is_nan_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = #force_inline is_nan(x[i])
 	}
 	return
@@ -499,7 +499,7 @@ is_inf_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {

@(require_results)
 is_inf_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = #force_inline is_inf(x[i])
 	}
 	return
@@ -512,7 +512,7 @@ classify_single :: proc "contextless" (x: $T) -> math.Float_Class where IS_FLOAT

@(require_results)
 classify_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]math.Float_Class) where IS_FLOAT(T) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = #force_inline classify_single(x[i])
 	}
 	return
@@ -532,42 +532,42 @@ classify :: proc{classify_single, classify_array}

@(require_results)
 less_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] < y[i]
 	}
 	return
 }
@(require_results)
 less_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] <= y[i]
 	}
 	return
 }
@(require_results)
 greater_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] > y[i]
 	}
 	return
 }
@(require_results)
 greater_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] >= y[i]
 	}
 	return
 }
@(require_results)
 equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] == y[i]
 	}
 	return
 }
@(require_results)
 not_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
-	for i in 0..<N {
+	#no_bounds_check for i in 0..<N {
 		out[i] = x[i] != y[i]
 	}
 	return
@@ -601,7 +601,7 @@ all :: proc "contextless" (x: $A/[$N]bool) -> (out: bool) {
@(require_results)
 not :: proc "contextless" (x: $A/[$N]bool) -> (out: A) {
 	for e, i in x {
-		out[i] = !e
+		#no_bounds_check out[i] = !e
 	}
 	return
 }
--- a/core/math/linalg/general.odin
+++ b/core/math/linalg/general.odin
@@ -46,18 +46,17 @@ scalar_dot :: proc "contextless" (a, b: $T) -> T where IS_FLOAT(T), !IS_ARRAY(T)

@(require_results)
 vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
-	ab := a * b
 	when N == 1 {
-		return ab.x
+		return a.x*b.x
 	} else when N == 2 {
-		return ab.x + ab.y
+		return a.x*b.x + a.y*b.y
 	} else when N == 3 {
-		return ab.x + ab.y + ab.z
+		return a.x*b.x + a.y*b.y + a.z*b.z
 	} else when N == 4 {
-		return ab.x + ab.y + ab.z + ab.w
+		return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w
 	} else {
-		for elem in ab {
-			c += elem
+		#unroll for _, i in a {
+			c += a[i]*b[i]
 		}
 		return c
 	}
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -455,6 +455,7 @@ enum IntegerDivisionByZeroKind : u8 {
 	IntegerDivisionByZero_AllBits,
 };

+
 // This stores the information for the specify architecture of this build
 struct BuildContext {
 	// Constants
--- a/src/check_decl.cpp
+++ b/src/check_decl.cpp
@@ -1480,6 +1480,8 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
 	e->Procedure.no_sanitize_memory  = ac.no_sanitize_memory;
 	e->Procedure.no_sanitize_thread  = ac.no_sanitize_thread;

+	e->Procedure.fast_math_flags = ac.fast_math_flags;
+
 	e->deprecated_message = ac.deprecated_message;
 	e->warning_message = ac.warning_message;
 	ac.link_name = handle_link_name(ctx, e->token, ac.link_name, ac.link_prefix, ac.link_suffix);
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -1140,6 +1140,11 @@ gb_internal void check_assignment(CheckerContext *c, Operand *operand, Type *typ
 		return;
 	}

+	if (operand->mode == Addressing_Type && is_type_typeid(type)) {
+		add_type_info_type(c, operand->type);
+		add_type_and_value(c, operand->expr, Addressing_Value, type, exact_value_typeid(operand->type));
+		return;
+	}

 	if (is_type_untyped(operand->type)) {
 		Type *target_type = type;
@@ -9122,9 +9127,11 @@ gb_internal bool check_is_operand_compound_lit_constant(CheckerContext *c, Opera
 	if (is_type_any(field_type)) {
 		return false;
 	}
-	if (field_type != nullptr && is_type_typeid(field_type) && o->mode == Addressing_Type) {
-		add_type_info_type(c, o->type);
-		return true;
+	if (field_type != nullptr && is_type_typeid(field_type)) {
+		if (o->mode == Addressing_Type) {
+			add_type_info_type(c, o->type);
+			return true;
+		}
 	}

 	Ast *expr = unparen_expr(o->expr);
@@ -10584,6 +10591,7 @@ gb_internal ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *
 		}


+
 		i64 max = 0;

 		Type *bet = base_type(elem_type);
@@ -12483,7 +12491,7 @@ gb_internal void check_multi_expr_with_type_hint(CheckerContext *c, Operand *o,
 	case Addressing_Type:
 		if (type_hint != nullptr && is_type_typeid(type_hint)) {
 			add_type_info_type(c, o->type);
-			break;
+			return;
 		}
 		error_operand_not_expression(o);
 		break;
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -1042,14 +1042,14 @@ struct GlobalEnumValue {
 	i64 value;
 };

-gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr) {
+gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr, Type *backing_type = nullptr) {
 	Scope *scope = create_scope(nullptr, builtin_pkg->scope);
 	Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);

 	Type *enum_type = alloc_type_enum();
 	Type *named_type = alloc_type_named(type_name, enum_type, entity);
 	set_base_type(named_type, enum_type);
-	enum_type->Enum.base_type = t_int;
+	enum_type->Enum.base_type = backing_type ? backing_type : t_int;
 	enum_type->Enum.scope = scope;
 	entity->type = named_type;

@@ -1250,6 +1250,41 @@ gb_internal void init_universal(void) {
 		add_global_enum_constant(fields, "ODIN_ERROR_POS_STYLE", build_context.ODIN_ERROR_POS_STYLE);
 	}

+	{
+		GlobalEnumValue values[OdinFastMath_COUNT] = {};
+		for (unsigned i = 0; i < OdinFastMath_COUNT; i++) {
+			values[i] = {OdinFastMathFlag_strings[i], i};
+		}
+
+		auto fields = add_global_enum_type(str_lit("Fast_Math_Flag"), values, gb_count_of(values), &t_fast_math_flag, t_u8);
+
+		GB_ASSERT(t_fast_math_flag->kind == Type_Named);
+		scope_insert(intrinsics_pkg->scope, t_fast_math_flag->Named.type_name);
+
+		Type *bs = alloc_type_bit_set();
+		bs->BitSet.elem = t_fast_math_flag;
+		bs->BitSet.underlying = t_u32;
+		bs->BitSet.lower = 0;
+		bs->BitSet.upper = OdinFastMath_COUNT-1;
+		bs->BitSet.node = nullptr;
+
+
+		{
+			String type_name = str_lit("Fast_Math_Flags");
+
+			Scope *scope = create_scope(nullptr, builtin_pkg->scope);
+			Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);
+
+			Type *named_type = alloc_type_named(type_name, bs, entity);
+			set_base_type(named_type, bs);
+			entity->type = named_type;
+
+			t_fast_math_flags = named_type;
+
+			scope_insert(intrinsics_pkg->scope, entity);
+		}
+	}
+
 	{
 		GlobalEnumValue values[OdinAtomicMemoryOrder_COUNT] = {
 			{OdinAtomicMemoryOrder_strings[OdinAtomicMemoryOrder_relaxed], OdinAtomicMemoryOrder_relaxed},
@@ -3554,11 +3589,17 @@ gb_internal void init_preload(Checker *c) {
 	init_core_objc_c(c);
 }

-gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value) {
+gb_internal void check_expr_with_type_hint(CheckerContext *c, Operand *o, Ast *e, Type *t);
+
+gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value, Type *type_hint = nullptr) {
 	ExactValue ev = {};
 	if (value != nullptr) {
 		Operand op = {};
-		check_expr(c, &op, value);
+		if (type_hint != nullptr) {
+			check_expr_with_type_hint(c, &op, value, type_hint);
+		} else {
+			check_expr(c, &op, value);
+		}
 		if (op.mode) {
 			if (op.mode == Addressing_Constant) {
 				ev = op.value;
@@ -4126,6 +4167,18 @@ gb_internal DECL_ATTRIBUTE_PROC(proc_decl_attribute) {
 		}
 		ac->no_sanitize_thread = true;
 		return true;
+	} else if (name == "fast_math") {
+		if (value == nullptr) {
+			error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
+		} else {
+			ExactValue ev = check_decl_attribute_value(c, value, t_fast_math_flags);
+			if (ev.kind != ExactValue_Integer) {
+				error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
+			} else {
+				ac->fast_math_flags = exact_value_to_u64(ev);
+			}
+		}
+		return true;
 	}
 	return false;
 }
--- a/src/checker.hpp
+++ b/src/checker.hpp
@@ -163,6 +163,8 @@ struct AttributeContext {
 	String require_target_feature; // required by the target micro-architecture
 	String enable_target_feature;  // will be enabled for the procedure only

+	u64 fast_math_flags;
+
 	bool   raddbg_type_view;
 	String raddbg_type_view_string;
 };
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -256,6 +256,9 @@ struct Entity {
 			struct GenProcsData *gen_procs;
 			BlockingMutex gen_procs_mutex;
 			ProcedureOptimizationMode optimization_mode;
+
+			u64     fast_math_flags;
+
 			bool    is_foreign                 : 1;
 			bool    is_export                  : 1;
 			bool    generated_from_polymorphic : 1;
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -672,7 +672,7 @@ gb_internal lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lh
 	}
 }

-gb_internal bool lb_is_matrix_simdable(Type *t) {
+gb_internal bool lb_is_matrix_simdable(Type *t, bool ignore_layout=false) {
 	Type *mt = base_type(t);
 	GB_ASSERT(mt->kind == Type_Matrix);
 	
@@ -697,12 +697,14 @@ gb_internal bool lb_is_matrix_simdable(Type *t) {
 		// it's not aligned well enough to use the vector instructions
 		return false;
 	}
-	if ((mt->Matrix.row_count & 1) ^ (mt->Matrix.column_count & 1)) {
+	if ((mt->Matrix.row_count & 1) && (mt->Matrix.column_count & 1)) {
 		return false;
 	}
 	if (mt->Matrix.is_row_major) {
-		// TODO(bill): make #row_major matrices work with SIMD
-		return false;
+		if (!ignore_layout) {
+			// TODO(bill): make #row_major matrices work with SIMD
+			return false;
+		}
 	}
 	
 	if (elem->kind == Type_Basic) {
@@ -820,35 +822,32 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
 		GB_PANIC("TODO: transpose with changing layout");
 	}

-	if (lb_is_matrix_simdable(mt) && lb_is_matrix_simdable(type)) {
+	if (lb_is_matrix_simdable(mt, true) && lb_is_matrix_simdable(type, true)) {
+		auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
+			return LLVMConstInt(lb_type(p->module, t_u32), val, false);
+		};
+
 		unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt);
 		unsigned row_count    = cast(unsigned)mt->Matrix.row_count;
 		unsigned column_count = cast(unsigned)mt->Matrix.column_count;
-
-		auto rows = slice_make<LLVMValueRef>(permanent_allocator(), row_count);
-		auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), column_count);
+		unsigned other_stride = (row_count*column_count)/stride;

 		LLVMValueRef vector = lb_matrix_to_vector(p, m);
+		auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), row_count * column_count);
 		for (unsigned i = 0; i < row_count; i++) {
 			for (unsigned j = 0; j < column_count; j++) {
-				unsigned offset = stride*j + i;
-				mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
+				mask_elems[other_stride*i + j] = do_u32(p, stride*j + i);
 			}
-
-			// transpose mask
-			LLVMValueRef mask = LLVMConstVector(mask_elems.data, column_count);
-			LLVMValueRef row = llvm_basic_shuffle(p, vector, mask);
-			rows[i] = row;
 		}
+		LLVMValueRef mask = LLVMConstVector(mask_elems.data, cast(unsigned)mask_elems.count);
+		LLVMValueRef transposed_vector = llvm_basic_shuffle(p, vector, mask);
+		lbAddr res = lb_add_local_generated(p, type, false);

-		lbAddr res = lb_add_local_generated(p, type, true);
-		for_array(i, rows) {
-			LLVMValueRef row = rows[i];
-			lbValue dst_row_ptr = lb_emit_matrix_epi(p, res.addr, 0, i);
-			LLVMValueRef ptr = dst_row_ptr.value;
-			ptr = LLVMBuildPointerCast(p->builder, ptr, LLVMPointerType(LLVMTypeOf(row), 0), "");
-			LLVMBuildStore(p->builder, row, ptr);
-		}
+		LLVMValueRef res_ptr = res.addr.value;
+		res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(transposed_vector), 0), "");
+
+		LLVMValueRef store = LLVMBuildStore(p->builder, transposed_vector, res_ptr);
+		LLVMSetAlignment(store, cast(unsigned)type_align_of(type));

 		return lb_addr_load(p, res);
 	}
@@ -867,8 +866,10 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
 	return lb_addr_load(p, res);
 }

-gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
-	lbAddr res = lb_add_local_generated(p, type, true);
+gb_internal lbAddr llvm_add_local_generated_from_vector(lbProcedure *p, Type *type, LLVMValueRef vector) {
+	GB_ASSERT(LLVMGetTypeKind(LLVMTypeOf(vector)) == LLVMVectorTypeKind);
+
+	lbAddr res = lb_add_local_generated(p, type, false);
 	LLVMValueRef res_ptr = res.addr.value;
 	unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
 	LLVMSetAlignment(res_ptr, alignment);
@@ -876,9 +877,16 @@ gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef v
 	res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
 	LLVMBuildStore(p->builder, vector, res_ptr);

+	return res;
+}
+
+gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
+	lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
 	return lb_addr_load(p, res);
 }

+
+
 gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type) {
 	if (is_type_array(m.type)) {
 		// no-op
@@ -896,31 +904,6 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type
 	lbValue n = lb_const_int(p->module, t_int, type_size_of(type));
 	lb_mem_copy_non_overlapping(p, res.addr, m_ptr, n);

-	// i64 row_count = mt->Matrix.row_count;
-	// i64 column_count = mt->Matrix.column_count;
-	// TEMPORARY_ALLOCATOR_GUARD();
-
-	// auto srcs = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
-	// auto dsts = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
-
-	// for (i64 j = 0; j < column_count; j++) {
-	// 	for (i64 i = 0; i < row_count; i++) {
-	// 		lbValue src = lb_emit_matrix_ev(p, m, i, j);
-	// 		array_add(&srcs, src);
-	// 	}
-	// }
-
-	// for (i64 j = 0; j < column_count; j++) {
-	// 	for (i64 i = 0; i < row_count; i++) {
-	// 		lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count);
-	// 		array_add(&dsts, dst);
-	// 	}
-	// }
-
-	// GB_ASSERT(srcs.count == dsts.count);
-	// for_array(i, srcs) {
-	// 	lb_emit_store(p, dsts[i], srcs[i]);
-	// }
 	return lb_addr_load(p, res);
 }

@@ -959,6 +942,10 @@ gb_internal lbValue lb_emit_outer_product(lbProcedure *p, lbValue a, lbValue b,
 gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, Type *type) {
 	// TODO(bill): Handle edge case for f16 types on x86(-64) platforms

+	auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
+		return LLVMConstInt(lb_type(p->module, t_u32), val, false);
+	};
+
 	Type *xt = base_type(lhs.type);
 	Type *yt = base_type(rhs.type);

@@ -975,50 +962,183 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
 	unsigned inner         = cast(unsigned)xt->Matrix.column_count;
 	unsigned outer_columns = cast(unsigned)yt->Matrix.column_count;

-	if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) {
-		unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
-		unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
+	if (lb_is_matrix_simdable(xt, true)) {
+		if (!xt->Matrix.is_row_major) { // #column_major
+			unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
+			unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);

-		auto x_rows    = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
-		auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
+			LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
+			LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);

-		LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
-		LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
+			if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
+				// square matrix calculation
+				unsigned N = outer_columns;

-		auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
-		for (unsigned i = 0; i < outer_rows; i++) {
-			for (unsigned j = 0; j < inner; j++) {
-				unsigned offset = x_stride*j + i;
-				mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
+				auto x_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
+				auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
+
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
+					LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
+					x_columns[i] = column;
+				}
+
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
+					LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
+					y_columns[i] = column;
+				}
+
+
+				auto z_columns  = slice_make<LLVMValueRef>(permanent_allocator(), N);
+				auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
+
+				for (unsigned i = 0; i < N; i++) {
+					for (unsigned j = 0; j < N; j++) {
+						LLVMValueRef mask = llvm_mask_same(p->module, j, N);
+						mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask);
+					}
+					z_columns[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, x_columns);
+				}
+
+				lbAddr res = lb_add_local_generated(p, type, false);
+				LLVMValueRef dest_ptr = res.addr.value;
+
+				LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0);
+				dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef indices[] = {do_u32(p, i)};
+	 				LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, "");
+	 				LLVMBuildStore(p->builder, z_columns[i], dst);
+				}
+
+				return lb_addr_load(p, res);
 			}

-			// transpose mask
-			LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
-			LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
-			x_rows[i] = row;
-		}

-		for (unsigned i = 0; i < outer_columns; i++) {
-			LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
-			LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
-			y_columns[i] = column;
-		}
+			auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
+			auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);

-		lbAddr res = lb_add_local_generated(p, type, true);
-		for_array(i, x_rows) {
-			LLVMValueRef x_row = x_rows[i];
-			for_array(j, y_columns) {
-				LLVMValueRef y_column = y_columns[j];
-				LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
-				lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
-				LLVMBuildStore(p->builder, elem, dst.value);
+			auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
+			for (unsigned i = 0; i < outer_rows; i++) {
+				for (unsigned j = 0; j < inner; j++) {
+					unsigned offset = x_stride*j + i;
+					mask_elems[j] = do_u32(p, offset);
+				}
+
+				// transpose mask
+				LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
+				LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
+				x_rows[i] = row;
 			}
+
+			for (unsigned i = 0; i < outer_columns; i++) {
+				LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
+				LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
+				y_columns[i] = column;
+			}
+
+			lbAddr res = lb_add_local_generated(p, type, false);
+			for_array(i, x_rows) {
+				LLVMValueRef x_row = x_rows[i];
+				for_array(j, y_columns) {
+					LLVMValueRef y_column = y_columns[j];
+					LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
+					lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
+					LLVMBuildStore(p->builder, elem, dst.value);
+				}
+			}
+			return lb_addr_load(p, res);
+		} else { // #row_major
+			unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
+			unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
+
+			LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
+			LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
+
+			if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
+				// square matrix calculation
+				unsigned N = outer_columns;
+
+				auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
+				auto y_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
+
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
+					LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
+					x_rows[i] = column;
+				}
+
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
+					LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
+					y_rows[i] = column;
+				}
+
+
+				auto z_rows     = slice_make<LLVMValueRef>(permanent_allocator(), N);
+				auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
+
+				for (unsigned i = 0; i < N; i++) {
+					for (unsigned j = 0; j < N; j++) {
+						LLVMValueRef mask = llvm_mask_same(p->module, j, N);
+						mask_elems[j] = llvm_basic_shuffle(p, x_rows[i], mask);
+					}
+					z_rows[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, y_rows);
+				}
+
+				lbAddr res = lb_add_local_generated(p, type, false);
+				LLVMValueRef dest_ptr = res.addr.value;
+
+				LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_rows[0]), 0);
+				dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
+				for (unsigned i = 0; i < N; i++) {
+					LLVMValueRef indices[] = {do_u32(p, i)};
+	 				LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_rows[0]), dest_ptr, indices, 1, "");
+	 				LLVMBuildStore(p->builder, z_rows[i], dst);
+				}
+
+				return lb_addr_load(p, res);
+			}
+
+			auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
+			auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
+
+			for (unsigned i = 0; i < outer_rows; i++) {
+				LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner);
+				LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
+				x_rows[i] = row;
+			}
+
+			auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
+			for (unsigned i = 0; i < outer_columns; i++) {
+				for (unsigned j = 0; j < inner; j++) {
+					unsigned offset = x_stride*j + i;
+					mask_elems[j] = do_u32(p, offset);
+				}
+
+				// transpose mask
+				LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
+				LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
+				y_columns[i] = column;
+			}
+
+			lbAddr res = lb_add_local_generated(p, type, false);
+			for_array(i, x_rows) {
+				LLVMValueRef x_row = x_rows[i];
+				for_array(j, y_columns) {
+					LLVMValueRef y_column = y_columns[j];
+					LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
+					lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
+					LLVMBuildStore(p->builder, elem, dst.value);
+				}
+			}
+			return lb_addr_load(p, res);
 		}
-		return lb_addr_load(p, res);
 	}

 	if (!xt->Matrix.is_row_major) {
-		lbAddr res = lb_add_local_generated(p, type, true);
+		lbAddr res = lb_add_local_generated(p, type, false);

 		auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);

@@ -1042,7 +1162,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,

 		return lb_addr_load(p, res);
 	} else {
-		lbAddr res = lb_add_local_generated(p, type, true);
+		lbAddr res = lb_add_local_generated(p, type, false);

 		auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);

@@ -1100,23 +1220,25 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal
 			m_columns[column_index] = column;
 		}

-		for (unsigned row_index = 0; row_index < column_count; row_index++) {
-			LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
-			LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
-			v_rows[row_index] = row;
-		}
+		if (LLVMIsALoadInst(rhs.value)) {
+			LLVMValueRef rhs_ptr = LLVMGetOperand(rhs.value, 0);
+			LLVMTypeRef vector_type = LLVMVectorType(lb_type(p->module, elem), cast(unsigned)vector_count);
+			LLVMValueRef rhs_vector = LLVMBuildLoad2(p->builder, vector_type, rhs_ptr, "");
+			LLVMSetAlignment(rhs_vector, cast(unsigned)type_align_of(type));

-		GB_ASSERT(column_count > 0);
-
-		LLVMValueRef vector = nullptr;
-		for (i64 i = 0; i < column_count; i++) {
-			if (i == 0) {
-				vector = llvm_vector_mul(p, m_columns[i], v_rows[i]);
-			} else {
-				vector = llvm_vector_mul_add(p, m_columns[i], v_rows[i], vector);
+			for (unsigned i = 0; i < column_count; i++) {
+				LLVMValueRef mask = llvm_mask_same(p->module, i, row_count);
+				v_rows[i] = llvm_basic_shuffle(p, rhs_vector, mask);
+			}
+		} else {
+			for (unsigned row_index = 0; row_index < column_count; row_index++) {
+				LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
+				LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
+				v_rows[row_index] = row;
 			}
 		}

+		LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, m_columns, v_rows);
 		return lb_matrix_cast_vector_to_type(p, vector, type);
 	}

@@ -1190,27 +1312,13 @@ gb_internal lbValue lb_emit_vector_mul_matrix(lbProcedure *p, lbValue lhs, lbVal

 		GB_ASSERT(row_count > 0);

-		LLVMValueRef vector = nullptr;
-		for (i64 i = 0; i < row_count; i++) {
-			if (i == 0) {
-				vector = llvm_vector_mul(p, v_rows[i], m_columns[i]);
-			} else {
-				vector = llvm_vector_mul_add(p, v_rows[i], m_columns[i], vector);
-			}
-		}
-
-		lbAddr res = lb_add_local_generated(p, type, true);
-		LLVMValueRef res_ptr = res.addr.value;
-		unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
-		LLVMSetAlignment(res_ptr, alignment);
-
-		res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
-		LLVMBuildStore(p->builder, vector, res_ptr);
+		LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, v_rows, m_columns);

+		lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
 		return lb_addr_load(p, res);
 	}

-	lbAddr res = lb_add_local_generated(p, type, true);
+	lbAddr res = lb_add_local_generated(p, type, false);

 	Type *vector_elem_type = base_array_type(rhs.type);

--- a/src/llvm_backend_opt.cpp
+++ b/src/llvm_backend_opt.cpp
@@ -270,6 +270,55 @@ gb_internal void lb_populate_module_pass_manager(LLVMTargetMachineRef target_mac
 	optimization of Odin programs	
 **************************************************************************/

+gb_internal void lb_run_fast_float_math_pass(lbProcedure *p) {
+	Entity *e = p->entity;
+	if (e == nullptr) {
+		return;
+	}
+	GB_ASSERT(e->kind == Entity_Procedure);
+
+
+	u64 fast_math_flags = e->Procedure.fast_math_flags;
+	LLVMFastMathFlags llvm_flags = 0;
+	if (fast_math_flags & OdinFastMath_Allow_Reassoc)    llvm_flags |= LLVMFastMathAllowReassoc;
+	if (fast_math_flags & OdinFastMath_No_NaNs)          llvm_flags |= LLVMFastMathNoNaNs;
+	if (fast_math_flags & OdinFastMath_No_Infs)          llvm_flags |= LLVMFastMathNoInfs;
+	if (fast_math_flags & OdinFastMath_No_Signed_Zeros)  llvm_flags |= LLVMFastMathNoSignedZeros;
+	if (fast_math_flags & OdinFastMath_Allow_Reciprocal) llvm_flags |= LLVMFastMathAllowReciprocal;
+	if (fast_math_flags & OdinFastMath_Allow_Contract)   llvm_flags |= LLVMFastMathAllowContract;
+	if (fast_math_flags & OdinFastMath_Approx_Func)      llvm_flags |= LLVMFastMathApproxFunc;
+
+	if (llvm_flags == 0) {
+		return;
+	}
+
+	for (LLVMBasicBlockRef block = LLVMGetFirstBasicBlock(p->value);
+	     block != nullptr;
+	     block = LLVMGetNextBasicBlock(block)) {
+		for (LLVMValueRef instr = LLVMGetFirstInstruction(block);
+		     instr != nullptr;
+		     instr = LLVMGetNextInstruction(instr))  {
+			switch (LLVMGetInstructionOpcode(instr)) {
+			case LLVMFNeg:
+			case LLVMFAdd:
+			case LLVMFSub:
+			case LLVMFMul:
+			case LLVMFDiv:
+			case LLVMFRem:
+			case LLVMFPToUI:
+			case LLVMFPToSI:
+			case LLVMUIToFP:
+			case LLVMSIToFP:
+			case LLVMFPTrunc:
+			case LLVMFPExt:
+			case LLVMFCmp:
+				LLVMSetFastMathFlags(instr, llvm_flags);
+				break;
+			}
+		}
+	}
+}
+
 gb_internal void lb_run_remove_dead_instruction_pass(lbProcedure *p) {
 	unsigned debug_declare_id = LLVMLookupIntrinsicID("llvm.dbg.declare", 16);
 	GB_ASSERT(debug_declare_id != 0);
@@ -475,6 +524,9 @@ gb_internal void lb_run_function_pass_manager(LLVMPassManagerRef fpm, lbProcedur
 	if (p == nullptr) {
 		return;
 	}
+
+	lb_run_fast_float_math_pass(p);
+
 	// NOTE(bill): LLVMAddDCEPass doesn't seem to be exported in the official DLL's for LLVM
 	// which means we cannot rely upon it
 	// This is also useful for read the .ll for debug purposes because a lot of instructions
--- a/src/llvm_backend_utility.cpp
+++ b/src/llvm_backend_utility.cpp
@@ -2048,6 +2048,15 @@ gb_internal LLVMValueRef llvm_mask_zero(lbModule *m, unsigned count) {
 	return LLVMConstNull(LLVMVectorType(lb_type(m, t_u32), count));
 }

+gb_internal LLVMValueRef llvm_mask_same(lbModule *m, unsigned value, unsigned count) {
+	auto iota = slice_make<LLVMValueRef>(temporary_allocator(), count);
+	for (unsigned i = 0; i < count; i++) {
+		iota[i] = lb_const_int(m, t_u32, value).value;
+	}
+	return LLVMConstVector(iota.data, count);
+}
+
+
 #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMGetUndef((type))
 // #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMConstNull((type))

@@ -2221,6 +2230,30 @@ gb_internal LLVMValueRef llvm_vector_mul(lbProcedure *p, LLVMValueRef a, LLVMVal
 	return LLVMBuildFMul(p->builder, a, b, "");
 }

+gb_internal LLVMValueRef llvm_vector_mul_pairwise_reduce_add(lbProcedure *p, Slice<LLVMValueRef> const &a, Slice<LLVMValueRef> const &b) {
+	GB_ASSERT(a.count == b.count);
+
+	auto temps = slice_make<LLVMValueRef>(temporary_allocator(), a.count);
+	for (unsigned i = 0; i < a.count; i++) {
+		temps[i] = llvm_vector_mul(p, a[i], b[i]);
+	}
+
+	unsigned k = cast(unsigned)a.count;
+	while (k > 1) {
+		unsigned half = k/2;
+		for (unsigned j = 0; j < half; j++) {
+			temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]);
+		}
+
+		if ((k&1) != 0) {
+			temps[half] = temps[k-1];
+		}
+		k = (k+1)/2;
+	}
+
+	return temps[0];
+}
+

 gb_internal LLVMValueRef llvm_vector_dot(lbProcedure *p, LLVMValueRef a, LLVMValueRef b) {
 	return llvm_vector_reduce_add(p, llvm_vector_mul(p, a, b));
@@ -2260,6 +2293,7 @@ gb_internal LLVMValueRef llvm_vector_mul_add(lbProcedure *p, LLVMValueRef a, LLV
 	}
 }

+
 gb_internal LLVMValueRef llvm_get_inline_asm(LLVMTypeRef func_type, String const &str, String const &clobbers, bool has_side_effects=true, bool is_align_stack=false, LLVMInlineAsmDialect dialect=LLVMInlineAsmDialectATT) {
 	return LLVMGetInlineAsm(func_type,
 		cast(char *)str.text, cast(size_t)str.len,
--- a/src/types.cpp
+++ b/src/types.cpp
@@ -805,6 +805,34 @@ gb_global Type *t_atomic_memory_order = nullptr;



+enum OdinFastMathFlag : u8 {
+	OdinFastMath_Allow_Reassoc    = 0,
+	OdinFastMath_No_NaNs          = 1,
+	OdinFastMath_No_Infs          = 2,
+	OdinFastMath_No_Signed_Zeros  = 3,
+	OdinFastMath_Allow_Reciprocal = 4,
+	OdinFastMath_Allow_Contract   = 5,
+	OdinFastMath_Approx_Func      = 6,
+
+	OdinFastMath_COUNT,
+};
+
+char const *OdinFastMathFlag_strings[OdinFastMath_COUNT] = {
+	"Allow_Reassoc",
+	"No_NaNs",
+	"No_Infs",
+	"No_Signed_Zeros",
+	"Allow_Reciprocal",
+	"Allow_Contract",
+	"Approx_Func",
+};
+
+gb_global Type *t_fast_math_flag  = nullptr; // named enum
+gb_global Type *t_fast_math_flags = nullptr; // named bit_set
+
+
+
+
 gb_global RecursiveMutex g_type_mutex;

 struct TypePath;