From daa6bb1aee91bea4c081827634efa6d83315d34d Mon Sep 17 00:00:00 2001 From: gingerBill Date: Fri, 8 May 2026 14:34:44 +0100 Subject: [PATCH 1/8] Fix `check_multi_expr_with_type_hint` to use `return` rather than `break` --- src/check_expr.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/check_expr.cpp b/src/check_expr.cpp index b81052590..676cdbe9f 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -1140,6 +1140,11 @@ gb_internal void check_assignment(CheckerContext *c, Operand *operand, Type *typ return; } + if (operand->mode == Addressing_Type && is_type_typeid(type)) { + add_type_info_type(c, operand->type); + add_type_and_value(c, operand->expr, Addressing_Value, type, exact_value_typeid(operand->type)); + return; + } if (is_type_untyped(operand->type)) { Type *target_type = type; @@ -9122,9 +9127,11 @@ gb_internal bool check_is_operand_compound_lit_constant(CheckerContext *c, Opera if (is_type_any(field_type)) { return false; } - if (field_type != nullptr && is_type_typeid(field_type) && o->mode == Addressing_Type) { - add_type_info_type(c, o->type); - return true; + if (field_type != nullptr && is_type_typeid(field_type)) { + if (o->mode == Addressing_Type) { + add_type_info_type(c, o->type); + return true; + } } Ast *expr = unparen_expr(o->expr); @@ -10584,6 +10591,7 @@ gb_internal ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast * } + i64 max = 0; Type *bet = base_type(elem_type); @@ -12483,7 +12491,7 @@ gb_internal void check_multi_expr_with_type_hint(CheckerContext *c, Operand *o, case Addressing_Type: if (type_hint != nullptr && is_type_typeid(type_hint)) { add_type_info_type(c, o->type); - break; + return; } error_operand_not_expression(o); break; From 65ff188c1ca9d50265221ed792937359b40ce0ac Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 11:16:27 +0100 Subject: [PATCH 2/8] Add optimization edge case for square mat * mat --- src/llvm_backend_expr.cpp | 119 +++++++++++++++++++++++++++++++++-- src/llvm_backend_utility.cpp | 9 +++ 2 files changed, 122 insertions(+), 6 deletions(-) diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 22530831b..598ab6d21 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -976,15 +976,122 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, unsigned outer_columns = cast(unsigned)yt->Matrix.column_count; if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) { + + // if (LLVMIsALoadInst(lhs.value) && LLVMIsALoadInst(rhs.value)) { + // auto do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { + // return LLVMConstInt(lb_type(p->module, t_u32), val, false); + // }; + + // LLVMValueRef llvm_stride = do_u32(p, inner); + // LLVMValueRef llvm_false = LLVMConstInt(lb_type(p->module, t_llvm_bool), false, false); + + // LLVMValueRef lhs_args[] = {LLVMGetOperand(lhs.value, 0), llvm_stride, llvm_false, do_u32(p, outer_rows), do_u32(p, inner)}; + // LLVMValueRef rhs_args[] = {LLVMGetOperand(rhs.value, 0), llvm_stride, llvm_false, do_u32(p, inner), do_u32(p, outer_columns)}; + // LLVMTypeRef types[] = {lb_type(p->module, elem)}; + + // LLVMValueRef lhs_loaded = lb_call_intrinsic(p, "llvm.matrix.column.major.load", lhs_args, gb_count_of(lhs_args), types, gb_count_of(types)); + // LLVMValueRef rhs_loaded = lb_call_intrinsic(p, "llvm.matrix.column.major.load", rhs_args, gb_count_of(rhs_args), types, gb_count_of(types)); + + // LLVMValueRef mul_args[] = {lhs_loaded, rhs_loaded, do_u32(p, outer_rows), do_u32(p, inner), do_u32(p, outer_columns)}; + // LLVMValueRef lhs_mul_rhs = lb_call_intrinsic(p, "llvm.matrix.multiply", mul_args, gb_count_of(mul_args), types, gb_count_of(types)); + + // lbAddr res = lb_add_local_generated(p, type, false); + + // LLVMValueRef store_args[] = {res.addr.value, lhs_mul_rhs, llvm_stride, llvm_false, do_u32(p, inner), do_u32(p, outer_columns)}; + // lb_call_intrinsic(p, "llvm.matrix.column.major.store", store_args, gb_count_of(store_args), types, gb_count_of(types)); + + // return lb_addr_load(p, res); + // } + + + unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt); unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt); - auto x_rows = slice_make(permanent_allocator(), outer_rows); - auto y_columns = slice_make(permanent_allocator(), outer_columns); - LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs); LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs); + if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) { + // square matrix calculation + unsigned N = outer_columns; + + auto x_columns = slice_make(permanent_allocator(), N); + auto y_columns = slice_make(permanent_allocator(), N); + + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner); + LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask); + x_columns[i] = column; + } + + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner); + LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); + y_columns[i] = column; + } + + + auto z_columns = slice_make(permanent_allocator(), N); + + auto mask_elems = slice_make(permanent_allocator(), N); + auto temp_muls = slice_make(permanent_allocator(), N); + + for (unsigned i = 0; i < N; i++) { + for (unsigned j = 0; j < N; j++) { + LLVMValueRef mask = llvm_mask_same(p->module, j, N); + mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask); + } + for (unsigned j = 0; j < N; j++) { + if (is_type_float(elem)) { + temp_muls[j] = LLVMBuildFMul(p->builder, mask_elems[j], x_columns[j], ""); + // LLVMSetFastMathFlags(temp_muls[j], LLVMFastMathAll); + } else { + temp_muls[j] = LLVMBuildMul(p->builder, mask_elems[j], x_columns[j], ""); + } + } + unsigned k = N; + while (k > 1) { + unsigned half = k/2; + for (unsigned j = 0; j < half; j++) { + if (is_type_float(elem)) { + temp_muls[j] = LLVMBuildFAdd(p->builder, temp_muls[2*j + 0], temp_muls[2*j + 1], ""); + // LLVMSetFastMathFlags(temp_muls[j], LLVMFastMathAll); + } else { + temp_muls[j] = LLVMBuildAdd(p->builder, temp_muls[2*j + 0], temp_muls[2*j + 1], ""); + } + } + + if ((k&1) != 0) { + temp_muls[half] = temp_muls[k-1]; + } + k = (k+1)/2; + } + + z_columns[i] = temp_muls[0]; + } + + auto do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { + return LLVMConstInt(lb_type(p->module, t_u32), val, false); + }; + + lbAddr res = lb_add_local_generated(p, type, false); + LLVMValueRef dest_ptr = res.addr.value; + + LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0); + dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, ""); + for (unsigned i = 0; i < N; i++) { + LLVMValueRef indices[] = {do_u32(p, i)}; + LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, ""); + LLVMBuildStore(p->builder, z_columns[i], dst); + } + + return lb_addr_load(p, res); + } + + + auto x_rows = slice_make(permanent_allocator(), outer_rows); + auto y_columns = slice_make(permanent_allocator(), outer_columns); + auto mask_elems = slice_make(permanent_allocator(), inner); for (unsigned i = 0; i < outer_rows; i++) { for (unsigned j = 0; j < inner; j++) { @@ -1004,7 +1111,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, y_columns[i] = column; } - lbAddr res = lb_add_local_generated(p, type, true); + lbAddr res = lb_add_local_generated(p, type, false); for_array(i, x_rows) { LLVMValueRef x_row = x_rows[i]; for_array(j, y_columns) { @@ -1018,7 +1125,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, } if (!xt->Matrix.is_row_major) { - lbAddr res = lb_add_local_generated(p, type, true); + lbAddr res = lb_add_local_generated(p, type, false); auto inners = slice_make(permanent_allocator(), inner); @@ -1042,7 +1149,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, return lb_addr_load(p, res); } else { - lbAddr res = lb_add_local_generated(p, type, true); + lbAddr res = lb_add_local_generated(p, type, false); auto inners = slice_make(permanent_allocator(), inner); diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp index a04f91fbd..d101d28c2 100644 --- a/src/llvm_backend_utility.cpp +++ b/src/llvm_backend_utility.cpp @@ -2048,6 +2048,15 @@ gb_internal LLVMValueRef llvm_mask_zero(lbModule *m, unsigned count) { return LLVMConstNull(LLVMVectorType(lb_type(m, t_u32), count)); } +gb_internal LLVMValueRef llvm_mask_same(lbModule *m, unsigned value, unsigned count) { + auto iota = slice_make(temporary_allocator(), count); + for (unsigned i = 0; i < count; i++) { + iota[i] = lb_const_int(m, t_u32, value).value; + } + return LLVMConstVector(iota.data, count); +} + + #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMGetUndef((type)) // #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMConstNull((type)) From b0ee0bb63538a6d43e16bafad98bf97bdbd8e89a Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 11:55:07 +0100 Subject: [PATCH 3/8] Add `@(fast_math={...})` + `intrinsics.Fast_Math_Flags` --- base/intrinsics/intrinsics.odin | 1 + base/runtime/core.odin | 17 +++++++++ src/build_settings.cpp | 1 + src/check_decl.cpp | 2 ++ src/checker.cpp | 61 ++++++++++++++++++++++++++++++--- src/checker.hpp | 2 ++ src/entity.cpp | 3 ++ src/llvm_backend_opt.cpp | 52 ++++++++++++++++++++++++++++ src/types.cpp | 28 +++++++++++++++ 9 files changed, 163 insertions(+), 4 deletions(-) diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin index dd3b1d38e..e89036932 100644 --- a/base/intrinsics/intrinsics.odin +++ b/base/intrinsics/intrinsics.odin @@ -4,6 +4,7 @@ package intrinsics import "base:runtime" + // Package-Related is_package_imported :: proc(package_name: string) -> bool --- diff --git a/base/runtime/core.odin b/base/runtime/core.odin index d77c41dc8..c1ba7aa3e 100644 --- a/base/runtime/core.odin +++ b/base/runtime/core.odin @@ -23,6 +23,23 @@ package runtime import "base:intrinsics" +/* +Fast_Math_Flag :: enum u8 { + Allow_Reassoc = 0, + No_NaNs = 1, + No_Infs = 2, + No_Signed_Zeros = 3, + Allow_Reciprocal = 4, + Allow_Contract = 5, + Approx_Func = 6, +} +*/ +Fast_Math_Flag :: intrinsics.Fast_Math_Flag + +// Fast_Math_Flags :: distinct bit_set[Fast_Math_Flag; u32] +Fast_Math_Flags :: intrinsics.Fast_Math_Flags + + // NOTE(bill): This must match the compiler's Calling_Convention :: enum u8 { Invalid = 0, diff --git a/src/build_settings.cpp b/src/build_settings.cpp index 699304a18..8f8a259d3 100644 --- a/src/build_settings.cpp +++ b/src/build_settings.cpp @@ -455,6 +455,7 @@ enum IntegerDivisionByZeroKind : u8 { IntegerDivisionByZero_AllBits, }; + // This stores the information for the specify architecture of this build struct BuildContext { // Constants diff --git a/src/check_decl.cpp b/src/check_decl.cpp index 7bf1cd9bf..f571c7401 100644 --- a/src/check_decl.cpp +++ b/src/check_decl.cpp @@ -1480,6 +1480,8 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) { e->Procedure.no_sanitize_memory = ac.no_sanitize_memory; e->Procedure.no_sanitize_thread = ac.no_sanitize_thread; + e->Procedure.fast_math_flags = ac.fast_math_flags; + e->deprecated_message = ac.deprecated_message; e->warning_message = ac.warning_message; ac.link_name = handle_link_name(ctx, e->token, ac.link_name, ac.link_prefix, ac.link_suffix); diff --git a/src/checker.cpp b/src/checker.cpp index 7d9aaf24f..1b668eff5 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -1042,14 +1042,14 @@ struct GlobalEnumValue { i64 value; }; -gb_internal Slice add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr) { +gb_internal Slice add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr, Type *backing_type = nullptr) { Scope *scope = create_scope(nullptr, builtin_pkg->scope); Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved); Type *enum_type = alloc_type_enum(); Type *named_type = alloc_type_named(type_name, enum_type, entity); set_base_type(named_type, enum_type); - enum_type->Enum.base_type = t_int; + enum_type->Enum.base_type = backing_type ? backing_type : t_int; enum_type->Enum.scope = scope; entity->type = named_type; @@ -1250,6 +1250,41 @@ gb_internal void init_universal(void) { add_global_enum_constant(fields, "ODIN_ERROR_POS_STYLE", build_context.ODIN_ERROR_POS_STYLE); } + { + GlobalEnumValue values[OdinFastMath_COUNT] = {}; + for (unsigned i = 0; i < OdinFastMath_COUNT; i++) { + values[i] = {OdinFastMathFlag_strings[i], i}; + } + + auto fields = add_global_enum_type(str_lit("Fast_Math_Flag"), values, gb_count_of(values), &t_fast_math_flag, t_u8); + + GB_ASSERT(t_fast_math_flag->kind == Type_Named); + scope_insert(intrinsics_pkg->scope, t_fast_math_flag->Named.type_name); + + Type *bs = alloc_type_bit_set(); + bs->BitSet.elem = t_fast_math_flag; + bs->BitSet.underlying = t_u32; + bs->BitSet.lower = 0; + bs->BitSet.upper = OdinFastMath_COUNT-1; + bs->BitSet.node = nullptr; + + + { + String type_name = str_lit("Fast_Math_Flags"); + + Scope *scope = create_scope(nullptr, builtin_pkg->scope); + Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved); + + Type *named_type = alloc_type_named(type_name, bs, entity); + set_base_type(named_type, bs); + entity->type = named_type; + + t_fast_math_flags = named_type; + + scope_insert(intrinsics_pkg->scope, entity); + } + } + { GlobalEnumValue values[OdinAtomicMemoryOrder_COUNT] = { {OdinAtomicMemoryOrder_strings[OdinAtomicMemoryOrder_relaxed], OdinAtomicMemoryOrder_relaxed}, @@ -3554,11 +3589,17 @@ gb_internal void init_preload(Checker *c) { init_core_objc_c(c); } -gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value) { +gb_internal void check_expr_with_type_hint(CheckerContext *c, Operand *o, Ast *e, Type *t); + +gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value, Type *type_hint = nullptr) { ExactValue ev = {}; if (value != nullptr) { Operand op = {}; - check_expr(c, &op, value); + if (type_hint != nullptr) { + check_expr_with_type_hint(c, &op, value, type_hint); + } else { + check_expr(c, &op, value); + } if (op.mode) { if (op.mode == Addressing_Constant) { ev = op.value; @@ -4126,6 +4167,18 @@ gb_internal DECL_ATTRIBUTE_PROC(proc_decl_attribute) { } ac->no_sanitize_thread = true; return true; + } else if (name == "fast_math") { + if (value == nullptr) { + error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name)); + } else { + ExactValue ev = check_decl_attribute_value(c, value, t_fast_math_flags); + if (ev.kind != ExactValue_Integer) { + error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name)); + } else { + ac->fast_math_flags = exact_value_to_u64(ev); + } + } + return true; } return false; } diff --git a/src/checker.hpp b/src/checker.hpp index 5e295dc84..cd2e580d8 100644 --- a/src/checker.hpp +++ b/src/checker.hpp @@ -163,6 +163,8 @@ struct AttributeContext { String require_target_feature; // required by the target micro-architecture String enable_target_feature; // will be enabled for the procedure only + u64 fast_math_flags; + bool raddbg_type_view; String raddbg_type_view_string; }; diff --git a/src/entity.cpp b/src/entity.cpp index 7bb6e88ca..1879e9d05 100644 --- a/src/entity.cpp +++ b/src/entity.cpp @@ -256,6 +256,9 @@ struct Entity { struct GenProcsData *gen_procs; BlockingMutex gen_procs_mutex; ProcedureOptimizationMode optimization_mode; + + u64 fast_math_flags; + bool is_foreign : 1; bool is_export : 1; bool generated_from_polymorphic : 1; diff --git a/src/llvm_backend_opt.cpp b/src/llvm_backend_opt.cpp index cb7fe1c75..47ea90703 100644 --- a/src/llvm_backend_opt.cpp +++ b/src/llvm_backend_opt.cpp @@ -270,6 +270,55 @@ gb_internal void lb_populate_module_pass_manager(LLVMTargetMachineRef target_mac optimization of Odin programs **************************************************************************/ +gb_internal void lb_run_fast_float_math_pass(lbProcedure *p) { + Entity *e = p->entity; + if (e == nullptr) { + return; + } + GB_ASSERT(e->kind == Entity_Procedure); + + + u64 fast_math_flags = e->Procedure.fast_math_flags; + LLVMFastMathFlags llvm_flags = 0; + if (fast_math_flags & OdinFastMath_Allow_Reassoc) llvm_flags |= LLVMFastMathAllowReassoc; + if (fast_math_flags & OdinFastMath_No_NaNs) llvm_flags |= LLVMFastMathNoNaNs; + if (fast_math_flags & OdinFastMath_No_Infs) llvm_flags |= LLVMFastMathNoInfs; + if (fast_math_flags & OdinFastMath_No_Signed_Zeros) llvm_flags |= LLVMFastMathNoSignedZeros; + if (fast_math_flags & OdinFastMath_Allow_Reciprocal) llvm_flags |= LLVMFastMathAllowReciprocal; + if (fast_math_flags & OdinFastMath_Allow_Contract) llvm_flags |= LLVMFastMathAllowContract; + if (fast_math_flags & OdinFastMath_Approx_Func) llvm_flags |= LLVMFastMathApproxFunc; + + if (llvm_flags == 0) { + return; + } + + for (LLVMBasicBlockRef block = LLVMGetFirstBasicBlock(p->value); + block != nullptr; + block = LLVMGetNextBasicBlock(block)) { + for (LLVMValueRef instr = LLVMGetFirstInstruction(block); + instr != nullptr; + instr = LLVMGetNextInstruction(instr)) { + switch (LLVMGetInstructionOpcode(instr)) { + case LLVMFNeg: + case LLVMFAdd: + case LLVMFSub: + case LLVMFMul: + case LLVMFDiv: + case LLVMFRem: + case LLVMFPToUI: + case LLVMFPToSI: + case LLVMUIToFP: + case LLVMSIToFP: + case LLVMFPTrunc: + case LLVMFPExt: + case LLVMFCmp: + LLVMSetFastMathFlags(instr, llvm_flags); + break; + } + } + } +} + gb_internal void lb_run_remove_dead_instruction_pass(lbProcedure *p) { unsigned debug_declare_id = LLVMLookupIntrinsicID("llvm.dbg.declare", 16); GB_ASSERT(debug_declare_id != 0); @@ -475,6 +524,9 @@ gb_internal void lb_run_function_pass_manager(LLVMPassManagerRef fpm, lbProcedur if (p == nullptr) { return; } + + lb_run_fast_float_math_pass(p); + // NOTE(bill): LLVMAddDCEPass doesn't seem to be exported in the official DLL's for LLVM // which means we cannot rely upon it // This is also useful for read the .ll for debug purposes because a lot of instructions diff --git a/src/types.cpp b/src/types.cpp index f7975838d..f87d62297 100644 --- a/src/types.cpp +++ b/src/types.cpp @@ -805,6 +805,34 @@ gb_global Type *t_atomic_memory_order = nullptr; +enum OdinFastMathFlag : u8 { + OdinFastMath_Allow_Reassoc = 0, + OdinFastMath_No_NaNs = 1, + OdinFastMath_No_Infs = 2, + OdinFastMath_No_Signed_Zeros = 3, + OdinFastMath_Allow_Reciprocal = 4, + OdinFastMath_Allow_Contract = 5, + OdinFastMath_Approx_Func = 6, + + OdinFastMath_COUNT, +}; + +char const *OdinFastMathFlag_strings[OdinFastMath_COUNT] = { + "Allow_Reassoc", + "No_NaNs", + "No_Infs", + "No_Signed_Zeros", + "Allow_Reciprocal", + "Allow_Contract", + "Approx_Func", +}; + +gb_global Type *t_fast_math_flag = nullptr; // named enum +gb_global Type *t_fast_math_flags = nullptr; // named bit_set + + + + gb_global RecursiveMutex g_type_mutex; struct TypePath; From 58e4e6be240c7f588fdec8bcbb3c68fb1b31eab2 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 13:14:02 +0100 Subject: [PATCH 4/8] Improve `matrix * vector` code generation --- src/llvm_backend_expr.cpp | 88 ++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 52 deletions(-) diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 598ab6d21..7f45b89dd 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -697,7 +697,7 @@ gb_internal bool lb_is_matrix_simdable(Type *t) { // it's not aligned well enough to use the vector instructions return false; } - if ((mt->Matrix.row_count & 1) ^ (mt->Matrix.column_count & 1)) { + if ((mt->Matrix.row_count & 1) && (mt->Matrix.column_count & 1)) { return false; } if (mt->Matrix.is_row_major) { @@ -976,35 +976,6 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, unsigned outer_columns = cast(unsigned)yt->Matrix.column_count; if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) { - - // if (LLVMIsALoadInst(lhs.value) && LLVMIsALoadInst(rhs.value)) { - // auto do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { - // return LLVMConstInt(lb_type(p->module, t_u32), val, false); - // }; - - // LLVMValueRef llvm_stride = do_u32(p, inner); - // LLVMValueRef llvm_false = LLVMConstInt(lb_type(p->module, t_llvm_bool), false, false); - - // LLVMValueRef lhs_args[] = {LLVMGetOperand(lhs.value, 0), llvm_stride, llvm_false, do_u32(p, outer_rows), do_u32(p, inner)}; - // LLVMValueRef rhs_args[] = {LLVMGetOperand(rhs.value, 0), llvm_stride, llvm_false, do_u32(p, inner), do_u32(p, outer_columns)}; - // LLVMTypeRef types[] = {lb_type(p->module, elem)}; - - // LLVMValueRef lhs_loaded = lb_call_intrinsic(p, "llvm.matrix.column.major.load", lhs_args, gb_count_of(lhs_args), types, gb_count_of(types)); - // LLVMValueRef rhs_loaded = lb_call_intrinsic(p, "llvm.matrix.column.major.load", rhs_args, gb_count_of(rhs_args), types, gb_count_of(types)); - - // LLVMValueRef mul_args[] = {lhs_loaded, rhs_loaded, do_u32(p, outer_rows), do_u32(p, inner), do_u32(p, outer_columns)}; - // LLVMValueRef lhs_mul_rhs = lb_call_intrinsic(p, "llvm.matrix.multiply", mul_args, gb_count_of(mul_args), types, gb_count_of(types)); - - // lbAddr res = lb_add_local_generated(p, type, false); - - // LLVMValueRef store_args[] = {res.addr.value, lhs_mul_rhs, llvm_stride, llvm_false, do_u32(p, inner), do_u32(p, outer_columns)}; - // lb_call_intrinsic(p, "llvm.matrix.column.major.store", store_args, gb_count_of(store_args), types, gb_count_of(types)); - - // return lb_addr_load(p, res); - // } - - - unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt); unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt); @@ -1042,23 +1013,13 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask); } for (unsigned j = 0; j < N; j++) { - if (is_type_float(elem)) { - temp_muls[j] = LLVMBuildFMul(p->builder, mask_elems[j], x_columns[j], ""); - // LLVMSetFastMathFlags(temp_muls[j], LLVMFastMathAll); - } else { - temp_muls[j] = LLVMBuildMul(p->builder, mask_elems[j], x_columns[j], ""); - } + temp_muls[j] = llvm_vector_mul(p, mask_elems[j], x_columns[j]); } unsigned k = N; while (k > 1) { unsigned half = k/2; for (unsigned j = 0; j < half; j++) { - if (is_type_float(elem)) { - temp_muls[j] = LLVMBuildFAdd(p->builder, temp_muls[2*j + 0], temp_muls[2*j + 1], ""); - // LLVMSetFastMathFlags(temp_muls[j], LLVMFastMathAll); - } else { - temp_muls[j] = LLVMBuildAdd(p->builder, temp_muls[2*j + 0], temp_muls[2*j + 1], ""); - } + temp_muls[j] = llvm_vector_add(p, temp_muls[2*j + 0], temp_muls[2*j + 1]); } if ((k&1) != 0) { @@ -1207,23 +1168,46 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal m_columns[column_index] = column; } - for (unsigned row_index = 0; row_index < column_count; row_index++) { - LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, ""); - LLVMValueRef row = llvm_vector_broadcast(p, value, row_count); - v_rows[row_index] = row; + if (LLVMIsALoadInst(rhs.value)) { + LLVMValueRef rhs_ptr = LLVMGetOperand(rhs.value, 0); + LLVMTypeRef vector_type = LLVMVectorType(lb_type(p->module, elem), cast(unsigned)vector_count); + LLVMValueRef rhs_vector = LLVMBuildLoad2(p->builder, vector_type, rhs_ptr, ""); + LLVMSetAlignment(rhs_vector, cast(unsigned)type_align_of(type)); + + for (unsigned i = 0; i < column_count; i++) { + LLVMValueRef mask = llvm_mask_same(p->module, i, row_count); + v_rows[i] = llvm_basic_shuffle(p, rhs_vector, mask); + } + } else { + for (unsigned row_index = 0; row_index < column_count; row_index++) { + LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, ""); + LLVMValueRef row = llvm_vector_broadcast(p, value, row_count); + v_rows[row_index] = row; + } + } + + auto temps = slice_make(permanent_allocator(), column_count); + for (unsigned i = 0; i < column_count; i++) { + temps[i] = llvm_vector_mul(p, m_columns[i], v_rows[i]); } GB_ASSERT(column_count > 0); - LLVMValueRef vector = nullptr; - for (i64 i = 0; i < column_count; i++) { - if (i == 0) { - vector = llvm_vector_mul(p, m_columns[i], v_rows[i]); - } else { - vector = llvm_vector_mul_add(p, m_columns[i], v_rows[i], vector); + unsigned k = column_count; + while (k > 1) { + unsigned half = k/2; + for (unsigned j = 0; j < half; j++) { + temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]); } + + if ((k&1) != 0) { + temps[half] = temps[k-1]; + } + k = (k+1)/2; } + LLVMValueRef vector = temps[0]; + return lb_matrix_cast_vector_to_type(p, vector, type); } From b752ff4bdbda17d7f7e48b002e2aedd51b11f2f5 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 13:28:54 +0100 Subject: [PATCH 5/8] Add a minor optimization for `row_major * row_major` --- src/llvm_backend_expr.cpp | 270 +++++++++++++++++++++-------------- src/llvm_backend_utility.cpp | 24 ++++ 2 files changed, 184 insertions(+), 110 deletions(-) diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 7f45b89dd..ba2bea7cd 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -672,7 +672,7 @@ gb_internal lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lh } } -gb_internal bool lb_is_matrix_simdable(Type *t) { +gb_internal bool lb_is_matrix_simdable(Type *t, bool ignore_layout=false) { Type *mt = base_type(t); GB_ASSERT(mt->kind == Type_Matrix); @@ -701,8 +701,10 @@ gb_internal bool lb_is_matrix_simdable(Type *t) { return false; } if (mt->Matrix.is_row_major) { - // TODO(bill): make #row_major matrices work with SIMD - return false; + if (!ignore_layout) { + // TODO(bill): make #row_major matrices work with SIMD + return false; + } } if (elem->kind == Type_Basic) { @@ -959,6 +961,10 @@ gb_internal lbValue lb_emit_outer_product(lbProcedure *p, lbValue a, lbValue b, gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, Type *type) { // TODO(bill): Handle edge case for f16 types on x86(-64) platforms + auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { + return LLVMConstInt(lb_type(p->module, t_u32), val, false); + }; + Type *xt = base_type(lhs.type); Type *yt = base_type(rhs.type); @@ -975,114 +981,179 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, unsigned inner = cast(unsigned)xt->Matrix.column_count; unsigned outer_columns = cast(unsigned)yt->Matrix.column_count; - if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) { - unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt); - unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt); + if (lb_is_matrix_simdable(xt, true)) { + if (!xt->Matrix.is_row_major) { // #column_major + unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt); + unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt); - LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs); - LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs); + LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs); + LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs); - if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) { - // square matrix calculation - unsigned N = outer_columns; + if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) { + // square matrix calculation + unsigned N = outer_columns; - auto x_columns = slice_make(permanent_allocator(), N); - auto y_columns = slice_make(permanent_allocator(), N); + auto x_columns = slice_make(permanent_allocator(), N); + auto y_columns = slice_make(permanent_allocator(), N); - for (unsigned i = 0; i < N; i++) { - LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner); - LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask); - x_columns[i] = column; + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N); + LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask); + x_columns[i] = column; + } + + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N); + LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); + y_columns[i] = column; + } + + + auto z_columns = slice_make(permanent_allocator(), N); + auto mask_elems = slice_make(permanent_allocator(), N); + + for (unsigned i = 0; i < N; i++) { + for (unsigned j = 0; j < N; j++) { + LLVMValueRef mask = llvm_mask_same(p->module, j, N); + mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask); + } + z_columns[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, x_columns); + } + + lbAddr res = lb_add_local_generated(p, type, false); + LLVMValueRef dest_ptr = res.addr.value; + + LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0); + dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, ""); + for (unsigned i = 0; i < N; i++) { + LLVMValueRef indices[] = {do_u32(p, i)}; + LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, ""); + LLVMBuildStore(p->builder, z_columns[i], dst); + } + + return lb_addr_load(p, res); } - for (unsigned i = 0; i < N; i++) { + + auto x_rows = slice_make(permanent_allocator(), outer_rows); + auto y_columns = slice_make(permanent_allocator(), outer_columns); + + auto mask_elems = slice_make(permanent_allocator(), inner); + for (unsigned i = 0; i < outer_rows; i++) { + for (unsigned j = 0; j < inner; j++) { + unsigned offset = x_stride*j + i; + mask_elems[j] = do_u32(p, offset); + } + + // transpose mask + LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner); + LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask); + x_rows[i] = row; + } + + for (unsigned i = 0; i < outer_columns; i++) { LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner); LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); y_columns[i] = column; } - - auto z_columns = slice_make(permanent_allocator(), N); - - auto mask_elems = slice_make(permanent_allocator(), N); - auto temp_muls = slice_make(permanent_allocator(), N); - - for (unsigned i = 0; i < N; i++) { - for (unsigned j = 0; j < N; j++) { - LLVMValueRef mask = llvm_mask_same(p->module, j, N); - mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask); + lbAddr res = lb_add_local_generated(p, type, false); + for_array(i, x_rows) { + LLVMValueRef x_row = x_rows[i]; + for_array(j, y_columns) { + LLVMValueRef y_column = y_columns[j]; + LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column); + lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j); + LLVMBuildStore(p->builder, elem, dst.value); } - for (unsigned j = 0; j < N; j++) { - temp_muls[j] = llvm_vector_mul(p, mask_elems[j], x_columns[j]); + } + return lb_addr_load(p, res); + } else { // #row_major + unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt); + unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt); + + LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs); + LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs); + + if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) { + // square matrix calculation + unsigned N = outer_columns; + + auto x_rows = slice_make(permanent_allocator(), N); + auto y_rows = slice_make(permanent_allocator(), N); + + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N); + LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask); + x_rows[i] = column; } - unsigned k = N; - while (k > 1) { - unsigned half = k/2; - for (unsigned j = 0; j < half; j++) { - temp_muls[j] = llvm_vector_add(p, temp_muls[2*j + 0], temp_muls[2*j + 1]); + + for (unsigned i = 0; i < N; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N); + LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); + y_rows[i] = column; + } + + + auto z_rows = slice_make(permanent_allocator(), N); + auto mask_elems = slice_make(permanent_allocator(), N); + + for (unsigned i = 0; i < N; i++) { + for (unsigned j = 0; j < N; j++) { + LLVMValueRef mask = llvm_mask_same(p->module, j, N); + mask_elems[j] = llvm_basic_shuffle(p, x_rows[i], mask); } - - if ((k&1) != 0) { - temp_muls[half] = temp_muls[k-1]; - } - k = (k+1)/2; + z_rows[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, y_rows); } - z_columns[i] = temp_muls[0]; + lbAddr res = lb_add_local_generated(p, type, false); + LLVMValueRef dest_ptr = res.addr.value; + + LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_rows[0]), 0); + dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, ""); + for (unsigned i = 0; i < N; i++) { + LLVMValueRef indices[] = {do_u32(p, i)}; + LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_rows[0]), dest_ptr, indices, 1, ""); + LLVMBuildStore(p->builder, z_rows[i], dst); + } + + return lb_addr_load(p, res); } - auto do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { - return LLVMConstInt(lb_type(p->module, t_u32), val, false); - }; + auto x_rows = slice_make(permanent_allocator(), outer_rows); + auto y_columns = slice_make(permanent_allocator(), outer_columns); + + for (unsigned i = 0; i < outer_rows; i++) { + LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner); + LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask); + x_rows[i] = row; + } + + auto mask_elems = slice_make(permanent_allocator(), inner); + for (unsigned i = 0; i < outer_columns; i++) { + for (unsigned j = 0; j < inner; j++) { + unsigned offset = x_stride*j + i; + mask_elems[j] = do_u32(p, offset); + } + + // transpose mask + LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner); + LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); + y_columns[i] = column; + } lbAddr res = lb_add_local_generated(p, type, false); - LLVMValueRef dest_ptr = res.addr.value; - - LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0); - dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, ""); - for (unsigned i = 0; i < N; i++) { - LLVMValueRef indices[] = {do_u32(p, i)}; - LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, ""); - LLVMBuildStore(p->builder, z_columns[i], dst); + for_array(i, x_rows) { + LLVMValueRef x_row = x_rows[i]; + for_array(j, y_columns) { + LLVMValueRef y_column = y_columns[j]; + LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column); + lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j); + LLVMBuildStore(p->builder, elem, dst.value); + } } - return lb_addr_load(p, res); } - - - auto x_rows = slice_make(permanent_allocator(), outer_rows); - auto y_columns = slice_make(permanent_allocator(), outer_columns); - - auto mask_elems = slice_make(permanent_allocator(), inner); - for (unsigned i = 0; i < outer_rows; i++) { - for (unsigned j = 0; j < inner; j++) { - unsigned offset = x_stride*j + i; - mask_elems[j] = lb_const_int(p->module, t_u32, offset).value; - } - - // transpose mask - LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner); - LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask); - x_rows[i] = row; - } - - for (unsigned i = 0; i < outer_columns; i++) { - LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner); - LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask); - y_columns[i] = column; - } - - lbAddr res = lb_add_local_generated(p, type, false); - for_array(i, x_rows) { - LLVMValueRef x_row = x_rows[i]; - for_array(j, y_columns) { - LLVMValueRef y_column = y_columns[j]; - LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column); - lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j); - LLVMBuildStore(p->builder, elem, dst.value); - } - } - return lb_addr_load(p, res); } if (!xt->Matrix.is_row_major) { @@ -1186,28 +1257,7 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal } } - auto temps = slice_make(permanent_allocator(), column_count); - for (unsigned i = 0; i < column_count; i++) { - temps[i] = llvm_vector_mul(p, m_columns[i], v_rows[i]); - } - - GB_ASSERT(column_count > 0); - - unsigned k = column_count; - while (k > 1) { - unsigned half = k/2; - for (unsigned j = 0; j < half; j++) { - temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]); - } - - if ((k&1) != 0) { - temps[half] = temps[k-1]; - } - k = (k+1)/2; - } - - LLVMValueRef vector = temps[0]; - + LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, m_columns, v_rows); return lb_matrix_cast_vector_to_type(p, vector, type); } diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp index d101d28c2..25481b2ed 100644 --- a/src/llvm_backend_utility.cpp +++ b/src/llvm_backend_utility.cpp @@ -2230,6 +2230,30 @@ gb_internal LLVMValueRef llvm_vector_mul(lbProcedure *p, LLVMValueRef a, LLVMVal return LLVMBuildFMul(p->builder, a, b, ""); } +gb_internal LLVMValueRef llvm_vector_mul_pairwise_reduce_add(lbProcedure *p, Slice const &a, Slice const &b) { + GB_ASSERT(a.count == b.count); + + auto temps = slice_make(temporary_allocator(), a.count); + for (unsigned i = 0; i < a.count; i++) { + temps[i] = llvm_vector_mul(p, a[i], b[i]); + } + + unsigned k = cast(unsigned)a.count; + while (k > 1) { + unsigned half = k/2; + for (unsigned j = 0; j < half; j++) { + temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]); + } + + if ((k&1) != 0) { + temps[half] = temps[k-1]; + } + k = (k+1)/2; + } + + return temps[0]; +} + gb_internal LLVMValueRef llvm_vector_dot(lbProcedure *p, LLVMValueRef a, LLVMValueRef b) { return llvm_vector_reduce_add(p, llvm_vector_mul(p, a, b)); From 7d0dba1b821aa38ff55d604555208a7b57a002a1 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 13:41:30 +0100 Subject: [PATCH 6/8] Improve code generation for `intrinsics.transpose` --- src/llvm_backend_expr.cpp | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index ba2bea7cd..968d27a4e 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -822,35 +822,32 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty GB_PANIC("TODO: transpose with changing layout"); } - if (lb_is_matrix_simdable(mt) && lb_is_matrix_simdable(type)) { + if (lb_is_matrix_simdable(mt, true) && lb_is_matrix_simdable(type, true)) { + auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef { + return LLVMConstInt(lb_type(p->module, t_u32), val, false); + }; + unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt); unsigned row_count = cast(unsigned)mt->Matrix.row_count; unsigned column_count = cast(unsigned)mt->Matrix.column_count; - - auto rows = slice_make(permanent_allocator(), row_count); - auto mask_elems = slice_make(permanent_allocator(), column_count); + unsigned other_stride = (row_count*column_count)/stride; LLVMValueRef vector = lb_matrix_to_vector(p, m); + auto mask_elems = slice_make(permanent_allocator(), row_count * column_count); for (unsigned i = 0; i < row_count; i++) { for (unsigned j = 0; j < column_count; j++) { - unsigned offset = stride*j + i; - mask_elems[j] = lb_const_int(p->module, t_u32, offset).value; + mask_elems[other_stride*i + j] = do_u32(p, stride*j + i); } - - // transpose mask - LLVMValueRef mask = LLVMConstVector(mask_elems.data, column_count); - LLVMValueRef row = llvm_basic_shuffle(p, vector, mask); - rows[i] = row; } + LLVMValueRef mask = LLVMConstVector(mask_elems.data, cast(unsigned)mask_elems.count); + LLVMValueRef transposed_vector = llvm_basic_shuffle(p, vector, mask); + lbAddr res = lb_add_local_generated(p, type, false); - lbAddr res = lb_add_local_generated(p, type, true); - for_array(i, rows) { - LLVMValueRef row = rows[i]; - lbValue dst_row_ptr = lb_emit_matrix_epi(p, res.addr, 0, i); - LLVMValueRef ptr = dst_row_ptr.value; - ptr = LLVMBuildPointerCast(p->builder, ptr, LLVMPointerType(LLVMTypeOf(row), 0), ""); - LLVMBuildStore(p->builder, row, ptr); - } + LLVMValueRef res_ptr = res.addr.value; + res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(transposed_vector), 0), ""); + + LLVMValueRef store = LLVMBuildStore(p->builder, transposed_vector, res_ptr); + LLVMSetAlignment(store, cast(unsigned)type_align_of(type)); return lb_addr_load(p, res); } From 86a6f69793988ed8967c7cd91fdaf67e6821ef37 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 14:01:50 +0100 Subject: [PATCH 7/8] General clean up --- src/llvm_backend_expr.cpp | 58 +++++++++--------------------------- src/llvm_backend_utility.cpp | 1 + 2 files changed, 15 insertions(+), 44 deletions(-) diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index 968d27a4e..53e0d32de 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -866,8 +866,10 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty return lb_addr_load(p, res); } -gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) { - lbAddr res = lb_add_local_generated(p, type, true); +gb_internal lbAddr llvm_add_local_generated_from_vector(lbProcedure *p, Type *type, LLVMValueRef vector) { + GB_ASSERT(LLVMGetTypeKind(LLVMTypeOf(vector)) == LLVMVectorTypeKind); + + lbAddr res = lb_add_local_generated(p, type, false); LLVMValueRef res_ptr = res.addr.value; unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector))); LLVMSetAlignment(res_ptr, alignment); @@ -875,9 +877,16 @@ gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef v res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), ""); LLVMBuildStore(p->builder, vector, res_ptr); + return res; +} + +gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) { + lbAddr res = llvm_add_local_generated_from_vector(p, type, vector); return lb_addr_load(p, res); } + + gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type) { if (is_type_array(m.type)) { // no-op @@ -895,31 +904,6 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type lbValue n = lb_const_int(p->module, t_int, type_size_of(type)); lb_mem_copy_non_overlapping(p, res.addr, m_ptr, n); - // i64 row_count = mt->Matrix.row_count; - // i64 column_count = mt->Matrix.column_count; - // TEMPORARY_ALLOCATOR_GUARD(); - - // auto srcs = array_make(temporary_allocator(), 0, row_count*column_count); - // auto dsts = array_make(temporary_allocator(), 0, row_count*column_count); - - // for (i64 j = 0; j < column_count; j++) { - // for (i64 i = 0; i < row_count; i++) { - // lbValue src = lb_emit_matrix_ev(p, m, i, j); - // array_add(&srcs, src); - // } - // } - - // for (i64 j = 0; j < column_count; j++) { - // for (i64 i = 0; i < row_count; i++) { - // lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count); - // array_add(&dsts, dst); - // } - // } - - // GB_ASSERT(srcs.count == dsts.count); - // for_array(i, srcs) { - // lb_emit_store(p, dsts[i], srcs[i]); - // } return lb_addr_load(p, res); } @@ -1328,27 +1312,13 @@ gb_internal lbValue lb_emit_vector_mul_matrix(lbProcedure *p, lbValue lhs, lbVal GB_ASSERT(row_count > 0); - LLVMValueRef vector = nullptr; - for (i64 i = 0; i < row_count; i++) { - if (i == 0) { - vector = llvm_vector_mul(p, v_rows[i], m_columns[i]); - } else { - vector = llvm_vector_mul_add(p, v_rows[i], m_columns[i], vector); - } - } - - lbAddr res = lb_add_local_generated(p, type, true); - LLVMValueRef res_ptr = res.addr.value; - unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector))); - LLVMSetAlignment(res_ptr, alignment); - - res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), ""); - LLVMBuildStore(p->builder, vector, res_ptr); + LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, v_rows, m_columns); + lbAddr res = llvm_add_local_generated_from_vector(p, type, vector); return lb_addr_load(p, res); } - lbAddr res = lb_add_local_generated(p, type, true); + lbAddr res = lb_add_local_generated(p, type, false); Type *vector_elem_type = base_array_type(rhs.type); diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp index 25481b2ed..bb3b4dadb 100644 --- a/src/llvm_backend_utility.cpp +++ b/src/llvm_backend_utility.cpp @@ -2293,6 +2293,7 @@ gb_internal LLVMValueRef llvm_vector_mul_add(lbProcedure *p, LLVMValueRef a, LLV } } + gb_internal LLVMValueRef llvm_get_inline_asm(LLVMTypeRef func_type, String const &str, String const &clobbers, bool has_side_effects=true, bool is_align_stack=false, LLVMInlineAsmDialect dialect=LLVMInlineAsmDialectATT) { return LLVMGetInlineAsm(func_type, cast(char *)str.text, cast(size_t)str.len, From 724ce14677b46473a6c5718371da1cffe4dab2ab Mon Sep 17 00:00:00 2001 From: gingerBill Date: Mon, 11 May 2026 15:44:12 +0100 Subject: [PATCH 8/8] More improvements to linalg functions --- core/math/linalg/extended.odin | 80 +++++++++++++++++----------------- core/math/linalg/general.odin | 13 +++--- 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/core/math/linalg/extended.odin b/core/math/linalg/extended.odin index 0470054c3..9e00a76d2 100644 --- a/core/math/linalg/extended.odin +++ b/core/math/linalg/extended.odin @@ -7,7 +7,7 @@ import "core:math" @(require_results) to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_NUMERIC(ELEM @(require_results) to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_NUMERIC(ELEM @(require_results) min_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: ELEM_TYPE(T)) where IS_NUMERIC out = builtin.min(a[0], a[1]) } else { out = builtin.min(a[0], a[1]) - for i in 2.. (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: ELEM_TYPE(T)) where IS_NUMERIC out = builtin.max(a[0], a[1], a[2]) }else { out = builtin.max(a[0], a[1]) - for i in 2.. (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { @(require_results) sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { @(require_results) clamp :: proc "contextless" (x, a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. T where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T) @(require_results) mix :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. T where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) step :: proc "contextless" (e, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. T where IS_FLOAT(ELEM_TYPE @(require_results) sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE( @(require_results) cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) @(require_results) ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { INVLN2 :: 1.4426950408889634073599246810018921374266459541529859341354494069 when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { INVLN10 :: 0.4342944819032518276511289189166050822943970058036665661144537831 when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) pow :: proc "contextless" (x, e: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. (out: T) where IS_FLOAT(ELEM_TYPE(T)) { @(require_results) round :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) { when IS_ARRAY(T) { - for i in 0.. bool where IS_FLOAT(T) { @(require_results) is_nan_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) { - for i in 0.. bool where IS_FLOAT(T) { @(require_results) is_inf_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) { - for i in 0.. math.Float_Class where IS_FLOAT @(require_results) classify_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]math.Float_Class) where IS_FLOAT(T) { - for i in 0.. (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0.. (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0.. (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0.. y[i] } return } @(require_results) greater_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0..= y[i] } return } @(require_results) equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0.. (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) { - for i in 0.. (out: bool) { @(require_results) not :: proc "contextless" (x: $A/[$N]bool) -> (out: A) { for e, i in x { - out[i] = !e + #no_bounds_check out[i] = !e } return } diff --git a/core/math/linalg/general.odin b/core/math/linalg/general.odin index 6a7ba3937..7013de244 100644 --- a/core/math/linalg/general.odin +++ b/core/math/linalg/general.odin @@ -46,18 +46,17 @@ scalar_dot :: proc "contextless" (a, b: $T) -> T where IS_FLOAT(T), !IS_ARRAY(T) @(require_results) vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check { - ab := a * b when N == 1 { - return ab.x + return a.x*b.x } else when N == 2 { - return ab.x + ab.y + return a.x*b.x + a.y*b.y } else when N == 3 { - return ab.x + ab.y + ab.z + return a.x*b.x + a.y*b.y + a.z*b.z } else when N == 4 { - return ab.x + ab.y + ab.z + ab.w + return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w } else { - for elem in ab { - c += elem + #unroll for _, i in a { + c += a[i]*b[i] } return c }