Merge pull request #6676 from odin-lang/bill/matrix-optimizations

General Matrix Optimizations + `@(fast_math)` attribute
This commit is contained in:
gingerBill
2026-05-12 10:42:35 +01:00
committed by GitHub
14 changed files with 476 additions and 168 deletions

View File

@@ -4,6 +4,7 @@ package intrinsics
import "base:runtime"
// Package-Related
is_package_imported :: proc(package_name: string) -> bool ---

View File

@@ -23,6 +23,23 @@ package runtime
import "base:intrinsics"
/*
Fast_Math_Flag :: enum u8 {
Allow_Reassoc = 0,
No_NaNs = 1,
No_Infs = 2,
No_Signed_Zeros = 3,
Allow_Reciprocal = 4,
Allow_Contract = 5,
Approx_Func = 6,
}
*/
Fast_Math_Flag :: intrinsics.Fast_Math_Flag
// Fast_Math_Flags :: distinct bit_set[Fast_Math_Flag; u32]
Fast_Math_Flags :: intrinsics.Fast_Math_Flags
// NOTE(bill): This must match the compiler's
Calling_Convention :: enum u8 {
Invalid = 0,

View File

@@ -7,7 +7,7 @@ import "core:math"
@(require_results)
to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = degrees[i] * RAD_PER_DEG
}
} else {
@@ -19,7 +19,7 @@ to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM
@(require_results)
to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = radians[i] * DEG_PER_RAD
}
} else {
@@ -31,7 +31,7 @@ to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM
@(require_results)
min_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = builtin.min(a[i], b[i])
}
} else {
@@ -51,7 +51,7 @@ min_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
out = builtin.min(a[0], a[1])
} else {
out = builtin.min(a[0], a[1])
for i in 2..<N {
#no_bounds_check for i in 2..<N {
out = builtin.min(out, a[i])
}
}
@@ -71,7 +71,7 @@ min :: proc{min_single, min_double, min_triple}
@(require_results)
max_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = builtin.max(a[i], b[i])
}
} else {
@@ -93,7 +93,7 @@ max_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
out = builtin.max(a[0], a[1], a[2])
}else {
out = builtin.max(a[0], a[1])
for i in 2..<N {
#no_bounds_check for i in 2..<N {
out = builtin.max(out, a[i])
}
}
@@ -113,7 +113,7 @@ max :: proc{max_single, max_double, max_triple}
@(require_results)
abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = auto_cast builtin.abs(a[i])
}
} else {
@@ -125,7 +125,7 @@ abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
@(require_results)
sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = #force_inline math.sign(a[i])
}
} else {
@@ -137,7 +137,7 @@ sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
@(require_results)
clamp :: proc "contextless" (x, a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = builtin.clamp(x[i], a[i], b[i])
}
} else {
@@ -155,7 +155,7 @@ saturate :: proc "contextless" (x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = a[i]*(1-t[i]) + b[i]*t[i]
}
} else {
@@ -166,7 +166,7 @@ lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)
@(require_results)
mix :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = a[i]*(1-t[i]) + b[i]*t[i]
}
} else {
@@ -183,7 +183,7 @@ unlerp :: proc "contextless" (a, b, x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
step :: proc "contextless" (e, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = x[i] < e[i] ? 0.0 : 1.0
}
} else {
@@ -208,7 +208,7 @@ smootherstep :: proc "contextless" (e0, e1, x: $T) -> T where IS_FLOAT(ELEM_TYPE
@(require_results)
sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.sqrt(x[i])
}
} else {
@@ -220,7 +220,7 @@ sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = 1.0/math.sqrt(x[i])
}
} else {
@@ -232,7 +232,7 @@ inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(
@(require_results)
cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.cos(x[i])
}
} else {
@@ -244,7 +244,7 @@ cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.sin(x[i])
}
} else {
@@ -256,7 +256,7 @@ sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.tan(x[i])
}
} else {
@@ -268,7 +268,7 @@ tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.acos(x[i])
}
} else {
@@ -280,7 +280,7 @@ acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.asin(x[i])
}
} else {
@@ -292,7 +292,7 @@ asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.atan(x[i])
}
} else {
@@ -303,7 +303,7 @@ atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.atan2(y[i], x[i])
}
} else {
@@ -316,7 +316,7 @@ atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T))
@(require_results)
ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.ln(x[i])
}
} else {
@@ -329,7 +329,7 @@ ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
INVLN2 :: 1.4426950408889634073599246810018921374266459541529859341354494069
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = INVLN2 * math.ln(x[i])
}
} else {
@@ -342,7 +342,7 @@ log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
INVLN10 :: 0.4342944819032518276511289189166050822943970058036665661144537831
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = INVLN10 * math.ln(x[i])
}
} else {
@@ -354,7 +354,7 @@ log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.ln(x[i]) / math.ln(cast(ELEM_TYPE(T))b[i])
}
} else {
@@ -366,7 +366,7 @@ log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.exp(x[i])
}
} else {
@@ -378,7 +378,7 @@ exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.exp(LN2 * x[i])
}
} else {
@@ -390,7 +390,7 @@ exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.exp(LN10 * x[i])
}
} else {
@@ -402,7 +402,7 @@ exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
pow :: proc "contextless" (x, e: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = math.pow(x[i], e[i])
}
} else {
@@ -425,7 +425,7 @@ floor :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
@(require_results)
round :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
when IS_ARRAY(T) {
for i in 0..<len(T) {
#no_bounds_check for i in 0..<len(T) {
out[i] = #force_inline math.round(x[i])
}
} else {
@@ -486,7 +486,7 @@ is_nan_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {
@(require_results)
is_nan_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = #force_inline is_nan(x[i])
}
return
@@ -499,7 +499,7 @@ is_inf_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {
@(require_results)
is_inf_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = #force_inline is_inf(x[i])
}
return
@@ -512,7 +512,7 @@ classify_single :: proc "contextless" (x: $T) -> math.Float_Class where IS_FLOAT
@(require_results)
classify_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]math.Float_Class) where IS_FLOAT(T) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = #force_inline classify_single(x[i])
}
return
@@ -532,42 +532,42 @@ classify :: proc{classify_single, classify_array}
@(require_results)
less_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] < y[i]
}
return
}
@(require_results)
less_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] <= y[i]
}
return
}
@(require_results)
greater_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] > y[i]
}
return
}
@(require_results)
greater_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] >= y[i]
}
return
}
@(require_results)
equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] == y[i]
}
return
}
@(require_results)
not_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
for i in 0..<N {
#no_bounds_check for i in 0..<N {
out[i] = x[i] != y[i]
}
return
@@ -601,7 +601,7 @@ all :: proc "contextless" (x: $A/[$N]bool) -> (out: bool) {
@(require_results)
not :: proc "contextless" (x: $A/[$N]bool) -> (out: A) {
for e, i in x {
out[i] = !e
#no_bounds_check out[i] = !e
}
return
}

View File

@@ -46,18 +46,17 @@ scalar_dot :: proc "contextless" (a, b: $T) -> T where IS_FLOAT(T), !IS_ARRAY(T)
@(require_results)
vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
ab := a * b
when N == 1 {
return ab.x
return a.x*b.x
} else when N == 2 {
return ab.x + ab.y
return a.x*b.x + a.y*b.y
} else when N == 3 {
return ab.x + ab.y + ab.z
return a.x*b.x + a.y*b.y + a.z*b.z
} else when N == 4 {
return ab.x + ab.y + ab.z + ab.w
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w
} else {
for elem in ab {
c += elem
#unroll for _, i in a {
c += a[i]*b[i]
}
return c
}

View File

@@ -455,6 +455,7 @@ enum IntegerDivisionByZeroKind : u8 {
IntegerDivisionByZero_AllBits,
};
// This stores the information for the specify architecture of this build
struct BuildContext {
// Constants

View File

@@ -1480,6 +1480,8 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
e->Procedure.no_sanitize_memory = ac.no_sanitize_memory;
e->Procedure.no_sanitize_thread = ac.no_sanitize_thread;
e->Procedure.fast_math_flags = ac.fast_math_flags;
e->deprecated_message = ac.deprecated_message;
e->warning_message = ac.warning_message;
ac.link_name = handle_link_name(ctx, e->token, ac.link_name, ac.link_prefix, ac.link_suffix);

View File

@@ -1140,6 +1140,11 @@ gb_internal void check_assignment(CheckerContext *c, Operand *operand, Type *typ
return;
}
if (operand->mode == Addressing_Type && is_type_typeid(type)) {
add_type_info_type(c, operand->type);
add_type_and_value(c, operand->expr, Addressing_Value, type, exact_value_typeid(operand->type));
return;
}
if (is_type_untyped(operand->type)) {
Type *target_type = type;
@@ -9122,9 +9127,11 @@ gb_internal bool check_is_operand_compound_lit_constant(CheckerContext *c, Opera
if (is_type_any(field_type)) {
return false;
}
if (field_type != nullptr && is_type_typeid(field_type) && o->mode == Addressing_Type) {
add_type_info_type(c, o->type);
return true;
if (field_type != nullptr && is_type_typeid(field_type)) {
if (o->mode == Addressing_Type) {
add_type_info_type(c, o->type);
return true;
}
}
Ast *expr = unparen_expr(o->expr);
@@ -10584,6 +10591,7 @@ gb_internal ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *
}
i64 max = 0;
Type *bet = base_type(elem_type);
@@ -12483,7 +12491,7 @@ gb_internal void check_multi_expr_with_type_hint(CheckerContext *c, Operand *o,
case Addressing_Type:
if (type_hint != nullptr && is_type_typeid(type_hint)) {
add_type_info_type(c, o->type);
break;
return;
}
error_operand_not_expression(o);
break;

View File

@@ -1042,14 +1042,14 @@ struct GlobalEnumValue {
i64 value;
};
gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr) {
gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr, Type *backing_type = nullptr) {
Scope *scope = create_scope(nullptr, builtin_pkg->scope);
Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);
Type *enum_type = alloc_type_enum();
Type *named_type = alloc_type_named(type_name, enum_type, entity);
set_base_type(named_type, enum_type);
enum_type->Enum.base_type = t_int;
enum_type->Enum.base_type = backing_type ? backing_type : t_int;
enum_type->Enum.scope = scope;
entity->type = named_type;
@@ -1250,6 +1250,41 @@ gb_internal void init_universal(void) {
add_global_enum_constant(fields, "ODIN_ERROR_POS_STYLE", build_context.ODIN_ERROR_POS_STYLE);
}
{
GlobalEnumValue values[OdinFastMath_COUNT] = {};
for (unsigned i = 0; i < OdinFastMath_COUNT; i++) {
values[i] = {OdinFastMathFlag_strings[i], i};
}
auto fields = add_global_enum_type(str_lit("Fast_Math_Flag"), values, gb_count_of(values), &t_fast_math_flag, t_u8);
GB_ASSERT(t_fast_math_flag->kind == Type_Named);
scope_insert(intrinsics_pkg->scope, t_fast_math_flag->Named.type_name);
Type *bs = alloc_type_bit_set();
bs->BitSet.elem = t_fast_math_flag;
bs->BitSet.underlying = t_u32;
bs->BitSet.lower = 0;
bs->BitSet.upper = OdinFastMath_COUNT-1;
bs->BitSet.node = nullptr;
{
String type_name = str_lit("Fast_Math_Flags");
Scope *scope = create_scope(nullptr, builtin_pkg->scope);
Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);
Type *named_type = alloc_type_named(type_name, bs, entity);
set_base_type(named_type, bs);
entity->type = named_type;
t_fast_math_flags = named_type;
scope_insert(intrinsics_pkg->scope, entity);
}
}
{
GlobalEnumValue values[OdinAtomicMemoryOrder_COUNT] = {
{OdinAtomicMemoryOrder_strings[OdinAtomicMemoryOrder_relaxed], OdinAtomicMemoryOrder_relaxed},
@@ -3554,11 +3589,17 @@ gb_internal void init_preload(Checker *c) {
init_core_objc_c(c);
}
gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value) {
gb_internal void check_expr_with_type_hint(CheckerContext *c, Operand *o, Ast *e, Type *t);
gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value, Type *type_hint = nullptr) {
ExactValue ev = {};
if (value != nullptr) {
Operand op = {};
check_expr(c, &op, value);
if (type_hint != nullptr) {
check_expr_with_type_hint(c, &op, value, type_hint);
} else {
check_expr(c, &op, value);
}
if (op.mode) {
if (op.mode == Addressing_Constant) {
ev = op.value;
@@ -4126,6 +4167,18 @@ gb_internal DECL_ATTRIBUTE_PROC(proc_decl_attribute) {
}
ac->no_sanitize_thread = true;
return true;
} else if (name == "fast_math") {
if (value == nullptr) {
error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
} else {
ExactValue ev = check_decl_attribute_value(c, value, t_fast_math_flags);
if (ev.kind != ExactValue_Integer) {
error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
} else {
ac->fast_math_flags = exact_value_to_u64(ev);
}
}
return true;
}
return false;
}

View File

@@ -163,6 +163,8 @@ struct AttributeContext {
String require_target_feature; // required by the target micro-architecture
String enable_target_feature; // will be enabled for the procedure only
u64 fast_math_flags;
bool raddbg_type_view;
String raddbg_type_view_string;
};

View File

@@ -256,6 +256,9 @@ struct Entity {
struct GenProcsData *gen_procs;
BlockingMutex gen_procs_mutex;
ProcedureOptimizationMode optimization_mode;
u64 fast_math_flags;
bool is_foreign : 1;
bool is_export : 1;
bool generated_from_polymorphic : 1;

View File

@@ -672,7 +672,7 @@ gb_internal lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lh
}
}
gb_internal bool lb_is_matrix_simdable(Type *t) {
gb_internal bool lb_is_matrix_simdable(Type *t, bool ignore_layout=false) {
Type *mt = base_type(t);
GB_ASSERT(mt->kind == Type_Matrix);
@@ -697,12 +697,14 @@ gb_internal bool lb_is_matrix_simdable(Type *t) {
// it's not aligned well enough to use the vector instructions
return false;
}
if ((mt->Matrix.row_count & 1) ^ (mt->Matrix.column_count & 1)) {
if ((mt->Matrix.row_count & 1) && (mt->Matrix.column_count & 1)) {
return false;
}
if (mt->Matrix.is_row_major) {
// TODO(bill): make #row_major matrices work with SIMD
return false;
if (!ignore_layout) {
// TODO(bill): make #row_major matrices work with SIMD
return false;
}
}
if (elem->kind == Type_Basic) {
@@ -820,35 +822,32 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
GB_PANIC("TODO: transpose with changing layout");
}
if (lb_is_matrix_simdable(mt) && lb_is_matrix_simdable(type)) {
if (lb_is_matrix_simdable(mt, true) && lb_is_matrix_simdable(type, true)) {
auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
return LLVMConstInt(lb_type(p->module, t_u32), val, false);
};
unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt);
unsigned row_count = cast(unsigned)mt->Matrix.row_count;
unsigned column_count = cast(unsigned)mt->Matrix.column_count;
auto rows = slice_make<LLVMValueRef>(permanent_allocator(), row_count);
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), column_count);
unsigned other_stride = (row_count*column_count)/stride;
LLVMValueRef vector = lb_matrix_to_vector(p, m);
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), row_count * column_count);
for (unsigned i = 0; i < row_count; i++) {
for (unsigned j = 0; j < column_count; j++) {
unsigned offset = stride*j + i;
mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
mask_elems[other_stride*i + j] = do_u32(p, stride*j + i);
}
// transpose mask
LLVMValueRef mask = LLVMConstVector(mask_elems.data, column_count);
LLVMValueRef row = llvm_basic_shuffle(p, vector, mask);
rows[i] = row;
}
LLVMValueRef mask = LLVMConstVector(mask_elems.data, cast(unsigned)mask_elems.count);
LLVMValueRef transposed_vector = llvm_basic_shuffle(p, vector, mask);
lbAddr res = lb_add_local_generated(p, type, false);
lbAddr res = lb_add_local_generated(p, type, true);
for_array(i, rows) {
LLVMValueRef row = rows[i];
lbValue dst_row_ptr = lb_emit_matrix_epi(p, res.addr, 0, i);
LLVMValueRef ptr = dst_row_ptr.value;
ptr = LLVMBuildPointerCast(p->builder, ptr, LLVMPointerType(LLVMTypeOf(row), 0), "");
LLVMBuildStore(p->builder, row, ptr);
}
LLVMValueRef res_ptr = res.addr.value;
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(transposed_vector), 0), "");
LLVMValueRef store = LLVMBuildStore(p->builder, transposed_vector, res_ptr);
LLVMSetAlignment(store, cast(unsigned)type_align_of(type));
return lb_addr_load(p, res);
}
@@ -867,8 +866,10 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
return lb_addr_load(p, res);
}
gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
lbAddr res = lb_add_local_generated(p, type, true);
gb_internal lbAddr llvm_add_local_generated_from_vector(lbProcedure *p, Type *type, LLVMValueRef vector) {
GB_ASSERT(LLVMGetTypeKind(LLVMTypeOf(vector)) == LLVMVectorTypeKind);
lbAddr res = lb_add_local_generated(p, type, false);
LLVMValueRef res_ptr = res.addr.value;
unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
LLVMSetAlignment(res_ptr, alignment);
@@ -876,9 +877,16 @@ gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef v
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
LLVMBuildStore(p->builder, vector, res_ptr);
return res;
}
gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
return lb_addr_load(p, res);
}
gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type) {
if (is_type_array(m.type)) {
// no-op
@@ -896,31 +904,6 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type
lbValue n = lb_const_int(p->module, t_int, type_size_of(type));
lb_mem_copy_non_overlapping(p, res.addr, m_ptr, n);
// i64 row_count = mt->Matrix.row_count;
// i64 column_count = mt->Matrix.column_count;
// TEMPORARY_ALLOCATOR_GUARD();
// auto srcs = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
// auto dsts = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
// for (i64 j = 0; j < column_count; j++) {
// for (i64 i = 0; i < row_count; i++) {
// lbValue src = lb_emit_matrix_ev(p, m, i, j);
// array_add(&srcs, src);
// }
// }
// for (i64 j = 0; j < column_count; j++) {
// for (i64 i = 0; i < row_count; i++) {
// lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count);
// array_add(&dsts, dst);
// }
// }
// GB_ASSERT(srcs.count == dsts.count);
// for_array(i, srcs) {
// lb_emit_store(p, dsts[i], srcs[i]);
// }
return lb_addr_load(p, res);
}
@@ -959,6 +942,10 @@ gb_internal lbValue lb_emit_outer_product(lbProcedure *p, lbValue a, lbValue b,
gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, Type *type) {
// TODO(bill): Handle edge case for f16 types on x86(-64) platforms
auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
return LLVMConstInt(lb_type(p->module, t_u32), val, false);
};
Type *xt = base_type(lhs.type);
Type *yt = base_type(rhs.type);
@@ -975,50 +962,183 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
unsigned inner = cast(unsigned)xt->Matrix.column_count;
unsigned outer_columns = cast(unsigned)yt->Matrix.column_count;
if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) {
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
if (lb_is_matrix_simdable(xt, true)) {
if (!xt->Matrix.is_row_major) { // #column_major
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
// square matrix calculation
unsigned N = outer_columns;
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
for (unsigned i = 0; i < outer_rows; i++) {
for (unsigned j = 0; j < inner; j++) {
unsigned offset = x_stride*j + i;
mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
auto x_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
for (unsigned i = 0; i < N; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
x_columns[i] = column;
}
for (unsigned i = 0; i < N; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
y_columns[i] = column;
}
auto z_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
LLVMValueRef mask = llvm_mask_same(p->module, j, N);
mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask);
}
z_columns[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, x_columns);
}
lbAddr res = lb_add_local_generated(p, type, false);
LLVMValueRef dest_ptr = res.addr.value;
LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0);
dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
for (unsigned i = 0; i < N; i++) {
LLVMValueRef indices[] = {do_u32(p, i)};
LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, "");
LLVMBuildStore(p->builder, z_columns[i], dst);
}
return lb_addr_load(p, res);
}
// transpose mask
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
x_rows[i] = row;
}
for (unsigned i = 0; i < outer_columns; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
y_columns[i] = column;
}
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
lbAddr res = lb_add_local_generated(p, type, true);
for_array(i, x_rows) {
LLVMValueRef x_row = x_rows[i];
for_array(j, y_columns) {
LLVMValueRef y_column = y_columns[j];
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
LLVMBuildStore(p->builder, elem, dst.value);
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
for (unsigned i = 0; i < outer_rows; i++) {
for (unsigned j = 0; j < inner; j++) {
unsigned offset = x_stride*j + i;
mask_elems[j] = do_u32(p, offset);
}
// transpose mask
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
x_rows[i] = row;
}
for (unsigned i = 0; i < outer_columns; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
y_columns[i] = column;
}
lbAddr res = lb_add_local_generated(p, type, false);
for_array(i, x_rows) {
LLVMValueRef x_row = x_rows[i];
for_array(j, y_columns) {
LLVMValueRef y_column = y_columns[j];
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
LLVMBuildStore(p->builder, elem, dst.value);
}
}
return lb_addr_load(p, res);
} else { // #row_major
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
// square matrix calculation
unsigned N = outer_columns;
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
auto y_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
for (unsigned i = 0; i < N; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
x_rows[i] = column;
}
for (unsigned i = 0; i < N; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
y_rows[i] = column;
}
auto z_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
LLVMValueRef mask = llvm_mask_same(p->module, j, N);
mask_elems[j] = llvm_basic_shuffle(p, x_rows[i], mask);
}
z_rows[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, y_rows);
}
lbAddr res = lb_add_local_generated(p, type, false);
LLVMValueRef dest_ptr = res.addr.value;
LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_rows[0]), 0);
dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
for (unsigned i = 0; i < N; i++) {
LLVMValueRef indices[] = {do_u32(p, i)};
LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_rows[0]), dest_ptr, indices, 1, "");
LLVMBuildStore(p->builder, z_rows[i], dst);
}
return lb_addr_load(p, res);
}
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
for (unsigned i = 0; i < outer_rows; i++) {
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner);
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
x_rows[i] = row;
}
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
for (unsigned i = 0; i < outer_columns; i++) {
for (unsigned j = 0; j < inner; j++) {
unsigned offset = x_stride*j + i;
mask_elems[j] = do_u32(p, offset);
}
// transpose mask
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
y_columns[i] = column;
}
lbAddr res = lb_add_local_generated(p, type, false);
for_array(i, x_rows) {
LLVMValueRef x_row = x_rows[i];
for_array(j, y_columns) {
LLVMValueRef y_column = y_columns[j];
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
LLVMBuildStore(p->builder, elem, dst.value);
}
}
return lb_addr_load(p, res);
}
return lb_addr_load(p, res);
}
if (!xt->Matrix.is_row_major) {
lbAddr res = lb_add_local_generated(p, type, true);
lbAddr res = lb_add_local_generated(p, type, false);
auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
@@ -1042,7 +1162,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
return lb_addr_load(p, res);
} else {
lbAddr res = lb_add_local_generated(p, type, true);
lbAddr res = lb_add_local_generated(p, type, false);
auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
@@ -1100,23 +1220,25 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal
m_columns[column_index] = column;
}
for (unsigned row_index = 0; row_index < column_count; row_index++) {
LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
v_rows[row_index] = row;
}
if (LLVMIsALoadInst(rhs.value)) {
LLVMValueRef rhs_ptr = LLVMGetOperand(rhs.value, 0);
LLVMTypeRef vector_type = LLVMVectorType(lb_type(p->module, elem), cast(unsigned)vector_count);
LLVMValueRef rhs_vector = LLVMBuildLoad2(p->builder, vector_type, rhs_ptr, "");
LLVMSetAlignment(rhs_vector, cast(unsigned)type_align_of(type));
GB_ASSERT(column_count > 0);
LLVMValueRef vector = nullptr;
for (i64 i = 0; i < column_count; i++) {
if (i == 0) {
vector = llvm_vector_mul(p, m_columns[i], v_rows[i]);
} else {
vector = llvm_vector_mul_add(p, m_columns[i], v_rows[i], vector);
for (unsigned i = 0; i < column_count; i++) {
LLVMValueRef mask = llvm_mask_same(p->module, i, row_count);
v_rows[i] = llvm_basic_shuffle(p, rhs_vector, mask);
}
} else {
for (unsigned row_index = 0; row_index < column_count; row_index++) {
LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
v_rows[row_index] = row;
}
}
LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, m_columns, v_rows);
return lb_matrix_cast_vector_to_type(p, vector, type);
}
@@ -1190,27 +1312,13 @@ gb_internal lbValue lb_emit_vector_mul_matrix(lbProcedure *p, lbValue lhs, lbVal
GB_ASSERT(row_count > 0);
LLVMValueRef vector = nullptr;
for (i64 i = 0; i < row_count; i++) {
if (i == 0) {
vector = llvm_vector_mul(p, v_rows[i], m_columns[i]);
} else {
vector = llvm_vector_mul_add(p, v_rows[i], m_columns[i], vector);
}
}
lbAddr res = lb_add_local_generated(p, type, true);
LLVMValueRef res_ptr = res.addr.value;
unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
LLVMSetAlignment(res_ptr, alignment);
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
LLVMBuildStore(p->builder, vector, res_ptr);
LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, v_rows, m_columns);
lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
return lb_addr_load(p, res);
}
lbAddr res = lb_add_local_generated(p, type, true);
lbAddr res = lb_add_local_generated(p, type, false);
Type *vector_elem_type = base_array_type(rhs.type);

View File

@@ -270,6 +270,55 @@ gb_internal void lb_populate_module_pass_manager(LLVMTargetMachineRef target_mac
optimization of Odin programs
**************************************************************************/
gb_internal void lb_run_fast_float_math_pass(lbProcedure *p) {
Entity *e = p->entity;
if (e == nullptr) {
return;
}
GB_ASSERT(e->kind == Entity_Procedure);
u64 fast_math_flags = e->Procedure.fast_math_flags;
LLVMFastMathFlags llvm_flags = 0;
if (fast_math_flags & OdinFastMath_Allow_Reassoc) llvm_flags |= LLVMFastMathAllowReassoc;
if (fast_math_flags & OdinFastMath_No_NaNs) llvm_flags |= LLVMFastMathNoNaNs;
if (fast_math_flags & OdinFastMath_No_Infs) llvm_flags |= LLVMFastMathNoInfs;
if (fast_math_flags & OdinFastMath_No_Signed_Zeros) llvm_flags |= LLVMFastMathNoSignedZeros;
if (fast_math_flags & OdinFastMath_Allow_Reciprocal) llvm_flags |= LLVMFastMathAllowReciprocal;
if (fast_math_flags & OdinFastMath_Allow_Contract) llvm_flags |= LLVMFastMathAllowContract;
if (fast_math_flags & OdinFastMath_Approx_Func) llvm_flags |= LLVMFastMathApproxFunc;
if (llvm_flags == 0) {
return;
}
for (LLVMBasicBlockRef block = LLVMGetFirstBasicBlock(p->value);
block != nullptr;
block = LLVMGetNextBasicBlock(block)) {
for (LLVMValueRef instr = LLVMGetFirstInstruction(block);
instr != nullptr;
instr = LLVMGetNextInstruction(instr)) {
switch (LLVMGetInstructionOpcode(instr)) {
case LLVMFNeg:
case LLVMFAdd:
case LLVMFSub:
case LLVMFMul:
case LLVMFDiv:
case LLVMFRem:
case LLVMFPToUI:
case LLVMFPToSI:
case LLVMUIToFP:
case LLVMSIToFP:
case LLVMFPTrunc:
case LLVMFPExt:
case LLVMFCmp:
LLVMSetFastMathFlags(instr, llvm_flags);
break;
}
}
}
}
gb_internal void lb_run_remove_dead_instruction_pass(lbProcedure *p) {
unsigned debug_declare_id = LLVMLookupIntrinsicID("llvm.dbg.declare", 16);
GB_ASSERT(debug_declare_id != 0);
@@ -475,6 +524,9 @@ gb_internal void lb_run_function_pass_manager(LLVMPassManagerRef fpm, lbProcedur
if (p == nullptr) {
return;
}
lb_run_fast_float_math_pass(p);
// NOTE(bill): LLVMAddDCEPass doesn't seem to be exported in the official DLL's for LLVM
// which means we cannot rely upon it
// This is also useful for read the .ll for debug purposes because a lot of instructions

View File

@@ -2048,6 +2048,15 @@ gb_internal LLVMValueRef llvm_mask_zero(lbModule *m, unsigned count) {
return LLVMConstNull(LLVMVectorType(lb_type(m, t_u32), count));
}
gb_internal LLVMValueRef llvm_mask_same(lbModule *m, unsigned value, unsigned count) {
auto iota = slice_make<LLVMValueRef>(temporary_allocator(), count);
for (unsigned i = 0; i < count; i++) {
iota[i] = lb_const_int(m, t_u32, value).value;
}
return LLVMConstVector(iota.data, count);
}
#define LLVM_VECTOR_DUMMY_VALUE(type) LLVMGetUndef((type))
// #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMConstNull((type))
@@ -2221,6 +2230,30 @@ gb_internal LLVMValueRef llvm_vector_mul(lbProcedure *p, LLVMValueRef a, LLVMVal
return LLVMBuildFMul(p->builder, a, b, "");
}
gb_internal LLVMValueRef llvm_vector_mul_pairwise_reduce_add(lbProcedure *p, Slice<LLVMValueRef> const &a, Slice<LLVMValueRef> const &b) {
GB_ASSERT(a.count == b.count);
auto temps = slice_make<LLVMValueRef>(temporary_allocator(), a.count);
for (unsigned i = 0; i < a.count; i++) {
temps[i] = llvm_vector_mul(p, a[i], b[i]);
}
unsigned k = cast(unsigned)a.count;
while (k > 1) {
unsigned half = k/2;
for (unsigned j = 0; j < half; j++) {
temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]);
}
if ((k&1) != 0) {
temps[half] = temps[k-1];
}
k = (k+1)/2;
}
return temps[0];
}
gb_internal LLVMValueRef llvm_vector_dot(lbProcedure *p, LLVMValueRef a, LLVMValueRef b) {
return llvm_vector_reduce_add(p, llvm_vector_mul(p, a, b));
@@ -2260,6 +2293,7 @@ gb_internal LLVMValueRef llvm_vector_mul_add(lbProcedure *p, LLVMValueRef a, LLV
}
}
gb_internal LLVMValueRef llvm_get_inline_asm(LLVMTypeRef func_type, String const &str, String const &clobbers, bool has_side_effects=true, bool is_align_stack=false, LLVMInlineAsmDialect dialect=LLVMInlineAsmDialectATT) {
return LLVMGetInlineAsm(func_type,
cast(char *)str.text, cast(size_t)str.len,

View File

@@ -805,6 +805,34 @@ gb_global Type *t_atomic_memory_order = nullptr;
enum OdinFastMathFlag : u8 {
OdinFastMath_Allow_Reassoc = 0,
OdinFastMath_No_NaNs = 1,
OdinFastMath_No_Infs = 2,
OdinFastMath_No_Signed_Zeros = 3,
OdinFastMath_Allow_Reciprocal = 4,
OdinFastMath_Allow_Contract = 5,
OdinFastMath_Approx_Func = 6,
OdinFastMath_COUNT,
};
char const *OdinFastMathFlag_strings[OdinFastMath_COUNT] = {
"Allow_Reassoc",
"No_NaNs",
"No_Infs",
"No_Signed_Zeros",
"Allow_Reciprocal",
"Allow_Contract",
"Approx_Func",
};
gb_global Type *t_fast_math_flag = nullptr; // named enum
gb_global Type *t_fast_math_flags = nullptr; // named bit_set
gb_global RecursiveMutex g_type_mutex;
struct TypePath;