mirror of
https://github.com/odin-lang/Odin.git
synced 2026-05-25 13:18:14 +00:00
Merge pull request #6676 from odin-lang/bill/matrix-optimizations
General Matrix Optimizations + `@(fast_math)` attribute
This commit is contained in:
@@ -4,6 +4,7 @@ package intrinsics
|
||||
|
||||
import "base:runtime"
|
||||
|
||||
|
||||
// Package-Related
|
||||
is_package_imported :: proc(package_name: string) -> bool ---
|
||||
|
||||
|
||||
@@ -23,6 +23,23 @@ package runtime
|
||||
|
||||
import "base:intrinsics"
|
||||
|
||||
/*
|
||||
Fast_Math_Flag :: enum u8 {
|
||||
Allow_Reassoc = 0,
|
||||
No_NaNs = 1,
|
||||
No_Infs = 2,
|
||||
No_Signed_Zeros = 3,
|
||||
Allow_Reciprocal = 4,
|
||||
Allow_Contract = 5,
|
||||
Approx_Func = 6,
|
||||
}
|
||||
*/
|
||||
Fast_Math_Flag :: intrinsics.Fast_Math_Flag
|
||||
|
||||
// Fast_Math_Flags :: distinct bit_set[Fast_Math_Flag; u32]
|
||||
Fast_Math_Flags :: intrinsics.Fast_Math_Flags
|
||||
|
||||
|
||||
// NOTE(bill): This must match the compiler's
|
||||
Calling_Convention :: enum u8 {
|
||||
Invalid = 0,
|
||||
|
||||
@@ -7,7 +7,7 @@ import "core:math"
|
||||
@(require_results)
|
||||
to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = degrees[i] * RAD_PER_DEG
|
||||
}
|
||||
} else {
|
||||
@@ -19,7 +19,7 @@ to_radians :: proc "contextless" (degrees: $T) -> (out: T) where IS_NUMERIC(ELEM
|
||||
@(require_results)
|
||||
to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = radians[i] * DEG_PER_RAD
|
||||
}
|
||||
} else {
|
||||
@@ -31,7 +31,7 @@ to_degrees :: proc "contextless" (radians: $T) -> (out: T) where IS_NUMERIC(ELEM
|
||||
@(require_results)
|
||||
min_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = builtin.min(a[i], b[i])
|
||||
}
|
||||
} else {
|
||||
@@ -51,7 +51,7 @@ min_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
|
||||
out = builtin.min(a[0], a[1])
|
||||
} else {
|
||||
out = builtin.min(a[0], a[1])
|
||||
for i in 2..<N {
|
||||
#no_bounds_check for i in 2..<N {
|
||||
out = builtin.min(out, a[i])
|
||||
}
|
||||
}
|
||||
@@ -71,7 +71,7 @@ min :: proc{min_single, min_double, min_triple}
|
||||
@(require_results)
|
||||
max_double :: proc "contextless" (a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = builtin.max(a[i], b[i])
|
||||
}
|
||||
} else {
|
||||
@@ -93,7 +93,7 @@ max_single :: proc "contextless" (a: $T) -> (out: ELEM_TYPE(T)) where IS_NUMERIC
|
||||
out = builtin.max(a[0], a[1], a[2])
|
||||
}else {
|
||||
out = builtin.max(a[0], a[1])
|
||||
for i in 2..<N {
|
||||
#no_bounds_check for i in 2..<N {
|
||||
out = builtin.max(out, a[i])
|
||||
}
|
||||
}
|
||||
@@ -113,7 +113,7 @@ max :: proc{max_single, max_double, max_triple}
|
||||
@(require_results)
|
||||
abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = auto_cast builtin.abs(a[i])
|
||||
}
|
||||
} else {
|
||||
@@ -125,7 +125,7 @@ abs :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = #force_inline math.sign(a[i])
|
||||
}
|
||||
} else {
|
||||
@@ -137,7 +137,7 @@ sign :: proc "contextless" (a: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
clamp :: proc "contextless" (x, a, b: $T) -> (out: T) where IS_NUMERIC(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = builtin.clamp(x[i], a[i], b[i])
|
||||
}
|
||||
} else {
|
||||
@@ -155,7 +155,7 @@ saturate :: proc "contextless" (x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = a[i]*(1-t[i]) + b[i]*t[i]
|
||||
}
|
||||
} else {
|
||||
@@ -166,7 +166,7 @@ lerp :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)
|
||||
@(require_results)
|
||||
mix :: proc "contextless" (a, b, t: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = a[i]*(1-t[i]) + b[i]*t[i]
|
||||
}
|
||||
} else {
|
||||
@@ -183,7 +183,7 @@ unlerp :: proc "contextless" (a, b, x: $T) -> T where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
step :: proc "contextless" (e, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = x[i] < e[i] ? 0.0 : 1.0
|
||||
}
|
||||
} else {
|
||||
@@ -208,7 +208,7 @@ smootherstep :: proc "contextless" (e0, e1, x: $T) -> T where IS_FLOAT(ELEM_TYPE
|
||||
@(require_results)
|
||||
sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.sqrt(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -220,7 +220,7 @@ sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = 1.0/math.sqrt(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -232,7 +232,7 @@ inverse_sqrt :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(
|
||||
@(require_results)
|
||||
cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.cos(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -244,7 +244,7 @@ cos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.sin(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -256,7 +256,7 @@ sin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.tan(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -268,7 +268,7 @@ tan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.acos(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -280,7 +280,7 @@ acos :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.asin(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -292,7 +292,7 @@ asin :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.atan(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -303,7 +303,7 @@ atan :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.atan2(y[i], x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -316,7 +316,7 @@ atan2 :: proc "contextless" (y, x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T))
|
||||
@(require_results)
|
||||
ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.ln(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -329,7 +329,7 @@ ln :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
INVLN2 :: 1.4426950408889634073599246810018921374266459541529859341354494069
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = INVLN2 * math.ln(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -342,7 +342,7 @@ log2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
INVLN10 :: 0.4342944819032518276511289189166050822943970058036665661144537831
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = INVLN10 * math.ln(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -354,7 +354,7 @@ log10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.ln(x[i]) / math.ln(cast(ELEM_TYPE(T))b[i])
|
||||
}
|
||||
} else {
|
||||
@@ -366,7 +366,7 @@ log :: proc "contextless" (x, b: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.exp(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -378,7 +378,7 @@ exp :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.exp(LN2 * x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -390,7 +390,7 @@ exp2 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.exp(LN10 * x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -402,7 +402,7 @@ exp10 :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
pow :: proc "contextless" (x, e: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = math.pow(x[i], e[i])
|
||||
}
|
||||
} else {
|
||||
@@ -425,7 +425,7 @@ floor :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
@(require_results)
|
||||
round :: proc "contextless" (x: $T) -> (out: T) where IS_FLOAT(ELEM_TYPE(T)) {
|
||||
when IS_ARRAY(T) {
|
||||
for i in 0..<len(T) {
|
||||
#no_bounds_check for i in 0..<len(T) {
|
||||
out[i] = #force_inline math.round(x[i])
|
||||
}
|
||||
} else {
|
||||
@@ -486,7 +486,7 @@ is_nan_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {
|
||||
|
||||
@(require_results)
|
||||
is_nan_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = #force_inline is_nan(x[i])
|
||||
}
|
||||
return
|
||||
@@ -499,7 +499,7 @@ is_inf_single :: proc "contextless" (x: $T) -> bool where IS_FLOAT(T) {
|
||||
|
||||
@(require_results)
|
||||
is_inf_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]bool) where IS_FLOAT(T) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = #force_inline is_inf(x[i])
|
||||
}
|
||||
return
|
||||
@@ -512,7 +512,7 @@ classify_single :: proc "contextless" (x: $T) -> math.Float_Class where IS_FLOAT
|
||||
|
||||
@(require_results)
|
||||
classify_array :: proc "contextless" (x: $A/[$N]$T) -> (out: [N]math.Float_Class) where IS_FLOAT(T) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = #force_inline classify_single(x[i])
|
||||
}
|
||||
return
|
||||
@@ -532,42 +532,42 @@ classify :: proc{classify_single, classify_array}
|
||||
|
||||
@(require_results)
|
||||
less_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] < y[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
less_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] <= y[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
greater_than_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] > y[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
greater_than_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] >= y[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] == y[i]
|
||||
}
|
||||
return
|
||||
}
|
||||
@(require_results)
|
||||
not_equal_array :: proc "contextless" (x, y: $A/[$N]$T) -> (out: [N]bool) where IS_ARRAY(A), IS_FLOAT(ELEM_TYPE(A)) {
|
||||
for i in 0..<N {
|
||||
#no_bounds_check for i in 0..<N {
|
||||
out[i] = x[i] != y[i]
|
||||
}
|
||||
return
|
||||
@@ -601,7 +601,7 @@ all :: proc "contextless" (x: $A/[$N]bool) -> (out: bool) {
|
||||
@(require_results)
|
||||
not :: proc "contextless" (x: $A/[$N]bool) -> (out: A) {
|
||||
for e, i in x {
|
||||
out[i] = !e
|
||||
#no_bounds_check out[i] = !e
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -46,18 +46,17 @@ scalar_dot :: proc "contextless" (a, b: $T) -> T where IS_FLOAT(T), !IS_ARRAY(T)
|
||||
|
||||
@(require_results)
|
||||
vector_dot :: proc "contextless" (a, b: $T/[$N]$E) -> (c: E) where IS_NUMERIC(E) #no_bounds_check {
|
||||
ab := a * b
|
||||
when N == 1 {
|
||||
return ab.x
|
||||
return a.x*b.x
|
||||
} else when N == 2 {
|
||||
return ab.x + ab.y
|
||||
return a.x*b.x + a.y*b.y
|
||||
} else when N == 3 {
|
||||
return ab.x + ab.y + ab.z
|
||||
return a.x*b.x + a.y*b.y + a.z*b.z
|
||||
} else when N == 4 {
|
||||
return ab.x + ab.y + ab.z + ab.w
|
||||
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w
|
||||
} else {
|
||||
for elem in ab {
|
||||
c += elem
|
||||
#unroll for _, i in a {
|
||||
c += a[i]*b[i]
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
@@ -455,6 +455,7 @@ enum IntegerDivisionByZeroKind : u8 {
|
||||
IntegerDivisionByZero_AllBits,
|
||||
};
|
||||
|
||||
|
||||
// This stores the information for the specify architecture of this build
|
||||
struct BuildContext {
|
||||
// Constants
|
||||
|
||||
@@ -1480,6 +1480,8 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
|
||||
e->Procedure.no_sanitize_memory = ac.no_sanitize_memory;
|
||||
e->Procedure.no_sanitize_thread = ac.no_sanitize_thread;
|
||||
|
||||
e->Procedure.fast_math_flags = ac.fast_math_flags;
|
||||
|
||||
e->deprecated_message = ac.deprecated_message;
|
||||
e->warning_message = ac.warning_message;
|
||||
ac.link_name = handle_link_name(ctx, e->token, ac.link_name, ac.link_prefix, ac.link_suffix);
|
||||
|
||||
@@ -1140,6 +1140,11 @@ gb_internal void check_assignment(CheckerContext *c, Operand *operand, Type *typ
|
||||
return;
|
||||
}
|
||||
|
||||
if (operand->mode == Addressing_Type && is_type_typeid(type)) {
|
||||
add_type_info_type(c, operand->type);
|
||||
add_type_and_value(c, operand->expr, Addressing_Value, type, exact_value_typeid(operand->type));
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_type_untyped(operand->type)) {
|
||||
Type *target_type = type;
|
||||
@@ -9122,9 +9127,11 @@ gb_internal bool check_is_operand_compound_lit_constant(CheckerContext *c, Opera
|
||||
if (is_type_any(field_type)) {
|
||||
return false;
|
||||
}
|
||||
if (field_type != nullptr && is_type_typeid(field_type) && o->mode == Addressing_Type) {
|
||||
add_type_info_type(c, o->type);
|
||||
return true;
|
||||
if (field_type != nullptr && is_type_typeid(field_type)) {
|
||||
if (o->mode == Addressing_Type) {
|
||||
add_type_info_type(c, o->type);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
Ast *expr = unparen_expr(o->expr);
|
||||
@@ -10584,6 +10591,7 @@ gb_internal ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *
|
||||
}
|
||||
|
||||
|
||||
|
||||
i64 max = 0;
|
||||
|
||||
Type *bet = base_type(elem_type);
|
||||
@@ -12483,7 +12491,7 @@ gb_internal void check_multi_expr_with_type_hint(CheckerContext *c, Operand *o,
|
||||
case Addressing_Type:
|
||||
if (type_hint != nullptr && is_type_typeid(type_hint)) {
|
||||
add_type_info_type(c, o->type);
|
||||
break;
|
||||
return;
|
||||
}
|
||||
error_operand_not_expression(o);
|
||||
break;
|
||||
|
||||
@@ -1042,14 +1042,14 @@ struct GlobalEnumValue {
|
||||
i64 value;
|
||||
};
|
||||
|
||||
gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr) {
|
||||
gb_internal Slice<Entity *> add_global_enum_type(String const &type_name, GlobalEnumValue *values, isize value_count, Type **enum_type_ = nullptr, Type *backing_type = nullptr) {
|
||||
Scope *scope = create_scope(nullptr, builtin_pkg->scope);
|
||||
Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);
|
||||
|
||||
Type *enum_type = alloc_type_enum();
|
||||
Type *named_type = alloc_type_named(type_name, enum_type, entity);
|
||||
set_base_type(named_type, enum_type);
|
||||
enum_type->Enum.base_type = t_int;
|
||||
enum_type->Enum.base_type = backing_type ? backing_type : t_int;
|
||||
enum_type->Enum.scope = scope;
|
||||
entity->type = named_type;
|
||||
|
||||
@@ -1250,6 +1250,41 @@ gb_internal void init_universal(void) {
|
||||
add_global_enum_constant(fields, "ODIN_ERROR_POS_STYLE", build_context.ODIN_ERROR_POS_STYLE);
|
||||
}
|
||||
|
||||
{
|
||||
GlobalEnumValue values[OdinFastMath_COUNT] = {};
|
||||
for (unsigned i = 0; i < OdinFastMath_COUNT; i++) {
|
||||
values[i] = {OdinFastMathFlag_strings[i], i};
|
||||
}
|
||||
|
||||
auto fields = add_global_enum_type(str_lit("Fast_Math_Flag"), values, gb_count_of(values), &t_fast_math_flag, t_u8);
|
||||
|
||||
GB_ASSERT(t_fast_math_flag->kind == Type_Named);
|
||||
scope_insert(intrinsics_pkg->scope, t_fast_math_flag->Named.type_name);
|
||||
|
||||
Type *bs = alloc_type_bit_set();
|
||||
bs->BitSet.elem = t_fast_math_flag;
|
||||
bs->BitSet.underlying = t_u32;
|
||||
bs->BitSet.lower = 0;
|
||||
bs->BitSet.upper = OdinFastMath_COUNT-1;
|
||||
bs->BitSet.node = nullptr;
|
||||
|
||||
|
||||
{
|
||||
String type_name = str_lit("Fast_Math_Flags");
|
||||
|
||||
Scope *scope = create_scope(nullptr, builtin_pkg->scope);
|
||||
Entity *entity = alloc_entity_type_name(scope, make_token_ident(type_name), nullptr, EntityState_Resolved);
|
||||
|
||||
Type *named_type = alloc_type_named(type_name, bs, entity);
|
||||
set_base_type(named_type, bs);
|
||||
entity->type = named_type;
|
||||
|
||||
t_fast_math_flags = named_type;
|
||||
|
||||
scope_insert(intrinsics_pkg->scope, entity);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
GlobalEnumValue values[OdinAtomicMemoryOrder_COUNT] = {
|
||||
{OdinAtomicMemoryOrder_strings[OdinAtomicMemoryOrder_relaxed], OdinAtomicMemoryOrder_relaxed},
|
||||
@@ -3554,11 +3589,17 @@ gb_internal void init_preload(Checker *c) {
|
||||
init_core_objc_c(c);
|
||||
}
|
||||
|
||||
gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value) {
|
||||
gb_internal void check_expr_with_type_hint(CheckerContext *c, Operand *o, Ast *e, Type *t);
|
||||
|
||||
gb_internal ExactValue check_decl_attribute_value(CheckerContext *c, Ast *value, Type *type_hint = nullptr) {
|
||||
ExactValue ev = {};
|
||||
if (value != nullptr) {
|
||||
Operand op = {};
|
||||
check_expr(c, &op, value);
|
||||
if (type_hint != nullptr) {
|
||||
check_expr_with_type_hint(c, &op, value, type_hint);
|
||||
} else {
|
||||
check_expr(c, &op, value);
|
||||
}
|
||||
if (op.mode) {
|
||||
if (op.mode == Addressing_Constant) {
|
||||
ev = op.value;
|
||||
@@ -4126,6 +4167,18 @@ gb_internal DECL_ATTRIBUTE_PROC(proc_decl_attribute) {
|
||||
}
|
||||
ac->no_sanitize_thread = true;
|
||||
return true;
|
||||
} else if (name == "fast_math") {
|
||||
if (value == nullptr) {
|
||||
error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
|
||||
} else {
|
||||
ExactValue ev = check_decl_attribute_value(c, value, t_fast_math_flags);
|
||||
if (ev.kind != ExactValue_Integer) {
|
||||
error(elem, "Expected a constant bit_set of type 'intrinsics.Fast_Math_Flags' for '%.*s'", LIT(name));
|
||||
} else {
|
||||
ac->fast_math_flags = exact_value_to_u64(ev);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -163,6 +163,8 @@ struct AttributeContext {
|
||||
String require_target_feature; // required by the target micro-architecture
|
||||
String enable_target_feature; // will be enabled for the procedure only
|
||||
|
||||
u64 fast_math_flags;
|
||||
|
||||
bool raddbg_type_view;
|
||||
String raddbg_type_view_string;
|
||||
};
|
||||
|
||||
@@ -256,6 +256,9 @@ struct Entity {
|
||||
struct GenProcsData *gen_procs;
|
||||
BlockingMutex gen_procs_mutex;
|
||||
ProcedureOptimizationMode optimization_mode;
|
||||
|
||||
u64 fast_math_flags;
|
||||
|
||||
bool is_foreign : 1;
|
||||
bool is_export : 1;
|
||||
bool generated_from_polymorphic : 1;
|
||||
|
||||
@@ -672,7 +672,7 @@ gb_internal lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lh
|
||||
}
|
||||
}
|
||||
|
||||
gb_internal bool lb_is_matrix_simdable(Type *t) {
|
||||
gb_internal bool lb_is_matrix_simdable(Type *t, bool ignore_layout=false) {
|
||||
Type *mt = base_type(t);
|
||||
GB_ASSERT(mt->kind == Type_Matrix);
|
||||
|
||||
@@ -697,12 +697,14 @@ gb_internal bool lb_is_matrix_simdable(Type *t) {
|
||||
// it's not aligned well enough to use the vector instructions
|
||||
return false;
|
||||
}
|
||||
if ((mt->Matrix.row_count & 1) ^ (mt->Matrix.column_count & 1)) {
|
||||
if ((mt->Matrix.row_count & 1) && (mt->Matrix.column_count & 1)) {
|
||||
return false;
|
||||
}
|
||||
if (mt->Matrix.is_row_major) {
|
||||
// TODO(bill): make #row_major matrices work with SIMD
|
||||
return false;
|
||||
if (!ignore_layout) {
|
||||
// TODO(bill): make #row_major matrices work with SIMD
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (elem->kind == Type_Basic) {
|
||||
@@ -820,35 +822,32 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
|
||||
GB_PANIC("TODO: transpose with changing layout");
|
||||
}
|
||||
|
||||
if (lb_is_matrix_simdable(mt) && lb_is_matrix_simdable(type)) {
|
||||
if (lb_is_matrix_simdable(mt, true) && lb_is_matrix_simdable(type, true)) {
|
||||
auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
|
||||
return LLVMConstInt(lb_type(p->module, t_u32), val, false);
|
||||
};
|
||||
|
||||
unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt);
|
||||
unsigned row_count = cast(unsigned)mt->Matrix.row_count;
|
||||
unsigned column_count = cast(unsigned)mt->Matrix.column_count;
|
||||
|
||||
auto rows = slice_make<LLVMValueRef>(permanent_allocator(), row_count);
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), column_count);
|
||||
unsigned other_stride = (row_count*column_count)/stride;
|
||||
|
||||
LLVMValueRef vector = lb_matrix_to_vector(p, m);
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), row_count * column_count);
|
||||
for (unsigned i = 0; i < row_count; i++) {
|
||||
for (unsigned j = 0; j < column_count; j++) {
|
||||
unsigned offset = stride*j + i;
|
||||
mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
|
||||
mask_elems[other_stride*i + j] = do_u32(p, stride*j + i);
|
||||
}
|
||||
|
||||
// transpose mask
|
||||
LLVMValueRef mask = LLVMConstVector(mask_elems.data, column_count);
|
||||
LLVMValueRef row = llvm_basic_shuffle(p, vector, mask);
|
||||
rows[i] = row;
|
||||
}
|
||||
LLVMValueRef mask = LLVMConstVector(mask_elems.data, cast(unsigned)mask_elems.count);
|
||||
LLVMValueRef transposed_vector = llvm_basic_shuffle(p, vector, mask);
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
for_array(i, rows) {
|
||||
LLVMValueRef row = rows[i];
|
||||
lbValue dst_row_ptr = lb_emit_matrix_epi(p, res.addr, 0, i);
|
||||
LLVMValueRef ptr = dst_row_ptr.value;
|
||||
ptr = LLVMBuildPointerCast(p->builder, ptr, LLVMPointerType(LLVMTypeOf(row), 0), "");
|
||||
LLVMBuildStore(p->builder, row, ptr);
|
||||
}
|
||||
LLVMValueRef res_ptr = res.addr.value;
|
||||
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(transposed_vector), 0), "");
|
||||
|
||||
LLVMValueRef store = LLVMBuildStore(p->builder, transposed_vector, res_ptr);
|
||||
LLVMSetAlignment(store, cast(unsigned)type_align_of(type));
|
||||
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
@@ -867,8 +866,10 @@ gb_internal lbValue lb_emit_matrix_transpose(lbProcedure *p, lbValue m, Type *ty
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
gb_internal lbAddr llvm_add_local_generated_from_vector(lbProcedure *p, Type *type, LLVMValueRef vector) {
|
||||
GB_ASSERT(LLVMGetTypeKind(LLVMTypeOf(vector)) == LLVMVectorTypeKind);
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
LLVMValueRef res_ptr = res.addr.value;
|
||||
unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
|
||||
LLVMSetAlignment(res_ptr, alignment);
|
||||
@@ -876,9 +877,16 @@ gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef v
|
||||
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
|
||||
LLVMBuildStore(p->builder, vector, res_ptr);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
gb_internal lbValue lb_matrix_cast_vector_to_type(lbProcedure *p, LLVMValueRef vector, Type *type) {
|
||||
lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
|
||||
|
||||
gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type) {
|
||||
if (is_type_array(m.type)) {
|
||||
// no-op
|
||||
@@ -896,31 +904,6 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type
|
||||
lbValue n = lb_const_int(p->module, t_int, type_size_of(type));
|
||||
lb_mem_copy_non_overlapping(p, res.addr, m_ptr, n);
|
||||
|
||||
// i64 row_count = mt->Matrix.row_count;
|
||||
// i64 column_count = mt->Matrix.column_count;
|
||||
// TEMPORARY_ALLOCATOR_GUARD();
|
||||
|
||||
// auto srcs = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
|
||||
// auto dsts = array_make<lbValue>(temporary_allocator(), 0, row_count*column_count);
|
||||
|
||||
// for (i64 j = 0; j < column_count; j++) {
|
||||
// for (i64 i = 0; i < row_count; i++) {
|
||||
// lbValue src = lb_emit_matrix_ev(p, m, i, j);
|
||||
// array_add(&srcs, src);
|
||||
// }
|
||||
// }
|
||||
|
||||
// for (i64 j = 0; j < column_count; j++) {
|
||||
// for (i64 i = 0; i < row_count; i++) {
|
||||
// lbValue dst = lb_emit_array_epi(p, res.addr, i + j*row_count);
|
||||
// array_add(&dsts, dst);
|
||||
// }
|
||||
// }
|
||||
|
||||
// GB_ASSERT(srcs.count == dsts.count);
|
||||
// for_array(i, srcs) {
|
||||
// lb_emit_store(p, dsts[i], srcs[i]);
|
||||
// }
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
@@ -959,6 +942,10 @@ gb_internal lbValue lb_emit_outer_product(lbProcedure *p, lbValue a, lbValue b,
|
||||
gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs, Type *type) {
|
||||
// TODO(bill): Handle edge case for f16 types on x86(-64) platforms
|
||||
|
||||
auto const do_u32 = [](lbProcedure *p, u32 val) -> LLVMValueRef {
|
||||
return LLVMConstInt(lb_type(p->module, t_u32), val, false);
|
||||
};
|
||||
|
||||
Type *xt = base_type(lhs.type);
|
||||
Type *yt = base_type(rhs.type);
|
||||
|
||||
@@ -975,50 +962,183 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
||||
unsigned inner = cast(unsigned)xt->Matrix.column_count;
|
||||
unsigned outer_columns = cast(unsigned)yt->Matrix.column_count;
|
||||
|
||||
if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) {
|
||||
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
|
||||
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
|
||||
if (lb_is_matrix_simdable(xt, true)) {
|
||||
if (!xt->Matrix.is_row_major) { // #column_major
|
||||
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
|
||||
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
|
||||
|
||||
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
|
||||
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
|
||||
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
|
||||
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
|
||||
|
||||
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
|
||||
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
|
||||
if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
|
||||
// square matrix calculation
|
||||
unsigned N = outer_columns;
|
||||
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
|
||||
for (unsigned i = 0; i < outer_rows; i++) {
|
||||
for (unsigned j = 0; j < inner; j++) {
|
||||
unsigned offset = x_stride*j + i;
|
||||
mask_elems[j] = lb_const_int(p->module, t_u32, offset).value;
|
||||
auto x_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
|
||||
x_columns[i] = column;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
|
||||
y_columns[i] = column;
|
||||
}
|
||||
|
||||
|
||||
auto z_columns = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
for (unsigned j = 0; j < N; j++) {
|
||||
LLVMValueRef mask = llvm_mask_same(p->module, j, N);
|
||||
mask_elems[j] = llvm_basic_shuffle(p, y_columns[i], mask);
|
||||
}
|
||||
z_columns[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, x_columns);
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
LLVMValueRef dest_ptr = res.addr.value;
|
||||
|
||||
LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_columns[0]), 0);
|
||||
dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef indices[] = {do_u32(p, i)};
|
||||
LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_columns[0]), dest_ptr, indices, 1, "");
|
||||
LLVMBuildStore(p->builder, z_columns[i], dst);
|
||||
}
|
||||
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
// transpose mask
|
||||
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
|
||||
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
|
||||
x_rows[i] = row;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < outer_columns; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
|
||||
y_columns[i] = column;
|
||||
}
|
||||
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
|
||||
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
for_array(i, x_rows) {
|
||||
LLVMValueRef x_row = x_rows[i];
|
||||
for_array(j, y_columns) {
|
||||
LLVMValueRef y_column = y_columns[j];
|
||||
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
|
||||
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
|
||||
LLVMBuildStore(p->builder, elem, dst.value);
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
|
||||
for (unsigned i = 0; i < outer_rows; i++) {
|
||||
for (unsigned j = 0; j < inner; j++) {
|
||||
unsigned offset = x_stride*j + i;
|
||||
mask_elems[j] = do_u32(p, offset);
|
||||
}
|
||||
|
||||
// transpose mask
|
||||
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
|
||||
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
|
||||
x_rows[i] = row;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < outer_columns; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, inner);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
|
||||
y_columns[i] = column;
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
for_array(i, x_rows) {
|
||||
LLVMValueRef x_row = x_rows[i];
|
||||
for_array(j, y_columns) {
|
||||
LLVMValueRef y_column = y_columns[j];
|
||||
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
|
||||
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
|
||||
LLVMBuildStore(p->builder, elem, dst.value);
|
||||
}
|
||||
}
|
||||
return lb_addr_load(p, res);
|
||||
} else { // #row_major
|
||||
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
|
||||
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
|
||||
|
||||
LLVMValueRef x_vector = lb_matrix_to_vector(p, lhs);
|
||||
LLVMValueRef y_vector = lb_matrix_to_vector(p, rhs);
|
||||
|
||||
if (outer_rows == outer_columns && outer_rows == inner && (inner & 1) == 0) {
|
||||
// square matrix calculation
|
||||
unsigned N = outer_columns;
|
||||
|
||||
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
auto y_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, N);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, x_vector, mask);
|
||||
x_rows[i] = column;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, y_stride*i, N);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
|
||||
y_rows[i] = column;
|
||||
}
|
||||
|
||||
|
||||
auto z_rows = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), N);
|
||||
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
for (unsigned j = 0; j < N; j++) {
|
||||
LLVMValueRef mask = llvm_mask_same(p->module, j, N);
|
||||
mask_elems[j] = llvm_basic_shuffle(p, x_rows[i], mask);
|
||||
}
|
||||
z_rows[i] = llvm_vector_mul_pairwise_reduce_add(p, mask_elems, y_rows);
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
LLVMValueRef dest_ptr = res.addr.value;
|
||||
|
||||
LLVMTypeRef dest_ptr_type = LLVMPointerType(LLVMTypeOf(z_rows[0]), 0);
|
||||
dest_ptr = LLVMBuildPointerCast(p->builder, dest_ptr, dest_ptr_type, "");
|
||||
for (unsigned i = 0; i < N; i++) {
|
||||
LLVMValueRef indices[] = {do_u32(p, i)};
|
||||
LLVMValueRef dst = LLVMBuildInBoundsGEP2(p->builder, LLVMTypeOf(z_rows[0]), dest_ptr, indices, 1, "");
|
||||
LLVMBuildStore(p->builder, z_rows[i], dst);
|
||||
}
|
||||
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
auto x_rows = slice_make<LLVMValueRef>(permanent_allocator(), outer_rows);
|
||||
auto y_columns = slice_make<LLVMValueRef>(permanent_allocator(), outer_columns);
|
||||
|
||||
for (unsigned i = 0; i < outer_rows; i++) {
|
||||
LLVMValueRef mask = llvm_mask_iota(p->module, x_stride*i, inner);
|
||||
LLVMValueRef row = llvm_basic_shuffle(p, x_vector, mask);
|
||||
x_rows[i] = row;
|
||||
}
|
||||
|
||||
auto mask_elems = slice_make<LLVMValueRef>(permanent_allocator(), inner);
|
||||
for (unsigned i = 0; i < outer_columns; i++) {
|
||||
for (unsigned j = 0; j < inner; j++) {
|
||||
unsigned offset = x_stride*j + i;
|
||||
mask_elems[j] = do_u32(p, offset);
|
||||
}
|
||||
|
||||
// transpose mask
|
||||
LLVMValueRef mask = LLVMConstVector(mask_elems.data, inner);
|
||||
LLVMValueRef column = llvm_basic_shuffle(p, y_vector, mask);
|
||||
y_columns[i] = column;
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
for_array(i, x_rows) {
|
||||
LLVMValueRef x_row = x_rows[i];
|
||||
for_array(j, y_columns) {
|
||||
LLVMValueRef y_column = y_columns[j];
|
||||
LLVMValueRef elem = llvm_vector_dot(p, x_row, y_column);
|
||||
lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
|
||||
LLVMBuildStore(p->builder, elem, dst.value);
|
||||
}
|
||||
}
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
if (!xt->Matrix.is_row_major) {
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
|
||||
auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
|
||||
|
||||
@@ -1042,7 +1162,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
||||
|
||||
return lb_addr_load(p, res);
|
||||
} else {
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
|
||||
auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
|
||||
|
||||
@@ -1100,23 +1220,25 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal
|
||||
m_columns[column_index] = column;
|
||||
}
|
||||
|
||||
for (unsigned row_index = 0; row_index < column_count; row_index++) {
|
||||
LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
|
||||
LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
|
||||
v_rows[row_index] = row;
|
||||
}
|
||||
if (LLVMIsALoadInst(rhs.value)) {
|
||||
LLVMValueRef rhs_ptr = LLVMGetOperand(rhs.value, 0);
|
||||
LLVMTypeRef vector_type = LLVMVectorType(lb_type(p->module, elem), cast(unsigned)vector_count);
|
||||
LLVMValueRef rhs_vector = LLVMBuildLoad2(p->builder, vector_type, rhs_ptr, "");
|
||||
LLVMSetAlignment(rhs_vector, cast(unsigned)type_align_of(type));
|
||||
|
||||
GB_ASSERT(column_count > 0);
|
||||
|
||||
LLVMValueRef vector = nullptr;
|
||||
for (i64 i = 0; i < column_count; i++) {
|
||||
if (i == 0) {
|
||||
vector = llvm_vector_mul(p, m_columns[i], v_rows[i]);
|
||||
} else {
|
||||
vector = llvm_vector_mul_add(p, m_columns[i], v_rows[i], vector);
|
||||
for (unsigned i = 0; i < column_count; i++) {
|
||||
LLVMValueRef mask = llvm_mask_same(p->module, i, row_count);
|
||||
v_rows[i] = llvm_basic_shuffle(p, rhs_vector, mask);
|
||||
}
|
||||
} else {
|
||||
for (unsigned row_index = 0; row_index < column_count; row_index++) {
|
||||
LLVMValueRef value = LLVMBuildExtractValue(p->builder, rhs.value, row_index, "");
|
||||
LLVMValueRef row = llvm_vector_broadcast(p, value, row_count);
|
||||
v_rows[row_index] = row;
|
||||
}
|
||||
}
|
||||
|
||||
LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, m_columns, v_rows);
|
||||
return lb_matrix_cast_vector_to_type(p, vector, type);
|
||||
}
|
||||
|
||||
@@ -1190,27 +1312,13 @@ gb_internal lbValue lb_emit_vector_mul_matrix(lbProcedure *p, lbValue lhs, lbVal
|
||||
|
||||
GB_ASSERT(row_count > 0);
|
||||
|
||||
LLVMValueRef vector = nullptr;
|
||||
for (i64 i = 0; i < row_count; i++) {
|
||||
if (i == 0) {
|
||||
vector = llvm_vector_mul(p, v_rows[i], m_columns[i]);
|
||||
} else {
|
||||
vector = llvm_vector_mul_add(p, v_rows[i], m_columns[i], vector);
|
||||
}
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
LLVMValueRef res_ptr = res.addr.value;
|
||||
unsigned alignment = cast(unsigned)gb_max(type_align_of(type), lb_alignof(LLVMTypeOf(vector)));
|
||||
LLVMSetAlignment(res_ptr, alignment);
|
||||
|
||||
res_ptr = LLVMBuildPointerCast(p->builder, res_ptr, LLVMPointerType(LLVMTypeOf(vector), 0), "");
|
||||
LLVMBuildStore(p->builder, vector, res_ptr);
|
||||
LLVMValueRef vector = llvm_vector_mul_pairwise_reduce_add(p, v_rows, m_columns);
|
||||
|
||||
lbAddr res = llvm_add_local_generated_from_vector(p, type, vector);
|
||||
return lb_addr_load(p, res);
|
||||
}
|
||||
|
||||
lbAddr res = lb_add_local_generated(p, type, true);
|
||||
lbAddr res = lb_add_local_generated(p, type, false);
|
||||
|
||||
Type *vector_elem_type = base_array_type(rhs.type);
|
||||
|
||||
|
||||
@@ -270,6 +270,55 @@ gb_internal void lb_populate_module_pass_manager(LLVMTargetMachineRef target_mac
|
||||
optimization of Odin programs
|
||||
**************************************************************************/
|
||||
|
||||
gb_internal void lb_run_fast_float_math_pass(lbProcedure *p) {
|
||||
Entity *e = p->entity;
|
||||
if (e == nullptr) {
|
||||
return;
|
||||
}
|
||||
GB_ASSERT(e->kind == Entity_Procedure);
|
||||
|
||||
|
||||
u64 fast_math_flags = e->Procedure.fast_math_flags;
|
||||
LLVMFastMathFlags llvm_flags = 0;
|
||||
if (fast_math_flags & OdinFastMath_Allow_Reassoc) llvm_flags |= LLVMFastMathAllowReassoc;
|
||||
if (fast_math_flags & OdinFastMath_No_NaNs) llvm_flags |= LLVMFastMathNoNaNs;
|
||||
if (fast_math_flags & OdinFastMath_No_Infs) llvm_flags |= LLVMFastMathNoInfs;
|
||||
if (fast_math_flags & OdinFastMath_No_Signed_Zeros) llvm_flags |= LLVMFastMathNoSignedZeros;
|
||||
if (fast_math_flags & OdinFastMath_Allow_Reciprocal) llvm_flags |= LLVMFastMathAllowReciprocal;
|
||||
if (fast_math_flags & OdinFastMath_Allow_Contract) llvm_flags |= LLVMFastMathAllowContract;
|
||||
if (fast_math_flags & OdinFastMath_Approx_Func) llvm_flags |= LLVMFastMathApproxFunc;
|
||||
|
||||
if (llvm_flags == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (LLVMBasicBlockRef block = LLVMGetFirstBasicBlock(p->value);
|
||||
block != nullptr;
|
||||
block = LLVMGetNextBasicBlock(block)) {
|
||||
for (LLVMValueRef instr = LLVMGetFirstInstruction(block);
|
||||
instr != nullptr;
|
||||
instr = LLVMGetNextInstruction(instr)) {
|
||||
switch (LLVMGetInstructionOpcode(instr)) {
|
||||
case LLVMFNeg:
|
||||
case LLVMFAdd:
|
||||
case LLVMFSub:
|
||||
case LLVMFMul:
|
||||
case LLVMFDiv:
|
||||
case LLVMFRem:
|
||||
case LLVMFPToUI:
|
||||
case LLVMFPToSI:
|
||||
case LLVMUIToFP:
|
||||
case LLVMSIToFP:
|
||||
case LLVMFPTrunc:
|
||||
case LLVMFPExt:
|
||||
case LLVMFCmp:
|
||||
LLVMSetFastMathFlags(instr, llvm_flags);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gb_internal void lb_run_remove_dead_instruction_pass(lbProcedure *p) {
|
||||
unsigned debug_declare_id = LLVMLookupIntrinsicID("llvm.dbg.declare", 16);
|
||||
GB_ASSERT(debug_declare_id != 0);
|
||||
@@ -475,6 +524,9 @@ gb_internal void lb_run_function_pass_manager(LLVMPassManagerRef fpm, lbProcedur
|
||||
if (p == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
lb_run_fast_float_math_pass(p);
|
||||
|
||||
// NOTE(bill): LLVMAddDCEPass doesn't seem to be exported in the official DLL's for LLVM
|
||||
// which means we cannot rely upon it
|
||||
// This is also useful for read the .ll for debug purposes because a lot of instructions
|
||||
|
||||
@@ -2048,6 +2048,15 @@ gb_internal LLVMValueRef llvm_mask_zero(lbModule *m, unsigned count) {
|
||||
return LLVMConstNull(LLVMVectorType(lb_type(m, t_u32), count));
|
||||
}
|
||||
|
||||
gb_internal LLVMValueRef llvm_mask_same(lbModule *m, unsigned value, unsigned count) {
|
||||
auto iota = slice_make<LLVMValueRef>(temporary_allocator(), count);
|
||||
for (unsigned i = 0; i < count; i++) {
|
||||
iota[i] = lb_const_int(m, t_u32, value).value;
|
||||
}
|
||||
return LLVMConstVector(iota.data, count);
|
||||
}
|
||||
|
||||
|
||||
#define LLVM_VECTOR_DUMMY_VALUE(type) LLVMGetUndef((type))
|
||||
// #define LLVM_VECTOR_DUMMY_VALUE(type) LLVMConstNull((type))
|
||||
|
||||
@@ -2221,6 +2230,30 @@ gb_internal LLVMValueRef llvm_vector_mul(lbProcedure *p, LLVMValueRef a, LLVMVal
|
||||
return LLVMBuildFMul(p->builder, a, b, "");
|
||||
}
|
||||
|
||||
gb_internal LLVMValueRef llvm_vector_mul_pairwise_reduce_add(lbProcedure *p, Slice<LLVMValueRef> const &a, Slice<LLVMValueRef> const &b) {
|
||||
GB_ASSERT(a.count == b.count);
|
||||
|
||||
auto temps = slice_make<LLVMValueRef>(temporary_allocator(), a.count);
|
||||
for (unsigned i = 0; i < a.count; i++) {
|
||||
temps[i] = llvm_vector_mul(p, a[i], b[i]);
|
||||
}
|
||||
|
||||
unsigned k = cast(unsigned)a.count;
|
||||
while (k > 1) {
|
||||
unsigned half = k/2;
|
||||
for (unsigned j = 0; j < half; j++) {
|
||||
temps[j] = llvm_vector_add(p, temps[2*j + 0], temps[2*j + 1]);
|
||||
}
|
||||
|
||||
if ((k&1) != 0) {
|
||||
temps[half] = temps[k-1];
|
||||
}
|
||||
k = (k+1)/2;
|
||||
}
|
||||
|
||||
return temps[0];
|
||||
}
|
||||
|
||||
|
||||
gb_internal LLVMValueRef llvm_vector_dot(lbProcedure *p, LLVMValueRef a, LLVMValueRef b) {
|
||||
return llvm_vector_reduce_add(p, llvm_vector_mul(p, a, b));
|
||||
@@ -2260,6 +2293,7 @@ gb_internal LLVMValueRef llvm_vector_mul_add(lbProcedure *p, LLVMValueRef a, LLV
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
gb_internal LLVMValueRef llvm_get_inline_asm(LLVMTypeRef func_type, String const &str, String const &clobbers, bool has_side_effects=true, bool is_align_stack=false, LLVMInlineAsmDialect dialect=LLVMInlineAsmDialectATT) {
|
||||
return LLVMGetInlineAsm(func_type,
|
||||
cast(char *)str.text, cast(size_t)str.len,
|
||||
|
||||
@@ -805,6 +805,34 @@ gb_global Type *t_atomic_memory_order = nullptr;
|
||||
|
||||
|
||||
|
||||
enum OdinFastMathFlag : u8 {
|
||||
OdinFastMath_Allow_Reassoc = 0,
|
||||
OdinFastMath_No_NaNs = 1,
|
||||
OdinFastMath_No_Infs = 2,
|
||||
OdinFastMath_No_Signed_Zeros = 3,
|
||||
OdinFastMath_Allow_Reciprocal = 4,
|
||||
OdinFastMath_Allow_Contract = 5,
|
||||
OdinFastMath_Approx_Func = 6,
|
||||
|
||||
OdinFastMath_COUNT,
|
||||
};
|
||||
|
||||
char const *OdinFastMathFlag_strings[OdinFastMath_COUNT] = {
|
||||
"Allow_Reassoc",
|
||||
"No_NaNs",
|
||||
"No_Infs",
|
||||
"No_Signed_Zeros",
|
||||
"Allow_Reciprocal",
|
||||
"Allow_Contract",
|
||||
"Approx_Func",
|
||||
};
|
||||
|
||||
gb_global Type *t_fast_math_flag = nullptr; // named enum
|
||||
gb_global Type *t_fast_math_flags = nullptr; // named bit_set
|
||||
|
||||
|
||||
|
||||
|
||||
gb_global RecursiveMutex g_type_mutex;
|
||||
|
||||
struct TypePath;
|
||||
|
||||
Reference in New Issue
Block a user