mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-05-26 23:08:25 +00:00
pkg/highway: replace resolveTargetQuery with direct CPU detection
The previous runtime_detect.zig called std.zig.system.resolveTargetQuery which pulled in the entire Zig target/CPU model table infrastructure for every architecture (~4,000 symbols, ~175 KB of data tables, ~130 KB of code). This bloated the binary by ~500 KB and shifted code layout enough to cause a measurable icache/branch-predictor regression in unrelated hot paths like the terminal parser (~20% more cycles for identical instruction counts). Replace with minimal, direct CPU feature detection per architecture: CPUID + XGETBV inline assembly on x86, sysctlbyname on Darwin AArch64, and getauxval/prctl via std.os.linux (direct syscalls, no libc) on Linux for AArch64, PPC, S390x, RISC-V, and LoongArch. Split into per-architecture files under src/detect/ for maintainability.
This commit is contained in:
@@ -15,7 +15,7 @@ pub fn build(b: *std.Build) !void {
|
||||
const lib = b.addLibrary(.{
|
||||
.name = "highway",
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("src/runtime_detect.zig"),
|
||||
.root_source_file = b.path("src/detect.zig"),
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
|
||||
49
pkg/highway/src/detect.zig
Normal file
49
pkg/highway/src/detect.zig
Normal file
@@ -0,0 +1,49 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("targets.zig").Targets;
|
||||
|
||||
const x86 = @import("detect/x86.zig");
|
||||
const aarch64_darwin = @import("detect/aarch64_darwin.zig");
|
||||
const aarch64_linux = @import("detect/aarch64_linux.zig");
|
||||
const ppc = @import("detect/ppc.zig");
|
||||
const s390x = @import("detect/s390x.zig");
|
||||
const riscv = @import("detect/riscv.zig");
|
||||
const loongarch = @import("detect/loongarch.zig");
|
||||
|
||||
/// Detect Highway targets at runtime using minimal, direct CPU feature
|
||||
/// probing.
|
||||
///
|
||||
/// Previous versions called std.zig.system.resolveTargetQuery which
|
||||
/// drags in the full Zig target/CPU model tables for every architecture,
|
||||
/// bloating the binary by ~300 KB and causing code-layout regressions in
|
||||
/// unrelated hot paths (icache / branch-predictor pressure).
|
||||
///
|
||||
/// This version uses only inline assembly (CPUID on x86, MRS on AArch64)
|
||||
/// and lightweight syscalls (sysctlbyname on Darwin, getauxval on Linux),
|
||||
/// so it adds no data tables and no std.Target dependency.
|
||||
pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 {
|
||||
return switch (builtin.cpu.arch) {
|
||||
.x86_64, .x86 => x86.detect(),
|
||||
.aarch64, .aarch64_be => detectAarch64(),
|
||||
.powerpc, .powerpc64, .powerpc64le => ppc.detect(),
|
||||
.s390x => s390x.detect(),
|
||||
.riscv32, .riscv64 => riscv.detect(),
|
||||
.loongarch32, .loongarch64 => loongarch.detect(),
|
||||
else => 0,
|
||||
};
|
||||
}
|
||||
|
||||
fn detectAarch64() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
// All AArch64 implementations have NEON.
|
||||
t.neon_without_aes = true;
|
||||
|
||||
if (comptime builtin.os.tag.isDarwin()) {
|
||||
return aarch64_darwin.detect(&t);
|
||||
} else if (comptime builtin.os.tag == .linux) {
|
||||
return aarch64_linux.detect(&t);
|
||||
}
|
||||
|
||||
// Other OS: return baseline NEON.
|
||||
return @bitCast(t);
|
||||
}
|
||||
33
pkg/highway/src/detect/aarch64_darwin.zig
Normal file
33
pkg/highway/src/detect/aarch64_darwin.zig
Normal file
@@ -0,0 +1,33 @@
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
|
||||
pub fn detect(t: *HwyTargets) i64 {
|
||||
// All Apple Silicon has AES.
|
||||
t.neon = true;
|
||||
|
||||
// Every Apple chip from A11 (2017) onward has FP16 + DotProd.
|
||||
// BF16 arrived with M2 / A15 (ARM_BLIZZARD_AVALANCHE, 2022).
|
||||
// We probe hw.optional.arm.FEAT_BF16 to be precise.
|
||||
const has_bf16 = darwinSysctlBool("hw.optional.arm.FEAT_BF16");
|
||||
if (has_bf16) {
|
||||
t.neon_bf16 = true;
|
||||
}
|
||||
|
||||
// Apple Silicon does not support SVE.
|
||||
return @bitCast(t.*);
|
||||
}
|
||||
|
||||
fn darwinSysctlBool(comptime name: [:0]const u8) bool {
|
||||
var value: c_int = 0;
|
||||
var len: usize = @sizeOf(c_int);
|
||||
const rc = sysctlbyname(name.ptr, &value, &len, null, 0);
|
||||
return rc == 0 and value != 0;
|
||||
}
|
||||
|
||||
// We can rely on libc for macOS because libsystem is always available.
|
||||
extern "c" fn sysctlbyname(
|
||||
name: [*:0]const u8,
|
||||
oldp: ?*anyopaque,
|
||||
oldlenp: ?*usize,
|
||||
newp: ?*const anyopaque,
|
||||
newlen: usize,
|
||||
) c_int;
|
||||
65
pkg/highway/src/detect/aarch64_linux.zig
Normal file
65
pkg/highway/src/detect/aarch64_linux.zig
Normal file
@@ -0,0 +1,65 @@
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect(t: *HwyTargets) i64 {
|
||||
// Linux exposes AArch64 features via getauxval(AT_HWCAP / AT_HWCAP2).
|
||||
const AT_HWCAP: usize = 16;
|
||||
const AT_HWCAP2: usize = 26;
|
||||
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
const hwcap2 = linux.getauxval(AT_HWCAP2);
|
||||
|
||||
// Bit positions from Linux UAPI asm/hwcap.h
|
||||
const HWCAP_AES: usize = 1 << 3;
|
||||
const HWCAP_FPHP: usize = 1 << 9; // FEAT_FP16
|
||||
const HWCAP_ASIMDDP: usize = 1 << 20; // DotProd
|
||||
const HWCAP_SVE: usize = 1 << 22;
|
||||
|
||||
const HWCAP2_BF16: usize = 1 << 14;
|
||||
const HWCAP2_SVE2: usize = 1 << 1;
|
||||
const HWCAP2_SVEAES: usize = 1 << 2;
|
||||
|
||||
if (hwcap & HWCAP_AES != 0) {
|
||||
t.neon = true;
|
||||
|
||||
if (hwcap & HWCAP_FPHP != 0 and
|
||||
hwcap & HWCAP_ASIMDDP != 0 and
|
||||
hwcap2 & HWCAP2_BF16 != 0)
|
||||
{
|
||||
t.neon_bf16 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hwcap & HWCAP_SVE != 0) {
|
||||
const vec_bytes = sveVectorBytes();
|
||||
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve = true;
|
||||
if (vec_bytes == 32) {
|
||||
t.sve_256 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hwcap2 & HWCAP2_SVE2 != 0 and hwcap2 & HWCAP2_SVEAES != 0) {
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve2 = true;
|
||||
} else if (vec_bytes == 16) {
|
||||
t.sve2_128 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t.*);
|
||||
}
|
||||
|
||||
fn sveVectorBytes() usize {
|
||||
// PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits.
|
||||
const PR_SVE_GET_VL: i32 = 51;
|
||||
const ret = linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0);
|
||||
const signed: isize = @bitCast(ret);
|
||||
if (signed >= 0) {
|
||||
return ret & 0xFFFF;
|
||||
}
|
||||
// prctl failed: assume 128-bit (NEON-width, conservative).
|
||||
return 16;
|
||||
}
|
||||
10
pkg/highway/src/detect/linux.zig
Normal file
10
pkg/highway/src/detect/linux.zig
Normal file
@@ -0,0 +1,10 @@
|
||||
/// Reads from the ELF auxiliary vector (set by the kernel at process
|
||||
/// start). Does not call into libc.
|
||||
pub inline fn getauxval(key: usize) usize {
|
||||
return @import("std").os.linux.getauxval(key);
|
||||
}
|
||||
|
||||
/// Direct syscall wrapper for prctl(2).
|
||||
pub inline fn prctl(option: i32, a2: usize, a3: usize, a4: usize, a5: usize) usize {
|
||||
return @import("std").os.linux.prctl(option, a2, a3, a4, a5);
|
||||
}
|
||||
26
pkg/highway/src/detect/loongarch.zig
Normal file
26
pkg/highway/src/detect/loongarch.zig
Normal file
@@ -0,0 +1,26 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// From Linux arch/loongarch/include/uapi/asm/hwcap.h
|
||||
const HWCAP_LSX: usize = 1 << 4;
|
||||
const HWCAP_LASX: usize = 1 << 5;
|
||||
|
||||
if (hwcap & HWCAP_LSX != 0) {
|
||||
t.lsx = true;
|
||||
|
||||
if (hwcap & HWCAP_LASX != 0) {
|
||||
t.lasx = true;
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
43
pkg/highway/src/detect/ppc.zig
Normal file
43
pkg/highway/src/detect/ppc.zig
Normal file
@@ -0,0 +1,43 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const AT_HWCAP2: usize = 26;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
const hwcap2 = linux.getauxval(AT_HWCAP2);
|
||||
|
||||
// From Linux arch/powerpc/include/uapi/asm/cputable.h
|
||||
const PPC_FEATURE_HAS_ALTIVEC: usize = 0x10000000;
|
||||
const PPC_FEATURE_HAS_VSX: usize = 0x00000080;
|
||||
const PPC_FEATURE2_ARCH_2_07: usize = 0x80000000; // POWER8
|
||||
const PPC_FEATURE2_VEC_CRYPTO: usize = 0x02000000;
|
||||
const PPC_FEATURE2_ARCH_3_00: usize = 0x00800000; // POWER9
|
||||
const PPC_FEATURE2_ARCH_3_1: usize = 0x00040000; // POWER10
|
||||
const PPC_FEATURE2_MMA: usize = 0x00020000;
|
||||
|
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC != 0 and
|
||||
hwcap & PPC_FEATURE_HAS_VSX != 0 and
|
||||
hwcap2 & PPC_FEATURE2_ARCH_2_07 != 0 and
|
||||
hwcap2 & PPC_FEATURE2_VEC_CRYPTO != 0)
|
||||
{
|
||||
t.ppc8 = true;
|
||||
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_00 != 0) {
|
||||
t.ppc9 = true;
|
||||
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_3_1 != 0 and
|
||||
hwcap2 & PPC_FEATURE2_MMA != 0)
|
||||
{
|
||||
t.ppc10 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
22
pkg/highway/src/detect/riscv.zig
Normal file
22
pkg/highway/src/detect/riscv.zig
Normal file
@@ -0,0 +1,22 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// ISA extension bit for 'V' (vector).
|
||||
// Letter-based bits: bit position = letter - 'A'.
|
||||
const HWCAP_V: usize = 1 << ('V' - 'A');
|
||||
|
||||
if (hwcap & HWCAP_V != 0) {
|
||||
t.rvv = true;
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
29
pkg/highway/src/detect/s390x.zig
Normal file
29
pkg/highway/src/detect/s390x.zig
Normal file
@@ -0,0 +1,29 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
const linux = @import("linux.zig");
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.os.tag != .linux) return @bitCast(t);
|
||||
|
||||
const AT_HWCAP: usize = 16;
|
||||
const hwcap = linux.getauxval(AT_HWCAP);
|
||||
|
||||
// From Linux arch/s390/include/asm/elf.h
|
||||
const HWCAP_VX: usize = 1 << 11;
|
||||
const HWCAP_VXE: usize = 1 << 13; // z14
|
||||
const HWCAP_VXE2: usize = 1 << 15; // z15
|
||||
|
||||
if (hwcap & HWCAP_VX != 0) {
|
||||
if (hwcap & HWCAP_VXE != 0) {
|
||||
t.z14 = true;
|
||||
|
||||
if (hwcap & HWCAP_VXE2 != 0) {
|
||||
t.z15 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
166
pkg/highway/src/detect/x86.zig
Normal file
166
pkg/highway/src/detect/x86.zig
Normal file
@@ -0,0 +1,166 @@
|
||||
const builtin = @import("builtin");
|
||||
const HwyTargets = @import("../targets.zig").Targets;
|
||||
|
||||
const CpuidResult = struct { eax: u32, ebx: u32, ecx: u32, edx: u32 };
|
||||
|
||||
fn cpuid(leaf: u32, subleaf: u32) CpuidResult {
|
||||
var eax: u32 = undefined;
|
||||
var ebx: u32 = undefined;
|
||||
var ecx: u32 = undefined;
|
||||
var edx: u32 = undefined;
|
||||
asm volatile ("cpuid"
|
||||
: [_] "={eax}" (eax),
|
||||
[_] "={ebx}" (ebx),
|
||||
[_] "={ecx}" (ecx),
|
||||
[_] "={edx}" (edx),
|
||||
: [_] "{eax}" (leaf),
|
||||
[_] "{ecx}" (subleaf),
|
||||
);
|
||||
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
|
||||
}
|
||||
|
||||
inline fn bit(val: u32, comptime pos: u5) bool {
|
||||
return (val >> pos) & 1 != 0;
|
||||
}
|
||||
|
||||
pub fn detect() i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
// x86_64 always has SSE2.
|
||||
if (comptime builtin.cpu.arch == .x86_64) {
|
||||
t.sse2 = true;
|
||||
}
|
||||
|
||||
const leaf0 = cpuid(0, 0);
|
||||
const max_leaf = leaf0.eax;
|
||||
if (max_leaf < 1) return @bitCast(t);
|
||||
|
||||
const leaf1 = cpuid(1, 0);
|
||||
|
||||
// -- SSE2 on 32-bit x86 -------------------------------------------------
|
||||
if (comptime builtin.cpu.arch == .x86) {
|
||||
if (bit(leaf1.edx, 25) and bit(leaf1.edx, 26)) {
|
||||
t.sse2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// -- SSSE3 ---------------------------------------------------------------
|
||||
if (bit(leaf1.ecx, 0) and // SSE3
|
||||
bit(leaf1.ecx, 9)) // SSSE3
|
||||
{
|
||||
t.ssse3 = true;
|
||||
}
|
||||
|
||||
// -- SSE4 ----------------------------------------------------------------
|
||||
if (bit(leaf1.ecx, 19) and // SSE4.1
|
||||
bit(leaf1.ecx, 20) and // SSE4.2
|
||||
bit(leaf1.ecx, 1) and // PCLMUL
|
||||
bit(leaf1.ecx, 25)) // AES
|
||||
{
|
||||
t.sse4 = true;
|
||||
}
|
||||
|
||||
// Check XSAVE / AVX OS support before enabling any AVX-dependent target.
|
||||
const has_xsave = bit(leaf1.ecx, 27);
|
||||
const has_avx_bit = bit(leaf1.ecx, 28);
|
||||
const xcr0: u32 = if (has_xsave and has_avx_bit) asm volatile ("xgetbv"
|
||||
: [_] "={eax}" (-> u32),
|
||||
: [_] "{ecx}" (@as(u32, 0)),
|
||||
: .{ .edx = true }) else 0;
|
||||
const has_avx_save = (xcr0 & 0x6) == 0x6; // SSE + AVX state
|
||||
|
||||
// Darwin lazily saves AVX-512 context on first use.
|
||||
const has_avx512_save = if (comptime builtin.os.tag.isDarwin())
|
||||
true
|
||||
else
|
||||
(xcr0 & 0xE0) == 0xE0; // opmask + zmm_hi256 + hi16_zmm
|
||||
|
||||
// -- AVX2 ----------------------------------------------------------------
|
||||
if (has_avx_save and max_leaf >= 7) {
|
||||
const leaf7 = cpuid(7, 0);
|
||||
|
||||
if (bit(leaf7.ebx, 5) and // AVX2
|
||||
bit(leaf1.ecx, 12) and // FMA
|
||||
bit(leaf1.ecx, 29)) // F16C
|
||||
{
|
||||
// Also need LZCNT (extended leaf), BMI, BMI2.
|
||||
const leaf_ext = cpuid(0x80000001, 0);
|
||||
if (bit(leaf_ext.ecx, 5) and // LZCNT
|
||||
bit(leaf7.ebx, 3) and // BMI
|
||||
bit(leaf7.ebx, 8)) // BMI2
|
||||
{
|
||||
t.avx2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// -- AVX-512 ---------------------------------------------------------
|
||||
if (has_avx512_save) {
|
||||
if (bit(leaf7.ebx, 16) and // AVX512F
|
||||
bit(leaf7.ebx, 31) and // AVX512VL
|
||||
bit(leaf7.ebx, 17) and // AVX512DQ
|
||||
bit(leaf7.ebx, 30) and // AVX512BW
|
||||
bit(leaf7.ebx, 28)) // AVX512CD
|
||||
{
|
||||
t.avx3 = true;
|
||||
}
|
||||
|
||||
if (bit(leaf7.ecx, 11) and // AVX512VNNI
|
||||
bit(leaf7.ecx, 10) and // VPCLMULQDQ (AVX save ok)
|
||||
bit(leaf7.ecx, 1) and // AVX512VBMI
|
||||
bit(leaf7.ecx, 6) and // AVX512VBMI2
|
||||
bit(leaf7.ecx, 9) and // VAES (AVX save ok)
|
||||
bit(leaf7.ecx, 14) and // AVX512VPOPCNTDQ
|
||||
bit(leaf7.ecx, 12) and // AVX512BITALG
|
||||
bit(leaf7.ecx, 8)) // GFNI
|
||||
{
|
||||
t.avx3_dl = true;
|
||||
}
|
||||
|
||||
// AVX512BF16 is in leaf 7 sub-1.
|
||||
if (t.avx3_dl and leaf7.eax >= 1) {
|
||||
const leaf7_1 = cpuid(7, 1);
|
||||
if (bit(leaf7_1.eax, 5)) { // AVX512BF16
|
||||
if (isAMD()) {
|
||||
t.avx3_zen4 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bit(leaf7.edx, 23) and // AVX512FP16
|
||||
bit(leaf7_1.eax, 5)) // AVX512BF16
|
||||
{
|
||||
t.avx3_spr = true;
|
||||
}
|
||||
} else if (bit(leaf7.edx, 23)) { // AVX512FP16 without sub-leaf
|
||||
// Can't check BF16 without sub-leaf support, skip avx3_spr.
|
||||
}
|
||||
}
|
||||
|
||||
// -- AVX10 -----------------------------------------------------------
|
||||
if (max_leaf >= 7 and cpuid(7, 0).eax >= 1) {
|
||||
const leaf7_1 = cpuid(7, 1);
|
||||
if (bit(leaf7_1.edx, 19)) { // AVX10.1-256
|
||||
if (max_leaf >= 0x24) {
|
||||
const leaf24 = cpuid(0x24, 0);
|
||||
if (bit(leaf24.ebx, 18)) { // AVX10.1-512
|
||||
t.avx3_spr = true;
|
||||
t.avx3_dl = true;
|
||||
t.avx3 = true;
|
||||
}
|
||||
}
|
||||
|
||||
// AVX10.2 detection would require a leaf we can't
|
||||
// reliably check yet; leave for future.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn isAMD() bool {
|
||||
const leaf0 = cpuid(0, 0);
|
||||
// "Auth" "enti" "cAMD"
|
||||
return leaf0.ebx == 0x68747541 and
|
||||
leaf0.ecx == 0x444d4163 and
|
||||
leaf0.edx == 0x69746e65;
|
||||
}
|
||||
@@ -1,257 +0,0 @@
|
||||
const builtin = @import("builtin");
|
||||
const std = @import("std");
|
||||
const Target = std.Target;
|
||||
const HwyTargets = @import("targets.zig").Targets;
|
||||
|
||||
/// Detect Highway targets using Zig's standard library CPU feature detection.
|
||||
///
|
||||
/// The logic is mostly identical to the Highway implementation, but we
|
||||
/// use Zig's built-in CPU feature detection instead of Highway so that we
|
||||
/// can strictly control access to Apple headers (and avoid them completely).
|
||||
pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 {
|
||||
const native = std.zig.system.resolveTargetQuery(.{}) catch return 0;
|
||||
const cpu = native.cpu;
|
||||
|
||||
return switch (builtin.cpu.arch) {
|
||||
.x86_64, .x86 => detectX86(cpu),
|
||||
.aarch64, .aarch64_be => detectAarch64(cpu),
|
||||
.powerpc, .powerpc64, .powerpc64le => detectPpc(cpu),
|
||||
.s390x => detectS390x(cpu),
|
||||
.riscv32, .riscv64 => detectRiscv(cpu),
|
||||
.loongarch32, .loongarch64 => detectLoongArch(cpu),
|
||||
else => 0,
|
||||
};
|
||||
}
|
||||
|
||||
fn detectX86(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (comptime builtin.cpu.arch == .x86_64) {
|
||||
t.sse2 = true;
|
||||
}
|
||||
|
||||
if (comptime builtin.cpu.arch == .x86) {
|
||||
if (cpu.has(.x86, .sse) and
|
||||
cpu.has(.x86, .sse2))
|
||||
{
|
||||
t.sse2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .sse3) and
|
||||
cpu.has(.x86, .ssse3))
|
||||
{
|
||||
t.ssse3 = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .sse4_1) and
|
||||
cpu.has(.x86, .sse4_2) and
|
||||
cpu.has(.x86, .pclmul) and
|
||||
cpu.has(.x86, .aes))
|
||||
{
|
||||
t.sse4 = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx) and
|
||||
cpu.has(.x86, .avx2) and
|
||||
cpu.has(.x86, .lzcnt) and
|
||||
cpu.has(.x86, .bmi) and
|
||||
cpu.has(.x86, .bmi2) and
|
||||
cpu.has(.x86, .fma) and
|
||||
cpu.has(.x86, .f16c))
|
||||
{
|
||||
t.avx2 = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx512f) and
|
||||
cpu.has(.x86, .avx512vl) and
|
||||
cpu.has(.x86, .avx512dq) and
|
||||
cpu.has(.x86, .avx512bw) and
|
||||
cpu.has(.x86, .avx512cd))
|
||||
{
|
||||
t.avx3 = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx512vnni) and
|
||||
cpu.has(.x86, .vpclmulqdq) and
|
||||
cpu.has(.x86, .avx512vbmi) and
|
||||
cpu.has(.x86, .avx512vbmi2) and
|
||||
cpu.has(.x86, .vaes) and
|
||||
cpu.has(.x86, .avx512vpopcntdq) and
|
||||
cpu.has(.x86, .avx512bitalg) and
|
||||
cpu.has(.x86, .gfni))
|
||||
{
|
||||
t.avx3_dl = true;
|
||||
}
|
||||
|
||||
if (t.avx3_dl and cpu.has(.x86, .avx512bf16)) {
|
||||
if (isAMD()) {
|
||||
t.avx3_zen4 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx512fp16) and
|
||||
cpu.has(.x86, .avx512bf16))
|
||||
{
|
||||
t.avx3_spr = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx10_1_256)) {
|
||||
if (cpu.has(.x86, .avx10_1_512)) {
|
||||
t.avx3_spr = true;
|
||||
t.avx3_dl = true;
|
||||
t.avx3 = true;
|
||||
}
|
||||
|
||||
if (cpu.has(.x86, .avx10_2_256)) {
|
||||
t.avx10_2 = true;
|
||||
if (cpu.has(.x86, .avx10_2_512)) {
|
||||
t.avx10_2_512 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// On Darwin the kernel lazily saves AVX512 context on first use, so no
|
||||
// explicit XCR0 check is required. On Linux, Zig's feature detection
|
||||
// reads the kernel-provided auxiliary vector (getauxval) which already
|
||||
// reflects OS-level XSAVE support.
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn detectAarch64(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
t.neon_without_aes = true;
|
||||
|
||||
if (cpu.has(.aarch64, .aes)) {
|
||||
t.neon = true;
|
||||
|
||||
if (cpu.has(.aarch64, .fullfp16) and
|
||||
cpu.has(.aarch64, .dotprod) and
|
||||
cpu.has(.aarch64, .bf16))
|
||||
{
|
||||
t.neon_bf16 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (cpu.has(.aarch64, .sve)) {
|
||||
const vec_bytes = sveVectorBytes();
|
||||
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve = true;
|
||||
if (vec_bytes == 32) {
|
||||
t.sve_256 = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (cpu.has(.aarch64, .sve2) and cpu.has(.aarch64, .sve2_aes)) {
|
||||
if (vec_bytes >= 32) {
|
||||
t.sve2 = true;
|
||||
} else if (vec_bytes == 16) {
|
||||
t.sve2_128 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn sveVectorBytes() usize {
|
||||
if (comptime builtin.os.tag == .linux) {
|
||||
// PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits.
|
||||
const PR_SVE_GET_VL = 51;
|
||||
const ret = std.os.linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0);
|
||||
const signed: isize = @bitCast(ret);
|
||||
if (signed >= 0) {
|
||||
return ret & 0xFFFF;
|
||||
}
|
||||
}
|
||||
// Non-Linux or prctl failed: assume 128-bit (NEON-width, conservative).
|
||||
return 16;
|
||||
}
|
||||
|
||||
fn detectPpc(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (cpu.has(.powerpc, .altivec) and
|
||||
cpu.has(.powerpc, .vsx) and
|
||||
cpu.has(.powerpc, .power8_vector) and
|
||||
cpu.has(.powerpc, .crypto))
|
||||
{
|
||||
t.ppc8 = true;
|
||||
|
||||
if (cpu.has(.powerpc, .power9_vector)) {
|
||||
t.ppc9 = true;
|
||||
|
||||
if (cpu.has(.powerpc, .power10_vector) and
|
||||
cpu.has(.powerpc, .mma))
|
||||
{
|
||||
t.ppc10 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn detectS390x(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (cpu.has(.s390x, .vector)) {
|
||||
if (cpu.has(.s390x, .vector_enhancements_1)) {
|
||||
t.z14 = true;
|
||||
|
||||
if (cpu.has(.s390x, .vector_enhancements_2)) {
|
||||
t.z15 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn detectRiscv(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (cpu.has(.riscv, .v)) {
|
||||
t.rvv = true;
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
fn detectLoongArch(cpu: Target.Cpu) i64 {
|
||||
var t: HwyTargets = .{};
|
||||
|
||||
if (cpu.has(.loongarch, .lsx)) {
|
||||
t.lsx = true;
|
||||
|
||||
if (cpu.has(.loongarch, .lasx)) {
|
||||
t.lasx = true;
|
||||
}
|
||||
}
|
||||
|
||||
return @bitCast(t);
|
||||
}
|
||||
|
||||
/// Check CPUID vendor string for "AuthenticAMD", matching Highway's IsAMD().
|
||||
/// Zig doesn't expose the vendor string, so we must use inline assembly.
|
||||
fn isAMD() bool {
|
||||
var eax: u32 = undefined;
|
||||
var ebx: u32 = undefined;
|
||||
var ecx: u32 = undefined;
|
||||
var edx: u32 = undefined;
|
||||
asm volatile ("cpuid"
|
||||
: [_] "={eax}" (eax),
|
||||
[_] "={ebx}" (ebx),
|
||||
[_] "={ecx}" (ecx),
|
||||
[_] "={edx}" (edx),
|
||||
: [_] "{eax}" (0),
|
||||
);
|
||||
|
||||
// "Auth" "enti" "cAMD"
|
||||
return ebx == 0x68747541 and
|
||||
ecx == 0x444d4163 and
|
||||
edx == 0x69746e65;
|
||||
}
|
||||
Reference in New Issue
Block a user