pkg/highway: replace resolveTargetQuery with direct CPU detection

The previous runtime_detect.zig called std.zig.system.resolveTargetQuery
which pulled in the entire Zig target/CPU model table infrastructure for
every architecture (~4,000 symbols, ~175 KB of data tables, ~130 KB of
code). This bloated the binary by ~500 KB and shifted code layout enough
to cause a measurable icache/branch-predictor regression in unrelated
hot paths like the terminal parser (~20% more cycles for identical
instruction counts).

Replace with minimal, direct CPU feature detection per architecture:
CPUID + XGETBV inline assembly on x86, sysctlbyname on Darwin AArch64,
and getauxval/prctl via std.os.linux (direct syscalls, no libc) on
Linux for AArch64, PPC, S390x, RISC-V, and LoongArch.

Split into per-architecture files under src/detect/ for
maintainability.
This commit is contained in:
Mitchell Hashimoto
2026-04-23 21:15:46 -07:00
parent 3c0b976d07
commit 00dfd67bee
11 changed files with 444 additions and 258 deletions

View File

@@ -15,7 +15,7 @@ pub fn build(b: *std.Build) !void {
const lib = b.addLibrary(.{
.name = "highway",
.root_module = b.createModule(.{
.root_source_file = b.path("src/runtime_detect.zig"),
.root_source_file = b.path("src/detect.zig"),
.target = target,
.optimize = optimize,
}),

View File

@@ -0,0 +1,49 @@
const builtin = @import("builtin");
const HwyTargets = @import("targets.zig").Targets;
const x86 = @import("detect/x86.zig");
const aarch64_darwin = @import("detect/aarch64_darwin.zig");
const aarch64_linux = @import("detect/aarch64_linux.zig");
const ppc = @import("detect/ppc.zig");
const s390x = @import("detect/s390x.zig");
const riscv = @import("detect/riscv.zig");
const loongarch = @import("detect/loongarch.zig");
/// Detect Highway targets at runtime using minimal, direct CPU feature
/// probing.
///
/// Previous versions called std.zig.system.resolveTargetQuery which
/// drags in the full Zig target/CPU model tables for every architecture,
/// bloating the binary by ~300 KB and causing code-layout regressions in
/// unrelated hot paths (icache / branch-predictor pressure).
///
/// This version uses only inline assembly (CPUID on x86, MRS on AArch64)
/// and lightweight syscalls (sysctlbyname on Darwin, getauxval on Linux),
/// so it adds no data tables and no std.Target dependency.
pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 {
return switch (builtin.cpu.arch) {
.x86_64, .x86 => x86.detect(),
.aarch64, .aarch64_be => detectAarch64(),
.powerpc, .powerpc64, .powerpc64le => ppc.detect(),
.s390x => s390x.detect(),
.riscv32, .riscv64 => riscv.detect(),
.loongarch32, .loongarch64 => loongarch.detect(),
else => 0,
};
}
fn detectAarch64() i64 {
var t: HwyTargets = .{};
// All AArch64 implementations have NEON.
t.neon_without_aes = true;
if (comptime builtin.os.tag.isDarwin()) {
return aarch64_darwin.detect(&t);
} else if (comptime builtin.os.tag == .linux) {
return aarch64_linux.detect(&t);
}
// Other OS: return baseline NEON.
return @bitCast(t);
}

View File

@@ -0,0 +1,33 @@
const HwyTargets = @import("../targets.zig").Targets;
pub fn detect(t: *HwyTargets) i64 {
// All Apple Silicon has AES.
t.neon = true;
// Every Apple chip from A11 (2017) onward has FP16 + DotProd.
// BF16 arrived with M2 / A15 (ARM_BLIZZARD_AVALANCHE, 2022).
// We probe hw.optional.arm.FEAT_BF16 to be precise.
const has_bf16 = darwinSysctlBool("hw.optional.arm.FEAT_BF16");
if (has_bf16) {
t.neon_bf16 = true;
}
// Apple Silicon does not support SVE.
return @bitCast(t.*);
}
fn darwinSysctlBool(comptime name: [:0]const u8) bool {
var value: c_int = 0;
var len: usize = @sizeOf(c_int);
const rc = sysctlbyname(name.ptr, &value, &len, null, 0);
return rc == 0 and value != 0;
}
// We can rely on libc for macOS because libsystem is always available.
extern "c" fn sysctlbyname(
name: [*:0]const u8,
oldp: ?*anyopaque,
oldlenp: ?*usize,
newp: ?*const anyopaque,
newlen: usize,
) c_int;

View File

@@ -0,0 +1,65 @@
const HwyTargets = @import("../targets.zig").Targets;
const linux = @import("linux.zig");
pub fn detect(t: *HwyTargets) i64 {
// Linux exposes AArch64 features via getauxval(AT_HWCAP / AT_HWCAP2).
const AT_HWCAP: usize = 16;
const AT_HWCAP2: usize = 26;
const hwcap = linux.getauxval(AT_HWCAP);
const hwcap2 = linux.getauxval(AT_HWCAP2);
// Bit positions from Linux UAPI asm/hwcap.h
const HWCAP_AES: usize = 1 << 3;
const HWCAP_FPHP: usize = 1 << 9; // FEAT_FP16
const HWCAP_ASIMDDP: usize = 1 << 20; // DotProd
const HWCAP_SVE: usize = 1 << 22;
const HWCAP2_BF16: usize = 1 << 14;
const HWCAP2_SVE2: usize = 1 << 1;
const HWCAP2_SVEAES: usize = 1 << 2;
if (hwcap & HWCAP_AES != 0) {
t.neon = true;
if (hwcap & HWCAP_FPHP != 0 and
hwcap & HWCAP_ASIMDDP != 0 and
hwcap2 & HWCAP2_BF16 != 0)
{
t.neon_bf16 = true;
}
}
if (hwcap & HWCAP_SVE != 0) {
const vec_bytes = sveVectorBytes();
if (vec_bytes >= 32) {
t.sve = true;
if (vec_bytes == 32) {
t.sve_256 = true;
}
}
if (hwcap2 & HWCAP2_SVE2 != 0 and hwcap2 & HWCAP2_SVEAES != 0) {
if (vec_bytes >= 32) {
t.sve2 = true;
} else if (vec_bytes == 16) {
t.sve2_128 = true;
}
}
}
return @bitCast(t.*);
}
fn sveVectorBytes() usize {
// PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits.
const PR_SVE_GET_VL: i32 = 51;
const ret = linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0);
const signed: isize = @bitCast(ret);
if (signed >= 0) {
return ret & 0xFFFF;
}
// prctl failed: assume 128-bit (NEON-width, conservative).
return 16;
}

View File

@@ -0,0 +1,10 @@
/// Reads from the ELF auxiliary vector (set by the kernel at process
/// start). Does not call into libc.
pub inline fn getauxval(key: usize) usize {
return @import("std").os.linux.getauxval(key);
}
/// Direct syscall wrapper for prctl(2).
pub inline fn prctl(option: i32, a2: usize, a3: usize, a4: usize, a5: usize) usize {
return @import("std").os.linux.prctl(option, a2, a3, a4, a5);
}

View File

@@ -0,0 +1,26 @@
const builtin = @import("builtin");
const HwyTargets = @import("../targets.zig").Targets;
const linux = @import("linux.zig");
pub fn detect() i64 {
var t: HwyTargets = .{};
if (comptime builtin.os.tag != .linux) return @bitCast(t);
const AT_HWCAP: usize = 16;
const hwcap = linux.getauxval(AT_HWCAP);
// From Linux arch/loongarch/include/uapi/asm/hwcap.h
const HWCAP_LSX: usize = 1 << 4;
const HWCAP_LASX: usize = 1 << 5;
if (hwcap & HWCAP_LSX != 0) {
t.lsx = true;
if (hwcap & HWCAP_LASX != 0) {
t.lasx = true;
}
}
return @bitCast(t);
}

View File

@@ -0,0 +1,43 @@
const builtin = @import("builtin");
const HwyTargets = @import("../targets.zig").Targets;
const linux = @import("linux.zig");
pub fn detect() i64 {
var t: HwyTargets = .{};
if (comptime builtin.os.tag != .linux) return @bitCast(t);
const AT_HWCAP: usize = 16;
const AT_HWCAP2: usize = 26;
const hwcap = linux.getauxval(AT_HWCAP);
const hwcap2 = linux.getauxval(AT_HWCAP2);
// From Linux arch/powerpc/include/uapi/asm/cputable.h
const PPC_FEATURE_HAS_ALTIVEC: usize = 0x10000000;
const PPC_FEATURE_HAS_VSX: usize = 0x00000080;
const PPC_FEATURE2_ARCH_2_07: usize = 0x80000000; // POWER8
const PPC_FEATURE2_VEC_CRYPTO: usize = 0x02000000;
const PPC_FEATURE2_ARCH_3_00: usize = 0x00800000; // POWER9
const PPC_FEATURE2_ARCH_3_1: usize = 0x00040000; // POWER10
const PPC_FEATURE2_MMA: usize = 0x00020000;
if (hwcap & PPC_FEATURE_HAS_ALTIVEC != 0 and
hwcap & PPC_FEATURE_HAS_VSX != 0 and
hwcap2 & PPC_FEATURE2_ARCH_2_07 != 0 and
hwcap2 & PPC_FEATURE2_VEC_CRYPTO != 0)
{
t.ppc8 = true;
if (hwcap2 & PPC_FEATURE2_ARCH_3_00 != 0) {
t.ppc9 = true;
if (hwcap2 & PPC_FEATURE2_ARCH_3_1 != 0 and
hwcap2 & PPC_FEATURE2_MMA != 0)
{
t.ppc10 = true;
}
}
}
return @bitCast(t);
}

View File

@@ -0,0 +1,22 @@
const builtin = @import("builtin");
const HwyTargets = @import("../targets.zig").Targets;
const linux = @import("linux.zig");
pub fn detect() i64 {
var t: HwyTargets = .{};
if (comptime builtin.os.tag != .linux) return @bitCast(t);
const AT_HWCAP: usize = 16;
const hwcap = linux.getauxval(AT_HWCAP);
// ISA extension bit for 'V' (vector).
// Letter-based bits: bit position = letter - 'A'.
const HWCAP_V: usize = 1 << ('V' - 'A');
if (hwcap & HWCAP_V != 0) {
t.rvv = true;
}
return @bitCast(t);
}

View File

@@ -0,0 +1,29 @@
const builtin = @import("builtin");
const HwyTargets = @import("../targets.zig").Targets;
const linux = @import("linux.zig");
pub fn detect() i64 {
var t: HwyTargets = .{};
if (comptime builtin.os.tag != .linux) return @bitCast(t);
const AT_HWCAP: usize = 16;
const hwcap = linux.getauxval(AT_HWCAP);
// From Linux arch/s390/include/asm/elf.h
const HWCAP_VX: usize = 1 << 11;
const HWCAP_VXE: usize = 1 << 13; // z14
const HWCAP_VXE2: usize = 1 << 15; // z15
if (hwcap & HWCAP_VX != 0) {
if (hwcap & HWCAP_VXE != 0) {
t.z14 = true;
if (hwcap & HWCAP_VXE2 != 0) {
t.z15 = true;
}
}
}
return @bitCast(t);
}

View File

@@ -0,0 +1,166 @@
const builtin = @import("builtin");
const HwyTargets = @import("../targets.zig").Targets;
const CpuidResult = struct { eax: u32, ebx: u32, ecx: u32, edx: u32 };
fn cpuid(leaf: u32, subleaf: u32) CpuidResult {
var eax: u32 = undefined;
var ebx: u32 = undefined;
var ecx: u32 = undefined;
var edx: u32 = undefined;
asm volatile ("cpuid"
: [_] "={eax}" (eax),
[_] "={ebx}" (ebx),
[_] "={ecx}" (ecx),
[_] "={edx}" (edx),
: [_] "{eax}" (leaf),
[_] "{ecx}" (subleaf),
);
return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx };
}
inline fn bit(val: u32, comptime pos: u5) bool {
return (val >> pos) & 1 != 0;
}
pub fn detect() i64 {
var t: HwyTargets = .{};
// x86_64 always has SSE2.
if (comptime builtin.cpu.arch == .x86_64) {
t.sse2 = true;
}
const leaf0 = cpuid(0, 0);
const max_leaf = leaf0.eax;
if (max_leaf < 1) return @bitCast(t);
const leaf1 = cpuid(1, 0);
// -- SSE2 on 32-bit x86 -------------------------------------------------
if (comptime builtin.cpu.arch == .x86) {
if (bit(leaf1.edx, 25) and bit(leaf1.edx, 26)) {
t.sse2 = true;
}
}
// -- SSSE3 ---------------------------------------------------------------
if (bit(leaf1.ecx, 0) and // SSE3
bit(leaf1.ecx, 9)) // SSSE3
{
t.ssse3 = true;
}
// -- SSE4 ----------------------------------------------------------------
if (bit(leaf1.ecx, 19) and // SSE4.1
bit(leaf1.ecx, 20) and // SSE4.2
bit(leaf1.ecx, 1) and // PCLMUL
bit(leaf1.ecx, 25)) // AES
{
t.sse4 = true;
}
// Check XSAVE / AVX OS support before enabling any AVX-dependent target.
const has_xsave = bit(leaf1.ecx, 27);
const has_avx_bit = bit(leaf1.ecx, 28);
const xcr0: u32 = if (has_xsave and has_avx_bit) asm volatile ("xgetbv"
: [_] "={eax}" (-> u32),
: [_] "{ecx}" (@as(u32, 0)),
: .{ .edx = true }) else 0;
const has_avx_save = (xcr0 & 0x6) == 0x6; // SSE + AVX state
// Darwin lazily saves AVX-512 context on first use.
const has_avx512_save = if (comptime builtin.os.tag.isDarwin())
true
else
(xcr0 & 0xE0) == 0xE0; // opmask + zmm_hi256 + hi16_zmm
// -- AVX2 ----------------------------------------------------------------
if (has_avx_save and max_leaf >= 7) {
const leaf7 = cpuid(7, 0);
if (bit(leaf7.ebx, 5) and // AVX2
bit(leaf1.ecx, 12) and // FMA
bit(leaf1.ecx, 29)) // F16C
{
// Also need LZCNT (extended leaf), BMI, BMI2.
const leaf_ext = cpuid(0x80000001, 0);
if (bit(leaf_ext.ecx, 5) and // LZCNT
bit(leaf7.ebx, 3) and // BMI
bit(leaf7.ebx, 8)) // BMI2
{
t.avx2 = true;
}
}
// -- AVX-512 ---------------------------------------------------------
if (has_avx512_save) {
if (bit(leaf7.ebx, 16) and // AVX512F
bit(leaf7.ebx, 31) and // AVX512VL
bit(leaf7.ebx, 17) and // AVX512DQ
bit(leaf7.ebx, 30) and // AVX512BW
bit(leaf7.ebx, 28)) // AVX512CD
{
t.avx3 = true;
}
if (bit(leaf7.ecx, 11) and // AVX512VNNI
bit(leaf7.ecx, 10) and // VPCLMULQDQ (AVX save ok)
bit(leaf7.ecx, 1) and // AVX512VBMI
bit(leaf7.ecx, 6) and // AVX512VBMI2
bit(leaf7.ecx, 9) and // VAES (AVX save ok)
bit(leaf7.ecx, 14) and // AVX512VPOPCNTDQ
bit(leaf7.ecx, 12) and // AVX512BITALG
bit(leaf7.ecx, 8)) // GFNI
{
t.avx3_dl = true;
}
// AVX512BF16 is in leaf 7 sub-1.
if (t.avx3_dl and leaf7.eax >= 1) {
const leaf7_1 = cpuid(7, 1);
if (bit(leaf7_1.eax, 5)) { // AVX512BF16
if (isAMD()) {
t.avx3_zen4 = true;
}
}
if (bit(leaf7.edx, 23) and // AVX512FP16
bit(leaf7_1.eax, 5)) // AVX512BF16
{
t.avx3_spr = true;
}
} else if (bit(leaf7.edx, 23)) { // AVX512FP16 without sub-leaf
// Can't check BF16 without sub-leaf support, skip avx3_spr.
}
}
// -- AVX10 -----------------------------------------------------------
if (max_leaf >= 7 and cpuid(7, 0).eax >= 1) {
const leaf7_1 = cpuid(7, 1);
if (bit(leaf7_1.edx, 19)) { // AVX10.1-256
if (max_leaf >= 0x24) {
const leaf24 = cpuid(0x24, 0);
if (bit(leaf24.ebx, 18)) { // AVX10.1-512
t.avx3_spr = true;
t.avx3_dl = true;
t.avx3 = true;
}
}
// AVX10.2 detection would require a leaf we can't
// reliably check yet; leave for future.
}
}
}
return @bitCast(t);
}
fn isAMD() bool {
const leaf0 = cpuid(0, 0);
// "Auth" "enti" "cAMD"
return leaf0.ebx == 0x68747541 and
leaf0.ecx == 0x444d4163 and
leaf0.edx == 0x69746e65;
}

View File

@@ -1,257 +0,0 @@
const builtin = @import("builtin");
const std = @import("std");
const Target = std.Target;
const HwyTargets = @import("targets.zig").Targets;
/// Detect Highway targets using Zig's standard library CPU feature detection.
///
/// The logic is mostly identical to the Highway implementation, but we
/// use Zig's built-in CPU feature detection instead of Highway so that we
/// can strictly control access to Apple headers (and avoid them completely).
pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 {
const native = std.zig.system.resolveTargetQuery(.{}) catch return 0;
const cpu = native.cpu;
return switch (builtin.cpu.arch) {
.x86_64, .x86 => detectX86(cpu),
.aarch64, .aarch64_be => detectAarch64(cpu),
.powerpc, .powerpc64, .powerpc64le => detectPpc(cpu),
.s390x => detectS390x(cpu),
.riscv32, .riscv64 => detectRiscv(cpu),
.loongarch32, .loongarch64 => detectLoongArch(cpu),
else => 0,
};
}
fn detectX86(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
if (comptime builtin.cpu.arch == .x86_64) {
t.sse2 = true;
}
if (comptime builtin.cpu.arch == .x86) {
if (cpu.has(.x86, .sse) and
cpu.has(.x86, .sse2))
{
t.sse2 = true;
}
}
if (cpu.has(.x86, .sse3) and
cpu.has(.x86, .ssse3))
{
t.ssse3 = true;
}
if (cpu.has(.x86, .sse4_1) and
cpu.has(.x86, .sse4_2) and
cpu.has(.x86, .pclmul) and
cpu.has(.x86, .aes))
{
t.sse4 = true;
}
if (cpu.has(.x86, .avx) and
cpu.has(.x86, .avx2) and
cpu.has(.x86, .lzcnt) and
cpu.has(.x86, .bmi) and
cpu.has(.x86, .bmi2) and
cpu.has(.x86, .fma) and
cpu.has(.x86, .f16c))
{
t.avx2 = true;
}
if (cpu.has(.x86, .avx512f) and
cpu.has(.x86, .avx512vl) and
cpu.has(.x86, .avx512dq) and
cpu.has(.x86, .avx512bw) and
cpu.has(.x86, .avx512cd))
{
t.avx3 = true;
}
if (cpu.has(.x86, .avx512vnni) and
cpu.has(.x86, .vpclmulqdq) and
cpu.has(.x86, .avx512vbmi) and
cpu.has(.x86, .avx512vbmi2) and
cpu.has(.x86, .vaes) and
cpu.has(.x86, .avx512vpopcntdq) and
cpu.has(.x86, .avx512bitalg) and
cpu.has(.x86, .gfni))
{
t.avx3_dl = true;
}
if (t.avx3_dl and cpu.has(.x86, .avx512bf16)) {
if (isAMD()) {
t.avx3_zen4 = true;
}
}
if (cpu.has(.x86, .avx512fp16) and
cpu.has(.x86, .avx512bf16))
{
t.avx3_spr = true;
}
if (cpu.has(.x86, .avx10_1_256)) {
if (cpu.has(.x86, .avx10_1_512)) {
t.avx3_spr = true;
t.avx3_dl = true;
t.avx3 = true;
}
if (cpu.has(.x86, .avx10_2_256)) {
t.avx10_2 = true;
if (cpu.has(.x86, .avx10_2_512)) {
t.avx10_2_512 = true;
}
}
}
// On Darwin the kernel lazily saves AVX512 context on first use, so no
// explicit XCR0 check is required. On Linux, Zig's feature detection
// reads the kernel-provided auxiliary vector (getauxval) which already
// reflects OS-level XSAVE support.
return @bitCast(t);
}
fn detectAarch64(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
t.neon_without_aes = true;
if (cpu.has(.aarch64, .aes)) {
t.neon = true;
if (cpu.has(.aarch64, .fullfp16) and
cpu.has(.aarch64, .dotprod) and
cpu.has(.aarch64, .bf16))
{
t.neon_bf16 = true;
}
}
if (cpu.has(.aarch64, .sve)) {
const vec_bytes = sveVectorBytes();
if (vec_bytes >= 32) {
t.sve = true;
if (vec_bytes == 32) {
t.sve_256 = true;
}
}
if (cpu.has(.aarch64, .sve2) and cpu.has(.aarch64, .sve2_aes)) {
if (vec_bytes >= 32) {
t.sve2 = true;
} else if (vec_bytes == 16) {
t.sve2_128 = true;
}
}
}
return @bitCast(t);
}
fn sveVectorBytes() usize {
if (comptime builtin.os.tag == .linux) {
// PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits.
const PR_SVE_GET_VL = 51;
const ret = std.os.linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0);
const signed: isize = @bitCast(ret);
if (signed >= 0) {
return ret & 0xFFFF;
}
}
// Non-Linux or prctl failed: assume 128-bit (NEON-width, conservative).
return 16;
}
fn detectPpc(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
if (cpu.has(.powerpc, .altivec) and
cpu.has(.powerpc, .vsx) and
cpu.has(.powerpc, .power8_vector) and
cpu.has(.powerpc, .crypto))
{
t.ppc8 = true;
if (cpu.has(.powerpc, .power9_vector)) {
t.ppc9 = true;
if (cpu.has(.powerpc, .power10_vector) and
cpu.has(.powerpc, .mma))
{
t.ppc10 = true;
}
}
}
return @bitCast(t);
}
fn detectS390x(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
if (cpu.has(.s390x, .vector)) {
if (cpu.has(.s390x, .vector_enhancements_1)) {
t.z14 = true;
if (cpu.has(.s390x, .vector_enhancements_2)) {
t.z15 = true;
}
}
}
return @bitCast(t);
}
fn detectRiscv(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
if (cpu.has(.riscv, .v)) {
t.rvv = true;
}
return @bitCast(t);
}
fn detectLoongArch(cpu: Target.Cpu) i64 {
var t: HwyTargets = .{};
if (cpu.has(.loongarch, .lsx)) {
t.lsx = true;
if (cpu.has(.loongarch, .lasx)) {
t.lasx = true;
}
}
return @bitCast(t);
}
/// Check CPUID vendor string for "AuthenticAMD", matching Highway's IsAMD().
/// Zig doesn't expose the vendor string, so we must use inline assembly.
fn isAMD() bool {
var eax: u32 = undefined;
var ebx: u32 = undefined;
var ecx: u32 = undefined;
var edx: u32 = undefined;
asm volatile ("cpuid"
: [_] "={eax}" (eax),
[_] "={ebx}" (ebx),
[_] "={ecx}" (ecx),
[_] "={edx}" (edx),
: [_] "{eax}" (0),
);
// "Auth" "enti" "cAMD"
return ebx == 0x68747541 and
ecx == 0x444d4163 and
edx == 0x69746e65;
}