diff --git a/pkg/highway/build.zig b/pkg/highway/build.zig index 7f9304d91..64c0e8869 100644 --- a/pkg/highway/build.zig +++ b/pkg/highway/build.zig @@ -15,7 +15,7 @@ pub fn build(b: *std.Build) !void { const lib = b.addLibrary(.{ .name = "highway", .root_module = b.createModule(.{ - .root_source_file = b.path("src/runtime_detect.zig"), + .root_source_file = b.path("src/detect.zig"), .target = target, .optimize = optimize, }), diff --git a/pkg/highway/src/detect.zig b/pkg/highway/src/detect.zig new file mode 100644 index 000000000..471314d94 --- /dev/null +++ b/pkg/highway/src/detect.zig @@ -0,0 +1,49 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("targets.zig").Targets; + +const x86 = @import("detect/x86.zig"); +const aarch64_darwin = @import("detect/aarch64_darwin.zig"); +const aarch64_linux = @import("detect/aarch64_linux.zig"); +const ppc = @import("detect/ppc.zig"); +const s390x = @import("detect/s390x.zig"); +const riscv = @import("detect/riscv.zig"); +const loongarch = @import("detect/loongarch.zig"); + +/// Detect Highway targets at runtime using minimal, direct CPU feature +/// probing. +/// +/// Previous versions called std.zig.system.resolveTargetQuery which +/// drags in the full Zig target/CPU model tables for every architecture, +/// bloating the binary by ~300 KB and causing code-layout regressions in +/// unrelated hot paths (icache / branch-predictor pressure). +/// +/// This version uses only inline assembly (CPUID on x86, MRS on AArch64) +/// and lightweight syscalls (sysctlbyname on Darwin, getauxval on Linux), +/// so it adds no data tables and no std.Target dependency. +pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 { + return switch (builtin.cpu.arch) { + .x86_64, .x86 => x86.detect(), + .aarch64, .aarch64_be => detectAarch64(), + .powerpc, .powerpc64, .powerpc64le => ppc.detect(), + .s390x => s390x.detect(), + .riscv32, .riscv64 => riscv.detect(), + .loongarch32, .loongarch64 => loongarch.detect(), + else => 0, + }; +} + +fn detectAarch64() i64 { + var t: HwyTargets = .{}; + + // All AArch64 implementations have NEON. + t.neon_without_aes = true; + + if (comptime builtin.os.tag.isDarwin()) { + return aarch64_darwin.detect(&t); + } else if (comptime builtin.os.tag == .linux) { + return aarch64_linux.detect(&t); + } + + // Other OS: return baseline NEON. + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/aarch64_darwin.zig b/pkg/highway/src/detect/aarch64_darwin.zig new file mode 100644 index 000000000..f69edb441 --- /dev/null +++ b/pkg/highway/src/detect/aarch64_darwin.zig @@ -0,0 +1,33 @@ +const HwyTargets = @import("../targets.zig").Targets; + +pub fn detect(t: *HwyTargets) i64 { + // All Apple Silicon has AES. + t.neon = true; + + // Every Apple chip from A11 (2017) onward has FP16 + DotProd. + // BF16 arrived with M2 / A15 (ARM_BLIZZARD_AVALANCHE, 2022). + // We probe hw.optional.arm.FEAT_BF16 to be precise. + const has_bf16 = darwinSysctlBool("hw.optional.arm.FEAT_BF16"); + if (has_bf16) { + t.neon_bf16 = true; + } + + // Apple Silicon does not support SVE. + return @bitCast(t.*); +} + +fn darwinSysctlBool(comptime name: [:0]const u8) bool { + var value: c_int = 0; + var len: usize = @sizeOf(c_int); + const rc = sysctlbyname(name.ptr, &value, &len, null, 0); + return rc == 0 and value != 0; +} + +// We can rely on libc for macOS because libsystem is always available. +extern "c" fn sysctlbyname( + name: [*:0]const u8, + oldp: ?*anyopaque, + oldlenp: ?*usize, + newp: ?*const anyopaque, + newlen: usize, +) c_int; diff --git a/pkg/highway/src/detect/aarch64_linux.zig b/pkg/highway/src/detect/aarch64_linux.zig new file mode 100644 index 000000000..a4e74e674 --- /dev/null +++ b/pkg/highway/src/detect/aarch64_linux.zig @@ -0,0 +1,65 @@ +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect(t: *HwyTargets) i64 { + // Linux exposes AArch64 features via getauxval(AT_HWCAP / AT_HWCAP2). + const AT_HWCAP: usize = 16; + const AT_HWCAP2: usize = 26; + + const hwcap = linux.getauxval(AT_HWCAP); + const hwcap2 = linux.getauxval(AT_HWCAP2); + + // Bit positions from Linux UAPI asm/hwcap.h + const HWCAP_AES: usize = 1 << 3; + const HWCAP_FPHP: usize = 1 << 9; // FEAT_FP16 + const HWCAP_ASIMDDP: usize = 1 << 20; // DotProd + const HWCAP_SVE: usize = 1 << 22; + + const HWCAP2_BF16: usize = 1 << 14; + const HWCAP2_SVE2: usize = 1 << 1; + const HWCAP2_SVEAES: usize = 1 << 2; + + if (hwcap & HWCAP_AES != 0) { + t.neon = true; + + if (hwcap & HWCAP_FPHP != 0 and + hwcap & HWCAP_ASIMDDP != 0 and + hwcap2 & HWCAP2_BF16 != 0) + { + t.neon_bf16 = true; + } + } + + if (hwcap & HWCAP_SVE != 0) { + const vec_bytes = sveVectorBytes(); + + if (vec_bytes >= 32) { + t.sve = true; + if (vec_bytes == 32) { + t.sve_256 = true; + } + } + + if (hwcap2 & HWCAP2_SVE2 != 0 and hwcap2 & HWCAP2_SVEAES != 0) { + if (vec_bytes >= 32) { + t.sve2 = true; + } else if (vec_bytes == 16) { + t.sve2_128 = true; + } + } + } + + return @bitCast(t.*); +} + +fn sveVectorBytes() usize { + // PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits. + const PR_SVE_GET_VL: i32 = 51; + const ret = linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0); + const signed: isize = @bitCast(ret); + if (signed >= 0) { + return ret & 0xFFFF; + } + // prctl failed: assume 128-bit (NEON-width, conservative). + return 16; +} diff --git a/pkg/highway/src/detect/linux.zig b/pkg/highway/src/detect/linux.zig new file mode 100644 index 000000000..951cf3e6b --- /dev/null +++ b/pkg/highway/src/detect/linux.zig @@ -0,0 +1,10 @@ +/// Reads from the ELF auxiliary vector (set by the kernel at process +/// start). Does not call into libc. +pub inline fn getauxval(key: usize) usize { + return @import("std").os.linux.getauxval(key); +} + +/// Direct syscall wrapper for prctl(2). +pub inline fn prctl(option: i32, a2: usize, a3: usize, a4: usize, a5: usize) usize { + return @import("std").os.linux.prctl(option, a2, a3, a4, a5); +} diff --git a/pkg/highway/src/detect/loongarch.zig b/pkg/highway/src/detect/loongarch.zig new file mode 100644 index 000000000..686d11e62 --- /dev/null +++ b/pkg/highway/src/detect/loongarch.zig @@ -0,0 +1,26 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // From Linux arch/loongarch/include/uapi/asm/hwcap.h + const HWCAP_LSX: usize = 1 << 4; + const HWCAP_LASX: usize = 1 << 5; + + if (hwcap & HWCAP_LSX != 0) { + t.lsx = true; + + if (hwcap & HWCAP_LASX != 0) { + t.lasx = true; + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/ppc.zig b/pkg/highway/src/detect/ppc.zig new file mode 100644 index 000000000..587965ce8 --- /dev/null +++ b/pkg/highway/src/detect/ppc.zig @@ -0,0 +1,43 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const AT_HWCAP2: usize = 26; + const hwcap = linux.getauxval(AT_HWCAP); + const hwcap2 = linux.getauxval(AT_HWCAP2); + + // From Linux arch/powerpc/include/uapi/asm/cputable.h + const PPC_FEATURE_HAS_ALTIVEC: usize = 0x10000000; + const PPC_FEATURE_HAS_VSX: usize = 0x00000080; + const PPC_FEATURE2_ARCH_2_07: usize = 0x80000000; // POWER8 + const PPC_FEATURE2_VEC_CRYPTO: usize = 0x02000000; + const PPC_FEATURE2_ARCH_3_00: usize = 0x00800000; // POWER9 + const PPC_FEATURE2_ARCH_3_1: usize = 0x00040000; // POWER10 + const PPC_FEATURE2_MMA: usize = 0x00020000; + + if (hwcap & PPC_FEATURE_HAS_ALTIVEC != 0 and + hwcap & PPC_FEATURE_HAS_VSX != 0 and + hwcap2 & PPC_FEATURE2_ARCH_2_07 != 0 and + hwcap2 & PPC_FEATURE2_VEC_CRYPTO != 0) + { + t.ppc8 = true; + + if (hwcap2 & PPC_FEATURE2_ARCH_3_00 != 0) { + t.ppc9 = true; + + if (hwcap2 & PPC_FEATURE2_ARCH_3_1 != 0 and + hwcap2 & PPC_FEATURE2_MMA != 0) + { + t.ppc10 = true; + } + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/riscv.zig b/pkg/highway/src/detect/riscv.zig new file mode 100644 index 000000000..619d12faa --- /dev/null +++ b/pkg/highway/src/detect/riscv.zig @@ -0,0 +1,22 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // ISA extension bit for 'V' (vector). + // Letter-based bits: bit position = letter - 'A'. + const HWCAP_V: usize = 1 << ('V' - 'A'); + + if (hwcap & HWCAP_V != 0) { + t.rvv = true; + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/s390x.zig b/pkg/highway/src/detect/s390x.zig new file mode 100644 index 000000000..90d2ae3d5 --- /dev/null +++ b/pkg/highway/src/detect/s390x.zig @@ -0,0 +1,29 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // From Linux arch/s390/include/asm/elf.h + const HWCAP_VX: usize = 1 << 11; + const HWCAP_VXE: usize = 1 << 13; // z14 + const HWCAP_VXE2: usize = 1 << 15; // z15 + + if (hwcap & HWCAP_VX != 0) { + if (hwcap & HWCAP_VXE != 0) { + t.z14 = true; + + if (hwcap & HWCAP_VXE2 != 0) { + t.z15 = true; + } + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/x86.zig b/pkg/highway/src/detect/x86.zig new file mode 100644 index 000000000..bdfd3f5d2 --- /dev/null +++ b/pkg/highway/src/detect/x86.zig @@ -0,0 +1,166 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; + +const CpuidResult = struct { eax: u32, ebx: u32, ecx: u32, edx: u32 }; + +fn cpuid(leaf: u32, subleaf: u32) CpuidResult { + var eax: u32 = undefined; + var ebx: u32 = undefined; + var ecx: u32 = undefined; + var edx: u32 = undefined; + asm volatile ("cpuid" + : [_] "={eax}" (eax), + [_] "={ebx}" (ebx), + [_] "={ecx}" (ecx), + [_] "={edx}" (edx), + : [_] "{eax}" (leaf), + [_] "{ecx}" (subleaf), + ); + return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx }; +} + +inline fn bit(val: u32, comptime pos: u5) bool { + return (val >> pos) & 1 != 0; +} + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + // x86_64 always has SSE2. + if (comptime builtin.cpu.arch == .x86_64) { + t.sse2 = true; + } + + const leaf0 = cpuid(0, 0); + const max_leaf = leaf0.eax; + if (max_leaf < 1) return @bitCast(t); + + const leaf1 = cpuid(1, 0); + + // -- SSE2 on 32-bit x86 ------------------------------------------------- + if (comptime builtin.cpu.arch == .x86) { + if (bit(leaf1.edx, 25) and bit(leaf1.edx, 26)) { + t.sse2 = true; + } + } + + // -- SSSE3 --------------------------------------------------------------- + if (bit(leaf1.ecx, 0) and // SSE3 + bit(leaf1.ecx, 9)) // SSSE3 + { + t.ssse3 = true; + } + + // -- SSE4 ---------------------------------------------------------------- + if (bit(leaf1.ecx, 19) and // SSE4.1 + bit(leaf1.ecx, 20) and // SSE4.2 + bit(leaf1.ecx, 1) and // PCLMUL + bit(leaf1.ecx, 25)) // AES + { + t.sse4 = true; + } + + // Check XSAVE / AVX OS support before enabling any AVX-dependent target. + const has_xsave = bit(leaf1.ecx, 27); + const has_avx_bit = bit(leaf1.ecx, 28); + const xcr0: u32 = if (has_xsave and has_avx_bit) asm volatile ("xgetbv" + : [_] "={eax}" (-> u32), + : [_] "{ecx}" (@as(u32, 0)), + : .{ .edx = true }) else 0; + const has_avx_save = (xcr0 & 0x6) == 0x6; // SSE + AVX state + + // Darwin lazily saves AVX-512 context on first use. + const has_avx512_save = if (comptime builtin.os.tag.isDarwin()) + true + else + (xcr0 & 0xE0) == 0xE0; // opmask + zmm_hi256 + hi16_zmm + + // -- AVX2 ---------------------------------------------------------------- + if (has_avx_save and max_leaf >= 7) { + const leaf7 = cpuid(7, 0); + + if (bit(leaf7.ebx, 5) and // AVX2 + bit(leaf1.ecx, 12) and // FMA + bit(leaf1.ecx, 29)) // F16C + { + // Also need LZCNT (extended leaf), BMI, BMI2. + const leaf_ext = cpuid(0x80000001, 0); + if (bit(leaf_ext.ecx, 5) and // LZCNT + bit(leaf7.ebx, 3) and // BMI + bit(leaf7.ebx, 8)) // BMI2 + { + t.avx2 = true; + } + } + + // -- AVX-512 --------------------------------------------------------- + if (has_avx512_save) { + if (bit(leaf7.ebx, 16) and // AVX512F + bit(leaf7.ebx, 31) and // AVX512VL + bit(leaf7.ebx, 17) and // AVX512DQ + bit(leaf7.ebx, 30) and // AVX512BW + bit(leaf7.ebx, 28)) // AVX512CD + { + t.avx3 = true; + } + + if (bit(leaf7.ecx, 11) and // AVX512VNNI + bit(leaf7.ecx, 10) and // VPCLMULQDQ (AVX save ok) + bit(leaf7.ecx, 1) and // AVX512VBMI + bit(leaf7.ecx, 6) and // AVX512VBMI2 + bit(leaf7.ecx, 9) and // VAES (AVX save ok) + bit(leaf7.ecx, 14) and // AVX512VPOPCNTDQ + bit(leaf7.ecx, 12) and // AVX512BITALG + bit(leaf7.ecx, 8)) // GFNI + { + t.avx3_dl = true; + } + + // AVX512BF16 is in leaf 7 sub-1. + if (t.avx3_dl and leaf7.eax >= 1) { + const leaf7_1 = cpuid(7, 1); + if (bit(leaf7_1.eax, 5)) { // AVX512BF16 + if (isAMD()) { + t.avx3_zen4 = true; + } + } + + if (bit(leaf7.edx, 23) and // AVX512FP16 + bit(leaf7_1.eax, 5)) // AVX512BF16 + { + t.avx3_spr = true; + } + } else if (bit(leaf7.edx, 23)) { // AVX512FP16 without sub-leaf + // Can't check BF16 without sub-leaf support, skip avx3_spr. + } + } + + // -- AVX10 ----------------------------------------------------------- + if (max_leaf >= 7 and cpuid(7, 0).eax >= 1) { + const leaf7_1 = cpuid(7, 1); + if (bit(leaf7_1.edx, 19)) { // AVX10.1-256 + if (max_leaf >= 0x24) { + const leaf24 = cpuid(0x24, 0); + if (bit(leaf24.ebx, 18)) { // AVX10.1-512 + t.avx3_spr = true; + t.avx3_dl = true; + t.avx3 = true; + } + } + + // AVX10.2 detection would require a leaf we can't + // reliably check yet; leave for future. + } + } + } + + return @bitCast(t); +} + +fn isAMD() bool { + const leaf0 = cpuid(0, 0); + // "Auth" "enti" "cAMD" + return leaf0.ebx == 0x68747541 and + leaf0.ecx == 0x444d4163 and + leaf0.edx == 0x69746e65; +} diff --git a/pkg/highway/src/runtime_detect.zig b/pkg/highway/src/runtime_detect.zig deleted file mode 100644 index 25554a44d..000000000 --- a/pkg/highway/src/runtime_detect.zig +++ /dev/null @@ -1,257 +0,0 @@ -const builtin = @import("builtin"); -const std = @import("std"); -const Target = std.Target; -const HwyTargets = @import("targets.zig").Targets; - -/// Detect Highway targets using Zig's standard library CPU feature detection. -/// -/// The logic is mostly identical to the Highway implementation, but we -/// use Zig's built-in CPU feature detection instead of Highway so that we -/// can strictly control access to Apple headers (and avoid them completely). -pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 { - const native = std.zig.system.resolveTargetQuery(.{}) catch return 0; - const cpu = native.cpu; - - return switch (builtin.cpu.arch) { - .x86_64, .x86 => detectX86(cpu), - .aarch64, .aarch64_be => detectAarch64(cpu), - .powerpc, .powerpc64, .powerpc64le => detectPpc(cpu), - .s390x => detectS390x(cpu), - .riscv32, .riscv64 => detectRiscv(cpu), - .loongarch32, .loongarch64 => detectLoongArch(cpu), - else => 0, - }; -} - -fn detectX86(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - if (comptime builtin.cpu.arch == .x86_64) { - t.sse2 = true; - } - - if (comptime builtin.cpu.arch == .x86) { - if (cpu.has(.x86, .sse) and - cpu.has(.x86, .sse2)) - { - t.sse2 = true; - } - } - - if (cpu.has(.x86, .sse3) and - cpu.has(.x86, .ssse3)) - { - t.ssse3 = true; - } - - if (cpu.has(.x86, .sse4_1) and - cpu.has(.x86, .sse4_2) and - cpu.has(.x86, .pclmul) and - cpu.has(.x86, .aes)) - { - t.sse4 = true; - } - - if (cpu.has(.x86, .avx) and - cpu.has(.x86, .avx2) and - cpu.has(.x86, .lzcnt) and - cpu.has(.x86, .bmi) and - cpu.has(.x86, .bmi2) and - cpu.has(.x86, .fma) and - cpu.has(.x86, .f16c)) - { - t.avx2 = true; - } - - if (cpu.has(.x86, .avx512f) and - cpu.has(.x86, .avx512vl) and - cpu.has(.x86, .avx512dq) and - cpu.has(.x86, .avx512bw) and - cpu.has(.x86, .avx512cd)) - { - t.avx3 = true; - } - - if (cpu.has(.x86, .avx512vnni) and - cpu.has(.x86, .vpclmulqdq) and - cpu.has(.x86, .avx512vbmi) and - cpu.has(.x86, .avx512vbmi2) and - cpu.has(.x86, .vaes) and - cpu.has(.x86, .avx512vpopcntdq) and - cpu.has(.x86, .avx512bitalg) and - cpu.has(.x86, .gfni)) - { - t.avx3_dl = true; - } - - if (t.avx3_dl and cpu.has(.x86, .avx512bf16)) { - if (isAMD()) { - t.avx3_zen4 = true; - } - } - - if (cpu.has(.x86, .avx512fp16) and - cpu.has(.x86, .avx512bf16)) - { - t.avx3_spr = true; - } - - if (cpu.has(.x86, .avx10_1_256)) { - if (cpu.has(.x86, .avx10_1_512)) { - t.avx3_spr = true; - t.avx3_dl = true; - t.avx3 = true; - } - - if (cpu.has(.x86, .avx10_2_256)) { - t.avx10_2 = true; - if (cpu.has(.x86, .avx10_2_512)) { - t.avx10_2_512 = true; - } - } - } - - // On Darwin the kernel lazily saves AVX512 context on first use, so no - // explicit XCR0 check is required. On Linux, Zig's feature detection - // reads the kernel-provided auxiliary vector (getauxval) which already - // reflects OS-level XSAVE support. - - return @bitCast(t); -} - -fn detectAarch64(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - t.neon_without_aes = true; - - if (cpu.has(.aarch64, .aes)) { - t.neon = true; - - if (cpu.has(.aarch64, .fullfp16) and - cpu.has(.aarch64, .dotprod) and - cpu.has(.aarch64, .bf16)) - { - t.neon_bf16 = true; - } - } - - if (cpu.has(.aarch64, .sve)) { - const vec_bytes = sveVectorBytes(); - - if (vec_bytes >= 32) { - t.sve = true; - if (vec_bytes == 32) { - t.sve_256 = true; - } - } - - if (cpu.has(.aarch64, .sve2) and cpu.has(.aarch64, .sve2_aes)) { - if (vec_bytes >= 32) { - t.sve2 = true; - } else if (vec_bytes == 16) { - t.sve2_128 = true; - } - } - } - - return @bitCast(t); -} - -fn sveVectorBytes() usize { - if (comptime builtin.os.tag == .linux) { - // PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits. - const PR_SVE_GET_VL = 51; - const ret = std.os.linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0); - const signed: isize = @bitCast(ret); - if (signed >= 0) { - return ret & 0xFFFF; - } - } - // Non-Linux or prctl failed: assume 128-bit (NEON-width, conservative). - return 16; -} - -fn detectPpc(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - if (cpu.has(.powerpc, .altivec) and - cpu.has(.powerpc, .vsx) and - cpu.has(.powerpc, .power8_vector) and - cpu.has(.powerpc, .crypto)) - { - t.ppc8 = true; - - if (cpu.has(.powerpc, .power9_vector)) { - t.ppc9 = true; - - if (cpu.has(.powerpc, .power10_vector) and - cpu.has(.powerpc, .mma)) - { - t.ppc10 = true; - } - } - } - - return @bitCast(t); -} - -fn detectS390x(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - if (cpu.has(.s390x, .vector)) { - if (cpu.has(.s390x, .vector_enhancements_1)) { - t.z14 = true; - - if (cpu.has(.s390x, .vector_enhancements_2)) { - t.z15 = true; - } - } - } - - return @bitCast(t); -} - -fn detectRiscv(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - if (cpu.has(.riscv, .v)) { - t.rvv = true; - } - - return @bitCast(t); -} - -fn detectLoongArch(cpu: Target.Cpu) i64 { - var t: HwyTargets = .{}; - - if (cpu.has(.loongarch, .lsx)) { - t.lsx = true; - - if (cpu.has(.loongarch, .lasx)) { - t.lasx = true; - } - } - - return @bitCast(t); -} - -/// Check CPUID vendor string for "AuthenticAMD", matching Highway's IsAMD(). -/// Zig doesn't expose the vendor string, so we must use inline assembly. -fn isAMD() bool { - var eax: u32 = undefined; - var ebx: u32 = undefined; - var ecx: u32 = undefined; - var edx: u32 = undefined; - asm volatile ("cpuid" - : [_] "={eax}" (eax), - [_] "={ebx}" (ebx), - [_] "={ecx}" (ecx), - [_] "={edx}" (edx), - : [_] "{eax}" (0), - ); - - // "Auth" "enti" "cAMD" - return ebx == 0x68747541 and - ecx == 0x444d4163 and - edx == 0x69746e65; -}