diff --git a/pkg/highway/bridge.cpp b/pkg/highway/bridge.cpp deleted file mode 100644 index 8f607f3e6..000000000 --- a/pkg/highway/bridge.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -namespace hwy { -namespace { - -// Highway's upstream abort.cc pulls in libc++ even when the rest of the -// library is compiled with HWY_NO_LIBCXX. Ghostty only needs Highway's dynamic -// dispatch/runtime target selection, so we provide the tiny Warn/Abort surface -// that targets.cc/per_target.cc expect and keep the package free of libc++. -WarnFunc g_warn_func = nullptr; -AbortFunc g_abort_func = nullptr; - -// Mirror the upstream behavior closely enough for Highway's internal callers: -// format into a fixed buffer, fall back to a generic error if formatting fails, -// and then dispatch to either the registered hook or stderr. -void format_message(const char* format, va_list args, char* buffer, size_t size) { - const int written = vsnprintf(buffer, size, format, args); - if (written < 0) { - snprintf(buffer, size, "%s", "failed to format highway message"); - } -} - -} // namespace - -WarnFunc& GetWarnFunc() { - return g_warn_func; -} - -AbortFunc& GetAbortFunc() { - return g_abort_func; -} - -WarnFunc SetWarnFunc(WarnFunc func) { - // Highway documents these setters as thread-safe. Using the compiler builtin - // keeps that guarantee without depending on std::atomic. - return __atomic_exchange_n(&g_warn_func, func, __ATOMIC_SEQ_CST); -} - -AbortFunc SetAbortFunc(AbortFunc func) { - return __atomic_exchange_n(&g_abort_func, func, __ATOMIC_SEQ_CST); -} - -void Warn(const char* file, int line, const char* format, ...) { - char message[1024]; - va_list args; - va_start(args, format); - format_message(format, args, message, sizeof(message)); - va_end(args); - - if (WarnFunc func = g_warn_func) { - func(file, line, message); - return; - } - - fprintf(stderr, "%s:%d: %s\n", file, line, message); -} - -HWY_NORETURN void Abort(const char* file, int line, const char* format, ...) { - char message[1024]; - va_list args; - va_start(args, format); - format_message(format, args, message, sizeof(message)); - va_end(args); - - if (AbortFunc func = g_abort_func) { - func(file, line, message); - } else { - fprintf(stderr, "%s:%d: %s\n", file, line, message); - } - - abort(); -} - -} // namespace hwy - -extern "C" { - -// Zig reads HWY_SUPPORTED_TARGETS via this C shim so it can keep its target -// enum in sync with the vendored Highway build without parsing C++ headers. -int64_t hwy_supported_targets() { - return HWY_SUPPORTED_TARGETS; -} -} diff --git a/pkg/highway/build.zig b/pkg/highway/build.zig index 6ed721562..64c0e8869 100644 --- a/pkg/highway/build.zig +++ b/pkg/highway/build.zig @@ -7,7 +7,7 @@ pub fn build(b: *std.Build) !void { const upstream_ = b.lazyDependency("highway", .{}); const module = b.addModule("highway", .{ - .root_source_file = b.path("main.zig"), + .root_source_file = b.path("src/main.zig"), .target = target, .optimize = optimize, }); @@ -15,22 +15,23 @@ pub fn build(b: *std.Build) !void { const lib = b.addLibrary(.{ .name = "highway", .root_module = b.createModule(.{ + .root_source_file = b.path("src/detect.zig"), .target = target, .optimize = optimize, }), .linkage = .static, }); + + // Our highway package is free of libc at runtime (uses no symbols) + // but does require libc headers at compile time. lib.linkLibC(); + + lib.addIncludePath(b.path("src/cpp")); if (upstream_) |upstream| { lib.addIncludePath(upstream.path("")); module.addIncludePath(upstream.path("")); } - if (target.result.os.tag.isDarwin()) { - const apple_sdk = @import("apple_sdk"); - try apple_sdk.addPaths(b, lib); - } - if (target.result.abi.isAndroid()) { const android_ndk = @import("android_ndk"); try android_ndk.addPaths(b, lib); @@ -93,19 +94,13 @@ pub fn build(b: *std.Build) !void { }); } - lib.addCSourceFiles(.{ .flags = flags.items, .files = &.{"bridge.cpp"} }); + lib.addCSourceFiles(.{ .flags = flags.items, .files = &.{ + "src/cpp/abort.cc", + "src/cpp/per_target.cc", + "src/cpp/targets.cpp", + } }); + if (upstream_) |upstream| { - lib.addCSourceFiles(.{ - .root = upstream.path(""), - .flags = flags.items, - .files = &.{ - // These provide the runtime target selection used by - // HWY_DYNAMIC_DISPATCH. The benchmark, timer, print, and - // aligned allocator support files are unused by Ghostty. - "hwy/per_target.cc", - "hwy/targets.cc", - }, - }); lib.installHeadersDirectory( upstream.path("hwy"), "hwy", @@ -119,7 +114,7 @@ pub fn build(b: *std.Build) !void { const test_exe = b.addTest(.{ .name = "test", .root_module = b.createModule(.{ - .root_source_file = b.path("main.zig"), + .root_source_file = b.path("src/main.zig"), .target = target, .optimize = optimize, }), diff --git a/pkg/highway/build.zig.zon b/pkg/highway/build.zig.zon index 4870d1db5..96b2768ae 100644 --- a/pkg/highway/build.zig.zon +++ b/pkg/highway/build.zig.zon @@ -11,7 +11,6 @@ .lazy = true, }, - .apple_sdk = .{ .path = "../apple-sdk" }, .android_ndk = .{ .path = "../android-ndk" }, }, } diff --git a/pkg/highway/main.zig b/pkg/highway/main.zig deleted file mode 100644 index 95ba6cda8..000000000 --- a/pkg/highway/main.zig +++ /dev/null @@ -1,57 +0,0 @@ -extern "c" fn hwy_supported_targets() i64; - -pub const Targets = packed struct(i64) { - // x86_64 - _reserved: u4 = 0, - avx3_spr: bool = false, - _reserved_5: u1 = 0, - avx3_zen4: bool = false, - avx3_dl: bool = false, - avx3: bool = false, - avx2: bool = false, - _reserved_10: u1 = 0, - sse4: bool = false, - ssse3: bool = false, - _reserved_13: u1 = 0, // SSE3 reserved - sse2: bool = false, - _reserved_15_23: u9 = 0, - - // aarch64 - sve2_128: bool = false, - sve_256: bool = false, - sve2: bool = false, - sve: bool = false, - neon: bool = false, - neon_without_aes: bool = false, - _reserved_30_36: u6 = 0, - - // risc-v - rvv: bool = false, - _reserved_38_46: u9 = 0, - - // IBM Power - ppc10: bool = false, - ppc9: bool = false, - ppc8: bool = false, - z15: bool = false, - z14: bool = false, - _reserved_52_57: u6 = 0, - - // WebAssembly - wasm_emu256: bool = false, - wasm: bool = false, - _reserved_60_61: u2 = 0, - - // Emulation - emu128: bool = false, - scalar: bool = false, - _reserved_63: u1 = 0, -}; - -pub fn supported_targets() Targets { - return @bitCast(hwy_supported_targets()); -} - -test { - _ = supported_targets(); -} diff --git a/pkg/highway/src/cpp/abort.cc b/pkg/highway/src/cpp/abort.cc new file mode 100644 index 000000000..152619b0d --- /dev/null +++ b/pkg/highway/src/cpp/abort.cc @@ -0,0 +1,70 @@ +// Copyright 2019 Google LLC +// Copyright 2024 Arm Limited and/or its affiliates +// SPDX-License-Identifier: Apache-2.0 +// SPDX-License-Identifier: BSD-3-Clause + +// Vendored from google/highway hwy/abort.cc at commit: +// 66486a10623fa0d72fe91260f96c892e41aceb06 +// +// Local modifications: +// - Removed stdio/stdlib/string/sanitizer-backed formatting and logging paths +// so this file no longer pulls in libc/libc++ symbols. +// - Replaced std::atomic storage with compiler atomics on plain function +// pointers to preserve thread-safe handler installation without libc++. +// - Kept only the Warn/Abort symbol surface Highway's runtime dispatch needs, +// with a trap-only fallback when no abort handler is installed. +// +// Why: +// - Ghostty only needs Highway's runtime dispatch support here, not its +// formatted stderr diagnostics. +// - Keeping this translation unit libc/libc++ free lets pkg/highway build as a +// small vendored shim around Zig-driven target detection. + +#include "hwy/abort.h" + +#include "hwy/base.h" + +namespace hwy { + +namespace { + +WarnFunc g_warn_func = nullptr; +AbortFunc g_abort_func = nullptr; + +} // namespace + +HWY_DLLEXPORT WarnFunc& GetWarnFunc() { + return g_warn_func; +} + +HWY_DLLEXPORT AbortFunc& GetAbortFunc() { + return g_abort_func; +} + +HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func) { + return __atomic_exchange_n(&g_warn_func, func, __ATOMIC_SEQ_CST); +} + +HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func) { + return __atomic_exchange_n(&g_abort_func, func, __ATOMIC_SEQ_CST); +} + +HWY_DLLEXPORT void HWY_FORMAT(3, 4) + Warn(const char* file, int line, const char* format, ...) { + WarnFunc handler = __atomic_load_n(&g_warn_func, __ATOMIC_SEQ_CST); + if (handler != nullptr) { + handler(file, line, format); + } +} + +HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) + Abort(const char* file, int line, const char* format, ...) { + AbortFunc handler = __atomic_load_n(&g_abort_func, __ATOMIC_SEQ_CST); + if (handler != nullptr) { + handler(file, line, format); + } + + __builtin_trap(); +} + +} // namespace hwy diff --git a/pkg/highway/src/cpp/per_target.cc b/pkg/highway/src/cpp/per_target.cc new file mode 100644 index 000000000..44973ad3f --- /dev/null +++ b/pkg/highway/src/cpp/per_target.cc @@ -0,0 +1,91 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Vendored from google/highway hwy/per_target.cc at commit: +// 66486a10623fa0d72fe91260f96c892e41aceb06 +// +// Local modifications: +// - Changed HWY_TARGET_INCLUDE from the upstream path to the local vendored +// filename so Highway's multi-pass include machinery resolves this copy. +// - Left the implementation otherwise identical to upstream. +// +// Why: +// - Ghostty vendors only the specific Highway .cc files it needs in this +// directory, so the original source-relative include path no longer exists. +// - Keeping the logic unchanged aside from the include path reduces fork +// maintenance cost while still allowing a minimal vendored source set. + +// Enable all targets so that calling Have* does not call into a null pointer. +#ifndef HWY_COMPILE_ALL_ATTAINABLE +#define HWY_COMPILE_ALL_ATTAINABLE +#endif +#include "hwy/per_target.h" + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "per_target.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { +int64_t GetTarget() { return HWY_TARGET; } +size_t GetVectorBytes() { return Lanes(ScalableTag()); } +bool GetHaveInteger64() { return HWY_HAVE_INTEGER64 != 0; } +bool GetHaveFloat16() { return HWY_HAVE_FLOAT16 != 0; } +bool GetHaveFloat64() { return HWY_HAVE_FLOAT64 != 0; } +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(GetTarget); +HWY_EXPORT(GetVectorBytes); +HWY_EXPORT(GetHaveInteger64); +HWY_EXPORT(GetHaveFloat16); +HWY_EXPORT(GetHaveFloat64); +} // namespace + +HWY_DLLEXPORT int64_t DispatchedTarget() { + return HWY_DYNAMIC_DISPATCH(GetTarget)(); +} + +HWY_DLLEXPORT size_t VectorBytes() { + return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); +} + +HWY_DLLEXPORT bool HaveInteger64() { + return HWY_DYNAMIC_DISPATCH(GetHaveInteger64)(); +} + +HWY_DLLEXPORT bool HaveFloat16() { + return HWY_DYNAMIC_DISPATCH(GetHaveFloat16)(); +} + +HWY_DLLEXPORT bool HaveFloat64() { + return HWY_DYNAMIC_DISPATCH(GetHaveFloat64)(); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/pkg/highway/src/cpp/targets.cpp b/pkg/highway/src/cpp/targets.cpp new file mode 100644 index 000000000..7977cd573 --- /dev/null +++ b/pkg/highway/src/cpp/targets.cpp @@ -0,0 +1,79 @@ +// Vendored from google/highway hwy/targets.cc at commit: +// 66486a10623fa0d72fe91260f96c892e41aceb06 +// +// Local modifications: +// - Dropped upstream CPU feature probing and platform-specific detection code +// in favor of Ghostty's Zig-provided ghostty_hwy_detect_targets(). +// - Removed the HWY_WARN baseline-mismatch diagnostic path so this file does +// not depend on libc-backed formatting/logging. +// - Kept only the chosen-target bookkeeping and runtime dispatch state that +// Highway's HWY_DYNAMIC_DISPATCH machinery needs. +// - Added hwy_supported_targets() as a small C shim for Zig to query the final +// supported target mask. +// +// Why: +// - Ghostty wants a minimal vendored Highway runtime that avoids direct libc +// usage and lets Zig own target detection policy. +// - Narrowing this file to dispatch state makes the local fork easier to audit +// and maintain than carrying upstream's full platform detection surface. + +#include "hwy/targets.h" + +namespace hwy { + +extern "C" int64_t ghostty_hwy_detect_targets(); + +// Vendored from Highway's hwy/targets.cc. Ghostty provides target detection in +// Zig, so this TU only retains the runtime dispatch/chosen-target state. +static int64_t DetectTargets() { + int64_t bits = HWY_SCALAR | HWY_EMU128; + +#if (HWY_ARCH_X86 || HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || \ + HWY_ARCH_RISCV || HWY_ARCH_LOONGARCH) && \ + HWY_HAVE_RUNTIME_DISPATCH + bits |= ghostty_hwy_detect_targets(); +#else + bits |= HWY_ENABLED_BASELINE; +#endif + + return bits; +} + +// When running tests, this value can be set to the mocked supported targets +// mask. Only written to from a single thread before the test starts. +static int64_t supported_targets_for_test_ = 0; + +// Mask of targets disabled at runtime with DisableTargets. +static int64_t supported_mask_ = LimitsMax(); + +HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { + supported_mask_ = static_cast(~disabled_targets); + GetChosenTarget().DeInit(); +} + +HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { + supported_targets_for_test_ = targets; + GetChosenTarget().DeInit(); +} + +HWY_DLLEXPORT int64_t SupportedTargets() { + int64_t targets = supported_targets_for_test_; + if (HWY_LIKELY(targets == 0)) { + targets = DetectTargets(); + GetChosenTarget().Update(targets); + } + + targets &= supported_mask_; + return targets == 0 ? HWY_STATIC_TARGET : targets; +} + +HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { + static ChosenTarget chosen_target; + return chosen_target; +} + +} // namespace hwy + +extern "C" int64_t hwy_supported_targets() { + return hwy::SupportedTargets(); +} diff --git a/pkg/highway/src/detect.zig b/pkg/highway/src/detect.zig new file mode 100644 index 000000000..471314d94 --- /dev/null +++ b/pkg/highway/src/detect.zig @@ -0,0 +1,49 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("targets.zig").Targets; + +const x86 = @import("detect/x86.zig"); +const aarch64_darwin = @import("detect/aarch64_darwin.zig"); +const aarch64_linux = @import("detect/aarch64_linux.zig"); +const ppc = @import("detect/ppc.zig"); +const s390x = @import("detect/s390x.zig"); +const riscv = @import("detect/riscv.zig"); +const loongarch = @import("detect/loongarch.zig"); + +/// Detect Highway targets at runtime using minimal, direct CPU feature +/// probing. +/// +/// Previous versions called std.zig.system.resolveTargetQuery which +/// drags in the full Zig target/CPU model tables for every architecture, +/// bloating the binary by ~300 KB and causing code-layout regressions in +/// unrelated hot paths (icache / branch-predictor pressure). +/// +/// This version uses only inline assembly (CPUID on x86, MRS on AArch64) +/// and lightweight syscalls (sysctlbyname on Darwin, getauxval on Linux), +/// so it adds no data tables and no std.Target dependency. +pub export fn ghostty_hwy_detect_targets() callconv(.c) i64 { + return switch (builtin.cpu.arch) { + .x86_64, .x86 => x86.detect(), + .aarch64, .aarch64_be => detectAarch64(), + .powerpc, .powerpc64, .powerpc64le => ppc.detect(), + .s390x => s390x.detect(), + .riscv32, .riscv64 => riscv.detect(), + .loongarch32, .loongarch64 => loongarch.detect(), + else => 0, + }; +} + +fn detectAarch64() i64 { + var t: HwyTargets = .{}; + + // All AArch64 implementations have NEON. + t.neon_without_aes = true; + + if (comptime builtin.os.tag.isDarwin()) { + return aarch64_darwin.detect(&t); + } else if (comptime builtin.os.tag == .linux) { + return aarch64_linux.detect(&t); + } + + // Other OS: return baseline NEON. + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/aarch64_darwin.zig b/pkg/highway/src/detect/aarch64_darwin.zig new file mode 100644 index 000000000..f69edb441 --- /dev/null +++ b/pkg/highway/src/detect/aarch64_darwin.zig @@ -0,0 +1,33 @@ +const HwyTargets = @import("../targets.zig").Targets; + +pub fn detect(t: *HwyTargets) i64 { + // All Apple Silicon has AES. + t.neon = true; + + // Every Apple chip from A11 (2017) onward has FP16 + DotProd. + // BF16 arrived with M2 / A15 (ARM_BLIZZARD_AVALANCHE, 2022). + // We probe hw.optional.arm.FEAT_BF16 to be precise. + const has_bf16 = darwinSysctlBool("hw.optional.arm.FEAT_BF16"); + if (has_bf16) { + t.neon_bf16 = true; + } + + // Apple Silicon does not support SVE. + return @bitCast(t.*); +} + +fn darwinSysctlBool(comptime name: [:0]const u8) bool { + var value: c_int = 0; + var len: usize = @sizeOf(c_int); + const rc = sysctlbyname(name.ptr, &value, &len, null, 0); + return rc == 0 and value != 0; +} + +// We can rely on libc for macOS because libsystem is always available. +extern "c" fn sysctlbyname( + name: [*:0]const u8, + oldp: ?*anyopaque, + oldlenp: ?*usize, + newp: ?*const anyopaque, + newlen: usize, +) c_int; diff --git a/pkg/highway/src/detect/aarch64_linux.zig b/pkg/highway/src/detect/aarch64_linux.zig new file mode 100644 index 000000000..a4e74e674 --- /dev/null +++ b/pkg/highway/src/detect/aarch64_linux.zig @@ -0,0 +1,65 @@ +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect(t: *HwyTargets) i64 { + // Linux exposes AArch64 features via getauxval(AT_HWCAP / AT_HWCAP2). + const AT_HWCAP: usize = 16; + const AT_HWCAP2: usize = 26; + + const hwcap = linux.getauxval(AT_HWCAP); + const hwcap2 = linux.getauxval(AT_HWCAP2); + + // Bit positions from Linux UAPI asm/hwcap.h + const HWCAP_AES: usize = 1 << 3; + const HWCAP_FPHP: usize = 1 << 9; // FEAT_FP16 + const HWCAP_ASIMDDP: usize = 1 << 20; // DotProd + const HWCAP_SVE: usize = 1 << 22; + + const HWCAP2_BF16: usize = 1 << 14; + const HWCAP2_SVE2: usize = 1 << 1; + const HWCAP2_SVEAES: usize = 1 << 2; + + if (hwcap & HWCAP_AES != 0) { + t.neon = true; + + if (hwcap & HWCAP_FPHP != 0 and + hwcap & HWCAP_ASIMDDP != 0 and + hwcap2 & HWCAP2_BF16 != 0) + { + t.neon_bf16 = true; + } + } + + if (hwcap & HWCAP_SVE != 0) { + const vec_bytes = sveVectorBytes(); + + if (vec_bytes >= 32) { + t.sve = true; + if (vec_bytes == 32) { + t.sve_256 = true; + } + } + + if (hwcap2 & HWCAP2_SVE2 != 0 and hwcap2 & HWCAP2_SVEAES != 0) { + if (vec_bytes >= 32) { + t.sve2 = true; + } else if (vec_bytes == 16) { + t.sve2_128 = true; + } + } + } + + return @bitCast(t.*); +} + +fn sveVectorBytes() usize { + // PR_SVE_GET_VL returns the SVE vector length in the lower 16 bits. + const PR_SVE_GET_VL: i32 = 51; + const ret = linux.prctl(PR_SVE_GET_VL, 0, 0, 0, 0); + const signed: isize = @bitCast(ret); + if (signed >= 0) { + return ret & 0xFFFF; + } + // prctl failed: assume 128-bit (NEON-width, conservative). + return 16; +} diff --git a/pkg/highway/src/detect/linux.zig b/pkg/highway/src/detect/linux.zig new file mode 100644 index 000000000..951cf3e6b --- /dev/null +++ b/pkg/highway/src/detect/linux.zig @@ -0,0 +1,10 @@ +/// Reads from the ELF auxiliary vector (set by the kernel at process +/// start). Does not call into libc. +pub inline fn getauxval(key: usize) usize { + return @import("std").os.linux.getauxval(key); +} + +/// Direct syscall wrapper for prctl(2). +pub inline fn prctl(option: i32, a2: usize, a3: usize, a4: usize, a5: usize) usize { + return @import("std").os.linux.prctl(option, a2, a3, a4, a5); +} diff --git a/pkg/highway/src/detect/loongarch.zig b/pkg/highway/src/detect/loongarch.zig new file mode 100644 index 000000000..686d11e62 --- /dev/null +++ b/pkg/highway/src/detect/loongarch.zig @@ -0,0 +1,26 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // From Linux arch/loongarch/include/uapi/asm/hwcap.h + const HWCAP_LSX: usize = 1 << 4; + const HWCAP_LASX: usize = 1 << 5; + + if (hwcap & HWCAP_LSX != 0) { + t.lsx = true; + + if (hwcap & HWCAP_LASX != 0) { + t.lasx = true; + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/ppc.zig b/pkg/highway/src/detect/ppc.zig new file mode 100644 index 000000000..587965ce8 --- /dev/null +++ b/pkg/highway/src/detect/ppc.zig @@ -0,0 +1,43 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const AT_HWCAP2: usize = 26; + const hwcap = linux.getauxval(AT_HWCAP); + const hwcap2 = linux.getauxval(AT_HWCAP2); + + // From Linux arch/powerpc/include/uapi/asm/cputable.h + const PPC_FEATURE_HAS_ALTIVEC: usize = 0x10000000; + const PPC_FEATURE_HAS_VSX: usize = 0x00000080; + const PPC_FEATURE2_ARCH_2_07: usize = 0x80000000; // POWER8 + const PPC_FEATURE2_VEC_CRYPTO: usize = 0x02000000; + const PPC_FEATURE2_ARCH_3_00: usize = 0x00800000; // POWER9 + const PPC_FEATURE2_ARCH_3_1: usize = 0x00040000; // POWER10 + const PPC_FEATURE2_MMA: usize = 0x00020000; + + if (hwcap & PPC_FEATURE_HAS_ALTIVEC != 0 and + hwcap & PPC_FEATURE_HAS_VSX != 0 and + hwcap2 & PPC_FEATURE2_ARCH_2_07 != 0 and + hwcap2 & PPC_FEATURE2_VEC_CRYPTO != 0) + { + t.ppc8 = true; + + if (hwcap2 & PPC_FEATURE2_ARCH_3_00 != 0) { + t.ppc9 = true; + + if (hwcap2 & PPC_FEATURE2_ARCH_3_1 != 0 and + hwcap2 & PPC_FEATURE2_MMA != 0) + { + t.ppc10 = true; + } + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/riscv.zig b/pkg/highway/src/detect/riscv.zig new file mode 100644 index 000000000..619d12faa --- /dev/null +++ b/pkg/highway/src/detect/riscv.zig @@ -0,0 +1,22 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // ISA extension bit for 'V' (vector). + // Letter-based bits: bit position = letter - 'A'. + const HWCAP_V: usize = 1 << ('V' - 'A'); + + if (hwcap & HWCAP_V != 0) { + t.rvv = true; + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/s390x.zig b/pkg/highway/src/detect/s390x.zig new file mode 100644 index 000000000..90d2ae3d5 --- /dev/null +++ b/pkg/highway/src/detect/s390x.zig @@ -0,0 +1,29 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; +const linux = @import("linux.zig"); + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + if (comptime builtin.os.tag != .linux) return @bitCast(t); + + const AT_HWCAP: usize = 16; + const hwcap = linux.getauxval(AT_HWCAP); + + // From Linux arch/s390/include/asm/elf.h + const HWCAP_VX: usize = 1 << 11; + const HWCAP_VXE: usize = 1 << 13; // z14 + const HWCAP_VXE2: usize = 1 << 15; // z15 + + if (hwcap & HWCAP_VX != 0) { + if (hwcap & HWCAP_VXE != 0) { + t.z14 = true; + + if (hwcap & HWCAP_VXE2 != 0) { + t.z15 = true; + } + } + } + + return @bitCast(t); +} diff --git a/pkg/highway/src/detect/x86.zig b/pkg/highway/src/detect/x86.zig new file mode 100644 index 000000000..bdfd3f5d2 --- /dev/null +++ b/pkg/highway/src/detect/x86.zig @@ -0,0 +1,166 @@ +const builtin = @import("builtin"); +const HwyTargets = @import("../targets.zig").Targets; + +const CpuidResult = struct { eax: u32, ebx: u32, ecx: u32, edx: u32 }; + +fn cpuid(leaf: u32, subleaf: u32) CpuidResult { + var eax: u32 = undefined; + var ebx: u32 = undefined; + var ecx: u32 = undefined; + var edx: u32 = undefined; + asm volatile ("cpuid" + : [_] "={eax}" (eax), + [_] "={ebx}" (ebx), + [_] "={ecx}" (ecx), + [_] "={edx}" (edx), + : [_] "{eax}" (leaf), + [_] "{ecx}" (subleaf), + ); + return .{ .eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx }; +} + +inline fn bit(val: u32, comptime pos: u5) bool { + return (val >> pos) & 1 != 0; +} + +pub fn detect() i64 { + var t: HwyTargets = .{}; + + // x86_64 always has SSE2. + if (comptime builtin.cpu.arch == .x86_64) { + t.sse2 = true; + } + + const leaf0 = cpuid(0, 0); + const max_leaf = leaf0.eax; + if (max_leaf < 1) return @bitCast(t); + + const leaf1 = cpuid(1, 0); + + // -- SSE2 on 32-bit x86 ------------------------------------------------- + if (comptime builtin.cpu.arch == .x86) { + if (bit(leaf1.edx, 25) and bit(leaf1.edx, 26)) { + t.sse2 = true; + } + } + + // -- SSSE3 --------------------------------------------------------------- + if (bit(leaf1.ecx, 0) and // SSE3 + bit(leaf1.ecx, 9)) // SSSE3 + { + t.ssse3 = true; + } + + // -- SSE4 ---------------------------------------------------------------- + if (bit(leaf1.ecx, 19) and // SSE4.1 + bit(leaf1.ecx, 20) and // SSE4.2 + bit(leaf1.ecx, 1) and // PCLMUL + bit(leaf1.ecx, 25)) // AES + { + t.sse4 = true; + } + + // Check XSAVE / AVX OS support before enabling any AVX-dependent target. + const has_xsave = bit(leaf1.ecx, 27); + const has_avx_bit = bit(leaf1.ecx, 28); + const xcr0: u32 = if (has_xsave and has_avx_bit) asm volatile ("xgetbv" + : [_] "={eax}" (-> u32), + : [_] "{ecx}" (@as(u32, 0)), + : .{ .edx = true }) else 0; + const has_avx_save = (xcr0 & 0x6) == 0x6; // SSE + AVX state + + // Darwin lazily saves AVX-512 context on first use. + const has_avx512_save = if (comptime builtin.os.tag.isDarwin()) + true + else + (xcr0 & 0xE0) == 0xE0; // opmask + zmm_hi256 + hi16_zmm + + // -- AVX2 ---------------------------------------------------------------- + if (has_avx_save and max_leaf >= 7) { + const leaf7 = cpuid(7, 0); + + if (bit(leaf7.ebx, 5) and // AVX2 + bit(leaf1.ecx, 12) and // FMA + bit(leaf1.ecx, 29)) // F16C + { + // Also need LZCNT (extended leaf), BMI, BMI2. + const leaf_ext = cpuid(0x80000001, 0); + if (bit(leaf_ext.ecx, 5) and // LZCNT + bit(leaf7.ebx, 3) and // BMI + bit(leaf7.ebx, 8)) // BMI2 + { + t.avx2 = true; + } + } + + // -- AVX-512 --------------------------------------------------------- + if (has_avx512_save) { + if (bit(leaf7.ebx, 16) and // AVX512F + bit(leaf7.ebx, 31) and // AVX512VL + bit(leaf7.ebx, 17) and // AVX512DQ + bit(leaf7.ebx, 30) and // AVX512BW + bit(leaf7.ebx, 28)) // AVX512CD + { + t.avx3 = true; + } + + if (bit(leaf7.ecx, 11) and // AVX512VNNI + bit(leaf7.ecx, 10) and // VPCLMULQDQ (AVX save ok) + bit(leaf7.ecx, 1) and // AVX512VBMI + bit(leaf7.ecx, 6) and // AVX512VBMI2 + bit(leaf7.ecx, 9) and // VAES (AVX save ok) + bit(leaf7.ecx, 14) and // AVX512VPOPCNTDQ + bit(leaf7.ecx, 12) and // AVX512BITALG + bit(leaf7.ecx, 8)) // GFNI + { + t.avx3_dl = true; + } + + // AVX512BF16 is in leaf 7 sub-1. + if (t.avx3_dl and leaf7.eax >= 1) { + const leaf7_1 = cpuid(7, 1); + if (bit(leaf7_1.eax, 5)) { // AVX512BF16 + if (isAMD()) { + t.avx3_zen4 = true; + } + } + + if (bit(leaf7.edx, 23) and // AVX512FP16 + bit(leaf7_1.eax, 5)) // AVX512BF16 + { + t.avx3_spr = true; + } + } else if (bit(leaf7.edx, 23)) { // AVX512FP16 without sub-leaf + // Can't check BF16 without sub-leaf support, skip avx3_spr. + } + } + + // -- AVX10 ----------------------------------------------------------- + if (max_leaf >= 7 and cpuid(7, 0).eax >= 1) { + const leaf7_1 = cpuid(7, 1); + if (bit(leaf7_1.edx, 19)) { // AVX10.1-256 + if (max_leaf >= 0x24) { + const leaf24 = cpuid(0x24, 0); + if (bit(leaf24.ebx, 18)) { // AVX10.1-512 + t.avx3_spr = true; + t.avx3_dl = true; + t.avx3 = true; + } + } + + // AVX10.2 detection would require a leaf we can't + // reliably check yet; leave for future. + } + } + } + + return @bitCast(t); +} + +fn isAMD() bool { + const leaf0 = cpuid(0, 0); + // "Auth" "enti" "cAMD" + return leaf0.ebx == 0x68747541 and + leaf0.ecx == 0x444d4163 and + leaf0.edx == 0x69746e65; +} diff --git a/pkg/highway/src/main.zig b/pkg/highway/src/main.zig new file mode 100644 index 000000000..614fd14af --- /dev/null +++ b/pkg/highway/src/main.zig @@ -0,0 +1,12 @@ +extern "c" fn hwy_supported_targets() i64; + +pub const Targets = @import("targets.zig").Targets; + +pub fn supported_targets() Targets { + return @bitCast(hwy_supported_targets()); +} + +test { + _ = supported_targets(); + _ = @import("runtime_detect.zig"); +} diff --git a/pkg/highway/src/targets.zig b/pkg/highway/src/targets.zig new file mode 100644 index 000000000..5ae77bcad --- /dev/null +++ b/pkg/highway/src/targets.zig @@ -0,0 +1,109 @@ +const assert = @import("std").debug.assert; + +pub const Targets = packed struct(i64) { + // x86_64 + _reserved_0_2: u3 = 0, + avx10_2_512: bool = false, + avx3_spr: bool = false, + avx10_2: bool = false, + avx3_zen4: bool = false, + avx3_dl: bool = false, + avx3: bool = false, + avx2: bool = false, + _reserved_10: u1 = 0, + sse4: bool = false, + ssse3: bool = false, + _reserved_13: u1 = 0, + sse2: bool = false, + _reserved_15_17: u3 = 0, + + // aarch64 + sve2_128: bool = false, + sve_256: bool = false, + _reserved_20_22: u3 = 0, + sve2: bool = false, + sve: bool = false, + _reserved_25: u1 = 0, + neon_bf16: bool = false, + _reserved_27: u1 = 0, + neon: bool = false, + neon_without_aes: bool = false, + _reserved_30_36: u7 = 0, + + // risc-v + rvv: bool = false, + _reserved_38_39: u2 = 0, + + // LoongArch + lasx: bool = false, + lsx: bool = false, + _reserved_42_46: u5 = 0, + + // IBM Power + ppc10: bool = false, + ppc9: bool = false, + ppc8: bool = false, + z15: bool = false, + z14: bool = false, + _reserved_52_57: u6 = 0, + + // WebAssembly + wasm_emu256: bool = false, + wasm: bool = false, + _reserved_60: u1 = 0, + + // Emulation + emu128: bool = false, + scalar: bool = false, + _reserved_63: u1 = 0, + + fn bitPos(comptime field_name: []const u8) comptime_int { + return @bitOffsetOf(Targets, field_name); + } + + // Verify at comptime that each flag field matches its Highway bit constant. + comptime { + // x86 + assert(bitPos("avx10_2_512") == 3); + assert(bitPos("avx3_spr") == 4); + assert(bitPos("avx10_2") == 5); + assert(bitPos("avx3_zen4") == 6); + assert(bitPos("avx3_dl") == 7); + assert(bitPos("avx3") == 8); + assert(bitPos("avx2") == 9); + assert(bitPos("sse4") == 11); + assert(bitPos("ssse3") == 12); + assert(bitPos("sse2") == 14); + + // aarch64 + assert(bitPos("sve2_128") == 18); + assert(bitPos("sve_256") == 19); + assert(bitPos("sve2") == 23); + assert(bitPos("sve") == 24); + assert(bitPos("neon_bf16") == 26); + assert(bitPos("neon") == 28); + assert(bitPos("neon_without_aes") == 29); + + // risc-v + assert(bitPos("rvv") == 37); + + // LoongArch + assert(bitPos("lasx") == 40); + assert(bitPos("lsx") == 41); + + // IBM Power + assert(bitPos("ppc10") == 47); + assert(bitPos("ppc9") == 48); + assert(bitPos("ppc8") == 49); + assert(bitPos("z15") == 50); + assert(bitPos("z14") == 51); + + // WebAssembly + assert(bitPos("wasm_emu256") == 58); + assert(bitPos("wasm") == 59); + + // Emulation + assert(bitPos("emu128") == 61); + assert(bitPos("scalar") == 62); + } +}; diff --git a/src/benchmark/TerminalParser.zig b/src/benchmark/TerminalParser.zig index e00081763..78c933121 100644 --- a/src/benchmark/TerminalParser.zig +++ b/src/benchmark/TerminalParser.zig @@ -88,11 +88,17 @@ fn step(ptr: *anyopaque) Benchmark.Error!void { return error.BenchmarkFailed; }; if (n == 0) break; // EOF reached - for (buf[0..n]) |c| { - const actions = p.next(c); - //std.log.warn("actions={any}", .{actions}); - _ = actions; - } + parseAll(&p, buf[0..n]); + } +} + +/// Separated from `step` so that the tight per-byte loop gets its own +/// function alignment, insulating it from code-layout changes elsewhere +/// in the binary that would otherwise shift its cache-line placement. +noinline fn parseAll(p: *terminalpkg.Parser, data: []const u8) void { + for (data) |c| { + const actions = p.next(c); + _ = actions; } } diff --git a/src/simd/codepoint_width.cpp b/src/simd/codepoint_width.cpp index 7e5fe7d2f..7f0dfd87c 100644 --- a/src/simd/codepoint_width.cpp +++ b/src/simd/codepoint_width.cpp @@ -7,8 +7,14 @@ #ifndef GHOSTTY_SIMD_CPW_HELPERS_ #define GHOSTTY_SIMD_CPW_HELPERS_ -#include -#include +#ifdef NDEBUG +#define GHOSTTY_SIMD_ASSERT(cond) ((void)0) +#else +#define GHOSTTY_SIMD_ASSERT(cond) \ + do { \ + if (!(cond)) __builtin_trap();\ + } while (0) +#endif // Replacement for std::size() that works without libc++. template @@ -249,8 +255,8 @@ static_assert(array_size(nsm_gte16) == array_size(nsm_lte16)); /// Handles 16-bit codepoints. template int8_t CodepointWidth16(D d, uint16_t input) { - assert(input > 0xFF); - assert(input <= 0xFFFF); + GHOSTTY_SIMD_ASSERT(input > 0xFF); + GHOSTTY_SIMD_ASSERT(input <= 0xFFFF); const size_t N = hn::Lanes(d); const hn::Vec input_vec = Set(d, input); @@ -287,7 +293,7 @@ int8_t CodepointWidth16(D d, uint16_t input) { return 2; } } - assert(i >= 7); // We should have checked all the ranges. + GHOSTTY_SIMD_ASSERT(i >= 7); // We should have checked all the ranges. } { @@ -353,7 +359,7 @@ int8_t CodepointWidth16(D d, uint16_t input) { /// Handles codepoints larger than 16-bit. template int8_t CodepointWidth32(D d, T input) { - assert(input > 0xFFFF); + GHOSTTY_SIMD_ASSERT(input > 0xFFFF); const size_t N = hn::Lanes(d); const hn::Vec input_vec = Set(d, input); @@ -379,7 +385,7 @@ int8_t CodepointWidth32(D d, T input) { return 2; } } - assert(i >= 2); // We should have checked all the ranges. + GHOSTTY_SIMD_ASSERT(i >= 2); // We should have checked all the ranges. } { diff --git a/src/simd/vt.cpp b/src/simd/vt.cpp index 1179c3773..5bf4147d5 100644 --- a/src/simd/vt.cpp +++ b/src/simd/vt.cpp @@ -5,8 +5,6 @@ #include #include -#include -#include #include #include