rexcode/x86: pre-matched encode hint + repair the typed builders

Targeted branchless revert + the pre-matched form fast path, and a fix
for a pre-existing bug the latter surfaced.

(a) Revert the two speculative-write spots from the prior branchless pass
    (legacy-prefix emission, widened displacement store, ENCODE_TAIL_SLACK)
    back to predicted branches. In real streams a legacy prefix is almost
    always absent and disp size is stable, so those branches are ~free and
    the unconditional stores only added work. Every class got faster
    (RET 19->17.5, MOV r,r 52->46.6, VADDPS 42.8->39.3 ns).

(b) Pre-matched form hint. Instruction.enc_hint (in the existing 11-byte
    padding, idx+1 biased; 0 = matcher path) lets a typed builder that maps
    to a single value-independent form bake the global form index, so
    encode() skips the O(forms) match scan -- and, in a varied stream, its
    unpredictable branches. Generated for non-immediate forms only (value-
    dependent imm8/imm32 selection stays on the matcher). On a 100k mixed
    typed-builder stream: 47.3 -> 30.2 ns/inst (-36%), byte-identical to the
    matcher path -- ~2x the original baseline for codegen.

Repair the typed inst_/emit_ builders. They were non-functional: the
generator cast the hw-only typed enum straight to Register
(Register(GPR64.RAX) -> class 0), so every typed-builder operand was
rejected by the matcher (encode returned empty). Untested because the
suite builds via the generic constructors. Now they build through the
class-correct op_gpr64/op_xmm/... path (op_* already used by 3+ operand
builders), emit_ reuses inst_, and a new 30-case consistency suite
asserts typed == generic (llvm-verified) and hint == matcher.

gen/builders/check/test/idempotent all green; 2276 cases.
This commit is contained in:
Brendan Punsky
2026-06-18 21:04:18 -04:00
committed by Flāvius
parent 8387731357
commit 078015bc34
5 changed files with 7283 additions and 7118 deletions

View File

@@ -33,13 +33,6 @@ import "core:rexcode/isa"
MAX_INST_SIZE :: 15 // Maximum x64 instruction length
// Extra bytes reserved past each instruction so the branchless emitters can
// write a few speculative bytes beyond the logical end (e.g. a widened 4-byte
// displacement store when only a disp8 is kept). The over-written tail is
// reclaimed by the next emit; this slack just guarantees the wide store stays
// in bounds even for the final instruction against a tight buffer.
ENCODE_TAIL_SLACK :: 8
// -----------------------------------------------------------------------------
// SECTION: 7.6 Core Encoding Function
@@ -107,7 +100,7 @@ encode :: proc(
}
// Check buffer space
if byte_count + MAX_INST_SIZE + ENCODE_TAIL_SLACK > u32(len(code)) {
if byte_count + MAX_INST_SIZE > u32(len(code)) {
append(errors, Error{u32(instruction_index), .BUFFER_OVERFLOW, {}})
ok = false
continue
@@ -147,27 +140,38 @@ encode :: proc(
}
}
// Find matching encoding from table (O(1) mnemonic lookup)
encodings := encoding_forms(inst.mnemonic)
if len(encodings) == 0 {
append(errors, Error{u32(instruction_index), .INVALID_MNEMONIC, {}})
ok = false
continue
}
// Find the first encoding that matches operands
matched_enc: ^Encoding = nil
for &e in encodings {
if encoding_matches_inline(&inst, &e, mode) {
matched_enc = &e
break
}
}
if matched_enc == nil {
append(errors, Error{u32(instruction_index), .NO_MATCHING_ENCODING, {}})
ok = false
continue
// Pre-matched form fast-path: a typed builder that maps to a single
// encoding form bakes `global_index + 1` into enc_hint, letting us skip
// the O(forms) match scan entirely -- and with it the scan's branches,
// which are the unpredictable ones in a varied instruction stream. Only
// in long mode (the builders' target); bounds-checked; anything else
// (hand-built, generic builders, i386, decode) falls back to matching.
if mode == ._64 && inst.enc_hint != ENC_HINT_NONE && int(inst.enc_hint) <= len(ENCODE_FORMS) {
matched_enc = &ENCODE_FORMS[inst.enc_hint - 1]
} else {
// Find matching encoding from table (O(1) mnemonic lookup)
encodings := encoding_forms(inst.mnemonic)
if len(encodings) == 0 {
append(errors, Error{u32(instruction_index), .INVALID_MNEMONIC, {}})
ok = false
continue
}
// Find the first encoding that matches operands
for &e in encodings {
if encoding_matches_inline(&inst, &e, mode) {
matched_enc = &e
break
}
}
if matched_enc == nil {
append(errors, Error{u32(instruction_index), .NO_MATCHING_ENCODING, {}})
ok = false
continue
}
}
// =====================================================================
@@ -208,34 +212,37 @@ encode :: proc(
}
has_modrm := mr_slot >= 0 || reg_slot >= 0
// --- Legacy Prefixes (branchless) ---
// --- Legacy Prefixes ---
//
// Each optional prefix byte is written *speculatively* at `pos`, then
// `pos` advances only if the prefix is actually present. When absent the
// speculative byte is overwritten by the next emit (the opcode always
// writes at `pos`), so the final stream is identical to the branching
// form -- with four data-dependent branches removed. The buffer carries
// MAX_INST_SIZE slack (checked above), so the spec writes stay in bounds.
// Kept as predicted branches: in real instruction streams a legacy
// prefix is almost always absent, so these are ~100% predicted-not-taken
// (free), and the branchless speculative-write form only added four
// unconditional stores per instruction for no win. See git history.
// Lock prefix (F0)
out[pos] = 0xF0
pos += u32(inst.flags.lock && enc.flags.lock_ok)
if inst.flags.lock && enc.flags.lock_ok {
out[pos] = 0xF0
pos += 1
}
// Rep/Repne prefix (NONE -> 0, REP -> F3, REPNE -> F2)
REP_BYTE := [Rep]u8{ .NONE = 0, .REP = 0xF3, .REPNE = 0xF2 }
rep_b := REP_BYTE[inst.flags.rep]
out[pos] = rep_b
pos += u32(rep_b != 0)
// Rep/Repne prefix
#partial switch inst.flags.rep {
case .REP: out[pos] = 0xF3; pos += 1
case .REPNE: out[pos] = 0xF2; pos += 1
}
// Segment override (table already maps 0 -> 0)
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
seg_b := seg_prefix[inst.flags.segment]
out[pos] = seg_b
pos += u32(seg_b != 0)
// Segment override
if inst.flags.segment != 0 {
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
out[pos] = seg_prefix[inst.flags.segment]
pos += 1
}
// Address size override (67h)
out[pos] = 0x67
pos += u32(inst.flags.addr32)
if inst.flags.addr32 {
out[pos] = 0x67
pos += 1
}
// --- VEX/EVEX or Legacy Encoding ---
@@ -580,15 +587,14 @@ encode :: proc(
pos += 1
}
// Displacement: four unconditional little-endian byte stores, then
// advance by the real size (0/1/4) -- no data-dependent loop. The
// untaken tail bytes are reclaimed by the next emit; ENCODE_TAIL_SLACK
// keeps the widened store in bounds.
out[pos+0] = u8(disp)
out[pos+1] = u8(disp >> 8)
out[pos+2] = u8(disp >> 16)
out[pos+3] = u8(disp >> 24)
pos += u32(displacement_size)
// Displacement: bounded little-endian emit. Kept as a counted loop
// (0/1/4 trips, highly predictable per code pattern) so no buffer
// tail-slack is needed and no bytes are written past the real size.
for _ in 0..<displacement_size {
out[pos] = u8(disp & 0xFF)
disp >>= 8
pos += 1
}
}
// Fixed ModR/M for special instructions. Triggered for:
@@ -912,7 +918,7 @@ imm_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: O
// Compute safe buffer sizes for encoding
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int {
return n * MAX_INST_SIZE + ENCODE_TAIL_SLACK
return n * MAX_INST_SIZE
}
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int {

View File

@@ -35,10 +35,25 @@ Instruction :: struct #packed {
operand_count: u8, // 1 byte
flags: Instruction_Flags, // 1 byte
length: u8, // 1 byte (filled by decoder, used for iteration)
_: [11]u8, // 11 bytes
enc_hint: u16, // 2 bytes (pre-matched form, +1 biased; 0 = none)
_: [9]u8, // 9 bytes
}
#assert(size_of(Instruction) == 64)
// Pre-matched encoding hint: a typed builder that maps to exactly one encoding
// form (no value-dependent immediate selection) stores `global_form_index + 1`
// in `Instruction.enc_hint`, letting encode() skip the form-match scan. 0 means
// "no hint" -- the zero value, so every hand-built / generic-builder / decoded
// instruction stays on the matching path unchanged.
ENC_HINT_NONE :: u16(0)
@(require_results)
with_hint :: #force_inline proc "contextless" (inst: Instruction, hint: u16) -> Instruction {
inst := inst
inst.enc_hint = hint
return inst
}
// -----------------------------------------------------------------------------
// SECTION: 7.9 Instruction Builder Helpers
// -----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@@ -3139,6 +3139,101 @@ print_summary :: proc() {
fmt.printf("%s======================================================================%s\n", BOLD, RESET)
}
// =============================================================================
// TYPED BUILDER CONSISTENCY
// =============================================================================
//
// The generated typed builders (inst_<mnem>/emit_<mnem>) were previously
// untested. A class-dropping register cast made every one of them encode to
// nothing, and the pre-matched enc_hint fast path needs guarding. Each case
// asserts the typed builder is byte-identical to the llvm-verified generic
// builder AND that the baked enc_hint matches the matcher path (hint cleared).
@(private="file") tb_a: [64]u8
@(private="file") tb_b: [64]u8
@(private="file")
tb_enc :: proc(inst: x86.Instruction, buf: []u8) -> []u8 {
relocs: [dynamic]x86.Relocation; errors: [dynamic]x86.Error
defer { delete(relocs); delete(errors) }
n, _ := x86.encode({inst}, nil, buf, &relocs, &errors)
return buf[:n]
}
@(private="file")
tb_eq :: proc(a, b: []u8) -> bool {
if len(a) != len(b) || len(a) == 0 { return false }
for x, i in a { if x != b[i] { return false } }
return true
}
@(private="file")
tb_check :: proc(name: string, typed, generic: x86.Instruction) {
t := tb_enc(typed, tb_a[:])
g := tb_enc(generic, tb_b[:])
typed_ok := tb_eq(t, g)
// hint path must equal the matcher path for the very same instruction
cleared := typed; cleared.enc_hint = 0
hint_ok := tb_eq(t, tb_enc(cleared, tb_b[:]))
if typed_ok && hint_ok {
g_stats.passed += 1
g_stats.cases_validated += 1
} else {
g_stats.failed += 1
fmt.printf(" %sFAIL%s %s: typed=% x generic=% x (typed_ok=%v hint_ok=%v)\n",
RED, RESET, name, t, g, typed_ok, hint_ok)
}
}
run_typed_builder_tests :: proc() {
md8 := x86.mem_base_disp(x86.RBP, -16)
md32 := x86.mem_base_disp(x86.RCX, 100000)
mbi := x86.mem_base_index_disp(x86.R8, x86.RDX, 4, 32)
mrip := x86.mem_rip_disp(0)
// GPR reg-reg, every size (r16 exercises the 66h class-dependent prefix)
tb_check("mov r8,r8", x86.inst_mov_r8_r8(.AL,.BL), x86.inst_r_r(.MOV, x86.AL, x86.BL))
tb_check("mov r16,r16", x86.inst_mov_r16_r16(.AX,.BX), x86.inst_r_r(.MOV, x86.AX, x86.BX))
tb_check("mov r32,r32", x86.inst_mov_r32_r32(.EAX,.EDX), x86.inst_r_r(.MOV, x86.EAX, x86.EDX))
tb_check("mov r64,r64", x86.inst_mov_r64_r64(.RAX,.RBX), x86.inst_r_r(.MOV, x86.RAX, x86.RBX))
tb_check("mov r64 ext", x86.inst_mov_r64_r64(.R8,.R15), x86.inst_r_r(.MOV, x86.R8, x86.R15))
tb_check("mov r32 ext", x86.inst_mov_r32_r32(.R9D,.R10D),x86.inst_r_r(.MOV, x86.R9D, x86.R10D))
// GPR arithmetic/logical reg-reg
tb_check("add r64,r64", x86.inst_add_r64_r64(.RAX,.RCX), x86.inst_r_r(.ADD, x86.RAX, x86.RCX))
tb_check("sub r64,r64", x86.inst_sub_r64_r64(.RSI,.RDI), x86.inst_r_r(.SUB, x86.RSI, x86.RDI))
tb_check("and r64,r64", x86.inst_and_r64_r64(.RBX,.RAX), x86.inst_r_r(.AND, x86.RBX, x86.RAX))
tb_check("or r64,r64", x86.inst_or_r64_r64(.RBX,.RAX), x86.inst_r_r(.OR, x86.RBX, x86.RAX))
tb_check("xor r64,r64", x86.inst_xor_r64_r64(.R8,.R8), x86.inst_r_r(.XOR, x86.R8, x86.R8))
tb_check("cmp r64,r64", x86.inst_cmp_r64_r64(.RAX,.RDX), x86.inst_r_r(.CMP, x86.RAX, x86.RDX))
tb_check("add r32,r32", x86.inst_add_r32_r32(.EAX,.ECX), x86.inst_r_r(.ADD, x86.EAX, x86.ECX))
// GPR reg-mem / mem-reg across addressing modes
tb_check("mov r64,[d8]", x86.inst_mov_r64_m64(.RDX, x86.Mem64{md8}), x86.inst_r_m(.MOV, x86.RDX, md8, 8))
tb_check("mov r64,[d32]", x86.inst_mov_r64_m64(.RAX, x86.Mem64{md32}), x86.inst_r_m(.MOV, x86.RAX, md32, 8))
tb_check("mov r64,[b+i]", x86.inst_mov_r64_m64(.RAX, x86.Mem64{mbi}), x86.inst_r_m(.MOV, x86.RAX, mbi, 8))
tb_check("mov r64,[rip]", x86.inst_mov_r64_m64(.RAX, x86.Mem64{mrip}), x86.inst_r_m(.MOV, x86.RAX, mrip, 8))
tb_check("mov [b+i],r64", x86.inst_mov_m64_r64(x86.Mem64{mbi}, .R9), x86.inst_m_r(.MOV, mbi, 8, x86.R9))
tb_check("add r64,[d8]", x86.inst_add_r64_m64(.RAX, x86.Mem64{md8}), x86.inst_r_m(.ADD, x86.RAX, md8, 8))
// SSE (legacy) + VEX vector
tb_check("movaps x,x", x86.inst_movaps_xmm_xmm(.XMM0,.XMM1), x86.inst_r_r(.MOVAPS, x86.XMM0, x86.XMM1))
tb_check("movaps x,m", x86.inst_movaps_xmm_m128(.XMM3, x86.Mem128{mbi}), x86.inst_r_m(.MOVAPS, x86.XMM3, mbi, 16))
tb_check("movaps m,x", x86.inst_movaps_m128_xmm(x86.Mem128{mbi}, .XMM8), x86.inst_m_r(.MOVAPS, mbi, 16, x86.XMM8))
tb_check("addps x,x", x86.inst_addps_xmm_xmm(.XMM2,.XMM4), x86.inst_r_r(.ADDPS, x86.XMM2, x86.XMM4))
tb_check("vaddps y,y,y", x86.inst_vaddps_ymm_ymm_ymm(.YMM0,.YMM1,.YMM2), x86.inst_r_r_r(.VADDPS, x86.YMM0, x86.YMM1, x86.YMM2))
tb_check("vaddps y ext", x86.inst_vaddps_ymm_ymm_ymm(.YMM8,.YMM12,.YMM15), x86.inst_r_r_r(.VADDPS, x86.YMM8, x86.YMM12, x86.YMM15))
tb_check("vmulps x,x,x", x86.inst_vmulps_xmm_xmm_xmm(.XMM0,.XMM1,.XMM2), x86.inst_r_r_r(.VMULPS, x86.XMM0, x86.XMM1, x86.XMM2))
// opcode+reg
tb_check("push r64", x86.inst_push_r64(.R11), x86.inst_r(.PUSH, x86.R11))
tb_check("pop r64", x86.inst_pop_r64(.R12), x86.inst_r(.POP, x86.R12))
// immediate forms (no hint -- value-dependent; must still be correct)
tb_check("mov r32,imm32", x86.inst_mov_r32_imm32(.EAX, 0x12345678), x86.inst_r_i(.MOV, x86.EAX, 0x12345678, 4))
tb_check("mov r64,imm64", x86.inst_mov_r64_imm64(.RAX, 0x1122334455667788), x86.inst_r_i(.MOV, x86.RAX, 0x1122334455667788, 8))
}
// =============================================================================
// MAIN
// =============================================================================
@@ -3187,6 +3282,9 @@ main :: proc() {
log_header("LABEL_MAP TESTS")
run_label_map_tests()
log_header("TYPED BUILDER CONSISTENCY")
run_typed_builder_tests()
log_header("PERFORMANCE BENCHMARKS")
run_benchmarks()

View File

@@ -42,6 +42,7 @@ Proc_Entry :: struct {
mnemonic: x86.Mnemonic,
sig: Operand_Signature,
proc_name: string,
enc_hint: u16, // biased global form index (idx+1) for the pre-match fast path; 0 = none
}
GEN_ATTRIB :: "// rexcode · Brendan Punsky (dotbmp@github), original author\n\n"
@@ -74,10 +75,21 @@ main :: proc() {
encodings := x86.ENCODE_FORMS[_run.start:][:_run.count]
if len(encodings) == 0 { continue }
for enc in encodings {
for enc, enc_idx in encodings {
// Skip encodings we can't generate builders for (implicit-only operands, etc.)
can_generate_builder(enc) or_continue
// A typed builder may bake a pre-matched form hint only when the
// matcher's pick is value-INDEPENDENT (no immediate/relative size
// selection); otherwise the matcher might choose a shorter form for
// some values, so we leave enc_hint=0 (matcher path). The first form
// in run order that produces a given proc_name wins the dedup below,
// which mirrors the matcher's first-match-in-run-order pick.
hint: u16 = 0
if form_is_hintable(enc) {
hint = u16(int(_run.start) + enc_idx + 1)
}
// For RM operands, generate both register and memory variants
variants := get_operand_variants(enc)
@@ -95,6 +107,7 @@ main :: proc() {
mnemonic = mnemonic,
sig = sig,
proc_name = proc_name,
enc_hint = hint,
}
if mnemonic not_in procs_by_mnemonic {
@@ -328,6 +341,21 @@ can_generate_builder :: proc(enc: x86.Encoding) -> bool {
return !has_any_operand || has_explicit
}
// A form is safe to pre-match (bake an enc_hint) only when the matcher's pick is
// VALUE-independent: it has no immediate or relative operand. Those select
// imm8-vs-imm32 / rel8-vs-rel32 by the runtime value, so the matcher may pick a
// shorter form than a typed builder's nominal one -- baking would diverge from
// the matcher (and from llvm-mc). Such forms keep enc_hint=0 (matcher path).
form_is_hintable :: proc(enc: x86.Encoding) -> bool {
for op in enc.ops {
#partial switch op {
case .IMM8, .IMM16, .IMM32, .IMM64, .IMM8SX, .REL8, .REL32:
return false
}
}
return true
}
// Get all variants for an encoding (expands RM operands into reg and mem variants)
get_operand_variants :: proc(enc: x86.Encoding) -> []Operand_Signature {
result: [dynamic]Operand_Signature
@@ -1366,7 +1394,18 @@ generate_proc :: proc(sb: ^strings.Builder, entry: Proc_Entry, max_name_padding:
strings.write_string(sb, " :: #force_inline proc \"contextless\" (")
strings.write_string(sb, params)
strings.write_string(sb, ") -> Instruction { return ")
generate_helper_call(sb, entry)
// Build via the typed op_* constructors (op_gpr64/op_xmm/...), which carry
// the register CLASS. The older inst_r_r(.., Register(dst), ..) shortcut cast
// the hw-only typed enum straight to Register and dropped the class, so every
// typed builder produced a class-0 operand the matcher rejected (encode -> empty).
if entry.enc_hint != 0 {
// Pre-matched form: bake the biased global index so encode() skips the scan.
strings.write_string(sb, "with_hint(")
generate_fallback_instruction(sb, entry)
fmt.sbprintf(sb, ", %d)", entry.enc_hint)
} else {
generate_fallback_instruction(sb, entry)
}
strings.write_string(sb, " }\n")
}
@@ -1402,7 +1441,14 @@ generate_emit_proc :: proc(sb: ^strings.Builder, entry: Proc_Entry, max_name_pad
}
strings.write_string(sb, " :: #force_inline proc(")
strings.write_string(sb, params)
strings.write_string(sb, ") { ")
generate_emit_helper_call(sb, entry)
strings.write_string(sb, " }\n")
// Reuse the (class-correct, hint-baked) inst_ builder rather than re-emitting
// the operands -- keeps emit_ in lockstep with inst_ and inherits the hint.
strings.write_string(sb, ") { append(instructions, ")
strings.write_string(sb, entry.proc_name)
strings.write_string(sb, "(")
for i in 0..<sig.count {
if i > 0 { strings.write_string(sb, ", ") }
strings.write_string(sb, names[i])
}
strings.write_string(sb, ")) }\n")
}