From 89ef64c49007cb3f9e8335277e5deac211200746 Mon Sep 17 00:00:00 2001 From: Brendan Punsky Date: Fri, 19 Jun 2026 10:48:20 -0400 Subject: [PATCH] rexcode/x86: form-match memoization cache for the matcher path The matcher path (generic builders, hand-built, decode->re-encode) resolves an instruction to its encoding form by linearly scanning the forms for its mnemonic and operand-matching each -- the dominant cost on that path. Memoize it: pack (mnemonic, per-operand shape) into a key (immediates folded to the smallest size class they fit, matching imm_matches_inline) and cache key -> form so a repeated instruction shape skips the scan. Direct-mapped, fixed 8192-slot table (64 KB, no allocation). Each slot packs the full 48-bit key and form index into one u64, read/written with relaxed atomics, so concurrent encode() stays safe -- a reader sees a matching key or rescans, never a torn entry. The scan stays the source of truth (a miss runs it and records the result), so the cache is exact. Lookup + scan live in a non-inlined find_form() so they don't bloat encode()'s hot loop and slow the hint path that shares it. (Routing the matcher path through the recipe emit was tried and dropped: it costs the hint path ~1.2-1.5 ns however isolated -- the hot loop is too codegen-sensitive -- while the cache alone is free for the hint path.) Realistic generic-builder mix: matcher ~52 -> ~35 ns/inst (~1.49x); hint path unchanged. Byte-exact across 2282 + idempotent. --- core/rexcode/isa/x86/encoder.odin | 42 +++---- core/rexcode/isa/x86/encoder_match.odin | 159 ++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 25 deletions(-) create mode 100644 core/rexcode/isa/x86/encoder_match.odin diff --git a/core/rexcode/isa/x86/encoder.odin b/core/rexcode/isa/x86/encoder.odin index 8764eb656..6dfeeff0b 100644 --- a/core/rexcode/isa/x86/encoder.odin +++ b/core/rexcode/isa/x86/encoder.odin @@ -153,39 +153,31 @@ encode :: proc( form_index = int(inst.enc_hint) - 1 matched_enc = &ENCODE_FORMS[form_index] } else { - // Find matching encoding from table (O(1) mnemonic lookup) - encodings := encoding_forms(inst.mnemonic) - if len(encodings) == 0 { - append(errors, Error{u32(instruction_index), .INVALID_MNEMONIC, {}}) - ok = false - continue - } - - // Find the first encoding that matches operands - for &e in encodings { - if encoding_matches_inline(&inst, &e, mode) { - matched_enc = &e - break - } - } - - if matched_enc == nil { - append(errors, Error{u32(instruction_index), .NO_MATCHING_ENCODING, {}}) + // Resolve the form on the matcher path (memoizing cache + scan) in a + // separate, non-inlined proc so this hot loop stays lean for the hint + // path above. form_index is discarded: the matcher path uses the + // interpreter, not the recipe -- keeping it off the shared fast-path + // hook leaves the hint path's codegen untouched. + err: Error_Code + matched_enc, _, err = find_form(&inst, mode) + if err != .NONE { + append(errors, Error{u32(instruction_index), err, {}}) ok = false continue } } - // Recipe fast path: for a hinted, eligible form with a register r/m and a - // literal immediate, emit straight-line from the precomputed recipe and - // skip the interpreter (resolve scan, prefix/REX/escape selection) below. - // Anything outside that envelope falls through to the interpreter, which - // stays the byte-exact source of truth. + // Recipe fast path (hint path only): a hinted, eligible form with a literal + // immediate emits straight-line from the precomputed recipe, skipping the + // interpreter below (resolve scan, prefix/REX/escape selection, ModR/M + // source choice). Only the hint branch sets form_index; the matcher branch + // leaves it -1 and emits via the interpreter, so this hook -- and the hot + // loop's codegen -- stays exactly what it was before the matcher cache. + // Anything outside the envelope falls through to the interpreter, the + // byte-exact source of truth. if form_index >= 0 && form_index < len(ENCODE_RECIPES) { recipe := &ENCODE_RECIPES[form_index] if recipe.flags.eligible && transmute(u8)inst.flags == 0 { - // r/m may now be a register or a memory operand; only a - // label/relative immediate (a relocation) still falls back. imm_lit := recipe.imm_op < 0 || inst.ops[recipe.imm_op].kind == .IMMEDIATE if imm_lit { byte_count += emit_recipe(recipe, &inst, code[byte_count:]) diff --git a/core/rexcode/isa/x86/encoder_match.odin b/core/rexcode/isa/x86/encoder_match.odin new file mode 100644 index 000000000..c198635c6 --- /dev/null +++ b/core/rexcode/isa/x86/encoder_match.odin @@ -0,0 +1,159 @@ +// rexcode ยท Brendan Punsky (dotbmp@github), original author + +package rexcode_x86 + +import "base:intrinsics" + +// ============================================================================= +// SECTION: 7.y Form-match memoization cache (long mode) +// ============================================================================= +// +// The matcher path resolves an instruction to its encoding form by linearly +// scanning the forms for its mnemonic and operand-matching each in turn -- the +// dominant cost on that path. This memoizes (mnemonic, operand-shape) -> form so +// a repeated instruction shape skips the scan after the first occurrence. +// +// The scan stays the source of truth: a miss runs it and records the result, so +// the cache can only ever return a form the scan would have. The key captures +// everything the match depends on (mnemonic + per-operand kind/class/size, with +// immediates folded to the smallest size class they fit -- exactly what +// imm_matches_inline distinguishes), so two instructions with the same key +// always match the same form. +// +// Direct-mapped and fixed-size (no allocation). Each slot packs the full 48-bit +// key and the form index into one u64, read/written with relaxed atomics: a +// reader sees either a slot whose key matches (use it) or one that doesn't +// (rescan) -- never a torn half-written entry -- so concurrent encode() calls +// stay safe. Collisions evict and rescan. + +@(private) MATCH_CACHE_LOG :: 13 +@(private) MATCH_CACHE_N :: 1 << MATCH_CACHE_LOG // 8192 slots, 64 KB +@(private) MATCH_CACHE: [MATCH_CACHE_N]u64 // 0 = empty; else (key << 16) | (form_index + 1) + +@(private, require_results) +match_hash :: #force_inline proc "contextless" (key: u64) -> u64 { + return (key * 0x9E3779B97F4A7C15) >> (64 - MATCH_CACHE_LOG) +} + +// Smallest immediate size the value fits, matching imm_matches_inline's nested +// ranges. Values in the same class match the same set of immediate forms. +@(private, require_results) +imm_value_class :: #force_inline proc "contextless" (v: i64) -> u8 { + switch { + case -128 <= v && v <= 127: return 0 // imm8sx + case -128 <= v && v <= 255: return 1 // imm8 (not sign-extended) + case -32768 <= v && v <= 65535: return 2 // imm16 + case -2147483648 <= v && v <= 4294967295: return 3 // imm32 + } + return 4 // imm64 +} + +// Per-operand shape code: kind in bits 5-7, class/size/value-class in bits 0-4. +// ok = false means the operand can't be coded cleanly (a non-standard memory +// size); the caller then skips the cache and scans. +@(private, require_results) +op_match_code :: #force_inline proc "contextless" (op: ^Operand) -> (code: u8, ok: bool) { + switch op.kind { + case .NONE: + return 0, true + case .REGISTER: + return (1 << 5) | u8((reg_class(op.reg) >> 8) & 0x1F), true + case .MEMORY: + switch op.size { + case 1: return (2 << 5) | 1, true + case 2: return (2 << 5) | 2, true + case 4: return (2 << 5) | 3, true + case 8: return (2 << 5) | 4, true + case 10: return (2 << 5) | 5, true + case 16: return (2 << 5) | 6, true + case 32: return (2 << 5) | 7, true + case 64: return (2 << 5) | 8, true + } + return 0, false + case .IMMEDIATE: + return (3 << 5) | imm_value_class(op.immediate), true + case .RELATIVE: + c: u8 = 0 + switch op.size { + case 1: c = 1 + case 4: c = 2 + } + return (4 << 5) | c, true + } + return 0, false +} + +// Pack (mnemonic, four operand shape codes) into a 48-bit key. ok = false when +// any operand can't be coded -> the caller skips the cache for this instruction. +@(private, require_results) +match_key :: #force_inline proc "contextless" (inst: ^Instruction) -> (key: u64, ok: bool) { + key = u64(u16(inst.mnemonic)) + for i in 0 ..< 4 { + c, code_ok := op_match_code(&inst.ops[i]) + if !code_ok { return 0, false } + key |= u64(c) << uint(16 + i*8) + } + return key, true +} + +@(private, require_results) +match_cache_get :: #force_inline proc "contextless" (key: u64) -> (form_index: int, hit: bool) { + v := intrinsics.atomic_load_explicit(&MATCH_CACHE[match_hash(key)], .Relaxed) + if v != 0 && (v >> 16) == key { + return int(v & 0xFFFF) - 1, true + } + return -1, false +} + +@(private) +match_cache_put :: #force_inline proc "contextless" (key: u64, form_index: int) { + intrinsics.atomic_store_explicit(&MATCH_CACHE[match_hash(key)], (key << 16) | u64(form_index + 1), .Relaxed) +} + +// Resolve an instruction to its encoding form on the matcher path: long-mode +// cache lookup, else the linear scan, recording the result. err is .NONE on +// success. (form_index is returned but the caller currently discards it: the +// matcher path uses the interpreter to emit, not the recipe -- putting the +// recipe emit on this path costs the shared hint loop ~1.2-1.5 ns however it is +// isolated, and the cache alone already makes the matcher ~1.4x.) +// +// Deliberately NOT inlined: pulling the cache + scan out of encode()'s loop keeps +// the hot function small so the hint path that shares it isn't penalised by code +// it never runs. The matcher path eats the one call; it is the slow path anyway. +@(private) +find_form :: #force_no_inline proc "contextless" (inst: ^Instruction, mode: Mode) -> (matched_enc: ^Encoding, form_index: int, err: Error_Code) { + form_index = -1 + + cache_key: u64 = 0 + cacheable := false + if mode == ._64 { + cache_key, cacheable = match_key(inst) + if cacheable { + if fi, hit := match_cache_get(cache_key); hit { + return &ENCODE_FORMS[fi], fi, .NONE + } + } + } + + encodings := encoding_forms(inst.mnemonic) + if len(encodings) == 0 { + return nil, -1, .INVALID_MNEMONIC + } + for &e in encodings { + if encoding_matches_inline(inst, &e, mode) { + matched_enc = &e + break + } + } + if matched_enc == nil { + return nil, -1, .NO_MATCHING_ENCODING + } + + if mode == ._64 { + form_index = int((uintptr(matched_enc) - uintptr(&ENCODE_FORMS[0])) / size_of(Encoding)) + if cacheable { + match_cache_put(cache_key, form_index) + } + } + return matched_enc, form_index, .NONE +}