From ac0589daa1771a7446389b8f4482e463b201227b Mon Sep 17 00:00:00 2001 From: Brendan Punsky Date: Fri, 19 Jun 2026 09:45:55 -0400 Subject: [PATCH] rexcode/x86: emit-descriptor fast path (precompiled per-form recipe) Precompute each encoding form into a flat Form_Recipe -- prefix byte, escape+ opcode blob, role->operand-index slots, ext, imm size, flags -- so the encoder replays common forms straight-line instead of re-interpreting enc.ops/enc.enc on every instruction (the resolve scan, escape ladder, prefix/REX selection). encode() takes the fast path when the form is hinted, eligible, has a register r/m and a literal immediate; everything else falls through to the existing interpreter, which stays the byte-exact source of truth. First cut: - reg-direct ModR/M only (memory r/m falls back) - hint path only (matcher / generic builders fall back) - ~33% of forms eligible (VEX/EVEX, 16-bit operand-size, x87 fixed-ModR/M, moffs/far/rel/implicit operands are marked ineligible) Recipes are built at startup into static storage (no heap); this moves into the table generator (#loaded like every other table) once the shape settles. Realistic immediate-heavy mix: ~19.0 -> ~16.3 ns/inst (52.7 -> 61.3 M/s). Byte-exact across 2282 cases + idempotent. Next: memory r/m addressing in the fast path, then the matcher path, then the gen-time port. --- core/rexcode/isa/x86/encoder.odin | 25 ++- core/rexcode/isa/x86/encoder_recipe.odin | 240 +++++++++++++++++++++++ 2 files changed, 262 insertions(+), 3 deletions(-) create mode 100644 core/rexcode/isa/x86/encoder_recipe.odin diff --git a/core/rexcode/isa/x86/encoder.odin b/core/rexcode/isa/x86/encoder.odin index f73730d92..cf9386b3d 100644 --- a/core/rexcode/isa/x86/encoder.odin +++ b/core/rexcode/isa/x86/encoder.odin @@ -141,6 +141,7 @@ encode :: proc( } matched_enc: ^Encoding = nil + form_index := -1 // index into ENCODE_FORMS / ENCODE_RECIPES; -1 = no recipe fast path // Pre-matched form fast-path: a typed builder that maps to a single // encoding form bakes `global_index + 1` into enc_hint, letting us skip @@ -149,7 +150,8 @@ encode :: proc( // in long mode (the builders' target); bounds-checked; anything else // (hand-built, generic builders, i386, decode) falls back to matching. if mode == ._64 && inst.enc_hint != ENC_HINT_NONE && int(inst.enc_hint) <= len(ENCODE_FORMS) { - matched_enc = &ENCODE_FORMS[inst.enc_hint - 1] + form_index = int(inst.enc_hint) - 1 + matched_enc = &ENCODE_FORMS[form_index] } else { // Find matching encoding from table (O(1) mnemonic lookup) encodings := encoding_forms(inst.mnemonic) @@ -174,6 +176,23 @@ encode :: proc( } } + // Recipe fast path: for a hinted, eligible form with a register r/m and a + // literal immediate, emit straight-line from the precomputed recipe and + // skip the interpreter (resolve scan, prefix/REX/escape selection) below. + // Anything outside that envelope falls through to the interpreter, which + // stays the byte-exact source of truth. + if form_index >= 0 && form_index < len(ENCODE_RECIPES) { + recipe := &ENCODE_RECIPES[form_index] + if recipe.flags.eligible && transmute(u8)inst.flags == 0 { + rm_reg := recipe.rm_op < 0 || inst.ops[recipe.rm_op].kind == .REGISTER + imm_lit := recipe.imm_op < 0 || inst.ops[recipe.imm_op].kind == .IMMEDIATE + if rm_reg && imm_lit { + byte_count += emit_recipe(recipe, &inst, code[byte_count:]) + continue + } + } + } + // ===================================================================== // ENCODE INSTRUCTION (fully inlined hot path) // ===================================================================== @@ -715,8 +734,8 @@ encode :: proc( // Branchless select mask: 0xFF when `b`, else 0x00. Used to OR-accumulate // REX/VEX/EVEX bit contributions without a per-condition branch -// (`x |= bmask(cond) & bits`). -@(private="file") +// (`x |= bmask(cond) & bits`). Package-private so the recipe emitter shares it. +@(private) bmask :: #force_inline proc "contextless" (b: bool) -> u8 { return -u8(b) } diff --git a/core/rexcode/isa/x86/encoder_recipe.odin b/core/rexcode/isa/x86/encoder_recipe.odin new file mode 100644 index 000000000..6c861aee1 --- /dev/null +++ b/core/rexcode/isa/x86/encoder_recipe.odin @@ -0,0 +1,240 @@ +// rexcode ยท Brendan Punsky (dotbmp@github), original author + +package rexcode_x86 + +// ============================================================================= +// SECTION: 7.x Emit Descriptor (precompiled per-form recipe) +// ============================================================================= +// +// Each ENCODE_FORMS entry is a compact description the encoder *interprets* at +// emit time: walk enc.ops to map operands to slots, switch on the escape ladder, +// select the mandatory prefix, decide the ModR/M reg source, etc. For the common +// legacy/SSE forms that work is identical on every instruction that shares the +// form, so we precompute it once into a flat Form_Recipe that the hot path can +// replay straight-line. +// +// Anything the flat recipe can't represent verbatim -- VEX/EVEX, 16-bit +// operand-size (66h), x87 fixed-ModR/M, moffs/far/rel/implicit operands -- is +// marked `eligible = false` and falls back to the existing interpreter, which +// stays the source of truth for correctness. +// +// NOTE(interim): ENCODE_RECIPES is currently built at startup from the #loaded +// ENCODE_FORMS (static storage, no heap). Once the fast path is validated this +// moves into the table generator -- serialized and #loaded like every other +// table, with no @init. + +Form_Recipe :: struct { + prefix: u8, // mandatory legacy prefix emitted before REX (0 = none) + opcode: [3]u8, // escape + opcode: [op] / [0F,op] / [0F,38,op] / [0F,3A,op] + opcode_len: u8, // 1..3 + ext: u8, // ModR/M reg ext digit (when reg_from_ext) or /digit source + rm_op: i8, // user operand index -> ModR/M r/m field (-1 = none) + reg_op: i8, // user operand index -> ModR/M reg field (-1 = none) + opr_op: i8, // user operand index -> +rb opcode register (-1 = none) + imm_op: i8, // user operand index -> immediate (-1 = none) + imm_size: u8, // 1/2/4/8 when imm_op >= 0 + flags: Recipe_Flags, +} + +Recipe_Flags :: bit_field u8 { + eligible: bool | 1, // emit via the recipe fast path; else fall back + reg_from_ext: bool | 1, // ModR/M reg field = ext digit (opcode extension), not reg_op + has_modrm: bool | 1, // a ModR/M byte is emitted (rm or reg operand present) + force_rex_w: bool | 1, // always emit REX.W + could_spl: bool | 1, // 8-bit form: an operand may be SPL/BPL/SIL/DIL (forces REX) + default_64: bool | 1, // default 64-bit operand size (PUSH/POP/CALL/...) +} + +// Derive the flat recipe for one encoding form. Pure; identical whether called +// here at startup or (later) from the table generator. +@(require_results) +form_to_recipe :: proc "contextless" (enc: ^Encoding) -> (r: Form_Recipe) { + r.rm_op, r.reg_op, r.opr_op, r.imm_op = -1, -1, -1, -1 + + // Escape + opcode blob. + switch enc.flags.esc { + case .NONE: r.opcode = {enc.opcode, 0, 0}; r.opcode_len = 1 + case ._0F: r.opcode = {0x0F, enc.opcode, 0}; r.opcode_len = 2 + case ._0F38: r.opcode = {0x0F, 0x38, enc.opcode}; r.opcode_len = 3 + case ._0F3A: r.opcode = {0x0F, 0x3A, enc.opcode}; r.opcode_len = 3 + } + + mand := [4]u8{0, 0x66, 0xF3, 0xF2} + r.prefix = mand[enc.flags.prefix] + r.ext = enc.ext + r.flags.reg_from_ext = enc.flags.modrm_reg_ext + r.flags.force_rex_w = enc.flags.force_rex_w + r.flags.default_64 = enc.flags.default_64 + + eligible := enc.flags.vex_type == .NONE + has_16bit := false + has_8bit := false + has_implicit := false + has_exotic := false + + // Walk the form's operands, mapping each encoded role to the *user* operand + // index (implicit operands are not user-provided and don't advance it). + user_idx := 0 + for op_type, i in enc.ops { + if op_type == .NONE { break } + if is_implicit_op_inline(op_type) { + has_implicit = true + continue + } + role_idx := i8(user_idx) + user_idx += 1 + + #partial switch op_type { + case .R16, .RM16, .M16, .IMM16: + has_16bit = true + case .R8, .RM8, .M8: + has_8bit = true + case .REL8, .REL32, .MOFFS8, .MOFFS16, .MOFFS32, .MOFFS64, + .PTR16_16, .PTR16_32, .PTR16_64, .M16_16, .M16_32, .M16_64, + .SREG, .CR, .DR, .STI, .MM, .MM_M64, + .K, .K_M8, .K_M16, .K_M32, .K_M64: + has_exotic = true + } + + #partial switch enc.enc[i] { + case .MR: r.rm_op = role_idx + case .REG: r.reg_op = role_idx + case .OP_R: r.opr_op = role_idx + case .IB: r.imm_op = role_idx; r.imm_size = 1 + case .IW: r.imm_op = role_idx; r.imm_size = 2 + case .ID: r.imm_op = role_idx; r.imm_size = 4 + case .IQ: r.imm_op = role_idx; r.imm_size = 8 + case .VVVV, .AAA, .IS4: + eligible = false + } + } + + r.flags.has_modrm = r.rm_op >= 0 || r.reg_op >= 0 + r.flags.could_spl = has_8bit + + // x87 ST(i) / 0F NOP-class forms emit enc.ext as a literal ModR/M byte; the + // fast path doesn't model that, so they fall back. + is_x87 := enc.opcode >= 0xD8 && enc.opcode <= 0xDF + fixed_modrm := enc.ext >= 0xC0 && !r.flags.has_modrm && (enc.flags.esc != .NONE || is_x87) + + r.flags.eligible = eligible && !has_16bit && !has_implicit && !has_exotic && !fixed_modrm + return +} + +// ----------------------------------------------------------------------------- +// Interim recipe table: built once at startup from the #loaded forms into static +// storage (no heap). ENCODE_RECIPES is parallel to ENCODE_FORMS. +// ----------------------------------------------------------------------------- + +@(private) ENCODE_RECIPE_CAP :: 4096 +@(private) encode_recipes_storage: [ENCODE_RECIPE_CAP]Form_Recipe +ENCODE_RECIPES: []Form_Recipe + +@(init) +build_encode_recipes :: proc "contextless" () { + n := min(len(ENCODE_FORMS), ENCODE_RECIPE_CAP) + for i in 0.. bool { + // SPL/BPL/SIL/DIL (GPR8 hw 4..7) require any REX to encode (else they read + // as AH/CH/DH/BH). + return op.kind == .REGISTER && reg_class(op.reg) == REG_GPR8 && reg_hw(op.reg) >= 4 && reg_hw(op.reg) <= 7 +} + +// Recipe-driven straight-line emit. Handles the eligible legacy/SSE forms with a +// register (or absent) r/m and a literal immediate -- exactly the cases the +// caller guards for. Produces byte-identical output to the interpreter; anything +// outside that envelope is rejected by the caller and never reaches here. +@(require_results) +emit_recipe :: #force_inline proc "contextless" (recipe: ^Form_Recipe, inst: ^Instruction, out: []u8) -> (pos: u32) { + // Mandatory prefix (66/F3/F2 for SSE); operand-size 66h forms are ineligible. + if recipe.prefix != 0 { + out[pos] = recipe.prefix + pos += 1 + } + + // REX, OR-masked from the register-bearing roles (no memory base/index here). + rex: u8 = recipe.flags.force_rex_w ? 0x48 : 0 + if recipe.reg_op >= 0 { + op := &inst.ops[recipe.reg_op] + rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x44 + } + if recipe.rm_op >= 0 { + op := &inst.ops[recipe.rm_op] + rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x41 + } + if recipe.opr_op >= 0 { + op := &inst.ops[recipe.opr_op] + rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x41 + } + if recipe.flags.could_spl && rex == 0 { + spl := false + if recipe.rm_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.rm_op]) } + if recipe.reg_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.reg_op]) } + if recipe.opr_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.opr_op]) } + rex |= bmask(spl) & 0x40 + } + if rex != 0 { + out[pos] = rex + pos += 1 + } + + // Opcode blob; for +rb forms the register index folds into the last byte. + ob := recipe.opcode + if recipe.opr_op >= 0 { + op := &inst.ops[recipe.opr_op] + if op.kind == .REGISTER { + ob[recipe.opcode_len - 1] += reg_hw(op.reg) & 0x7 + } + } + for j in 0..= 0 { + op := &inst.ops[recipe.reg_op] + if op.kind == .REGISTER { reg_field = reg_hw(op.reg) & 0x7 } + } + } + rm_field: u8 = 0 + if recipe.rm_op >= 0 { + op := &inst.ops[recipe.rm_op] + if op.kind == .REGISTER { rm_field = reg_hw(op.reg) & 0x7 } + } + out[pos] = 0xC0 | (reg_field << 3) | rm_field + pos += 1 + } + + // Immediate (literal; .RELATIVE/label immediates are rejected by the caller). + if recipe.imm_op >= 0 { + v := u64(inst.ops[recipe.imm_op].immediate) + switch recipe.imm_size { + case 1: + out[pos] = u8(v) + pos += 1 + case 2: + out[pos] = u8(v); out[pos+1] = u8(v >> 8) + pos += 2 + case 4: + out[pos] = u8(v); out[pos+1] = u8(v >> 8); out[pos+2] = u8(v >> 16); out[pos+3] = u8(v >> 24) + pos += 4 + case 8: + out[pos] = u8(v); out[pos+1] = u8(v >> 8); out[pos+2] = u8(v >> 16); out[pos+3] = u8(v >> 24) + out[pos+4] = u8(v >> 32); out[pos+5] = u8(v >> 40); out[pos+6] = u8(v >> 48); out[pos+7] = u8(v >> 56) + pos += 8 + } + } + + return +}