rexcode/x86: emit-descriptor fast path (precompiled per-form recipe)

Precompute each encoding form into a flat Form_Recipe -- prefix byte, escape+
opcode blob, role->operand-index slots, ext, imm size, flags -- so the encoder
replays common forms straight-line instead of re-interpreting enc.ops/enc.enc
on every instruction (the resolve scan, escape ladder, prefix/REX selection).

encode() takes the fast path when the form is hinted, eligible, has a register
r/m and a literal immediate; everything else falls through to the existing
interpreter, which stays the byte-exact source of truth. First cut:
  - reg-direct ModR/M only (memory r/m falls back)
  - hint path only (matcher / generic builders fall back)
  - ~33% of forms eligible (VEX/EVEX, 16-bit operand-size, x87 fixed-ModR/M,
    moffs/far/rel/implicit operands are marked ineligible)

Recipes are built at startup into static storage (no heap); this moves into the
table generator (#loaded like every other table) once the shape settles.

Realistic immediate-heavy mix: ~19.0 -> ~16.3 ns/inst (52.7 -> 61.3 M/s).
Byte-exact across 2282 cases + idempotent.

Next: memory r/m addressing in the fast path, then the matcher path, then the
gen-time port.
This commit is contained in:
Brendan Punsky
2026-06-19 09:45:55 -04:00
committed by Flāvius
parent de0d2ae178
commit ac0589daa1
2 changed files with 262 additions and 3 deletions

View File

@@ -141,6 +141,7 @@ encode :: proc(
}
matched_enc: ^Encoding = nil
form_index := -1 // index into ENCODE_FORMS / ENCODE_RECIPES; -1 = no recipe fast path
// Pre-matched form fast-path: a typed builder that maps to a single
// encoding form bakes `global_index + 1` into enc_hint, letting us skip
@@ -149,7 +150,8 @@ encode :: proc(
// in long mode (the builders' target); bounds-checked; anything else
// (hand-built, generic builders, i386, decode) falls back to matching.
if mode == ._64 && inst.enc_hint != ENC_HINT_NONE && int(inst.enc_hint) <= len(ENCODE_FORMS) {
matched_enc = &ENCODE_FORMS[inst.enc_hint - 1]
form_index = int(inst.enc_hint) - 1
matched_enc = &ENCODE_FORMS[form_index]
} else {
// Find matching encoding from table (O(1) mnemonic lookup)
encodings := encoding_forms(inst.mnemonic)
@@ -174,6 +176,23 @@ encode :: proc(
}
}
// Recipe fast path: for a hinted, eligible form with a register r/m and a
// literal immediate, emit straight-line from the precomputed recipe and
// skip the interpreter (resolve scan, prefix/REX/escape selection) below.
// Anything outside that envelope falls through to the interpreter, which
// stays the byte-exact source of truth.
if form_index >= 0 && form_index < len(ENCODE_RECIPES) {
recipe := &ENCODE_RECIPES[form_index]
if recipe.flags.eligible && transmute(u8)inst.flags == 0 {
rm_reg := recipe.rm_op < 0 || inst.ops[recipe.rm_op].kind == .REGISTER
imm_lit := recipe.imm_op < 0 || inst.ops[recipe.imm_op].kind == .IMMEDIATE
if rm_reg && imm_lit {
byte_count += emit_recipe(recipe, &inst, code[byte_count:])
continue
}
}
}
// =====================================================================
// ENCODE INSTRUCTION (fully inlined hot path)
// =====================================================================
@@ -715,8 +734,8 @@ encode :: proc(
// Branchless select mask: 0xFF when `b`, else 0x00. Used to OR-accumulate
// REX/VEX/EVEX bit contributions without a per-condition branch
// (`x |= bmask(cond) & bits`).
@(private="file")
// (`x |= bmask(cond) & bits`). Package-private so the recipe emitter shares it.
@(private)
bmask :: #force_inline proc "contextless" (b: bool) -> u8 {
return -u8(b)
}

View File

@@ -0,0 +1,240 @@
// rexcode · Brendan Punsky (dotbmp@github), original author
package rexcode_x86
// =============================================================================
// SECTION: 7.x Emit Descriptor (precompiled per-form recipe)
// =============================================================================
//
// Each ENCODE_FORMS entry is a compact description the encoder *interprets* at
// emit time: walk enc.ops to map operands to slots, switch on the escape ladder,
// select the mandatory prefix, decide the ModR/M reg source, etc. For the common
// legacy/SSE forms that work is identical on every instruction that shares the
// form, so we precompute it once into a flat Form_Recipe that the hot path can
// replay straight-line.
//
// Anything the flat recipe can't represent verbatim -- VEX/EVEX, 16-bit
// operand-size (66h), x87 fixed-ModR/M, moffs/far/rel/implicit operands -- is
// marked `eligible = false` and falls back to the existing interpreter, which
// stays the source of truth for correctness.
//
// NOTE(interim): ENCODE_RECIPES is currently built at startup from the #loaded
// ENCODE_FORMS (static storage, no heap). Once the fast path is validated this
// moves into the table generator -- serialized and #loaded like every other
// table, with no @init.
Form_Recipe :: struct {
prefix: u8, // mandatory legacy prefix emitted before REX (0 = none)
opcode: [3]u8, // escape + opcode: [op] / [0F,op] / [0F,38,op] / [0F,3A,op]
opcode_len: u8, // 1..3
ext: u8, // ModR/M reg ext digit (when reg_from_ext) or /digit source
rm_op: i8, // user operand index -> ModR/M r/m field (-1 = none)
reg_op: i8, // user operand index -> ModR/M reg field (-1 = none)
opr_op: i8, // user operand index -> +rb opcode register (-1 = none)
imm_op: i8, // user operand index -> immediate (-1 = none)
imm_size: u8, // 1/2/4/8 when imm_op >= 0
flags: Recipe_Flags,
}
Recipe_Flags :: bit_field u8 {
eligible: bool | 1, // emit via the recipe fast path; else fall back
reg_from_ext: bool | 1, // ModR/M reg field = ext digit (opcode extension), not reg_op
has_modrm: bool | 1, // a ModR/M byte is emitted (rm or reg operand present)
force_rex_w: bool | 1, // always emit REX.W
could_spl: bool | 1, // 8-bit form: an operand may be SPL/BPL/SIL/DIL (forces REX)
default_64: bool | 1, // default 64-bit operand size (PUSH/POP/CALL/...)
}
// Derive the flat recipe for one encoding form. Pure; identical whether called
// here at startup or (later) from the table generator.
@(require_results)
form_to_recipe :: proc "contextless" (enc: ^Encoding) -> (r: Form_Recipe) {
r.rm_op, r.reg_op, r.opr_op, r.imm_op = -1, -1, -1, -1
// Escape + opcode blob.
switch enc.flags.esc {
case .NONE: r.opcode = {enc.opcode, 0, 0}; r.opcode_len = 1
case ._0F: r.opcode = {0x0F, enc.opcode, 0}; r.opcode_len = 2
case ._0F38: r.opcode = {0x0F, 0x38, enc.opcode}; r.opcode_len = 3
case ._0F3A: r.opcode = {0x0F, 0x3A, enc.opcode}; r.opcode_len = 3
}
mand := [4]u8{0, 0x66, 0xF3, 0xF2}
r.prefix = mand[enc.flags.prefix]
r.ext = enc.ext
r.flags.reg_from_ext = enc.flags.modrm_reg_ext
r.flags.force_rex_w = enc.flags.force_rex_w
r.flags.default_64 = enc.flags.default_64
eligible := enc.flags.vex_type == .NONE
has_16bit := false
has_8bit := false
has_implicit := false
has_exotic := false
// Walk the form's operands, mapping each encoded role to the *user* operand
// index (implicit operands are not user-provided and don't advance it).
user_idx := 0
for op_type, i in enc.ops {
if op_type == .NONE { break }
if is_implicit_op_inline(op_type) {
has_implicit = true
continue
}
role_idx := i8(user_idx)
user_idx += 1
#partial switch op_type {
case .R16, .RM16, .M16, .IMM16:
has_16bit = true
case .R8, .RM8, .M8:
has_8bit = true
case .REL8, .REL32, .MOFFS8, .MOFFS16, .MOFFS32, .MOFFS64,
.PTR16_16, .PTR16_32, .PTR16_64, .M16_16, .M16_32, .M16_64,
.SREG, .CR, .DR, .STI, .MM, .MM_M64,
.K, .K_M8, .K_M16, .K_M32, .K_M64:
has_exotic = true
}
#partial switch enc.enc[i] {
case .MR: r.rm_op = role_idx
case .REG: r.reg_op = role_idx
case .OP_R: r.opr_op = role_idx
case .IB: r.imm_op = role_idx; r.imm_size = 1
case .IW: r.imm_op = role_idx; r.imm_size = 2
case .ID: r.imm_op = role_idx; r.imm_size = 4
case .IQ: r.imm_op = role_idx; r.imm_size = 8
case .VVVV, .AAA, .IS4:
eligible = false
}
}
r.flags.has_modrm = r.rm_op >= 0 || r.reg_op >= 0
r.flags.could_spl = has_8bit
// x87 ST(i) / 0F NOP-class forms emit enc.ext as a literal ModR/M byte; the
// fast path doesn't model that, so they fall back.
is_x87 := enc.opcode >= 0xD8 && enc.opcode <= 0xDF
fixed_modrm := enc.ext >= 0xC0 && !r.flags.has_modrm && (enc.flags.esc != .NONE || is_x87)
r.flags.eligible = eligible && !has_16bit && !has_implicit && !has_exotic && !fixed_modrm
return
}
// -----------------------------------------------------------------------------
// Interim recipe table: built once at startup from the #loaded forms into static
// storage (no heap). ENCODE_RECIPES is parallel to ENCODE_FORMS.
// -----------------------------------------------------------------------------
@(private) ENCODE_RECIPE_CAP :: 4096
@(private) encode_recipes_storage: [ENCODE_RECIPE_CAP]Form_Recipe
ENCODE_RECIPES: []Form_Recipe
@(init)
build_encode_recipes :: proc "contextless" () {
n := min(len(ENCODE_FORMS), ENCODE_RECIPE_CAP)
for i in 0..<n {
encode_recipes_storage[i] = form_to_recipe(&ENCODE_FORMS[i])
}
ENCODE_RECIPES = encode_recipes_storage[:n]
}
@(private)
op_is_spl :: #force_inline proc "contextless" (op: ^Operand) -> bool {
// SPL/BPL/SIL/DIL (GPR8 hw 4..7) require any REX to encode (else they read
// as AH/CH/DH/BH).
return op.kind == .REGISTER && reg_class(op.reg) == REG_GPR8 && reg_hw(op.reg) >= 4 && reg_hw(op.reg) <= 7
}
// Recipe-driven straight-line emit. Handles the eligible legacy/SSE forms with a
// register (or absent) r/m and a literal immediate -- exactly the cases the
// caller guards for. Produces byte-identical output to the interpreter; anything
// outside that envelope is rejected by the caller and never reaches here.
@(require_results)
emit_recipe :: #force_inline proc "contextless" (recipe: ^Form_Recipe, inst: ^Instruction, out: []u8) -> (pos: u32) {
// Mandatory prefix (66/F3/F2 for SSE); operand-size 66h forms are ineligible.
if recipe.prefix != 0 {
out[pos] = recipe.prefix
pos += 1
}
// REX, OR-masked from the register-bearing roles (no memory base/index here).
rex: u8 = recipe.flags.force_rex_w ? 0x48 : 0
if recipe.reg_op >= 0 {
op := &inst.ops[recipe.reg_op]
rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x44
}
if recipe.rm_op >= 0 {
op := &inst.ops[recipe.rm_op]
rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x41
}
if recipe.opr_op >= 0 {
op := &inst.ops[recipe.opr_op]
rex |= bmask(op.kind == .REGISTER && reg_needs_rex(op.reg)) & 0x41
}
if recipe.flags.could_spl && rex == 0 {
spl := false
if recipe.rm_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.rm_op]) }
if recipe.reg_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.reg_op]) }
if recipe.opr_op >= 0 { spl = spl || op_is_spl(&inst.ops[recipe.opr_op]) }
rex |= bmask(spl) & 0x40
}
if rex != 0 {
out[pos] = rex
pos += 1
}
// Opcode blob; for +rb forms the register index folds into the last byte.
ob := recipe.opcode
if recipe.opr_op >= 0 {
op := &inst.ops[recipe.opr_op]
if op.kind == .REGISTER {
ob[recipe.opcode_len - 1] += reg_hw(op.reg) & 0x7
}
}
for j in 0..<recipe.opcode_len {
out[pos] = ob[j]
pos += 1
}
// ModR/M, register-direct (mod = 11).
if recipe.flags.has_modrm {
reg_field: u8 = recipe.ext & 0x7
if !recipe.flags.reg_from_ext {
reg_field = 0
if recipe.reg_op >= 0 {
op := &inst.ops[recipe.reg_op]
if op.kind == .REGISTER { reg_field = reg_hw(op.reg) & 0x7 }
}
}
rm_field: u8 = 0
if recipe.rm_op >= 0 {
op := &inst.ops[recipe.rm_op]
if op.kind == .REGISTER { rm_field = reg_hw(op.reg) & 0x7 }
}
out[pos] = 0xC0 | (reg_field << 3) | rm_field
pos += 1
}
// Immediate (literal; .RELATIVE/label immediates are rejected by the caller).
if recipe.imm_op >= 0 {
v := u64(inst.ops[recipe.imm_op].immediate)
switch recipe.imm_size {
case 1:
out[pos] = u8(v)
pos += 1
case 2:
out[pos] = u8(v); out[pos+1] = u8(v >> 8)
pos += 2
case 4:
out[pos] = u8(v); out[pos+1] = u8(v >> 8); out[pos+2] = u8(v >> 16); out[pos+3] = u8(v >> 24)
pos += 4
case 8:
out[pos] = u8(v); out[pos+1] = u8(v >> 8); out[pos+2] = u8(v >> 16); out[pos+3] = u8(v >> 24)
out[pos+4] = u8(v >> 32); out[pos+5] = u8(v >> 40); out[pos+6] = u8(v >> 48); out[pos+7] = u8(v >> 56)
pos += 8
}
}
return
}