rexcode/x86: branchless hot paths + single-pass operand resolution

Three layers on the x86 encode/decode hot paths, all byte-exact (2246
LLVM-verified cases) and roundtrip-clean:

1. Branchless: legacy-prefix emission (speculative write + conditional
   advance), REX/VEX/EVEX extension-bit accumulation (gate-and-mask),
   ModRM mod/disp-size selection (cmov selects), displacement emission
   (widened store + ENCODE_TAIL_SLACK); decoder REX/VEX/EVEX register
   extensions (arithmetic instead of if/+=8).

2. Resolve-operands-once: the previous code re-derived each user operand
   ~5-10x per instruction (a fresh O(n) scan of enc.ops per emission
   pass). Now resolved into a [4]^Operand map a single time.

3. Single-pass gather: fold the opcode-+rb and ModR/M slot-detection
   scans into that one resolve pass (3 enc.enc passes -> 1).

Net on a 100k mixed-instruction benchmark: encode ~58 -> ~54 ns/inst
(best 52). Branchless alone was a ~7% encode regression (predicted
branches, nothing to recover); the algorithmic passes recovered it and
beat baseline.
This commit is contained in:
Brendan Punsky
2026-06-18 20:16:26 -04:00
committed by Flāvius
parent daa5b7cb79
commit 8387731357
2 changed files with 176 additions and 192 deletions

View File

@@ -671,10 +671,7 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
case .REG:
// Register encoded in ModR/M.reg
register_number := modrm_info.reg
if state.rex & 0x04 != 0 { // REX.R
register_number += 8
}
register_number := modrm_info.reg + ((state.rex & 0x04) << 1) // REX.R -> +8 (branchless)
reg := decode_register(register_number, op_type, state.rex)
op = op_reg(reg)
return
@@ -683,10 +680,7 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
// Register or memory in ModR/M.rm
if modrm_info.mod == 3 {
// Register
register_number := modrm_info.rm
if state.rex & 0x01 != 0 { // REX.B
register_number += 8
}
register_number := modrm_info.rm + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
reg := decode_register(register_number, op_type, state.rex)
op = op_reg(reg)
return
@@ -755,20 +749,14 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
case .OP_R:
// Register encoded in low 3 bits of opcode
register_number := state.opcode_reg
if state.rex & 0x01 != 0 { // REX.B extends the register
register_number += 8
}
register_number := state.opcode_reg + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
reg := decode_register(register_number, op_type, state.rex)
op = op_reg(reg)
return
case .VVVV:
// VEX.vvvv register
register_number := 15 - state.vex_vvvv
if state.evex_v2 {
register_number += 16
}
register_number := (15 - state.vex_vvvv) + (u8(state.evex_v2) << 4) // EVEX.V' -> +16 (branchless)
reg := decode_register(register_number, op_type, state.rex)
op = op_reg(reg)
return
@@ -800,13 +788,8 @@ decode_single_operand_vex :: proc(state: ^Decoder_State, op_type: Operand_Type,
case .REG:
// Register in ModR/M.reg, extended by VEX.R
// vex_r is true when the encoded bit is 0, meaning extension is active
register_number := modrm_info.reg
if state.vex_r { // vex_r=true means extend (bit was 0 in inverted encoding)
register_number += 8
}
if state.vex_type == .EVEX && state.evex_r2 { // evex_r2=true means extend
register_number += 16
}
// VEX.R -> +8, EVEX.R' -> +16 (branchless; vex_r/evex_r2 true means extend)
register_number := modrm_info.reg + (u8(state.vex_r) << 3) + (u8(state.vex_type == .EVEX && state.evex_r2) << 4)
reg := decode_register(register_number, op_type, 0)
return op_reg(reg), .NONE
@@ -814,10 +797,7 @@ decode_single_operand_vex :: proc(state: ^Decoder_State, op_type: Operand_Type,
if modrm_info.mod == 3 {
// Register in ModR/M.rm, extended by VEX.B
// vex_b is true when the encoded bit is 0, meaning extension is active
register_number := modrm_info.rm
if state.vex_b { // vex_b=true means extend
register_number += 8
}
register_number := modrm_info.rm + (u8(state.vex_b) << 3) // VEX.B -> +8 (branchless)
reg := decode_register(register_number, op_type, 0)
return op_reg(reg), .NONE
} else {
@@ -857,10 +837,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
if has_sib {
// SIB addressing
base_number := sib_info.base
if state.rex & 0x01 != 0 { // REX.B
base_number += 8
}
base_number := sib_info.base + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
// Special case: base=5 with mod=0 means no base (displacement32 only)
if sib_info.base == 5 && modrm_info.mod == 0 {
@@ -871,10 +848,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
// Index register (0xFF means no index)
if sib_info.index != 0xFF {
index_number := sib_info.index
if state.rex & 0x02 != 0 { // REX.X
index_number += 8
}
index_number := sib_info.index + ((state.rex & 0x02) << 2) // REX.X -> +8 (branchless)
index_reg = addr_reg_from_num(index_number, state.mode)
scale = sib_info.scale
}
@@ -890,10 +864,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
base_reg = NONE
}
} else {
base_number := modrm_info.rm
if state.rex & 0x01 != 0 { // REX.B
base_number += 8
}
base_number := modrm_info.rm + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
base_reg = addr_reg_from_num(base_number, state.mode)
}
}

View File

@@ -33,6 +33,13 @@ import "core:rexcode/isa"
MAX_INST_SIZE :: 15 // Maximum x64 instruction length
// Extra bytes reserved past each instruction so the branchless emitters can
// write a few speculative bytes beyond the logical end (e.g. a widened 4-byte
// displacement store when only a disp8 is kept). The over-written tail is
// reclaimed by the next emit; this slack just guarantees the wide store stays
// in bounds even for the final instruction against a tight buffer.
ENCODE_TAIL_SLACK :: 8
// -----------------------------------------------------------------------------
// SECTION: 7.6 Core Encoding Function
@@ -100,7 +107,7 @@ encode :: proc(
}
// Check buffer space
if byte_count + MAX_INST_SIZE > u32(len(code)) {
if byte_count + MAX_INST_SIZE + ENCODE_TAIL_SLACK > u32(len(code)) {
append(errors, Error{u32(instruction_index), .BUFFER_OVERFLOW, {}})
ok = false
continue
@@ -171,32 +178,64 @@ encode :: proc(
out := code[byte_count:]
pos: u32 = 0
// --- Legacy Prefixes ---
// Resolve every encoding slot to its user operand ONCE, and gather the
// ModR/M and opcode-reg slot roles in the same pass. The emission below
// indexes user_ops[slot] instead of re-deriving the mapping per pass --
// the previous code re-scanned enc.ops ~5-10x per instruction (once for
// REX bits, opcode +rb, ModR/M slots, reg/rm fields, immediates), which
// was a dominant per-instruction cost.
user_ops: [4]^Operand
mr_slot: int = -1
reg_slot: int = -1
opr_slot: int = -1
{
user_idx := 0
for op, i in enc.ops {
if op == .NONE { break }
if !is_implicit_op_inline(op) {
if user_idx < int(inst.operand_count) {
user_ops[i] = &inst.ops[user_idx]
}
user_idx += 1
}
// Slot roles (parallel array enc.enc[i]) gathered in the same pass.
#partial switch enc.enc[i] {
case .MR: mr_slot = i
case .REG: reg_slot = i
case .OP_R: opr_slot = i
}
}
}
has_modrm := mr_slot >= 0 || reg_slot >= 0
// --- Legacy Prefixes (branchless) ---
//
// Each optional prefix byte is written *speculatively* at `pos`, then
// `pos` advances only if the prefix is actually present. When absent the
// speculative byte is overwritten by the next emit (the opcode always
// writes at `pos`), so the final stream is identical to the branching
// form -- with four data-dependent branches removed. The buffer carries
// MAX_INST_SIZE slack (checked above), so the spec writes stay in bounds.
// Lock prefix (F0)
if inst.flags.lock && enc.flags.lock_ok {
out[pos] = 0xF0
pos += 1
}
out[pos] = 0xF0
pos += u32(inst.flags.lock && enc.flags.lock_ok)
// Rep/Repne prefix
#partial switch inst.flags.rep {
case .REP: out[pos] = 0xF3; pos += 1
case .REPNE: out[pos] = 0xF2; pos += 1
}
// Rep/Repne prefix (NONE -> 0, REP -> F3, REPNE -> F2)
REP_BYTE := [Rep]u8{ .NONE = 0, .REP = 0xF3, .REPNE = 0xF2 }
rep_b := REP_BYTE[inst.flags.rep]
out[pos] = rep_b
pos += u32(rep_b != 0)
// Segment override
if inst.flags.segment != 0 {
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
out[pos] = seg_prefix[inst.flags.segment]
pos += 1
}
// Segment override (table already maps 0 -> 0)
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
seg_b := seg_prefix[inst.flags.segment]
out[pos] = seg_b
pos += u32(seg_b != 0)
// Address size override (67h)
if inst.flags.addr32 {
out[pos] = 0x67
pos += 1
}
out[pos] = 0x67
pos += u32(inst.flags.addr32)
// --- VEX/EVEX or Legacy Encoding ---
@@ -226,25 +265,28 @@ encode :: proc(
case .W1: w = 1
}
// Check operands for REX bits
// Operand-driven extension bits (branchless: compute reg & mem
// contributions, gate by kind, clear the inverted bit via AND-mask).
for enc_type, i in enc.enc {
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op == nil { continue }
is_reg := user_op.kind == .REGISTER
is_mem := user_op.kind == .MEMORY
m := user_op.mem
reg_ext := is_reg && reg_needs_rex(user_op.reg)
base_ext := is_mem && mem_has_base(m) && m.base_ext
index_ext := is_mem && mem_has_index(m) && m.index_ext
#partial switch enc_type {
case .REG:
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { r = 0 }
r &= u8(!reg_ext)
case .MR:
#partial switch user_op.kind {
case .REGISTER:
if reg_needs_rex(user_op.reg) { b = 0 }
case .MEMORY:
m := user_op.mem
if mem_has_base(m) && m.base_ext { b = 0 }
if mem_has_index(m) && m.index_ext { x = 0 }
}
b &= u8(!reg_ext)
b &= u8(!base_ext)
x &= u8(!index_ext)
case .VVVV:
if user_op.kind == .REGISTER { vvvv = ~reg_hw(user_op.reg) & 0xF }
vvvv = is_reg ? (~reg_hw(user_op.reg) & 0xF) : vvvv
}
}
@@ -288,37 +330,34 @@ encode :: proc(
}
for i in 0..<4 {
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op == nil { continue }
is_reg := user_op.kind == .REGISTER
is_mem := user_op.kind == .MEMORY
m := user_op.mem
hw := reg_hw(user_op.reg) // gated by is_reg below
reg8 := is_reg && hw >= 8
reg16 := is_reg && hw >= 16
base_ext := is_mem && mem_has_base(m) && m.base_ext
index_ext := is_mem && mem_has_index(m) && m.index_ext
#partial switch enc.enc[i] {
case .REG:
if user_op.kind == .REGISTER {
hw := reg_hw(user_op.reg)
if hw >= 8 { r = 0 }
if hw >= 16 { rr = 0 }
}
r &= u8(!reg8)
rr &= u8(!reg16)
case .MR:
#partial switch user_op.kind {
case .REGISTER:
hw := reg_hw(user_op.reg)
if hw >= 8 { b = 0 }
case .MEMORY:
m := user_op.mem
if mem_has_base(m) && m.base_ext { b = 0 }
if mem_has_index(m) && m.index_ext { x = 0 }
if user_op.flags.broadcast != .NONE { bb = 1 }
}
b &= u8(!reg8)
b &= u8(!base_ext)
x &= u8(!index_ext)
bb |= u8(is_mem && user_op.flags.broadcast != .NONE)
case .VVVV:
if user_op.kind == .REGISTER {
hw := reg_hw(user_op.reg)
vvvv = ~hw & 0xF
if hw >= 16 { vvv = 0 }
}
vvvv = is_reg ? (~hw & 0xF) : vvvv
vvv &= u8(!reg16)
case .AAA:
if user_op.kind == .REGISTER { aaa = reg_hw(user_op.reg) & 0x7 }
aaa = is_reg ? (hw & 0x7) : aaa
}
if user_op.flags.zeroing { z = 1 }
z |= u8(user_op.flags.zeroing)
}
out[pos] = 0x62
@@ -352,44 +391,45 @@ encode :: proc(
pos += 1
}
// REX prefix
rex: u8 = 0
if enc.flags.force_rex_w { rex |= 0x48 }
// REX prefix (branchless: OR each operand's contribution via a mask).
// Both the register and memory contributions are computed and gated
// by operand kind, so the data-dependent REGISTER/MEMORY branch is
// gone; only the per-form enc_type switch (predictable) remains.
rex: u8 = bmask(enc.flags.force_rex_w) & 0x48
for enc_type, i in enc.enc {
if enc_type == .NONE { continue }
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op == nil { continue }
is_reg := user_op.kind == .REGISTER
is_mem := user_op.kind == .MEMORY
m := user_op.mem // union bytes; only used when is_mem
reg_ext := is_reg && reg_needs_rex(user_op.reg)
base_ext := is_mem && mem_has_base(m) && m.base_ext
index_ext := is_mem && mem_has_index(m) && m.index_ext
#partial switch enc_type {
case .REG:
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { rex |= 0x44 }
rex |= bmask(reg_ext) & 0x44
case .MR:
#partial switch user_op.kind {
case .REGISTER:
if reg_needs_rex(user_op.reg) { rex |= 0x41 }
case .MEMORY:
m := user_op.mem
if mem_has_base(m) && m.base_ext { rex |= 0x41 }
if mem_has_index(m) && m.index_ext { rex |= 0x42 }
}
rex |= bmask(reg_ext) & 0x41 // register r/m -> REX.B
rex |= bmask(base_ext) & 0x41
rex |= bmask(index_ext) & 0x42
case .OP_R:
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { rex |= 0x41 }
rex |= bmask(reg_ext) & 0x41
}
}
// SPL, BPL, SIL, DIL require REX (long mode only).
// SPL/BPL/SIL/DIL (GPR8 hw 4-7) require an empty REX (long mode only).
if mode == ._64 {
spl_seen := false
for i in 0..<inst.operand_count {
op := &inst.ops[i]
if op.kind == .REGISTER {
class := reg_class(op.reg)
hw := reg_hw(op.reg)
if class == REG_GPR8 && hw >= 4 && hw <= 7 {
if rex == 0 { rex = 0x40 }
}
}
hw := reg_hw(op.reg)
spl_seen ||= op.kind == .REGISTER && reg_class(op.reg) == REG_GPR8 && hw >= 4 && hw <= 7
}
rex |= bmask(rex == 0 && spl_seen) & 0x40
}
// 32-bit mode forbids the REX prefix entirely. If any operand
@@ -429,14 +469,11 @@ encode :: proc(
x87_fixed_modrm := opcode >= 0xD8 && opcode <= 0xDF && enc.ext >= 0xC0
opr_index: u8 = 0
opr_seen := false
for enc_type, i in enc.enc {
if enc_type == .OP_R {
user_op := get_user_op_inline(&inst, enc, i)
if user_op != nil && user_op.kind == .REGISTER {
opr_index = reg_hw(user_op.reg) & 0x07
opr_seen = true
}
break
if opr_slot >= 0 {
user_op := user_ops[opr_slot]
if user_op != nil && user_op.kind == .REGISTER {
opr_index = reg_hw(user_op.reg) & 0x07
opr_seen = true
}
}
if opr_seen && !x87_fixed_modrm {
@@ -446,18 +483,7 @@ encode :: proc(
out[pos] = opcode
pos += 1
// --- ModR/M and SIB ---
has_modrm := false
mr_slot: int = -1
reg_slot: int = -1
for enc_type, i in enc.enc {
#partial switch enc_type {
case .MR: mr_slot = i; has_modrm = true
case .REG: reg_slot = i; has_modrm = true
}
}
// --- ModR/M and SIB --- (mr_slot/reg_slot/has_modrm gathered above)
if has_modrm {
has_sib := false
mod: u8 = 0
@@ -471,7 +497,7 @@ encode :: proc(
if enc.flags.modrm_reg_ext {
reg_field = enc.ext & 0x07
} else if reg_slot >= 0 {
reg_op := get_user_op_inline(&inst, enc, reg_slot)
reg_op := user_ops[reg_slot]
if reg_op != nil && reg_op.kind == .REGISTER {
reg_field = reg_hw(reg_op.reg) & 0x07
}
@@ -479,7 +505,7 @@ encode :: proc(
// R/M field
if mr_slot >= 0 {
mr_op := get_user_op_inline(&inst, enc, mr_slot)
mr_op := user_ops[mr_slot]
if mr_op != nil {
#partial switch mr_op.kind {
case .REGISTER:
@@ -507,6 +533,12 @@ encode :: proc(
needs_sib := has_index || (base_hw & 0x07) == 4
has_base := mem_has_base(m)
is_rbp := (base_hw & 0x07) == 5
is_zero := disp_value == 0
fits8 := disp_value >= -128 && disp_value <= 127
disp = disp_value
if needs_sib {
has_sib = true
rm = 0b100
@@ -518,37 +550,22 @@ encode :: proc(
case 8: scale = 3
}
idx: u8 = 0b100
if has_index { idx = m.index_hw & 0x07 }
base_sib := base_hw & 0x07
if !mem_has_base(m) { base_sib = 0b101 }
idx := has_index ? (m.index_hw & 0x07) : u8(0b100)
base_sib := has_base ? (base_hw & 0x07) : u8(0b101)
sib = (scale << 6) | (idx << 3) | base_sib
if mem_has_base(m) && (base_hw & 0x07) == 5 && disp_value == 0 {
mod = 0b01; disp = 0; displacement_size = 1
} else if !mem_has_base(m) {
mod = 0b00; disp = disp_value; displacement_size = 4
} else if disp_value == 0 {
mod = 0b00
} else if disp_value >= -128 && disp_value <= 127 {
mod = 0b01; disp = disp_value; displacement_size = 1
} else {
mod = 0b10; disp = disp_value; displacement_size = 4
}
// mod / disp size, branchless. No base -> [disp32]
// (mod 00, size 4). Otherwise: no displacement when
// zero and not RBP-like; else disp8 if it fits, else
// disp32. (RBP-like base forces an explicit disp8.)
no_disp := has_base && is_zero && !(has_base && is_rbp)
displacement_size = !has_base ? 4 : (no_disp ? 0 : (fits8 ? 1 : 4))
mod = !has_base ? 0b00 : (no_disp ? 0b00 : (fits8 ? 0b01 : 0b10))
} else {
rm = base_hw & 0x07
if (base_hw & 0x07) == 5 && disp_value == 0 {
mod = 0b01; disp = 0; displacement_size = 1
} else if disp_value == 0 {
mod = 0b00
} else if disp_value >= -128 && disp_value <= 127 {
mod = 0b01; disp = disp_value; displacement_size = 1
} else {
mod = 0b10; disp = disp_value; displacement_size = 4
}
no_disp := is_zero && !is_rbp
displacement_size = no_disp ? 0 : (fits8 ? 1 : 4)
mod = no_disp ? 0b00 : (fits8 ? 0b01 : 0b10)
}
}
}
@@ -563,11 +580,15 @@ encode :: proc(
pos += 1
}
for _ in 0..<displacement_size {
out[pos] = u8(disp & 0xFF)
disp >>= 8
pos += 1
}
// Displacement: four unconditional little-endian byte stores, then
// advance by the real size (0/1/4) -- no data-dependent loop. The
// untaken tail bytes are reclaimed by the next emit; ENCODE_TAIL_SLACK
// keeps the widened store in bounds.
out[pos+0] = u8(disp)
out[pos+1] = u8(disp >> 8)
out[pos+2] = u8(disp >> 16)
out[pos+3] = u8(disp >> 24)
pos += u32(displacement_size)
}
// Fixed ModR/M for special instructions. Triggered for:
@@ -588,7 +609,7 @@ encode :: proc(
for enc_type, i in enc.enc {
#partial switch enc_type {
case .IB:
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op != nil {
#partial switch user_op.kind {
case .IMMEDIATE:
@@ -604,7 +625,7 @@ encode :: proc(
}
case .IW:
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op != nil && user_op.kind == .IMMEDIATE {
immediate_val := u16(user_op.immediate)
out[pos] = u8(immediate_val); out[pos+1] = u8(immediate_val >> 8)
@@ -612,7 +633,7 @@ encode :: proc(
}
case .ID:
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op != nil {
#partial switch user_op.kind {
case .IMMEDIATE:
@@ -629,7 +650,7 @@ encode :: proc(
}
case .IQ:
user_op := get_user_op_inline(&inst, enc, i)
user_op := user_ops[i]
if user_op != nil && user_op.kind == .IMMEDIATE {
immediate_val := u64(user_op.immediate)
for j in u32(0)..<8 { out[pos + j] = u8(immediate_val >> (j * 8)) }
@@ -698,6 +719,14 @@ encode :: proc(
// SECTION: 7.7 Inline Helper Functions
// -----------------------------------------------------------------------------
// Branchless select mask: 0xFF when `b`, else 0x00. Used to OR-accumulate
// REX/VEX/EVEX bit contributions without a per-condition branch
// (`x |= bmask(cond) & bits`).
@(private="file")
bmask :: #force_inline proc "contextless" (b: bool) -> u8 {
return -u8(b)
}
// Check if instruction matches encoding (inlined for hot path).
// `mode` lets default_64 entries match 32-bit operands in i386 and
// filters out mode-restricted (mode_32_only) encodings when not in i386.
@@ -877,29 +906,13 @@ imm_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: O
return false
}
get_user_op_inline :: #force_inline proc "contextless" (inst: ^Instruction, enc: ^Encoding, slot: int) -> ^Operand {
user_idx := 0
for op, i in enc.ops {
if op == .NONE { break }
if is_implicit_op_inline(op) { continue }
if i == slot {
if user_idx < int(inst.operand_count) {
return &inst.ops[user_idx]
}
return nil
}
user_idx += 1
}
return nil
}
// -----------------------------------------------------------------------------
// SECTION: 7.8 Convenience Functions
// -----------------------------------------------------------------------------
// Compute safe buffer sizes for encoding
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int {
return n * MAX_INST_SIZE
return n * MAX_INST_SIZE + ENCODE_TAIL_SLACK
}
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int {