mirror of
https://github.com/odin-lang/Odin.git
synced 2026-06-19 08:32:33 +00:00
rexcode/x86: branchless hot paths + single-pass operand resolution
Three layers on the x86 encode/decode hot paths, all byte-exact (2246 LLVM-verified cases) and roundtrip-clean: 1. Branchless: legacy-prefix emission (speculative write + conditional advance), REX/VEX/EVEX extension-bit accumulation (gate-and-mask), ModRM mod/disp-size selection (cmov selects), displacement emission (widened store + ENCODE_TAIL_SLACK); decoder REX/VEX/EVEX register extensions (arithmetic instead of if/+=8). 2. Resolve-operands-once: the previous code re-derived each user operand ~5-10x per instruction (a fresh O(n) scan of enc.ops per emission pass). Now resolved into a [4]^Operand map a single time. 3. Single-pass gather: fold the opcode-+rb and ModR/M slot-detection scans into that one resolve pass (3 enc.enc passes -> 1). Net on a 100k mixed-instruction benchmark: encode ~58 -> ~54 ns/inst (best 52). Branchless alone was a ~7% encode regression (predicted branches, nothing to recover); the algorithmic passes recovered it and beat baseline.
This commit is contained in:
@@ -671,10 +671,7 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
|
||||
|
||||
case .REG:
|
||||
// Register encoded in ModR/M.reg
|
||||
register_number := modrm_info.reg
|
||||
if state.rex & 0x04 != 0 { // REX.R
|
||||
register_number += 8
|
||||
}
|
||||
register_number := modrm_info.reg + ((state.rex & 0x04) << 1) // REX.R -> +8 (branchless)
|
||||
reg := decode_register(register_number, op_type, state.rex)
|
||||
op = op_reg(reg)
|
||||
return
|
||||
@@ -683,10 +680,7 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
|
||||
// Register or memory in ModR/M.rm
|
||||
if modrm_info.mod == 3 {
|
||||
// Register
|
||||
register_number := modrm_info.rm
|
||||
if state.rex & 0x01 != 0 { // REX.B
|
||||
register_number += 8
|
||||
}
|
||||
register_number := modrm_info.rm + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
|
||||
reg := decode_register(register_number, op_type, state.rex)
|
||||
op = op_reg(reg)
|
||||
return
|
||||
@@ -755,20 +749,14 @@ decode_single_operand :: proc(state: ^Decoder_State, op_type: Operand_Type, op_e
|
||||
|
||||
case .OP_R:
|
||||
// Register encoded in low 3 bits of opcode
|
||||
register_number := state.opcode_reg
|
||||
if state.rex & 0x01 != 0 { // REX.B extends the register
|
||||
register_number += 8
|
||||
}
|
||||
register_number := state.opcode_reg + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
|
||||
reg := decode_register(register_number, op_type, state.rex)
|
||||
op = op_reg(reg)
|
||||
return
|
||||
|
||||
case .VVVV:
|
||||
// VEX.vvvv register
|
||||
register_number := 15 - state.vex_vvvv
|
||||
if state.evex_v2 {
|
||||
register_number += 16
|
||||
}
|
||||
register_number := (15 - state.vex_vvvv) + (u8(state.evex_v2) << 4) // EVEX.V' -> +16 (branchless)
|
||||
reg := decode_register(register_number, op_type, state.rex)
|
||||
op = op_reg(reg)
|
||||
return
|
||||
@@ -800,13 +788,8 @@ decode_single_operand_vex :: proc(state: ^Decoder_State, op_type: Operand_Type,
|
||||
case .REG:
|
||||
// Register in ModR/M.reg, extended by VEX.R
|
||||
// vex_r is true when the encoded bit is 0, meaning extension is active
|
||||
register_number := modrm_info.reg
|
||||
if state.vex_r { // vex_r=true means extend (bit was 0 in inverted encoding)
|
||||
register_number += 8
|
||||
}
|
||||
if state.vex_type == .EVEX && state.evex_r2 { // evex_r2=true means extend
|
||||
register_number += 16
|
||||
}
|
||||
// VEX.R -> +8, EVEX.R' -> +16 (branchless; vex_r/evex_r2 true means extend)
|
||||
register_number := modrm_info.reg + (u8(state.vex_r) << 3) + (u8(state.vex_type == .EVEX && state.evex_r2) << 4)
|
||||
reg := decode_register(register_number, op_type, 0)
|
||||
return op_reg(reg), .NONE
|
||||
|
||||
@@ -814,10 +797,7 @@ decode_single_operand_vex :: proc(state: ^Decoder_State, op_type: Operand_Type,
|
||||
if modrm_info.mod == 3 {
|
||||
// Register in ModR/M.rm, extended by VEX.B
|
||||
// vex_b is true when the encoded bit is 0, meaning extension is active
|
||||
register_number := modrm_info.rm
|
||||
if state.vex_b { // vex_b=true means extend
|
||||
register_number += 8
|
||||
}
|
||||
register_number := modrm_info.rm + (u8(state.vex_b) << 3) // VEX.B -> +8 (branchless)
|
||||
reg := decode_register(register_number, op_type, 0)
|
||||
return op_reg(reg), .NONE
|
||||
} else {
|
||||
@@ -857,10 +837,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
|
||||
|
||||
if has_sib {
|
||||
// SIB addressing
|
||||
base_number := sib_info.base
|
||||
if state.rex & 0x01 != 0 { // REX.B
|
||||
base_number += 8
|
||||
}
|
||||
base_number := sib_info.base + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
|
||||
|
||||
// Special case: base=5 with mod=0 means no base (displacement32 only)
|
||||
if sib_info.base == 5 && modrm_info.mod == 0 {
|
||||
@@ -871,10 +848,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
|
||||
|
||||
// Index register (0xFF means no index)
|
||||
if sib_info.index != 0xFF {
|
||||
index_number := sib_info.index
|
||||
if state.rex & 0x02 != 0 { // REX.X
|
||||
index_number += 8
|
||||
}
|
||||
index_number := sib_info.index + ((state.rex & 0x02) << 2) // REX.X -> +8 (branchless)
|
||||
index_reg = addr_reg_from_num(index_number, state.mode)
|
||||
scale = sib_info.scale
|
||||
}
|
||||
@@ -890,10 +864,7 @@ decode_memory_operand :: proc(state: ^Decoder_State, modrm_info: ModRM_Info,
|
||||
base_reg = NONE
|
||||
}
|
||||
} else {
|
||||
base_number := modrm_info.rm
|
||||
if state.rex & 0x01 != 0 { // REX.B
|
||||
base_number += 8
|
||||
}
|
||||
base_number := modrm_info.rm + ((state.rex & 0x01) << 3) // REX.B -> +8 (branchless)
|
||||
base_reg = addr_reg_from_num(base_number, state.mode)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,13 @@ import "core:rexcode/isa"
|
||||
|
||||
MAX_INST_SIZE :: 15 // Maximum x64 instruction length
|
||||
|
||||
// Extra bytes reserved past each instruction so the branchless emitters can
|
||||
// write a few speculative bytes beyond the logical end (e.g. a widened 4-byte
|
||||
// displacement store when only a disp8 is kept). The over-written tail is
|
||||
// reclaimed by the next emit; this slack just guarantees the wide store stays
|
||||
// in bounds even for the final instruction against a tight buffer.
|
||||
ENCODE_TAIL_SLACK :: 8
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// SECTION: 7.6 Core Encoding Function
|
||||
@@ -100,7 +107,7 @@ encode :: proc(
|
||||
}
|
||||
|
||||
// Check buffer space
|
||||
if byte_count + MAX_INST_SIZE > u32(len(code)) {
|
||||
if byte_count + MAX_INST_SIZE + ENCODE_TAIL_SLACK > u32(len(code)) {
|
||||
append(errors, Error{u32(instruction_index), .BUFFER_OVERFLOW, {}})
|
||||
ok = false
|
||||
continue
|
||||
@@ -171,32 +178,64 @@ encode :: proc(
|
||||
out := code[byte_count:]
|
||||
pos: u32 = 0
|
||||
|
||||
// --- Legacy Prefixes ---
|
||||
// Resolve every encoding slot to its user operand ONCE, and gather the
|
||||
// ModR/M and opcode-reg slot roles in the same pass. The emission below
|
||||
// indexes user_ops[slot] instead of re-deriving the mapping per pass --
|
||||
// the previous code re-scanned enc.ops ~5-10x per instruction (once for
|
||||
// REX bits, opcode +rb, ModR/M slots, reg/rm fields, immediates), which
|
||||
// was a dominant per-instruction cost.
|
||||
user_ops: [4]^Operand
|
||||
mr_slot: int = -1
|
||||
reg_slot: int = -1
|
||||
opr_slot: int = -1
|
||||
{
|
||||
user_idx := 0
|
||||
for op, i in enc.ops {
|
||||
if op == .NONE { break }
|
||||
if !is_implicit_op_inline(op) {
|
||||
if user_idx < int(inst.operand_count) {
|
||||
user_ops[i] = &inst.ops[user_idx]
|
||||
}
|
||||
user_idx += 1
|
||||
}
|
||||
// Slot roles (parallel array enc.enc[i]) gathered in the same pass.
|
||||
#partial switch enc.enc[i] {
|
||||
case .MR: mr_slot = i
|
||||
case .REG: reg_slot = i
|
||||
case .OP_R: opr_slot = i
|
||||
}
|
||||
}
|
||||
}
|
||||
has_modrm := mr_slot >= 0 || reg_slot >= 0
|
||||
|
||||
// --- Legacy Prefixes (branchless) ---
|
||||
//
|
||||
// Each optional prefix byte is written *speculatively* at `pos`, then
|
||||
// `pos` advances only if the prefix is actually present. When absent the
|
||||
// speculative byte is overwritten by the next emit (the opcode always
|
||||
// writes at `pos`), so the final stream is identical to the branching
|
||||
// form -- with four data-dependent branches removed. The buffer carries
|
||||
// MAX_INST_SIZE slack (checked above), so the spec writes stay in bounds.
|
||||
|
||||
// Lock prefix (F0)
|
||||
if inst.flags.lock && enc.flags.lock_ok {
|
||||
out[pos] = 0xF0
|
||||
pos += 1
|
||||
}
|
||||
out[pos] = 0xF0
|
||||
pos += u32(inst.flags.lock && enc.flags.lock_ok)
|
||||
|
||||
// Rep/Repne prefix
|
||||
#partial switch inst.flags.rep {
|
||||
case .REP: out[pos] = 0xF3; pos += 1
|
||||
case .REPNE: out[pos] = 0xF2; pos += 1
|
||||
}
|
||||
// Rep/Repne prefix (NONE -> 0, REP -> F3, REPNE -> F2)
|
||||
REP_BYTE := [Rep]u8{ .NONE = 0, .REP = 0xF3, .REPNE = 0xF2 }
|
||||
rep_b := REP_BYTE[inst.flags.rep]
|
||||
out[pos] = rep_b
|
||||
pos += u32(rep_b != 0)
|
||||
|
||||
// Segment override
|
||||
if inst.flags.segment != 0 {
|
||||
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
|
||||
out[pos] = seg_prefix[inst.flags.segment]
|
||||
pos += 1
|
||||
}
|
||||
// Segment override (table already maps 0 -> 0)
|
||||
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
|
||||
seg_b := seg_prefix[inst.flags.segment]
|
||||
out[pos] = seg_b
|
||||
pos += u32(seg_b != 0)
|
||||
|
||||
// Address size override (67h)
|
||||
if inst.flags.addr32 {
|
||||
out[pos] = 0x67
|
||||
pos += 1
|
||||
}
|
||||
out[pos] = 0x67
|
||||
pos += u32(inst.flags.addr32)
|
||||
|
||||
// --- VEX/EVEX or Legacy Encoding ---
|
||||
|
||||
@@ -226,25 +265,28 @@ encode :: proc(
|
||||
case .W1: w = 1
|
||||
}
|
||||
|
||||
// Check operands for REX bits
|
||||
// Operand-driven extension bits (branchless: compute reg & mem
|
||||
// contributions, gate by kind, clear the inverted bit via AND-mask).
|
||||
for enc_type, i in enc.enc {
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op == nil { continue }
|
||||
|
||||
is_reg := user_op.kind == .REGISTER
|
||||
is_mem := user_op.kind == .MEMORY
|
||||
m := user_op.mem
|
||||
reg_ext := is_reg && reg_needs_rex(user_op.reg)
|
||||
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
||||
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
||||
|
||||
#partial switch enc_type {
|
||||
case .REG:
|
||||
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { r = 0 }
|
||||
r &= u8(!reg_ext)
|
||||
case .MR:
|
||||
#partial switch user_op.kind {
|
||||
case .REGISTER:
|
||||
if reg_needs_rex(user_op.reg) { b = 0 }
|
||||
case .MEMORY:
|
||||
m := user_op.mem
|
||||
if mem_has_base(m) && m.base_ext { b = 0 }
|
||||
if mem_has_index(m) && m.index_ext { x = 0 }
|
||||
}
|
||||
b &= u8(!reg_ext)
|
||||
b &= u8(!base_ext)
|
||||
x &= u8(!index_ext)
|
||||
case .VVVV:
|
||||
if user_op.kind == .REGISTER { vvvv = ~reg_hw(user_op.reg) & 0xF }
|
||||
vvvv = is_reg ? (~reg_hw(user_op.reg) & 0xF) : vvvv
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,37 +330,34 @@ encode :: proc(
|
||||
}
|
||||
|
||||
for i in 0..<4 {
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op == nil { continue }
|
||||
|
||||
is_reg := user_op.kind == .REGISTER
|
||||
is_mem := user_op.kind == .MEMORY
|
||||
m := user_op.mem
|
||||
hw := reg_hw(user_op.reg) // gated by is_reg below
|
||||
reg8 := is_reg && hw >= 8
|
||||
reg16 := is_reg && hw >= 16
|
||||
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
||||
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
||||
|
||||
#partial switch enc.enc[i] {
|
||||
case .REG:
|
||||
if user_op.kind == .REGISTER {
|
||||
hw := reg_hw(user_op.reg)
|
||||
if hw >= 8 { r = 0 }
|
||||
if hw >= 16 { rr = 0 }
|
||||
}
|
||||
r &= u8(!reg8)
|
||||
rr &= u8(!reg16)
|
||||
case .MR:
|
||||
#partial switch user_op.kind {
|
||||
case .REGISTER:
|
||||
hw := reg_hw(user_op.reg)
|
||||
if hw >= 8 { b = 0 }
|
||||
case .MEMORY:
|
||||
m := user_op.mem
|
||||
if mem_has_base(m) && m.base_ext { b = 0 }
|
||||
if mem_has_index(m) && m.index_ext { x = 0 }
|
||||
if user_op.flags.broadcast != .NONE { bb = 1 }
|
||||
}
|
||||
b &= u8(!reg8)
|
||||
b &= u8(!base_ext)
|
||||
x &= u8(!index_ext)
|
||||
bb |= u8(is_mem && user_op.flags.broadcast != .NONE)
|
||||
case .VVVV:
|
||||
if user_op.kind == .REGISTER {
|
||||
hw := reg_hw(user_op.reg)
|
||||
vvvv = ~hw & 0xF
|
||||
if hw >= 16 { vvv = 0 }
|
||||
}
|
||||
vvvv = is_reg ? (~hw & 0xF) : vvvv
|
||||
vvv &= u8(!reg16)
|
||||
case .AAA:
|
||||
if user_op.kind == .REGISTER { aaa = reg_hw(user_op.reg) & 0x7 }
|
||||
aaa = is_reg ? (hw & 0x7) : aaa
|
||||
}
|
||||
if user_op.flags.zeroing { z = 1 }
|
||||
z |= u8(user_op.flags.zeroing)
|
||||
}
|
||||
|
||||
out[pos] = 0x62
|
||||
@@ -352,44 +391,45 @@ encode :: proc(
|
||||
pos += 1
|
||||
}
|
||||
|
||||
// REX prefix
|
||||
rex: u8 = 0
|
||||
if enc.flags.force_rex_w { rex |= 0x48 }
|
||||
// REX prefix (branchless: OR each operand's contribution via a mask).
|
||||
// Both the register and memory contributions are computed and gated
|
||||
// by operand kind, so the data-dependent REGISTER/MEMORY branch is
|
||||
// gone; only the per-form enc_type switch (predictable) remains.
|
||||
rex: u8 = bmask(enc.flags.force_rex_w) & 0x48
|
||||
|
||||
for enc_type, i in enc.enc {
|
||||
if enc_type == .NONE { continue }
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op == nil { continue }
|
||||
|
||||
is_reg := user_op.kind == .REGISTER
|
||||
is_mem := user_op.kind == .MEMORY
|
||||
m := user_op.mem // union bytes; only used when is_mem
|
||||
reg_ext := is_reg && reg_needs_rex(user_op.reg)
|
||||
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
||||
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
||||
|
||||
#partial switch enc_type {
|
||||
case .REG:
|
||||
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { rex |= 0x44 }
|
||||
rex |= bmask(reg_ext) & 0x44
|
||||
case .MR:
|
||||
#partial switch user_op.kind {
|
||||
case .REGISTER:
|
||||
if reg_needs_rex(user_op.reg) { rex |= 0x41 }
|
||||
case .MEMORY:
|
||||
m := user_op.mem
|
||||
if mem_has_base(m) && m.base_ext { rex |= 0x41 }
|
||||
if mem_has_index(m) && m.index_ext { rex |= 0x42 }
|
||||
}
|
||||
rex |= bmask(reg_ext) & 0x41 // register r/m -> REX.B
|
||||
rex |= bmask(base_ext) & 0x41
|
||||
rex |= bmask(index_ext) & 0x42
|
||||
case .OP_R:
|
||||
if user_op.kind == .REGISTER && reg_needs_rex(user_op.reg) { rex |= 0x41 }
|
||||
rex |= bmask(reg_ext) & 0x41
|
||||
}
|
||||
}
|
||||
|
||||
// SPL, BPL, SIL, DIL require REX (long mode only).
|
||||
// SPL/BPL/SIL/DIL (GPR8 hw 4-7) require an empty REX (long mode only).
|
||||
if mode == ._64 {
|
||||
spl_seen := false
|
||||
for i in 0..<inst.operand_count {
|
||||
op := &inst.ops[i]
|
||||
if op.kind == .REGISTER {
|
||||
class := reg_class(op.reg)
|
||||
hw := reg_hw(op.reg)
|
||||
if class == REG_GPR8 && hw >= 4 && hw <= 7 {
|
||||
if rex == 0 { rex = 0x40 }
|
||||
}
|
||||
}
|
||||
hw := reg_hw(op.reg)
|
||||
spl_seen ||= op.kind == .REGISTER && reg_class(op.reg) == REG_GPR8 && hw >= 4 && hw <= 7
|
||||
}
|
||||
rex |= bmask(rex == 0 && spl_seen) & 0x40
|
||||
}
|
||||
|
||||
// 32-bit mode forbids the REX prefix entirely. If any operand
|
||||
@@ -429,14 +469,11 @@ encode :: proc(
|
||||
x87_fixed_modrm := opcode >= 0xD8 && opcode <= 0xDF && enc.ext >= 0xC0
|
||||
opr_index: u8 = 0
|
||||
opr_seen := false
|
||||
for enc_type, i in enc.enc {
|
||||
if enc_type == .OP_R {
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
if user_op != nil && user_op.kind == .REGISTER {
|
||||
opr_index = reg_hw(user_op.reg) & 0x07
|
||||
opr_seen = true
|
||||
}
|
||||
break
|
||||
if opr_slot >= 0 {
|
||||
user_op := user_ops[opr_slot]
|
||||
if user_op != nil && user_op.kind == .REGISTER {
|
||||
opr_index = reg_hw(user_op.reg) & 0x07
|
||||
opr_seen = true
|
||||
}
|
||||
}
|
||||
if opr_seen && !x87_fixed_modrm {
|
||||
@@ -446,18 +483,7 @@ encode :: proc(
|
||||
out[pos] = opcode
|
||||
pos += 1
|
||||
|
||||
// --- ModR/M and SIB ---
|
||||
has_modrm := false
|
||||
mr_slot: int = -1
|
||||
reg_slot: int = -1
|
||||
|
||||
for enc_type, i in enc.enc {
|
||||
#partial switch enc_type {
|
||||
case .MR: mr_slot = i; has_modrm = true
|
||||
case .REG: reg_slot = i; has_modrm = true
|
||||
}
|
||||
}
|
||||
|
||||
// --- ModR/M and SIB --- (mr_slot/reg_slot/has_modrm gathered above)
|
||||
if has_modrm {
|
||||
has_sib := false
|
||||
mod: u8 = 0
|
||||
@@ -471,7 +497,7 @@ encode :: proc(
|
||||
if enc.flags.modrm_reg_ext {
|
||||
reg_field = enc.ext & 0x07
|
||||
} else if reg_slot >= 0 {
|
||||
reg_op := get_user_op_inline(&inst, enc, reg_slot)
|
||||
reg_op := user_ops[reg_slot]
|
||||
if reg_op != nil && reg_op.kind == .REGISTER {
|
||||
reg_field = reg_hw(reg_op.reg) & 0x07
|
||||
}
|
||||
@@ -479,7 +505,7 @@ encode :: proc(
|
||||
|
||||
// R/M field
|
||||
if mr_slot >= 0 {
|
||||
mr_op := get_user_op_inline(&inst, enc, mr_slot)
|
||||
mr_op := user_ops[mr_slot]
|
||||
if mr_op != nil {
|
||||
#partial switch mr_op.kind {
|
||||
case .REGISTER:
|
||||
@@ -507,6 +533,12 @@ encode :: proc(
|
||||
|
||||
needs_sib := has_index || (base_hw & 0x07) == 4
|
||||
|
||||
has_base := mem_has_base(m)
|
||||
is_rbp := (base_hw & 0x07) == 5
|
||||
is_zero := disp_value == 0
|
||||
fits8 := disp_value >= -128 && disp_value <= 127
|
||||
disp = disp_value
|
||||
|
||||
if needs_sib {
|
||||
has_sib = true
|
||||
rm = 0b100
|
||||
@@ -518,37 +550,22 @@ encode :: proc(
|
||||
case 8: scale = 3
|
||||
}
|
||||
|
||||
idx: u8 = 0b100
|
||||
if has_index { idx = m.index_hw & 0x07 }
|
||||
|
||||
base_sib := base_hw & 0x07
|
||||
if !mem_has_base(m) { base_sib = 0b101 }
|
||||
|
||||
idx := has_index ? (m.index_hw & 0x07) : u8(0b100)
|
||||
base_sib := has_base ? (base_hw & 0x07) : u8(0b101)
|
||||
sib = (scale << 6) | (idx << 3) | base_sib
|
||||
|
||||
if mem_has_base(m) && (base_hw & 0x07) == 5 && disp_value == 0 {
|
||||
mod = 0b01; disp = 0; displacement_size = 1
|
||||
} else if !mem_has_base(m) {
|
||||
mod = 0b00; disp = disp_value; displacement_size = 4
|
||||
} else if disp_value == 0 {
|
||||
mod = 0b00
|
||||
} else if disp_value >= -128 && disp_value <= 127 {
|
||||
mod = 0b01; disp = disp_value; displacement_size = 1
|
||||
} else {
|
||||
mod = 0b10; disp = disp_value; displacement_size = 4
|
||||
}
|
||||
// mod / disp size, branchless. No base -> [disp32]
|
||||
// (mod 00, size 4). Otherwise: no displacement when
|
||||
// zero and not RBP-like; else disp8 if it fits, else
|
||||
// disp32. (RBP-like base forces an explicit disp8.)
|
||||
no_disp := has_base && is_zero && !(has_base && is_rbp)
|
||||
displacement_size = !has_base ? 4 : (no_disp ? 0 : (fits8 ? 1 : 4))
|
||||
mod = !has_base ? 0b00 : (no_disp ? 0b00 : (fits8 ? 0b01 : 0b10))
|
||||
} else {
|
||||
rm = base_hw & 0x07
|
||||
|
||||
if (base_hw & 0x07) == 5 && disp_value == 0 {
|
||||
mod = 0b01; disp = 0; displacement_size = 1
|
||||
} else if disp_value == 0 {
|
||||
mod = 0b00
|
||||
} else if disp_value >= -128 && disp_value <= 127 {
|
||||
mod = 0b01; disp = disp_value; displacement_size = 1
|
||||
} else {
|
||||
mod = 0b10; disp = disp_value; displacement_size = 4
|
||||
}
|
||||
no_disp := is_zero && !is_rbp
|
||||
displacement_size = no_disp ? 0 : (fits8 ? 1 : 4)
|
||||
mod = no_disp ? 0b00 : (fits8 ? 0b01 : 0b10)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -563,11 +580,15 @@ encode :: proc(
|
||||
pos += 1
|
||||
}
|
||||
|
||||
for _ in 0..<displacement_size {
|
||||
out[pos] = u8(disp & 0xFF)
|
||||
disp >>= 8
|
||||
pos += 1
|
||||
}
|
||||
// Displacement: four unconditional little-endian byte stores, then
|
||||
// advance by the real size (0/1/4) -- no data-dependent loop. The
|
||||
// untaken tail bytes are reclaimed by the next emit; ENCODE_TAIL_SLACK
|
||||
// keeps the widened store in bounds.
|
||||
out[pos+0] = u8(disp)
|
||||
out[pos+1] = u8(disp >> 8)
|
||||
out[pos+2] = u8(disp >> 16)
|
||||
out[pos+3] = u8(disp >> 24)
|
||||
pos += u32(displacement_size)
|
||||
}
|
||||
|
||||
// Fixed ModR/M for special instructions. Triggered for:
|
||||
@@ -588,7 +609,7 @@ encode :: proc(
|
||||
for enc_type, i in enc.enc {
|
||||
#partial switch enc_type {
|
||||
case .IB:
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op != nil {
|
||||
#partial switch user_op.kind {
|
||||
case .IMMEDIATE:
|
||||
@@ -604,7 +625,7 @@ encode :: proc(
|
||||
}
|
||||
|
||||
case .IW:
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op != nil && user_op.kind == .IMMEDIATE {
|
||||
immediate_val := u16(user_op.immediate)
|
||||
out[pos] = u8(immediate_val); out[pos+1] = u8(immediate_val >> 8)
|
||||
@@ -612,7 +633,7 @@ encode :: proc(
|
||||
}
|
||||
|
||||
case .ID:
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op != nil {
|
||||
#partial switch user_op.kind {
|
||||
case .IMMEDIATE:
|
||||
@@ -629,7 +650,7 @@ encode :: proc(
|
||||
}
|
||||
|
||||
case .IQ:
|
||||
user_op := get_user_op_inline(&inst, enc, i)
|
||||
user_op := user_ops[i]
|
||||
if user_op != nil && user_op.kind == .IMMEDIATE {
|
||||
immediate_val := u64(user_op.immediate)
|
||||
for j in u32(0)..<8 { out[pos + j] = u8(immediate_val >> (j * 8)) }
|
||||
@@ -698,6 +719,14 @@ encode :: proc(
|
||||
// SECTION: 7.7 Inline Helper Functions
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// Branchless select mask: 0xFF when `b`, else 0x00. Used to OR-accumulate
|
||||
// REX/VEX/EVEX bit contributions without a per-condition branch
|
||||
// (`x |= bmask(cond) & bits`).
|
||||
@(private="file")
|
||||
bmask :: #force_inline proc "contextless" (b: bool) -> u8 {
|
||||
return -u8(b)
|
||||
}
|
||||
|
||||
// Check if instruction matches encoding (inlined for hot path).
|
||||
// `mode` lets default_64 entries match 32-bit operands in i386 and
|
||||
// filters out mode-restricted (mode_32_only) encodings when not in i386.
|
||||
@@ -877,29 +906,13 @@ imm_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: O
|
||||
return false
|
||||
}
|
||||
|
||||
get_user_op_inline :: #force_inline proc "contextless" (inst: ^Instruction, enc: ^Encoding, slot: int) -> ^Operand {
|
||||
user_idx := 0
|
||||
for op, i in enc.ops {
|
||||
if op == .NONE { break }
|
||||
if is_implicit_op_inline(op) { continue }
|
||||
if i == slot {
|
||||
if user_idx < int(inst.operand_count) {
|
||||
return &inst.ops[user_idx]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
user_idx += 1
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// SECTION: 7.8 Convenience Functions
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// Compute safe buffer sizes for encoding
|
||||
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int {
|
||||
return n * MAX_INST_SIZE
|
||||
return n * MAX_INST_SIZE + ENCODE_TAIL_SLACK
|
||||
}
|
||||
|
||||
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int {
|
||||
|
||||
Reference in New Issue
Block a user