mirror of
https://github.com/odin-lang/Odin.git
synced 2026-06-19 08:32:33 +00:00
Three layers on the x86 encode/decode hot paths, all byte-exact (2246 LLVM-verified cases) and roundtrip-clean: 1. Branchless: legacy-prefix emission (speculative write + conditional advance), REX/VEX/EVEX extension-bit accumulation (gate-and-mask), ModRM mod/disp-size selection (cmov selects), displacement emission (widened store + ENCODE_TAIL_SLACK); decoder REX/VEX/EVEX register extensions (arithmetic instead of if/+=8). 2. Resolve-operands-once: the previous code re-derived each user operand ~5-10x per instruction (a fresh O(n) scan of enc.ops per emission pass). Now resolved into a [4]^Operand map a single time. 3. Single-pass gather: fold the opcode-+rb and ModR/M slot-detection scans into that one resolve pass (3 enc.enc passes -> 1). Net on a 100k mixed-instruction benchmark: encode ~58 -> ~54 ns/inst (best 52). Branchless alone was a ~7% encode regression (predicted branches, nothing to recover); the algorithmic passes recovered it and beat baseline.
994 lines
34 KiB
Odin
994 lines
34 KiB
Odin
// rexcode · Brendan Punsky (dotbmp@github), original author
|
|
|
|
package rexcode_x86
|
|
|
|
// =============================================================================
|
|
// SECTION: 7. HIGH-PERFORMANCE ENCODER
|
|
// =============================================================================
|
|
//
|
|
// Ultra-fast table-driven x64 instruction encoder with:
|
|
// - Zero allocations: user provides all buffers
|
|
// - O(1) mnemonic lookup via enum-indexed table
|
|
// - O(1) label lookup via array indexing
|
|
// - Fully inlined hot path - no function call overhead
|
|
// - Trivially parallelizable: encode() is pure, no shared state
|
|
//
|
|
// API: Single entry point `encode()` that takes:
|
|
// - instructions: []Instruction to encode
|
|
// - label_defs: []Label_Definition mapping label_id -> instruction_index
|
|
// - code: []u8 output buffer for machine code
|
|
// - relocs: []Relocation output buffer for relocations
|
|
// - errors: []Error output buffer for errors
|
|
//
|
|
// Returns Result with counts and success status.
|
|
// Unresolved labels are returned as relocations (no extern/internal distinction).
|
|
|
|
import "base:intrinsics"
|
|
import "core:fmt"
|
|
import "core:rexcode/isa"
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// SECTION: 7.1 Constants
|
|
// -----------------------------------------------------------------------------
|
|
|
|
MAX_INST_SIZE :: 15 // Maximum x64 instruction length
|
|
|
|
// Extra bytes reserved past each instruction so the branchless emitters can
|
|
// write a few speculative bytes beyond the logical end (e.g. a widened 4-byte
|
|
// displacement store when only a disp8 is kept). The over-written tail is
|
|
// reclaimed by the next emit; this slack just guarantees the wide store stays
|
|
// in bounds even for the final instruction against a tight buffer.
|
|
ENCODE_TAIL_SLACK :: 8
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// SECTION: 7.6 Core Encoding Function
|
|
// -----------------------------------------------------------------------------
|
|
|
|
// encode: The single entry point for x64 instruction encoding.
|
|
//
|
|
// Parameters:
|
|
// instructions - Array of instructions to encode
|
|
// label_defs - Array mapping label_id -> instruction index. MODIFIED IN PLACE
|
|
// to contain byte offsets after encoding.
|
|
// code - Output buffer for machine code (must be large enough)
|
|
// relocs - Dynamic array; unresolved relocations are appended
|
|
// errors - Dynamic array; encoding errors are appended
|
|
// resolve - If true, resolve relocations and patch code in place
|
|
// base_address - Base address for absolute relocations (when resolve=true)
|
|
//
|
|
// Returns:
|
|
// Result with code size and success status.
|
|
//
|
|
// After encoding, label_defs[label_id] contains the byte offset of that label.
|
|
// Unresolved references (labels not in label_defs) are appended to relocs.
|
|
//
|
|
encode :: proc(
|
|
instructions: []Instruction,
|
|
label_defs: []Label_Definition, // Input: inst index. Modified to byte offset.
|
|
code: []u8,
|
|
relocs: ^[dynamic]Relocation, // Unresolved relocations appended here
|
|
errors: ^[dynamic]Error, // Errors appended here
|
|
resolve: bool = true,
|
|
base_address: u64 = 0,
|
|
mode: Mode = ._64, // i386 vs x86-64 mode
|
|
) -> (byte_count: u32, ok: bool) {
|
|
if mode == ._16 {
|
|
// Real-mode encoding is not implemented; the ModRM addressing
|
|
// model differs from protected/long mode and needs a separate
|
|
// emission path. See Mode enum comment in encoding_types.odin.
|
|
fmt.panicf("x64.encode: Mode._16 (real mode) is not yet supported")
|
|
}
|
|
|
|
ok = true
|
|
|
|
// Temp storage for pending relocations (before resolution)
|
|
pending_relocations: [dynamic]Relocation
|
|
defer delete(pending_relocations)
|
|
|
|
// Temp storage for instruction byte offsets
|
|
inst_offsets: [dynamic]u32
|
|
defer delete(inst_offsets)
|
|
resize(&inst_offsets, len(instructions))
|
|
|
|
// =========================================================================
|
|
// PASS 1: Encode all instructions, collect relocations
|
|
// =========================================================================
|
|
|
|
for &inst, instruction_index in instructions {
|
|
// Record this instruction's byte offset
|
|
inst_offsets[instruction_index] = byte_count
|
|
|
|
// Validate operand_count bounds
|
|
if inst.operand_count > 4 {
|
|
append(errors, Error{u32(instruction_index), .INVALID_OPERAND_COUNT, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
|
|
// Check buffer space
|
|
if byte_count + MAX_INST_SIZE + ENCODE_TAIL_SLACK > u32(len(code)) {
|
|
append(errors, Error{u32(instruction_index), .BUFFER_OVERFLOW, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
|
|
// i386 operand validation. The following don't exist in 32-bit
|
|
// protected mode and must be rejected up front so we don't
|
|
// silently emit bytes that mean something else (e.g. SPL would
|
|
// encode as AH in 32-bit). Catches legacy AND VEX/EVEX paths.
|
|
if mode == ._32 {
|
|
invalid := false
|
|
for i in 0..<inst.operand_count {
|
|
op := &inst.ops[i]
|
|
#partial switch op.kind {
|
|
case .REGISTER:
|
|
// R8-R15, XMM8-31, YMM8-31, ZMM8-31 require REX/VEX/EVEX extension.
|
|
if reg_needs_rex(op.reg) { invalid = true; break }
|
|
// SPL/BPL/SIL/DIL (REG_GPR8 hw 4-7) don't exist in i386;
|
|
// those encodings decode as AH/CH/DH/BH there. Users
|
|
// wanting high-byte regs should use REG_GPR8H (AH..BH).
|
|
if reg_class(op.reg) == REG_GPR8 {
|
|
hw := reg_hw(op.reg)
|
|
if hw >= 4 && hw <= 7 { invalid = true; break }
|
|
}
|
|
case .MEMORY:
|
|
m := op.mem
|
|
if (mem_has_base(m) && m.base_ext) ||
|
|
(mem_has_index(m) && m.index_ext) {
|
|
invalid = true; break
|
|
}
|
|
}
|
|
}
|
|
if invalid {
|
|
append(errors, Error{u32(instruction_index), .OPERAND_MISMATCH, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Find matching encoding from table (O(1) mnemonic lookup)
|
|
encodings := encoding_forms(inst.mnemonic)
|
|
if len(encodings) == 0 {
|
|
append(errors, Error{u32(instruction_index), .INVALID_MNEMONIC, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
|
|
// Find the first encoding that matches operands
|
|
matched_enc: ^Encoding = nil
|
|
for &e in encodings {
|
|
if encoding_matches_inline(&inst, &e, mode) {
|
|
matched_enc = &e
|
|
break
|
|
}
|
|
}
|
|
|
|
if matched_enc == nil {
|
|
append(errors, Error{u32(instruction_index), .NO_MATCHING_ENCODING, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
|
|
// =====================================================================
|
|
// ENCODE INSTRUCTION (fully inlined hot path)
|
|
// =====================================================================
|
|
|
|
enc := matched_enc
|
|
out := code[byte_count:]
|
|
pos: u32 = 0
|
|
|
|
// Resolve every encoding slot to its user operand ONCE, and gather the
|
|
// ModR/M and opcode-reg slot roles in the same pass. The emission below
|
|
// indexes user_ops[slot] instead of re-deriving the mapping per pass --
|
|
// the previous code re-scanned enc.ops ~5-10x per instruction (once for
|
|
// REX bits, opcode +rb, ModR/M slots, reg/rm fields, immediates), which
|
|
// was a dominant per-instruction cost.
|
|
user_ops: [4]^Operand
|
|
mr_slot: int = -1
|
|
reg_slot: int = -1
|
|
opr_slot: int = -1
|
|
{
|
|
user_idx := 0
|
|
for op, i in enc.ops {
|
|
if op == .NONE { break }
|
|
if !is_implicit_op_inline(op) {
|
|
if user_idx < int(inst.operand_count) {
|
|
user_ops[i] = &inst.ops[user_idx]
|
|
}
|
|
user_idx += 1
|
|
}
|
|
// Slot roles (parallel array enc.enc[i]) gathered in the same pass.
|
|
#partial switch enc.enc[i] {
|
|
case .MR: mr_slot = i
|
|
case .REG: reg_slot = i
|
|
case .OP_R: opr_slot = i
|
|
}
|
|
}
|
|
}
|
|
has_modrm := mr_slot >= 0 || reg_slot >= 0
|
|
|
|
// --- Legacy Prefixes (branchless) ---
|
|
//
|
|
// Each optional prefix byte is written *speculatively* at `pos`, then
|
|
// `pos` advances only if the prefix is actually present. When absent the
|
|
// speculative byte is overwritten by the next emit (the opcode always
|
|
// writes at `pos`), so the final stream is identical to the branching
|
|
// form -- with four data-dependent branches removed. The buffer carries
|
|
// MAX_INST_SIZE slack (checked above), so the spec writes stay in bounds.
|
|
|
|
// Lock prefix (F0)
|
|
out[pos] = 0xF0
|
|
pos += u32(inst.flags.lock && enc.flags.lock_ok)
|
|
|
|
// Rep/Repne prefix (NONE -> 0, REP -> F3, REPNE -> F2)
|
|
REP_BYTE := [Rep]u8{ .NONE = 0, .REP = 0xF3, .REPNE = 0xF2 }
|
|
rep_b := REP_BYTE[inst.flags.rep]
|
|
out[pos] = rep_b
|
|
pos += u32(rep_b != 0)
|
|
|
|
// Segment override (table already maps 0 -> 0)
|
|
seg_prefix := [8]u8{0, 0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65, 0}
|
|
seg_b := seg_prefix[inst.flags.segment]
|
|
out[pos] = seg_b
|
|
pos += u32(seg_b != 0)
|
|
|
|
// Address size override (67h)
|
|
out[pos] = 0x67
|
|
pos += u32(inst.flags.addr32)
|
|
|
|
// --- VEX/EVEX or Legacy Encoding ---
|
|
|
|
#partial switch enc.flags.vex_type{
|
|
case .VEX:
|
|
// VEX prefix encoding
|
|
r: u8 = 1; x: u8 = 1; b: u8 = 1
|
|
vvvv: u8 = 0xF; l: u8 = 0; pp: u8 = 0; mmmmm: u8 = 1; w: u8 = 0
|
|
|
|
#partial switch enc.flags.esc {
|
|
case ._0F: mmmmm = 1
|
|
case ._0F38: mmmmm = 2
|
|
case ._0F3A: mmmmm = 3
|
|
}
|
|
|
|
switch enc.flags.prefix {
|
|
case 1: pp = 1 // 66
|
|
case 2: pp = 2 // F3
|
|
case 3: pp = 3 // F2
|
|
}
|
|
|
|
#partial switch enc.flags.vex_l {
|
|
case .L1: l = 1
|
|
}
|
|
|
|
#partial switch enc.flags.vex_w {
|
|
case .W1: w = 1
|
|
}
|
|
|
|
// Operand-driven extension bits (branchless: compute reg & mem
|
|
// contributions, gate by kind, clear the inverted bit via AND-mask).
|
|
for enc_type, i in enc.enc {
|
|
user_op := user_ops[i]
|
|
if user_op == nil { continue }
|
|
|
|
is_reg := user_op.kind == .REGISTER
|
|
is_mem := user_op.kind == .MEMORY
|
|
m := user_op.mem
|
|
reg_ext := is_reg && reg_needs_rex(user_op.reg)
|
|
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
|
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
|
|
|
#partial switch enc_type {
|
|
case .REG:
|
|
r &= u8(!reg_ext)
|
|
case .MR:
|
|
b &= u8(!reg_ext)
|
|
b &= u8(!base_ext)
|
|
x &= u8(!index_ext)
|
|
case .VVVV:
|
|
vvvv = is_reg ? (~reg_hw(user_op.reg) & 0xF) : vvvv
|
|
}
|
|
}
|
|
|
|
// 2-byte or 3-byte VEX
|
|
if x == 1 && b == 1 && w == 0 && mmmmm == 1 {
|
|
out[pos] = 0xC5
|
|
out[pos+1] = (r << 7) | (vvvv << 3) | (l << 2) | pp
|
|
pos += 2
|
|
} else {
|
|
out[pos] = 0xC4
|
|
out[pos+1] = (r << 7) | (x << 6) | (b << 5) | mmmmm
|
|
out[pos+2] = (w << 7) | (vvvv << 3) | (l << 2) | pp
|
|
pos += 3
|
|
}
|
|
|
|
case .EVEX:
|
|
// EVEX prefix encoding (4 bytes)
|
|
r: u8 = 1; x: u8 = 1; b: u8 = 1; rr: u8 = 1
|
|
mm: u8 = 1; w: u8 = 0; vvvv: u8 = 0xF; pp: u8 = 0
|
|
z: u8 = 0; ll: u8 = 0; bb: u8 = 0; vvv: u8 = 1; aaa: u8 = 0
|
|
|
|
#partial switch enc.flags.esc {
|
|
case ._0F: mm = 1
|
|
case ._0F38: mm = 2
|
|
case ._0F3A: mm = 3
|
|
}
|
|
|
|
switch enc.flags.prefix {
|
|
case 1: pp = 1
|
|
case 2: pp = 2
|
|
case 3: pp = 3
|
|
}
|
|
|
|
#partial switch enc.flags.vex_l {
|
|
case .L1: ll = 1
|
|
case .L2: ll = 2
|
|
}
|
|
|
|
#partial switch enc.flags.vex_w {
|
|
case .W1: w = 1
|
|
}
|
|
|
|
for i in 0..<4 {
|
|
user_op := user_ops[i]
|
|
if user_op == nil { continue }
|
|
|
|
is_reg := user_op.kind == .REGISTER
|
|
is_mem := user_op.kind == .MEMORY
|
|
m := user_op.mem
|
|
hw := reg_hw(user_op.reg) // gated by is_reg below
|
|
reg8 := is_reg && hw >= 8
|
|
reg16 := is_reg && hw >= 16
|
|
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
|
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
|
|
|
#partial switch enc.enc[i] {
|
|
case .REG:
|
|
r &= u8(!reg8)
|
|
rr &= u8(!reg16)
|
|
case .MR:
|
|
b &= u8(!reg8)
|
|
b &= u8(!base_ext)
|
|
x &= u8(!index_ext)
|
|
bb |= u8(is_mem && user_op.flags.broadcast != .NONE)
|
|
case .VVVV:
|
|
vvvv = is_reg ? (~hw & 0xF) : vvvv
|
|
vvv &= u8(!reg16)
|
|
case .AAA:
|
|
aaa = is_reg ? (hw & 0x7) : aaa
|
|
}
|
|
z |= u8(user_op.flags.zeroing)
|
|
}
|
|
|
|
out[pos] = 0x62
|
|
out[pos+1] = (r << 7) | (x << 6) | (b << 5) | (rr << 4) | mm
|
|
out[pos+2] = (w << 7) | (vvvv << 3) | 0x04 | pp
|
|
out[pos+3] = (z << 7) | (ll << 5) | (bb << 4) | (vvv << 3) | aaa
|
|
pos += 4
|
|
|
|
case: // Legacy encoding
|
|
// Operand size override (66h)
|
|
needs_66 := false
|
|
for i in 0..<inst.operand_count {
|
|
if inst.ops[i].kind == .REGISTER && reg_class(inst.ops[i].reg) == REG_GPR16 {
|
|
needs_66 = enc.flags.prefix != 1 // PREFIX_66
|
|
break
|
|
}
|
|
}
|
|
if needs_66 {
|
|
out[pos] = 0x66
|
|
pos += 1
|
|
}
|
|
|
|
// Mandatory prefix
|
|
if enc.flags.prefix != 0 && !needs_66 {
|
|
mand_prefix := [4]u8{0, 0x66, 0xF3, 0xF2}
|
|
out[pos] = mand_prefix[enc.flags.prefix]
|
|
pos += 1
|
|
} else if enc.flags.prefix != 0 && enc.flags.prefix != 1 {
|
|
mand_prefix := [4]u8{0, 0x66, 0xF3, 0xF2}
|
|
out[pos] = mand_prefix[enc.flags.prefix]
|
|
pos += 1
|
|
}
|
|
|
|
// REX prefix (branchless: OR each operand's contribution via a mask).
|
|
// Both the register and memory contributions are computed and gated
|
|
// by operand kind, so the data-dependent REGISTER/MEMORY branch is
|
|
// gone; only the per-form enc_type switch (predictable) remains.
|
|
rex: u8 = bmask(enc.flags.force_rex_w) & 0x48
|
|
|
|
for enc_type, i in enc.enc {
|
|
if enc_type == .NONE { continue }
|
|
user_op := user_ops[i]
|
|
if user_op == nil { continue }
|
|
|
|
is_reg := user_op.kind == .REGISTER
|
|
is_mem := user_op.kind == .MEMORY
|
|
m := user_op.mem // union bytes; only used when is_mem
|
|
reg_ext := is_reg && reg_needs_rex(user_op.reg)
|
|
base_ext := is_mem && mem_has_base(m) && m.base_ext
|
|
index_ext := is_mem && mem_has_index(m) && m.index_ext
|
|
|
|
#partial switch enc_type {
|
|
case .REG:
|
|
rex |= bmask(reg_ext) & 0x44
|
|
case .MR:
|
|
rex |= bmask(reg_ext) & 0x41 // register r/m -> REX.B
|
|
rex |= bmask(base_ext) & 0x41
|
|
rex |= bmask(index_ext) & 0x42
|
|
case .OP_R:
|
|
rex |= bmask(reg_ext) & 0x41
|
|
}
|
|
}
|
|
|
|
// SPL/BPL/SIL/DIL (GPR8 hw 4-7) require an empty REX (long mode only).
|
|
if mode == ._64 {
|
|
spl_seen := false
|
|
for i in 0..<inst.operand_count {
|
|
op := &inst.ops[i]
|
|
hw := reg_hw(op.reg)
|
|
spl_seen ||= op.kind == .REGISTER && reg_class(op.reg) == REG_GPR8 && hw >= 4 && hw <= 7
|
|
}
|
|
rex |= bmask(rex == 0 && spl_seen) & 0x40
|
|
}
|
|
|
|
// 32-bit mode forbids the REX prefix entirely. If any operand
|
|
// demanded REX bits (R8-R15, SPL/BPL/SIL/DIL, force_rex_w),
|
|
// the instruction is not legal i386.
|
|
if mode == ._32 && rex != 0 {
|
|
append(errors, Error{u32(instruction_index), .OPERAND_MISMATCH, {}})
|
|
ok = false
|
|
continue
|
|
}
|
|
|
|
if rex != 0 {
|
|
out[pos] = rex
|
|
pos += 1
|
|
}
|
|
|
|
// Escape bytes
|
|
#partial switch enc.flags.esc {
|
|
case ._0F:
|
|
out[pos] = 0x0F
|
|
pos += 1
|
|
case ._0F38:
|
|
out[pos] = 0x0F; out[pos+1] = 0x38
|
|
pos += 2
|
|
case ._0F3A:
|
|
out[pos] = 0x0F; out[pos+1] = 0x3A
|
|
pos += 2
|
|
}
|
|
}
|
|
|
|
// --- Opcode ---
|
|
opcode := enc.opcode
|
|
|
|
// Handle +rb/+rw/+rd/+ro (register in opcode). For x87 fixed-ModR/M
|
|
// forms (opcodes 0xD8..0xDF with ext >= 0xC0), the .OP_R index goes
|
|
// into the rm field of the fixed ModR/M byte instead of the opcode.
|
|
x87_fixed_modrm := opcode >= 0xD8 && opcode <= 0xDF && enc.ext >= 0xC0
|
|
opr_index: u8 = 0
|
|
opr_seen := false
|
|
if opr_slot >= 0 {
|
|
user_op := user_ops[opr_slot]
|
|
if user_op != nil && user_op.kind == .REGISTER {
|
|
opr_index = reg_hw(user_op.reg) & 0x07
|
|
opr_seen = true
|
|
}
|
|
}
|
|
if opr_seen && !x87_fixed_modrm {
|
|
opcode += opr_index
|
|
}
|
|
|
|
out[pos] = opcode
|
|
pos += 1
|
|
|
|
// --- ModR/M and SIB --- (mr_slot/reg_slot/has_modrm gathered above)
|
|
if has_modrm {
|
|
has_sib := false
|
|
mod: u8 = 0
|
|
reg_field: u8 = 0
|
|
rm: u8 = 0
|
|
sib: u8 = 0
|
|
disp: i32 = 0
|
|
displacement_size: u8 = 0
|
|
|
|
// Reg field
|
|
if enc.flags.modrm_reg_ext {
|
|
reg_field = enc.ext & 0x07
|
|
} else if reg_slot >= 0 {
|
|
reg_op := user_ops[reg_slot]
|
|
if reg_op != nil && reg_op.kind == .REGISTER {
|
|
reg_field = reg_hw(reg_op.reg) & 0x07
|
|
}
|
|
}
|
|
|
|
// R/M field
|
|
if mr_slot >= 0 {
|
|
mr_op := user_ops[mr_slot]
|
|
if mr_op != nil {
|
|
#partial switch mr_op.kind {
|
|
case .REGISTER:
|
|
mod = 0b11
|
|
rm = reg_hw(mr_op.reg) & 0x07
|
|
case .MEMORY:
|
|
m := mr_op.mem
|
|
|
|
if mem_is_rip_relative(m) {
|
|
mod = 0b00
|
|
rm = 0b101
|
|
disp = m.disp
|
|
displacement_size = 4
|
|
} else if !mem_has_base(m) && !mem_has_index(m) {
|
|
mod = 0b00
|
|
rm = 0b100
|
|
has_sib = true
|
|
sib = 0b00_100_101
|
|
disp = m.disp
|
|
displacement_size = 4
|
|
} else {
|
|
base_hw := m.base_hw
|
|
has_index := mem_has_index(m)
|
|
disp_value := m.disp
|
|
|
|
needs_sib := has_index || (base_hw & 0x07) == 4
|
|
|
|
has_base := mem_has_base(m)
|
|
is_rbp := (base_hw & 0x07) == 5
|
|
is_zero := disp_value == 0
|
|
fits8 := disp_value >= -128 && disp_value <= 127
|
|
disp = disp_value
|
|
|
|
if needs_sib {
|
|
has_sib = true
|
|
rm = 0b100
|
|
|
|
scale: u8 = 0
|
|
switch mem_scale(m) {
|
|
case 2: scale = 1
|
|
case 4: scale = 2
|
|
case 8: scale = 3
|
|
}
|
|
|
|
idx := has_index ? (m.index_hw & 0x07) : u8(0b100)
|
|
base_sib := has_base ? (base_hw & 0x07) : u8(0b101)
|
|
sib = (scale << 6) | (idx << 3) | base_sib
|
|
|
|
// mod / disp size, branchless. No base -> [disp32]
|
|
// (mod 00, size 4). Otherwise: no displacement when
|
|
// zero and not RBP-like; else disp8 if it fits, else
|
|
// disp32. (RBP-like base forces an explicit disp8.)
|
|
no_disp := has_base && is_zero && !(has_base && is_rbp)
|
|
displacement_size = !has_base ? 4 : (no_disp ? 0 : (fits8 ? 1 : 4))
|
|
mod = !has_base ? 0b00 : (no_disp ? 0b00 : (fits8 ? 0b01 : 0b10))
|
|
} else {
|
|
rm = base_hw & 0x07
|
|
no_disp := is_zero && !is_rbp
|
|
displacement_size = no_disp ? 0 : (fits8 ? 1 : 4)
|
|
mod = no_disp ? 0b00 : (fits8 ? 0b01 : 0b10)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
out[pos] = (mod << 6) | (reg_field << 3) | rm
|
|
pos += 1
|
|
|
|
if has_sib {
|
|
out[pos] = sib
|
|
pos += 1
|
|
}
|
|
|
|
// Displacement: four unconditional little-endian byte stores, then
|
|
// advance by the real size (0/1/4) -- no data-dependent loop. The
|
|
// untaken tail bytes are reclaimed by the next emit; ENCODE_TAIL_SLACK
|
|
// keeps the widened store in bounds.
|
|
out[pos+0] = u8(disp)
|
|
out[pos+1] = u8(disp >> 8)
|
|
out[pos+2] = u8(disp >> 16)
|
|
out[pos+3] = u8(disp >> 24)
|
|
pos += u32(displacement_size)
|
|
}
|
|
|
|
// Fixed ModR/M for special instructions. Triggered for:
|
|
// - 0F-escape forms (NOP-class, MONITOR/MWAIT, etc.)
|
|
// - x87 ST(i) and special control instructions (opcodes 0xD8..0xDF)
|
|
is_x87_opcode := enc.opcode >= 0xD8 && enc.opcode <= 0xDF
|
|
if enc.ext >= 0xC0 && !has_modrm && (enc.flags.esc != .NONE || is_x87_opcode) {
|
|
modrm_byte := enc.ext
|
|
// For x87 ST(i) forms, OR the OP_R register index into the rm field
|
|
if x87_fixed_modrm && opr_seen {
|
|
modrm_byte = (modrm_byte & 0xF8) | opr_index
|
|
}
|
|
out[pos] = modrm_byte
|
|
pos += 1
|
|
}
|
|
|
|
// --- Immediates ---
|
|
for enc_type, i in enc.enc {
|
|
#partial switch enc_type {
|
|
case .IB:
|
|
user_op := user_ops[i]
|
|
if user_op != nil {
|
|
#partial switch user_op.kind {
|
|
case .IMMEDIATE:
|
|
out[pos] = u8(user_op.immediate)
|
|
pos += 1
|
|
case .RELATIVE:
|
|
// Relative reference - record relocation
|
|
label_id := u32(user_op.relative)
|
|
append(&pending_relocations, Relocation{byte_count + pos, label_id, 0, .REL8, 1, u16(instruction_index)})
|
|
out[pos] = 0
|
|
pos += 1
|
|
}
|
|
}
|
|
|
|
case .IW:
|
|
user_op := user_ops[i]
|
|
if user_op != nil && user_op.kind == .IMMEDIATE {
|
|
immediate_val := u16(user_op.immediate)
|
|
out[pos] = u8(immediate_val); out[pos+1] = u8(immediate_val >> 8)
|
|
pos += 2
|
|
}
|
|
|
|
case .ID:
|
|
user_op := user_ops[i]
|
|
if user_op != nil {
|
|
#partial switch user_op.kind {
|
|
case .IMMEDIATE:
|
|
immediate_val := u32(user_op.immediate)
|
|
out[pos] = u8(immediate_val); out[pos+1] = u8(immediate_val >> 8)
|
|
out[pos+2] = u8(immediate_val >> 16); out[pos+3] = u8(immediate_val >> 24)
|
|
pos += 4
|
|
case .RELATIVE:
|
|
label_id := u32(user_op.relative)
|
|
append(&pending_relocations, Relocation{byte_count + pos, label_id, 0, .REL32, 4, u16(instruction_index)})
|
|
out[pos] = 0; out[pos+1] = 0; out[pos+2] = 0; out[pos+3] = 0
|
|
pos += 4
|
|
}
|
|
}
|
|
|
|
case .IQ:
|
|
user_op := user_ops[i]
|
|
if user_op != nil && user_op.kind == .IMMEDIATE {
|
|
immediate_val := u64(user_op.immediate)
|
|
for j in u32(0)..<8 { out[pos + j] = u8(immediate_val >> (j * 8)) }
|
|
pos += 8
|
|
}
|
|
}
|
|
}
|
|
|
|
byte_count += pos
|
|
}
|
|
|
|
// =========================================================================
|
|
// PASS 1.5: Convert label_defs from instruction indices to byte offsets
|
|
// =========================================================================
|
|
|
|
isa.rewrite_label_defs_to_offsets(label_defs, inst_offsets[:])
|
|
|
|
// =========================================================================
|
|
// PASS 2: Resolve relocations (x64 dispatches its types to isa patch primitives)
|
|
// =========================================================================
|
|
|
|
for &relocation in pending_relocations {
|
|
label_id := relocation.label_id
|
|
|
|
// O(1) label lookup - label_defs now contains byte offsets
|
|
if label_id >= u32(len(label_defs)) || label_defs[label_id] == LABEL_UNDEFINED {
|
|
// Unresolved - append to user's relocs array
|
|
append(relocs, relocation)
|
|
continue
|
|
}
|
|
|
|
target_offset := u32(label_defs[label_id])
|
|
patch_offset := relocation.offset
|
|
|
|
if !resolve {
|
|
// User wants all relocations, even resolvable ones
|
|
append(relocs, relocation)
|
|
continue
|
|
}
|
|
|
|
#partial switch relocation.type {
|
|
case .REL8:
|
|
// x64 REL8: instruction ends 1 byte after the value field.
|
|
next_pc := patch_offset + 1
|
|
if !patch_pcrel_i8(code, patch_offset, target_offset, next_pc, relocation.addend) {
|
|
append(errors, Error{u32(relocation.inst_idx), .LABEL_OUT_OF_RANGE, {}})
|
|
ok = false
|
|
}
|
|
|
|
case .REL32:
|
|
next_pc := patch_offset + 4
|
|
patch_pcrel_i32(code, patch_offset, target_offset, next_pc, relocation.addend)
|
|
|
|
case .ABS32:
|
|
patch_abs32(code, patch_offset, target_offset, base_address, relocation.addend)
|
|
|
|
case .ABS64:
|
|
patch_abs64(code, patch_offset, target_offset, base_address, relocation.addend)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// SECTION: 7.7 Inline Helper Functions
|
|
// -----------------------------------------------------------------------------
|
|
|
|
// Branchless select mask: 0xFF when `b`, else 0x00. Used to OR-accumulate
|
|
// REX/VEX/EVEX bit contributions without a per-condition branch
|
|
// (`x |= bmask(cond) & bits`).
|
|
@(private="file")
|
|
bmask :: #force_inline proc "contextless" (b: bool) -> u8 {
|
|
return -u8(b)
|
|
}
|
|
|
|
// Check if instruction matches encoding (inlined for hot path).
|
|
// `mode` lets default_64 entries match 32-bit operands in i386 and
|
|
// filters out mode-restricted (mode_32_only) encodings when not in i386.
|
|
encoding_matches_inline :: proc "contextless" (inst: ^Instruction, enc: ^Encoding, mode: Mode) -> bool {
|
|
// Mode gate: skip i386-only encodings (short-form INC/DEC at 0x40-0x4F)
|
|
// when not in Mode._32.
|
|
if enc.flags.mode_32_only && mode != ._32 { return false }
|
|
|
|
explicit_count := enc.flags.explicit_count
|
|
|
|
if !enc.flags.has_implicit {
|
|
if inst.operand_count != explicit_count { return false }
|
|
for i in 0 ..< explicit_count {
|
|
eff := mode_rewrite_op_type(enc.ops[i], mode, enc.flags.default_64)
|
|
operand_matches_inline(&inst.ops[i], eff) or_return
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Special case: if user provides exactly one more operand than non-implicit count,
|
|
// check if the extra operand matches an implicit operand (e.g., CL for shifts)
|
|
if inst.operand_count == explicit_count + 1 {
|
|
// Check if the last user operand matches an implicit operand in the encoding
|
|
last_user_op := &inst.ops[inst.operand_count - 1]
|
|
found_matching_implicit := false
|
|
for op_type in enc.ops {
|
|
if op_type == .NONE { break }
|
|
if is_implicit_op_inline(op_type) && implicit_operand_matches(last_user_op, op_type) {
|
|
found_matching_implicit = true
|
|
break
|
|
}
|
|
}
|
|
if !found_matching_implicit { return false }
|
|
|
|
// Match the first (operand_count - 1) user operands against non-implicit encoding operands
|
|
user_idx: u8 = 0
|
|
for op_type in enc.ops {
|
|
if op_type == .NONE { break }
|
|
if is_implicit_op_inline(op_type) { continue }
|
|
|
|
if user_idx >= inst.operand_count - 1 { return false }
|
|
effective_op_type := mode_rewrite_op_type(op_type, mode, enc.flags.default_64)
|
|
operand_matches_inline(&inst.ops[user_idx], effective_op_type) or_return
|
|
user_idx += 1
|
|
}
|
|
return user_idx == inst.operand_count - 1
|
|
}
|
|
|
|
// STandard case: operand count must match non-implicit count
|
|
if inst.operand_count != explicit_count { return false }
|
|
|
|
// Match each user operand against non-implicit encoding operands
|
|
user_idx: u8 = 0
|
|
for op_type in enc.ops {
|
|
if op_type == .NONE { break }
|
|
if is_implicit_op_inline(op_type) { continue }
|
|
|
|
if user_idx >= inst.operand_count { return false }
|
|
effective_op_type := mode_rewrite_op_type(op_type, mode, enc.flags.default_64)
|
|
operand_matches_inline(&inst.ops[user_idx], effective_op_type) or_return
|
|
user_idx += 1
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// Check if a user operand matches an implicit operand type (for explicit implicit operand matching)
|
|
implicit_operand_matches :: #force_inline proc "contextless" (op: ^Operand, op_type: Operand_Type) -> bool {
|
|
if op.kind != .REGISTER { return false }
|
|
#partial switch op_type {
|
|
case .CL_IMPL: return op.reg == CL
|
|
case .DX_IMPL: return op.reg == DX
|
|
case .ST0_IMPL: return op.reg == ST0
|
|
case .XMM0_IMPL: return op.reg == XMM0
|
|
// Don't match AL/AX/EAX/RAX_Impl - those are for short-form encodings
|
|
// Don't match One_Impl - can't provide "1" as a register
|
|
}
|
|
return false
|
|
}
|
|
|
|
is_implicit_op_inline :: #force_inline proc "contextless" (op: Operand_Type) -> bool {
|
|
#partial switch op {
|
|
case .AL_IMPL, .AX_IMPL, .EAX_IMPL, .RAX_IMPL,
|
|
.CL_IMPL, .DX_IMPL, .ONE_IMPL, .ST0_IMPL, .XMM0_IMPL:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
operand_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: Operand_Type) -> bool {
|
|
switch op.kind {
|
|
case .NONE: return op_type == .NONE
|
|
case .REGISTER: return reg_matches_inline(op, op_type)
|
|
case .MEMORY: return mem_matches_inline(op, op_type)
|
|
case .IMMEDIATE: return imm_matches_inline(op, op_type)
|
|
case .RELATIVE:
|
|
// Respect user's size preference: size=1 -> REL8, size=4 -> REL32
|
|
if op.size == 1 { return op_type == .REL8 }
|
|
if op.size == 4 { return op_type == .REL32 }
|
|
// Default: accept either
|
|
return op_type == .REL8 || op_type == .REL32
|
|
}
|
|
return false
|
|
}
|
|
|
|
reg_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: Operand_Type) -> bool {
|
|
class := reg_class(op.reg)
|
|
#partial switch op_type {
|
|
case .R8, .RM8: return class == REG_GPR8 || class == REG_GPR8H
|
|
case .R16, .RM16: return class == REG_GPR16
|
|
case .R32, .RM32: return class == REG_GPR32
|
|
case .R64, .RM64: return class == REG_GPR64
|
|
case .XMM, .XMM_M32, .XMM_M64, .XMM_M128: return class == REG_XMM
|
|
case .YMM, .YMM_M256: return class == REG_YMM
|
|
case .ZMM, .ZMM_M512: return class == REG_ZMM
|
|
case .MM, .MM_M64: return class == REG_MM
|
|
case .K, .K_M8, .K_M16, .K_M32, .K_M64: return class == REG_K
|
|
case .SREG: return class == REG_SEG
|
|
case .CR: return class == REG_CR
|
|
case .DR: return class == REG_DR
|
|
case .STI: return class == REG_ST
|
|
}
|
|
return false
|
|
}
|
|
|
|
mem_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: Operand_Type) -> bool {
|
|
#partial switch op_type {
|
|
case .M: return true // Any size memory
|
|
case .RM8: return op.size == 1
|
|
case .RM16: return op.size == 2
|
|
case .RM32: return op.size == 4
|
|
case .RM64: return op.size == 8
|
|
case .M8: return op.size == 1
|
|
case .M16: return op.size == 2
|
|
case .M32: return op.size == 4
|
|
case .M64: return op.size == 8
|
|
case .M80: return op.size == 10
|
|
case .M128: return op.size == 16
|
|
case .M256: return op.size == 32
|
|
case .M512: return op.size == 64
|
|
case .XMM_M32: return op.size == 4
|
|
case .XMM_M64, .MM_M64: return op.size == 8
|
|
case .XMM_M128: return op.size == 16
|
|
case .YMM_M256: return op.size == 32
|
|
case .ZMM_M512: return op.size == 64
|
|
case .K_M8: return op.size == 1
|
|
case .K_M16: return op.size == 2
|
|
case .K_M32: return op.size == 4
|
|
case .K_M64: return op.size == 8
|
|
case .M16_16, .M16_32, .M16_64: return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
imm_matches_inline :: #force_inline proc "contextless" (op: ^Operand, op_type: Operand_Type) -> bool {
|
|
// Match based on whether the VALUE fits in the encoding's immediate size.
|
|
// x64 immediates are interpreted as both signed and unsigned depending on context:
|
|
// - ADD r32, imm8sx: sign-extended, so -1 becomes 0xFFFFFFFF
|
|
// - MOV r32, imm32: can be 0xFFFFFFFF (unsigned) or -1 (signed), same encoding
|
|
// We accept the full range that can be encoded (union of signed and unsigned).
|
|
#partial switch op_type {
|
|
case .IMM8:
|
|
// Full 8-bit range: signed [-128, 127] OR unsigned [0, 255]
|
|
return -128 <= op.immediate && op.immediate <= 255
|
|
case .IMM8SX:
|
|
// Sign-extended 8-bit: must be in signed 8-bit range
|
|
return -128 <= op.immediate && op.immediate <= 127
|
|
case .IMM16:
|
|
// Full 16-bit range: signed [-32768, 32767] OR unsigned [0, 65535]
|
|
return -32768 <= op.immediate && op.immediate <= 65535
|
|
case .IMM32:
|
|
// Full 32-bit range: signed [-2147483648, 2147483647] OR unsigned [0, 4294967295]
|
|
return -2147483648 <= op.immediate && op.immediate <= 4294967295
|
|
case .IMM64:
|
|
return true // Any i64 value fits
|
|
}
|
|
return false
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// SECTION: 7.8 Convenience Functions
|
|
// -----------------------------------------------------------------------------
|
|
|
|
// Compute safe buffer sizes for encoding
|
|
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int {
|
|
return n * MAX_INST_SIZE + ENCODE_TAIL_SLACK
|
|
}
|
|
|
|
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int {
|
|
return n // At most 1 reloc per instruction
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// SECTION: 7.10 Dynamic Array Instruction Emitters
|
|
// -----------------------------------------------------------------------------
|
|
|
|
// Instruction emitters
|
|
emit_none :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic) {
|
|
append(instructions, inst_none(mnemonic))
|
|
}
|
|
|
|
emit_r :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, r: Register) {
|
|
append(instructions, inst_r(mnemonic, r))
|
|
}
|
|
|
|
emit_rr :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination, source: Register) {
|
|
append(instructions, inst_r_r(mnemonic, destination, source))
|
|
}
|
|
|
|
emit_ri :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Register, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_r_i(mnemonic, destination, immediate, immediate_size))
|
|
}
|
|
|
|
emit_rm :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Register, source: Memory, size: u8) {
|
|
append(instructions, inst_r_m(mnemonic, destination, source, size))
|
|
}
|
|
|
|
emit_mr :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Memory, size: u8, source: Register) {
|
|
append(instructions, inst_m_r(mnemonic, destination, size, source))
|
|
}
|
|
|
|
emit_m :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, m: Memory, size: u8) {
|
|
append(instructions, inst_m(mnemonic, m, size))
|
|
}
|
|
|
|
emit_mi :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Memory, size: u8, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_m_i(mnemonic, destination, size, immediate, immediate_size))
|
|
}
|
|
|
|
emit_rel :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, label_id: u32, size: u8 = 4) {
|
|
append(instructions, inst_rel(mnemonic, label_id, size))
|
|
}
|
|
|
|
emit_rrr :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination, source1, source2: Register) {
|
|
append(instructions, inst_r_r_r(mnemonic, destination, source1, source2))
|
|
}
|
|
|
|
emit_rrm :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination, source1: Register, source2: Memory, size: u8) {
|
|
append(instructions, inst_r_r_m(mnemonic, destination, source1, source2, size))
|
|
}
|
|
|
|
emit_rri :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination, source: Register, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_r_r_i(mnemonic, destination, source, immediate, immediate_size))
|
|
}
|
|
|
|
emit_rrrr :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination, source1, source2, source3: Register) {
|
|
append(instructions, inst_r_r_r_r(mnemonic, destination, source1, source2, source3))
|
|
}
|
|
|
|
emit_i :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_i(mnemonic, immediate, immediate_size))
|
|
}
|
|
|
|
emit_rmi :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Register, source: Memory, mem_size: u8, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_r_m_i(mnemonic, destination, source, mem_size, immediate, immediate_size))
|
|
}
|
|
|
|
emit_mri :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, destination: Memory, mem_size: u8, source: Register, immediate: i64, immediate_size: u8) {
|
|
append(instructions, inst_m_r_i(mnemonic, destination, mem_size, source, immediate, immediate_size))
|
|
}
|
|
|
|
emit_rel_offset :: #force_inline proc(instructions: ^[dynamic]Instruction, mnemonic: Mnemonic, offset: i64, offset_size: u8) {
|
|
append(instructions, inst_rel_offset(mnemonic, offset, offset_size))
|
|
}
|