Files
Odin/core/rexcode/isa/arm64/encoder.odin
Brendan Punsky fae15847a3 rexcode: buffer-sizing helpers across all ISAs + naming-contract doc
Roll the encode/decode buffer-sizing helpers (added for x86 in 49787b7de) out
to every other ISA, and document them in the cross-arch naming contract.

Per arch (arm32, arm64, mips, riscv, ppc, ppc_vle, rsp, mos6502, mos65816):
  - encode_max_code_size / encode_max_relocation_count now key off the
    []Instruction slice (were int counts); bodies unchanged (* MAX_INST_SIZE).
  - encode_reserve(code, relocs, instructions): grows the caller's code []u8 by
    length and reserves relocs by capacity; allocates no new buffers.
  - decode_max_instruction_count / decode_estimate_instruction_count: exact
    ceiling and typical estimate, keyed off the min/avg instruction size per
    arch (fixed-4: arm64/mips/ppc/rsp; min-2: arm32/riscv/ppc_vle; min-1: mos).
  - decode_reserve(instructions, inst_info, label_defs, data, exact=false).

docs/cross_arch_design.md: helpers added to the naming contract.

No behavior change to the existing size helpers (signature only). All 10 ISAs
check + test green (x86 2282, arm32 600, arm64 461, mips 281, riscv 154, ppc 31,
ppc_vle 281, rsp 70, mos6502 148, mos65816 53).
2026-06-19 04:11:30 -04:00

914 lines
33 KiB
Odin

// rexcode · Brendan Punsky (dotbmp@github), original author
package rexcode_arm64
// =============================================================================
// AArch64 ENCODER
// =============================================================================
//
// Fixed-width 4-byte ISA. Two-pass design mirroring mips/encoder.odin /
// riscv/encoder.odin. The interesting bits vs other arches:
//
// * Compound operands: SHIFTED_REG (Rm + shift type + amount) and
// EXTENDED_REG (Rm + extend + amount) are packed by the RM encoder
// by inspecting the operand kind -- a plain REGISTER decays to
// LSL #0 / UXTX, amount=0.
//
// * Three split-immediate scatter patterns:
// BRANCH_PG21 -- 21-bit imm split as immlo[30:29] + immhi[23:5]
// (ADR / ADRP)
// TBZ_BIT -- 6-bit bit position split as b5[31] + b40[23:19]
// (TBZ / TBNZ)
// SYS_FIELD -- 15-bit (op0:op1:CRn:CRm:op2) at bits 19:5
//
// * Loads/stores with the unsigned-offset (LDR/STR) form scale the
// user displacement by data size (1/2/4/8) derived from bits[31:30]
// of the encoding. LDP/STP pair forms scale a signed 7-bit field.
//
// * Endianness: AArch64 standard mode stores instructions LE; BE-32
// (instructions stored big-endian) is legacy and rare. Parameter
// defaults to LITTLE.
MAX_INST_SIZE :: 4
encode_max_code_size :: #force_inline proc "contextless" (instructions: []Instruction) -> int {
return len(instructions) * MAX_INST_SIZE
}
encode_max_relocation_count :: #force_inline proc "contextless" (instructions: []Instruction) -> int {
return len(instructions)
}
// Pre-size the caller's encode outputs (code grown by length so code[:] is a
// valid emit target; relocs reserved by capacity) so the encode hot path never
// reallocates. Allocates no new buffers; pass nil to skip either array.
encode_reserve :: proc(code: ^[dynamic]u8, relocs: ^[dynamic]Relocation, instructions: []Instruction) {
if code != nil {
size := encode_max_code_size(instructions)
if len(code) < size {
resize(code, size)
}
}
if relocs != nil {
reserve(relocs, len(relocs) + encode_max_relocation_count(instructions))
}
}
encode :: proc(
instructions: []Instruction,
label_defs: []Label_Definition,
code: []u8,
relocs: ^[dynamic]Relocation,
errors: ^[dynamic]Error,
endianness: Endianness = .LITTLE,
resolve: bool = true,
base_address: u64 = 0,
) -> (byte_count: u32, ok: bool) {
n_inst := u32(len(instructions))
if u32(len(code)) < n_inst * 4 {
append(errors, Error{inst_idx = 0, code = .BUFFER_OVERFLOW})
return
}
errors_start := u32(len(errors))
pending_start := u32(len(relocs))
// ---- PASS 1 -----------------------------------------------------------
for i in 0..<n_inst {
inst := &instructions[i]
word := encode_one_inline(inst, byte_count, u16(i), relocs, errors) or_return
write_u32(code, byte_count, word, endianness)
byte_count += 4
}
// ---- PASS 1.5: fixed-width => *4 -------------------------------------
for &ld in label_defs {
if ld != LABEL_UNDEFINED {
ld = Label_Definition(u32(ld) * 4)
}
}
if !resolve {
ok = u32(len(errors)) == errors_start
return
}
// ---- PASS 2: resolve relocations -------------------------------------
n_relocs := u32(len(relocs))
write_idx := pending_start
for read_idx in pending_start..<n_relocs {
r := relocs[read_idx]
if resolve_relocation_inline(code, label_defs, &r, endianness, base_address, errors) {
continue
}
if write_idx != read_idx { relocs[write_idx] = r }
write_idx += 1
}
if write_idx != n_relocs { resize(relocs, int(write_idx)) }
ok = u32(len(errors)) == errors_start
return
}
// =============================================================================
// Internal: encode one instruction
// =============================================================================
@(private="file")
encode_one_inline :: #force_inline proc(
inst: ^Instruction,
pc: u32,
inst_idx: u16,
relocs: ^[dynamic]Relocation,
errors: ^[dynamic]Error,
) -> (word: u32, ok: bool) {
if inst.mnemonic == .INVALID {
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
return 0, false
}
forms := encoding_forms(inst.mnemonic)
if len(forms) == 0 {
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
return 0, false
}
form: ^Encoding
for &f in forms {
if encoding_matches_inline(inst, &f) { form = &f; break }
}
if form == nil {
append(errors, Error{inst_idx = u32(inst_idx), code = .NO_MATCHING_ENCODING})
return 0, false
}
word = form.bits
if form.enc[0] != .NONE { word |= pack_operand_inline(&inst.ops[0], form.enc[0], form, pc, inst_idx, relocs) }
if form.enc[1] != .NONE { word |= pack_operand_inline(&inst.ops[1], form.enc[1], form, pc, inst_idx, relocs) }
if form.enc[2] != .NONE { word |= pack_operand_inline(&inst.ops[2], form.enc[2], form, pc, inst_idx, relocs) }
if form.enc[3] != .NONE { word |= pack_operand_inline(&inst.ops[3], form.enc[3], form, pc, inst_idx, relocs) }
return word, true
}
@(private="file")
encoding_matches_inline :: #force_inline proc "contextless" (
inst: ^Instruction, form: ^Encoding,
) -> bool {
return operand_matches_inline(&inst.ops[0], form.ops[0], form) &&
operand_matches_inline(&inst.ops[1], form.ops[1], form) &&
operand_matches_inline(&inst.ops[2], form.ops[2], form) &&
operand_matches_inline(&inst.ops[3], form.ops[3], form)
}
@(private="file")
operand_matches_inline :: #force_inline proc "contextless" (
op: ^Operand, ot: Operand_Type, form: ^Encoding,
) -> bool {
switch ot {
case .NONE:
return op.kind == .NONE
case .W_REG:
return op.kind == .REGISTER && reg_class(op.reg) == REG_W
case .X_REG:
return op.kind == .REGISTER && reg_class(op.reg) == REG_X
case .WSP_REG:
return op.kind == .REGISTER && (reg_class(op.reg) == REG_W || reg_class(op.reg) == REG_WSP)
case .XSP_REG:
return op.kind == .REGISTER && (reg_class(op.reg) == REG_X || reg_class(op.reg) == REG_XSP)
case .W_SHIFTED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_W }
return false
case .X_SHIFTED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_X }
return false
case .W_EXTENDED:
// The extend type selects W vs X for the inner reg; accept either
// and let the encoder pack option = extend faithfully.
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
if op.kind == .EXTENDED_REG {
c := reg_class(op.extended.reg)
return c == REG_W || c == REG_X
}
return false
case .X_EXTENDED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
if op.kind == .EXTENDED_REG {
c := reg_class(op.extended.reg)
return c == REG_W || c == REG_X
}
return false
case .B_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_B
case .H_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_H
case .S_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_S
case .D_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_D
case .Q_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_Q
case .V_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_V
// NEON vector arrangement variants. The user encodes the arrangement
// via op.size: 8B=8, 16B=16, 4H=24, 8H=32, 2S=40, 4S=48, 1D=56, 2D=64.
// (lanes * elem_bytes; unique per arrangement). When op.size==0 the
// matcher accepts any V register (legacy / "first form wins") --
// callers using op_reg() get size=4 by default which matches the
// .V_4H form arithmetically; prefer the explicit op_v_*() builders.
case .V_8B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 8)
case .V_16B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 16)
case .V_4H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
case .V_8H, .V_8H_FP16:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 32)
case .V_2S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 40)
case .V_4S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 48)
case .V_1D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 56)
case .V_2D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64)
case .V_4H_FP16:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
// Element-indexed V views: element size carried in op.size (B=1,H=2,S=4,
// D=8) so DUP/INS forms disambiguate. .S also accepts size 0 so a plain
// op_reg (as the hand-written SM3TT forms pass) still matches the .S slot.
case .V_ELEM_B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 1
case .V_ELEM_H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 2
case .V_ELEM_S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 4 || op.size == 0)
case .V_ELEM_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 8
// SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8.
// op.size==0 (legacy / default-constructed) accepts any width.
case .Z_REG_B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 1)
case .Z_REG_H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 2)
case .Z_REG_S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 4)
case .Z_REG_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 8)
case .P_REG, .P_REG_MERGE, .P_REG_ZERO, .P_REG_GOV:
return op.kind == .REGISTER && reg_class(op.reg) == REG_P
// SME tile state (immediate-encoded tile number; user supplies the
// tile index as an immediate, e.g. 0 for ZA0.S, 3 for ZA3.S).
case .ZA_TILE_B, .ZA_TILE_H, .ZA_TILE_S, .ZA_TILE_D, .ZA_TILE_Q:
return op.kind == .IMMEDIATE
// Misc immediate sub-types added in batch 3
case .FCMLA_ROT, .FCADD_ROT, .SVE_PRFOP, .LDRAA_IMM10:
return op.kind == .IMMEDIATE
case .LSL_SHIFT_W, .LSL_SHIFT_X, .ROR_SHIFT:
return op.kind == .IMMEDIATE
case .Z_PAIR:
// SME2 vector pair: first reg must be even (Z0, Z2, ..., Z30).
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x1) == 0
case .Z_QUAD:
// SME2 vector quad: first reg must be multiple of 4.
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x3) == 0
case .SME_PATTERN, .SVE_PATTERN:
return op.kind == .IMMEDIATE
// SME tile slice (packed immediate descriptor; see encoding_types.odin)
case .SME_SLICE_B, .SME_SLICE_H, .SME_SLICE_W, .SME_SLICE_D, .SME_SLICE_Q:
return op.kind == .IMMEDIATE
case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT, .VEC_INDEX:
return op.kind == .IMMEDIATE
case .BITMASK_IMM:
// The user passes the raw logical mask value; we validate that it
// fits the AArch64 bitmask-immediate encoding at the form's width.
return op.kind == .IMMEDIATE && is_valid_bitmask_imm(u64(op.immediate), form.flags.is_64)
case .REL_26, .REL_19, .REL_14, .REL_PG21:
return op.kind == .RELATIVE
case .MEM:
return op.kind == .MEMORY
case .COND:
return op.kind == .COND
}
return false
}
// =============================================================================
// Operand packer
// =============================================================================
@(private="file")
// Element size in bits for a NEON vector arrangement operand type.
vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
#partial switch ot {
case .V_8B, .V_16B: return 8
case .V_4H, .V_8H, .V_4H_FP16, .V_8H_FP16: return 16
case .V_2S, .V_4S: return 32
case .V_1D, .V_2D: return 64
case .Z_REG_B: return 8
case .Z_REG_H: return 16
case .Z_REG_S: return 32
case .Z_REG_D: return 64
}
return 8
}
@(private="file")
// Lane-index marker bit (log2 of element-size in bytes) for a DUP/INS form:
// derived from the V_ELEM_* operand the form carries. B=0, H=1, S=2, D=3.
vidx_markerbit :: #force_inline proc "contextless" (form: ^Encoding) -> u32 {
for ot in form.ops {
#partial switch ot {
case .V_ELEM_B: return 0
case .V_ELEM_H: return 1
case .V_ELEM_S: return 2
case .V_ELEM_D: return 3
}
}
return 0
}
pack_operand_inline :: #force_inline proc(
op: ^Operand,
enc: Operand_Encoding,
form: ^Encoding,
pc: u32,
inst_idx: u16,
relocs: ^[dynamic]Relocation,
) -> u32 {
switch enc {
case .NONE, .IMPL:
return 0
// ---- Register slots ----------------------------------------------------
case .RD, .RT:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .RN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .RT2, .RA:
return (u32(reg_hw(op.reg)) & 0x1F) << 10
case .RM:
// RM has three flavours per the operand kind:
// REGISTER -- plain Rm at bits 20-16
// SHIFTED_REG -- Rm + shift type (22:23) + amount (15:10)
// EXTENDED_REG -- Rm + extend (13:15) + amount (10:12)
switch op.kind {
case .REGISTER:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .SHIFTED_REG:
return (u32(reg_hw(op.shifted.reg)) & 0x1F) << 16 |
(u32(op.shifted.type) & 0x3) << 22 |
(u32(op.shifted.amount) & 0x3F) << 10
case .EXTENDED_REG:
return (u32(reg_hw(op.extended.reg)) & 0x1F) << 16 |
(u32(op.extended.extend) & 0x7) << 13 |
(u32(op.extended.amount) & 0x7) << 10
case .NONE, .IMMEDIATE, .MEMORY, .RELATIVE, .COND:
return 0
}
// ---- Immediates --------------------------------------------------------
case .IMM12: return (u32(op.immediate) & 0xFFF) << 10
case .IMM16: return (u32(op.immediate) & 0xFFFF) << 5
case .IMM6: return (u32(op.immediate) & 0x3F) << 10
case .IMM9: return (u32(op.immediate) & 0x1FF) << 12
case .IMM_HW: return (u32(op.immediate) & 0x3) << 21
case .IMM_SH12: return (u32(op.immediate) & 0x1) << 22
case .SHIFT_TYPE: return (u32(op.immediate) & 0x3) << 22
case .EXT_OPT: return (u32(op.immediate) & 0x7) << 13
case .EXT_IMM3: return (u32(op.immediate) & 0x7) << 10
case .COND_HI:
// Condition payload may arrive as IMMEDIATE (raw) or COND kind.
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
return (c & 0xF) << 12
case .COND_LO:
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
return (c & 0xF) << 0
case .NZCV_FIELD:
return (u32(op.immediate) & 0xF) << 0
case .SYS_FIELD:
return (u32(op.immediate) & 0x7FFF) << 5
case .HINT_FIELD:
return (u32(op.immediate) & 0x7F) << 5
case .BARRIER_FIELD:
return (u32(op.immediate) & 0xF) << 8
// ---- Memory operand variants ------------------------------------------
case .OFFSET_BASE_U12:
// Scaled unsigned 12-bit: imm12 = disp / data_size
// data_size derived from bits[31:30] of the form: 00=1, 01=2, 10=4, 11=8
size := u32(1) << ((form.bits >> 30) & 0x3)
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := (u32(op.mem.disp) / size) & 0xFFF
return base_bits | (imm_bits << 10)
case .OFFSET_BASE_S9:
// Signed 9-bit unscaled at bits 20-12.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12)
case .OFFSET_BASE_PRE:
// Pre-index: bits[11:10] = 11, signed 9-bit at 20-12.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12) | (0x3 << 10)
case .OFFSET_BASE_POST:
// Post-index: bits[11:10] = 01.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12) | (0x1 << 10)
case .OFFSET_BASE_A:
// Atomic addressing: [Xn] only -- no displacement, no shift.
// Used by load/store exclusives, acquire/release, LSE atomics.
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
case .OFFSET_REG:
// [Xn, Xm{, LSL #s}]: option=011, S = shift!=0.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
option := u32(0x3) << 13
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
case .OFFSET_EXT:
// [Xn, Wm, SXTW|UXTW|SXTX #s]: option = ext, S = shift!=0.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
option := (u32(op.mem.extend) & 0x7) << 13
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
// ---- PC-relative branches ---------------------------------------------
case .BRANCH_26:
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .B26, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_19:
// Could be B.cond, CBZ/CBNZ, or LDR literal -- the relocation
// type for all three is the same B_COND19 (19-bit signed PC-rel
// scaled by 4) since the encoding field is identical.
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .B_COND19, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_14:
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .TBZ14, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_PG21:
// ADR / ADRP -- choose reloc type by the form's bits[31] (op flag).
ty: Relocation_Type = .ADR_PCREL21
if (form.bits >> 31) & 1 != 0 { ty = .ADRP_PCREL21 }
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = ty, size = 4, inst_idx = inst_idx,
})
return 0
// ---- TBZ / TBNZ bit position split (b5 at bit 31, b40 at 23-19) -----
case .TBZ_BIT:
bit := u32(op.immediate) & 0x3F
return ((bit >> 5) & 1) << 31 | (bit & 0x1F) << 19
// ---- NEON / SIMD register slots (alias of RD/RN/RM/RA bit positions) --
case .VD:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .VN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .VM:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .VA:
return (u32(reg_hw(op.reg)) & 0x1F) << 10
// NEON shift-by-immediate: the element-size marker is already in `bits`;
// the operand drives only the low immh:immb bits at 22:16.
case .NEON_SHL_IMM:
return (u32(op.immediate) & 0x3F) << 16
case .NEON_SHR_IMM:
esize := vec_esize(form.ops[0])
return ((esize - u32(op.immediate)) & 0x3F) << 16
// NEON copy/permute index fields (element-size marker fixed in `bits`).
case .VN_VM_DUP:
hw := u32(reg_hw(op.reg)) & 0x1F
return (hw << 5) | (hw << 16)
case .NEON_IDX5:
mb := vidx_markerbit(form)
return (u32(op.immediate) << (mb + 1)) << 16
case .NEON_IDX4:
mb := vidx_markerbit(form)
return (u32(op.immediate) << mb) << 11
case .NEON_EXT_IDX:
return (u32(op.immediate) & 0xF) << 11
// CCMP/CCMN immediate (imm5 at 20:16) and MSR-immediate PSTATE selector.
case .IMM5_HI:
return (u32(op.immediate) & 0x1F) << 16
case .MSR_PSTATE:
v := u32(op.immediate)
return ((v >> 3) & 0x7) << 16 | (v & 0x7) << 5
case .FMOV_SCALAR_IMM:
return (u32(op.immediate) & 0xFF) << 13
// SVE alias duplicated predicate / Z fields + EXT byte index.
case .PG4_PM_DUP:
p := u32(reg_hw(op.reg)) & 0xF
return p << 10 | p << 16
case .PN_PM_DUP:
p := u32(reg_hw(op.reg)) & 0xF
return p << 5 | p << 16
case .PN_PG_PM_DUP:
p := u32(reg_hw(op.reg)) & 0xF
return p << 5 | p << 10 | p << 16
case .ZD_ZM_DUP:
z := u32(reg_hw(op.reg)) & 0x1F
return z << 0 | z << 16
case .SVE_EXT_IMM:
v := u32(op.immediate)
return ((v >> 3) & 0x1F) << 16 | (v & 0x7) << 10
case .ZA_TILE_LOW:
return (u32(op.immediate) & 0x7) << 0
// NEON single-structure lane index (Q at 30, S at 12, size at 11:10).
case .NEON_LANE_B:
i := u32(op.immediate)
return ((i >> 3) & 0x1) << 30 | ((i >> 2) & 0x1) << 12 | (i & 0x3) << 10
case .NEON_LANE_H:
i := u32(op.immediate)
return ((i >> 2) & 0x1) << 30 | ((i >> 1) & 0x1) << 12 | (i & 0x1) << 11
case .NEON_LANE_S:
i := u32(op.immediate)
return ((i >> 1) & 0x1) << 30 | (i & 0x1) << 12
case .NEON_LANE_D:
return (u32(op.immediate) & 0x1) << 30
// SVE2 XAR rotate amount: V = 2*esize - amount, split tszh:tszl:imm3.
case .SVE_XAR_SHIFT:
esize := vec_esize(form.ops[0])
v := (2 * esize - u32(op.immediate)) & 0x7F
return ((v >> 5) & 0x3) << 22 | ((v >> 3) & 0x3) << 19 | (v & 0x7) << 16
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
case .NEON_IMM8_FMOV:
v := u32(op.immediate) & 0xFF
return ((v >> 5) & 0x7) << 16 | (v & 0x1F) << 5
case .NEON_INDEX_H:
// H lane index: H at bit 20, L at bit 21, M at bit 11 (3 bits total
// when ESize=H). v1 keeps the simpler layout: just bits 20-19.
return (u32(op.immediate) & 0x3) << 19
case .NEON_INDEX_S:
// S lane index: bits 11 (H) + 21 (L). v1: bit 11 + bit 21.
v := u32(op.immediate) & 0x3
return (v & 0x1) << 21 | ((v >> 1) & 0x1) << 11
case .NEON_INDEX_D:
return (u32(op.immediate) & 0x1) << 11
// LSE atomics share field positions with the standard load/store
// encoding (Rs at 16-20, Rt at 0-4, Rn at 5-9).
case .ATOMIC_RS:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .ATOMIC_RT:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .ATOMIC_RN:
// Memory operand carries the address register in mem.base.
if op.kind == .MEMORY {
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
}
return (u32(reg_hw(op.reg)) & 0x1F) << 5
// Bitmask logical immediate. The user passes the raw 32/64-bit mask
// value in op.immediate; the matcher has already validated that the
// value is encodable at the form's width, so encode_bitmask_imm
// cannot fail here.
case .BITMASK_FIELD:
n, immr, imms, _ := encode_bitmask_imm(u64(op.immediate), form.flags.is_64)
return (u32(n) << 22) | (u32(immr) << 16) | (u32(imms) << 10)
// SVE predicates (low 4 bits at 0/5/16; merge/zero via bit 14 etc.)
case .PD:
return (u32(reg_hw(op.reg)) & 0xF) << 0
case .PN:
return (u32(reg_hw(op.reg)) & 0xF) << 5
case .PM:
return (u32(reg_hw(op.reg)) & 0xF) << 16
case .PG:
// Governing predicate (3-bit slot, P0..P7 only).
return (u32(reg_hw(op.reg)) & 0x7) << 10
case .PG4:
// 4-bit Pg slot (P0..P15) used by predicate-logical and a few
// SVE2 ops.
return (u32(reg_hw(op.reg)) & 0xF) << 10
case .PM3:
// 3-bit Pm at bits 15:13 (SME outer products FMOPA/SMOPA/etc.).
return (u32(reg_hw(op.reg)) & 0x7) << 13
// SVE immediates
case .SVE_IMM8:
// Signed 8-bit at bits 12-5 (DUP/CPY/ADD imm).
return (u32(op.immediate) & 0xFF) << 5
case .SVE_IMM5:
// 5-bit at bits 20-16 (INDEX imm, etc.).
return (u32(op.immediate) & 0x1F) << 16
case .SVE_SHIFT_TSZ_IMM:
// tsz:imm3 at bits 22:16 -- caller passes the already-composed
// 7-bit field (tsz<6:3>:imm3<2:0>) in the IMMEDIATE.
return (u32(op.immediate) & 0x7F) << 16
case .SVE_PATTERN:
return (u32(op.immediate) & 0x1F) << 5
// SVE memory operands
case .SVE_OFFSET_BASE_SS:
// [Xn, Xm, LSL #s] scalar+scalar. Base at 9:5, index at 20:16;
// shift is implicit in the encoding's static bits (per ESize).
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
return base_bits | idx_bits
case .SVE_OFFSET_BASE_SI:
// [Xn{, #imm, MUL VL}] scalar+imm. Base at 9:5, signed 4-bit imm
// at bits 19:16 (caller passes signed disp as op.mem.disp).
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := (u32(op.mem.disp) & 0xF) << 16
return base_bits | imm_bits
// SME ZA tile number fields (position depends on element size).
case .ZA_TILE_NUM_B:
// ZA0.B only -- nothing to encode (single tile of byte form).
return 0
case .ZA_TILE_NUM_H:
// ZA0.H..ZA1.H -- 1-bit tile number at bit 22.
return (u32(op.immediate) & 0x1) << 22
case .ZA_TILE_NUM_S:
// ZA0.S..ZA3.S -- 2-bit tile number at bits 23:22.
return (u32(op.immediate) & 0x3) << 22
case .ZA_TILE_NUM_D:
// ZA0.D..ZA7.D -- 3-bit tile number at bits 23:21.
return (u32(op.immediate) & 0x7) << 21
case .SME_PATTERN_FIELD:
// 4-bit SME pattern/list at bits 8:5 (ZERO instruction list mask).
return (u32(op.immediate) & 0xF) << 5
// ---- SVE gather/scatter + vector-base memory --------------------------
case .SVE_OFFSET_BASE_VEC:
// [Xn, Zm.S/D, extend] -- base GPR at 9:5, Zm at 20:16.
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
return base | idx
case .SVE_OFFSET_VEC_BASE:
// [Zn.S/D, #imm5] -- vector base at 9:5, signed-5 imm at bits 20:16.
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm := (u32(op.mem.disp) & 0x1F) << 16
return base | imm
// ---- SVE indexed lane field (FMLA Zda.T, Zn.T, Zm.T[i]) --------------
case .SVE_FMLA_IDX_H:
// i3 = (op.immediate >> 4) & 0x7? No -- user passes lane index
// (0..7) directly. Encoder packs i3 split as bit 22, bits 20:19,
// and Zm at bits 18:16 (low 8 regs only for indexed .H/.S).
// The instruction format we use accepts the lane index as a
// 3-bit immediate; the Zm register comes via .VM.
lane := u32(op.immediate) & 0x7
return ((lane >> 2) & 0x1) << 22 | (lane & 0x3) << 19
case .SVE_FMLA_IDX_S:
lane := u32(op.immediate) & 0x3
return lane << 19
case .SVE_FMLA_IDX_D:
lane := u32(op.immediate) & 0x1
return lane << 20
// ---- SME tile slice descriptor packing -------------------------------
//
// The slice descriptor (packed immediate) is unpacked into the
// instruction's bit positions per element size. The user-passed
// packed value carries:
// imm[3:0] | V[4] | Ws[6:5] | tile[10:7]
//
// Instruction layout (per LLVM golden tests):
// bit 15 = V flag (0=H, 1=V)
// bits 14:13 = Ws index (Ws is W12 + this)
// bits 3:0 = tile_num and imm packed (per element size):
// .B : imm[3:0] (single tile, ZA0.B)
// .H : tile[0]<<3 | imm[2:0] (2 tiles, 8 slices each)
// .W : tile[1:0]<<2 | imm[1:0] (4 tiles, 4 slices each)
// .D : tile[2:0]<<1 | imm[0] (8 tiles, 2 slices each)
// .Q : tile[3:0] (16 tiles, no imm)
case .SME_SLICE_B:
v := u32(op.immediate)
imm := v & 0xF
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
return (vflag << 15) | (ws << 13) | imm
case .SME_SLICE_H:
v := u32(op.immediate)
imm := v & 0x7
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x1
return (vflag << 15) | (ws << 13) | imm | (tile << 3)
case .SME_SLICE_W:
v := u32(op.immediate)
imm := v & 0x3
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x3
return (vflag << 15) | (ws << 13) | imm | (tile << 2)
case .SME_SLICE_D:
v := u32(op.immediate)
imm := v & 0x1
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x7
return (vflag << 15) | (ws << 13) | imm | (tile << 1)
case .SME_SLICE_Q:
v := u32(op.immediate)
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0xF
return (vflag << 15) | (ws << 13) | tile
// ---- Batch 3 misc immediate encodings ----
case .ENC_FCMLA_ROT:
// 2-bit rotation at bits 13:12 (0/1/2/3 = 0°/90°/180°/270°).
return (u32(op.immediate) & 0x3) << 12
case .ENC_FCADD_ROT:
// 1-bit rotation at bit 12 (0 = 90°, 1 = 270°).
return (u32(op.immediate) & 0x1) << 12
case .ENC_SVE_PRFOP:
// 4-bit SVE prefetch op at bits 3:0.
return u32(op.immediate) & 0xF
case .ENC_LDRAA_IMM10:
// Signed 10-bit immediate at bits 21:12 (the user passes a byte
// offset that must be a multiple of 8; we encode imm >> 3).
v := u32(i32(op.immediate) >> 3) & 0x3FF
return v << 12
// ---- Batch 5 composite-packed encodings ----
case .ENC_LSL_IMM_W:
// 32-bit LSL alias: immr = (-imm) & 31, imms = 31 - imm.
imm := u32(op.immediate) & 0x1F
immr := ((~imm + 1) & 0x1F)
imms := (31 - imm) & 0x1F
return (immr << 16) | (imms << 10)
case .ENC_LSL_IMM_X:
// 64-bit LSL alias: immr = (-imm) & 63, imms = 63 - imm.
imm := u32(op.immediate) & 0x3F
immr := ((~imm + 1) & 0x3F)
imms := (63 - imm) & 0x3F
return (immr << 16) | (imms << 10)
case .ENC_DUAL_RN_RM:
// Pack the register at both Rn (9:5) AND Rm (20:16) slots
// (for ROR Rd, Rn, #imm = EXTR Rd, Rn, Rn, #imm).
hw := u32(reg_hw(op.reg)) & 0x1F
return (hw << 5) | (hw << 16)
case .ENC_ROR_SHIFT:
// imms (shift amount) at bits 15:10.
return (u32(op.immediate) & 0x3F) << 10
case .ENC_Z_PAIR_VD, .ENC_Z_QUAD_VD:
// Pack first Z reg into Vd slot (bits 4:0).
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .ENC_Z_PAIR_VN, .ENC_Z_QUAD_VN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .ENC_Z_PAIR_VM, .ENC_Z_QUAD_VM:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
}
return 0
}
// =============================================================================
// Pass 2 -- relocation resolver
// =============================================================================
@(private="file")
resolve_relocation_inline :: #force_inline proc(
code: []u8,
label_defs: []Label_Definition,
relocation: ^Relocation,
endianness: Endianness,
base_address: u64,
errors: ^[dynamic]Error,
) -> bool {
if int(relocation.label_id) >= len(label_defs) { return false }
ld := label_defs[relocation.label_id]
if ld == LABEL_UNDEFINED { return false }
target := u32(ld)
word := read_u32(code, relocation.offset, endianness)
switch relocation.type {
case .B26:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<25) || words > (1<<25)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= u32(words) & 0x03FFFFFF
case .B_COND19, .LDR_LITERAL19:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<18) || words > (1<<18)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= (u32(words) & 0x7FFFF) << 5
case .TBZ14:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<13) || words > (1<<13)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= (u32(words) & 0x3FFF) << 5
case .ADR_PCREL21:
// ADR: signed 21-bit byte offset (no scaling).
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel < -(1<<20) || rel > (1<<20)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
v := u32(rel) & 0x1FFFFF
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
case .ADRP_PCREL21:
// ADRP: difference of page (4KB-aligned) targets.
target_page := u64(target) & ~u64(0xFFF) + base_address & ~u64(0xFFF)
// Effective: ((target + base) >> 12) - ((pc + base) >> 12)
// Simpler: ((target + base) - (pc + base)) >> 12 when both are
// 4KB-aligned; but base + offset alignment is the caller's concern.
pc_page := (u64(relocation.offset) + base_address) & ~u64(0xFFF)
tg_page := target_page
diff := i64(tg_page) - i64(pc_page) + i64(relocation.addend)
if diff & 0xFFF != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
pages := diff >> 12
if pages < -(1<<20) || pages > (1<<20)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
v := u32(pages) & 0x1FFFFF
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
case .NONE, .PCREL_LO12_I, .PCREL_LO12_S, .ABS64, .ABS32, .ABS16:
// Linker-bound or assembler-layer; not auto-resolved here.
return false
}
write_u32(code, relocation.offset, word, endianness)
return true
}
// =============================================================================
// Endian-aware word I/O
// =============================================================================
@(private="package")
write_u32 :: #force_inline proc "contextless" (
code: []u8, offset: u32, word: u32, endianness: Endianness,
) {
if endianness == .LITTLE {
code[offset+0] = u8(word)
code[offset+1] = u8(word >> 8)
code[offset+2] = u8(word >> 16)
code[offset+3] = u8(word >> 24)
} else {
code[offset+0] = u8(word >> 24)
code[offset+1] = u8(word >> 16)
code[offset+2] = u8(word >> 8)
code[offset+3] = u8(word)
}
}
@(private="package")
read_u32 :: #force_inline proc "contextless" (
code: []u8, offset: u32, endianness: Endianness,
) -> u32 {
if endianness == .LITTLE {
return u32(code[offset+0]) |
(u32(code[offset+1]) << 8) |
(u32(code[offset+2]) << 16) |
(u32(code[offset+3]) << 24)
}
return (u32(code[offset+0]) << 24) |
(u32(code[offset+1]) << 16) |
(u32(code[offset+2]) << 8) |
u32(code[offset+3])
}