Files
Odin/core/rexcode/arm64/encoder.odin
Brendan Punsky 06eb3de6a2 rexcode/arm64: NEON copy/permute (MOV/MVN/DUP/INS/EXT) encode forms
MOV_V (ORR alias: source feeds both Vn and Vm via a new VN_VM_DUP
encoding), MVN_V (NOT alias, plain 2-register), DUP_V (element form
Vd.T,Vn.Ts[i] and general form Vd.T,Wn/Xn), INS (element-to-element and
from-GPR), EXT_V (imm4 byte index). Adds a VEC_INDEX operand type plus
NEON_IDX5/NEON_IDX4/NEON_EXT_IDX encodings: the element-size marker rides
in the entry bits, the lane index drives the bits above it, and the
decoder recovers the element size from imm5's marker.

Element size now rides in op.size (B=1/H=2/S=4/D=8) via op_v_elem_b/h/s/d
so the matcher can disambiguate DUP/INS element forms; the builder
generator maps V_ELEM_* to those constructors. specgen derives the mask
by varying registers and each index field to its max -- the GPR-source
forms vary Vd and Rn independently (Rn 31 = wzr/xzr) so the low bit of
each field toggles. All 19 representative forms byte-exact vs llvm-mc and
decode-clean; 461 tests green. (TBL/TBX register-list forms deferred.)
2026-06-17 23:23:44 -04:00

844 lines
31 KiB
Odin

// rexcode · Brendan Punsky (dotbmp@github), original author
package rexcode_arm64
// =============================================================================
// AArch64 ENCODER
// =============================================================================
//
// Fixed-width 4-byte ISA. Two-pass design mirroring mips/encoder.odin /
// riscv/encoder.odin. The interesting bits vs other arches:
//
// * Compound operands: SHIFTED_REG (Rm + shift type + amount) and
// EXTENDED_REG (Rm + extend + amount) are packed by the RM encoder
// by inspecting the operand kind -- a plain REGISTER decays to
// LSL #0 / UXTX, amount=0.
//
// * Three split-immediate scatter patterns:
// BRANCH_PG21 -- 21-bit imm split as immlo[30:29] + immhi[23:5]
// (ADR / ADRP)
// TBZ_BIT -- 6-bit bit position split as b5[31] + b40[23:19]
// (TBZ / TBNZ)
// SYS_FIELD -- 15-bit (op0:op1:CRn:CRm:op2) at bits 19:5
//
// * Loads/stores with the unsigned-offset (LDR/STR) form scale the
// user displacement by data size (1/2/4/8) derived from bits[31:30]
// of the encoding. LDP/STP pair forms scale a signed 7-bit field.
//
// * Endianness: AArch64 standard mode stores instructions LE; BE-32
// (instructions stored big-endian) is legacy and rare. Parameter
// defaults to LITTLE.
MAX_INST_SIZE :: 4
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int { return n * 4 }
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int { return n }
encode :: proc(
instructions: []Instruction,
label_defs: []Label_Definition,
code: []u8,
relocs: ^[dynamic]Relocation,
errors: ^[dynamic]Error,
endianness: Endianness = .LITTLE,
resolve: bool = true,
base_address: u64 = 0,
) -> Result {
n_inst := u32(len(instructions))
if u32(len(code)) < n_inst * 4 {
append(errors, Error{inst_idx = 0, code = .BUFFER_OVERFLOW})
return Result{byte_count = 0, success = false}
}
errors_start := u32(len(errors))
pending_start := u32(len(relocs))
pc: u32 = 0
// ---- PASS 1 -----------------------------------------------------------
for i in 0..<n_inst {
inst := &instructions[i]
word, ok := encode_one_inline(inst, pc, u16(i), relocs, errors)
if !ok { return Result{byte_count = pc, success = false} }
write_u32(code, pc, word, endianness)
pc += 4
}
// ---- PASS 1.5: fixed-width => *4 -------------------------------------
for &ld in label_defs {
if ld != LABEL_UNDEFINED {
ld = Label_Definition(u32(ld) * 4)
}
}
if !resolve {
return Result{byte_count = pc, success = u32(len(errors)) == errors_start}
}
// ---- PASS 2: resolve relocations -------------------------------------
n_relocs := u32(len(relocs))
write_idx := pending_start
for read_idx in pending_start..<n_relocs {
r := relocs[read_idx]
if resolve_relocation_inline(code, label_defs, &r, endianness, base_address, errors) {
continue
}
if write_idx != read_idx { relocs[write_idx] = r }
write_idx += 1
}
if write_idx != n_relocs { resize(relocs, int(write_idx)) }
return Result{byte_count = pc, success = u32(len(errors)) == errors_start}
}
// =============================================================================
// Internal: encode one instruction
// =============================================================================
@(private="file")
encode_one_inline :: #force_inline proc(
inst: ^Instruction,
pc: u32,
inst_idx: u16,
relocs: ^[dynamic]Relocation,
errors: ^[dynamic]Error,
) -> (word: u32, ok: bool) {
if inst.mnemonic == .INVALID {
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
return 0, false
}
forms := encoding_forms(inst.mnemonic)
if len(forms) == 0 {
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
return 0, false
}
form: ^Encoding
for &f in forms {
if encoding_matches_inline(inst, &f) { form = &f; break }
}
if form == nil {
append(errors, Error{inst_idx = u32(inst_idx), code = .NO_MATCHING_ENCODING})
return 0, false
}
word = form.bits
if form.enc[0] != .NONE { word |= pack_operand_inline(&inst.ops[0], form.enc[0], form, pc, inst_idx, relocs) }
if form.enc[1] != .NONE { word |= pack_operand_inline(&inst.ops[1], form.enc[1], form, pc, inst_idx, relocs) }
if form.enc[2] != .NONE { word |= pack_operand_inline(&inst.ops[2], form.enc[2], form, pc, inst_idx, relocs) }
if form.enc[3] != .NONE { word |= pack_operand_inline(&inst.ops[3], form.enc[3], form, pc, inst_idx, relocs) }
return word, true
}
@(private="file")
encoding_matches_inline :: #force_inline proc "contextless" (
inst: ^Instruction, form: ^Encoding,
) -> bool {
return operand_matches_inline(&inst.ops[0], form.ops[0], form) &&
operand_matches_inline(&inst.ops[1], form.ops[1], form) &&
operand_matches_inline(&inst.ops[2], form.ops[2], form) &&
operand_matches_inline(&inst.ops[3], form.ops[3], form)
}
@(private="file")
operand_matches_inline :: #force_inline proc "contextless" (
op: ^Operand, ot: Operand_Type, form: ^Encoding,
) -> bool {
switch ot {
case .NONE:
return op.kind == .NONE
case .W_REG:
return op.kind == .REGISTER && reg_class(op.reg) == REG_W
case .X_REG:
return op.kind == .REGISTER && reg_class(op.reg) == REG_X
case .WSP_REG:
return op.kind == .REGISTER && (reg_class(op.reg) == REG_W || reg_class(op.reg) == REG_WSP)
case .XSP_REG:
return op.kind == .REGISTER && (reg_class(op.reg) == REG_X || reg_class(op.reg) == REG_XSP)
case .W_SHIFTED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_W }
return false
case .X_SHIFTED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_X }
return false
case .W_EXTENDED:
// The extend type selects W vs X for the inner reg; accept either
// and let the encoder pack option = extend faithfully.
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
if op.kind == .EXTENDED_REG {
c := reg_class(op.extended.reg)
return c == REG_W || c == REG_X
}
return false
case .X_EXTENDED:
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
if op.kind == .EXTENDED_REG {
c := reg_class(op.extended.reg)
return c == REG_W || c == REG_X
}
return false
case .B_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_B
case .H_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_H
case .S_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_S
case .D_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_D
case .Q_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_Q
case .V_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_V
// NEON vector arrangement variants. The user encodes the arrangement
// via op.size: 8B=8, 16B=16, 4H=24, 8H=32, 2S=40, 4S=48, 1D=56, 2D=64.
// (lanes * elem_bytes; unique per arrangement). When op.size==0 the
// matcher accepts any V register (legacy / "first form wins") --
// callers using op_reg() get size=4 by default which matches the
// .V_4H form arithmetically; prefer the explicit op_v_*() builders.
case .V_8B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 8)
case .V_16B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 16)
case .V_4H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
case .V_8H, .V_8H_FP16:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 32)
case .V_2S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 40)
case .V_4S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 48)
case .V_1D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 56)
case .V_2D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64)
case .V_4H_FP16:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
// Element-indexed V views: element size carried in op.size (B=1,H=2,S=4,
// D=8) so DUP/INS forms disambiguate. .S also accepts size 0 so a plain
// op_reg (as the hand-written SM3TT forms pass) still matches the .S slot.
case .V_ELEM_B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 1
case .V_ELEM_H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 2
case .V_ELEM_S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 4 || op.size == 0)
case .V_ELEM_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 8
// SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8.
// op.size==0 (legacy / default-constructed) accepts any width.
case .Z_REG_B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 1)
case .Z_REG_H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 2)
case .Z_REG_S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 4)
case .Z_REG_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 8)
case .P_REG, .P_REG_MERGE, .P_REG_ZERO, .P_REG_GOV:
return op.kind == .REGISTER && reg_class(op.reg) == REG_P
// SME tile state (immediate-encoded tile number; user supplies the
// tile index as an immediate, e.g. 0 for ZA0.S, 3 for ZA3.S).
case .ZA_TILE_B, .ZA_TILE_H, .ZA_TILE_S, .ZA_TILE_D, .ZA_TILE_Q:
return op.kind == .IMMEDIATE
// Misc immediate sub-types added in batch 3
case .FCMLA_ROT, .FCADD_ROT, .SVE_PRFOP, .LDRAA_IMM10:
return op.kind == .IMMEDIATE
case .LSL_SHIFT_W, .LSL_SHIFT_X, .ROR_SHIFT:
return op.kind == .IMMEDIATE
case .Z_PAIR:
// SME2 vector pair: first reg must be even (Z0, Z2, ..., Z30).
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x1) == 0
case .Z_QUAD:
// SME2 vector quad: first reg must be multiple of 4.
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x3) == 0
case .SME_PATTERN, .SVE_PATTERN:
return op.kind == .IMMEDIATE
// SME tile slice (packed immediate descriptor; see encoding_types.odin)
case .SME_SLICE_B, .SME_SLICE_H, .SME_SLICE_W, .SME_SLICE_D, .SME_SLICE_Q:
return op.kind == .IMMEDIATE
case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT, .VEC_INDEX:
return op.kind == .IMMEDIATE
case .BITMASK_IMM:
// The user passes the raw logical mask value; we validate that it
// fits the AArch64 bitmask-immediate encoding at the form's width.
return op.kind == .IMMEDIATE && is_valid_bitmask_imm(u64(op.immediate), form.flags.is_64)
case .REL_26, .REL_19, .REL_14, .REL_PG21:
return op.kind == .RELATIVE
case .MEM:
return op.kind == .MEMORY
case .COND:
return op.kind == .COND
}
return false
}
// =============================================================================
// Operand packer
// =============================================================================
@(private="file")
// Element size in bits for a NEON vector arrangement operand type.
vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
#partial switch ot {
case .V_8B, .V_16B: return 8
case .V_4H, .V_8H, .V_4H_FP16, .V_8H_FP16: return 16
case .V_2S, .V_4S: return 32
case .V_1D, .V_2D: return 64
}
return 8
}
@(private="file")
// Lane-index marker bit (log2 of element-size in bytes) for a DUP/INS form:
// derived from the V_ELEM_* operand the form carries. B=0, H=1, S=2, D=3.
vidx_markerbit :: #force_inline proc "contextless" (form: ^Encoding) -> u32 {
for ot in form.ops {
#partial switch ot {
case .V_ELEM_B: return 0
case .V_ELEM_H: return 1
case .V_ELEM_S: return 2
case .V_ELEM_D: return 3
}
}
return 0
}
pack_operand_inline :: #force_inline proc(
op: ^Operand,
enc: Operand_Encoding,
form: ^Encoding,
pc: u32,
inst_idx: u16,
relocs: ^[dynamic]Relocation,
) -> u32 {
switch enc {
case .NONE, .IMPL:
return 0
// ---- Register slots ----------------------------------------------------
case .RD, .RT:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .RN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .RT2, .RA:
return (u32(reg_hw(op.reg)) & 0x1F) << 10
case .RM:
// RM has three flavours per the operand kind:
// REGISTER -- plain Rm at bits 20-16
// SHIFTED_REG -- Rm + shift type (22:23) + amount (15:10)
// EXTENDED_REG -- Rm + extend (13:15) + amount (10:12)
switch op.kind {
case .REGISTER:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .SHIFTED_REG:
return (u32(reg_hw(op.shifted.reg)) & 0x1F) << 16 |
(u32(op.shifted.type) & 0x3) << 22 |
(u32(op.shifted.amount) & 0x3F) << 10
case .EXTENDED_REG:
return (u32(reg_hw(op.extended.reg)) & 0x1F) << 16 |
(u32(op.extended.extend) & 0x7) << 13 |
(u32(op.extended.amount) & 0x7) << 10
case .NONE, .IMMEDIATE, .MEMORY, .RELATIVE, .COND:
return 0
}
// ---- Immediates --------------------------------------------------------
case .IMM12: return (u32(op.immediate) & 0xFFF) << 10
case .IMM16: return (u32(op.immediate) & 0xFFFF) << 5
case .IMM6: return (u32(op.immediate) & 0x3F) << 10
case .IMM9: return (u32(op.immediate) & 0x1FF) << 12
case .IMM_HW: return (u32(op.immediate) & 0x3) << 21
case .IMM_SH12: return (u32(op.immediate) & 0x1) << 22
case .SHIFT_TYPE: return (u32(op.immediate) & 0x3) << 22
case .EXT_OPT: return (u32(op.immediate) & 0x7) << 13
case .EXT_IMM3: return (u32(op.immediate) & 0x7) << 10
case .COND_HI:
// Condition payload may arrive as IMMEDIATE (raw) or COND kind.
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
return (c & 0xF) << 12
case .COND_LO:
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
return (c & 0xF) << 0
case .NZCV_FIELD:
return (u32(op.immediate) & 0xF) << 0
case .SYS_FIELD:
return (u32(op.immediate) & 0x7FFF) << 5
case .HINT_FIELD:
return (u32(op.immediate) & 0x7F) << 5
case .BARRIER_FIELD:
return (u32(op.immediate) & 0xF) << 8
// ---- Memory operand variants ------------------------------------------
case .OFFSET_BASE_U12:
// Scaled unsigned 12-bit: imm12 = disp / data_size
// data_size derived from bits[31:30] of the form: 00=1, 01=2, 10=4, 11=8
size := u32(1) << ((form.bits >> 30) & 0x3)
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := (u32(op.mem.disp) / size) & 0xFFF
return base_bits | (imm_bits << 10)
case .OFFSET_BASE_S9:
// Signed 9-bit unscaled at bits 20-12.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12)
case .OFFSET_BASE_PRE:
// Pre-index: bits[11:10] = 11, signed 9-bit at 20-12.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12) | (0x3 << 10)
case .OFFSET_BASE_POST:
// Post-index: bits[11:10] = 01.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := u32(op.mem.disp) & 0x1FF
return base_bits | (imm_bits << 12) | (0x1 << 10)
case .OFFSET_BASE_A:
// Atomic addressing: [Xn] only -- no displacement, no shift.
// Used by load/store exclusives, acquire/release, LSE atomics.
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
case .OFFSET_REG:
// [Xn, Xm{, LSL #s}]: option=011, S = shift!=0.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
option := u32(0x3) << 13
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
case .OFFSET_EXT:
// [Xn, Wm, SXTW|UXTW|SXTX #s]: option = ext, S = shift!=0.
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
option := (u32(op.mem.extend) & 0x7) << 13
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
// ---- PC-relative branches ---------------------------------------------
case .BRANCH_26:
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .B26, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_19:
// Could be B.cond, CBZ/CBNZ, or LDR literal -- the relocation
// type for all three is the same B_COND19 (19-bit signed PC-rel
// scaled by 4) since the encoding field is identical.
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .B_COND19, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_14:
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = .TBZ14, size = 4, inst_idx = inst_idx,
})
return 0
case .BRANCH_PG21:
// ADR / ADRP -- choose reloc type by the form's bits[31] (op flag).
ty: Relocation_Type = .ADR_PCREL21
if (form.bits >> 31) & 1 != 0 { ty = .ADRP_PCREL21 }
append(relocs, Relocation{
offset = pc, label_id = u32(op.relative),
type = ty, size = 4, inst_idx = inst_idx,
})
return 0
// ---- TBZ / TBNZ bit position split (b5 at bit 31, b40 at 23-19) -----
case .TBZ_BIT:
bit := u32(op.immediate) & 0x3F
return ((bit >> 5) & 1) << 31 | (bit & 0x1F) << 19
// ---- NEON / SIMD register slots (alias of RD/RN/RM/RA bit positions) --
case .VD:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .VN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .VM:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .VA:
return (u32(reg_hw(op.reg)) & 0x1F) << 10
// NEON shift-by-immediate: the element-size marker is already in `bits`;
// the operand drives only the low immh:immb bits at 22:16.
case .NEON_SHL_IMM:
return (u32(op.immediate) & 0x3F) << 16
case .NEON_SHR_IMM:
esize := vec_esize(form.ops[0])
return ((esize - u32(op.immediate)) & 0x3F) << 16
// NEON copy/permute index fields (element-size marker fixed in `bits`).
case .VN_VM_DUP:
hw := u32(reg_hw(op.reg)) & 0x1F
return (hw << 5) | (hw << 16)
case .NEON_IDX5:
mb := vidx_markerbit(form)
return (u32(op.immediate) << (mb + 1)) << 16
case .NEON_IDX4:
mb := vidx_markerbit(form)
return (u32(op.immediate) << mb) << 11
case .NEON_EXT_IDX:
return (u32(op.immediate) & 0xF) << 11
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
case .NEON_IMM8_FMOV:
v := u32(op.immediate) & 0xFF
return ((v >> 5) & 0x7) << 16 | (v & 0x1F) << 5
case .NEON_INDEX_H:
// H lane index: H at bit 20, L at bit 21, M at bit 11 (3 bits total
// when ESize=H). v1 keeps the simpler layout: just bits 20-19.
return (u32(op.immediate) & 0x3) << 19
case .NEON_INDEX_S:
// S lane index: bits 11 (H) + 21 (L). v1: bit 11 + bit 21.
v := u32(op.immediate) & 0x3
return (v & 0x1) << 21 | ((v >> 1) & 0x1) << 11
case .NEON_INDEX_D:
return (u32(op.immediate) & 0x1) << 11
// LSE atomics share field positions with the standard load/store
// encoding (Rs at 16-20, Rt at 0-4, Rn at 5-9).
case .ATOMIC_RS:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
case .ATOMIC_RT:
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .ATOMIC_RN:
// Memory operand carries the address register in mem.base.
if op.kind == .MEMORY {
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
}
return (u32(reg_hw(op.reg)) & 0x1F) << 5
// Bitmask logical immediate. The user passes the raw 32/64-bit mask
// value in op.immediate; the matcher has already validated that the
// value is encodable at the form's width, so encode_bitmask_imm
// cannot fail here.
case .BITMASK_FIELD:
n, immr, imms, _ := encode_bitmask_imm(u64(op.immediate), form.flags.is_64)
return (u32(n) << 22) | (u32(immr) << 16) | (u32(imms) << 10)
// SVE predicates (low 4 bits at 0/5/16; merge/zero via bit 14 etc.)
case .PD:
return (u32(reg_hw(op.reg)) & 0xF) << 0
case .PN:
return (u32(reg_hw(op.reg)) & 0xF) << 5
case .PM:
return (u32(reg_hw(op.reg)) & 0xF) << 16
case .PG:
// Governing predicate (3-bit slot, P0..P7 only).
return (u32(reg_hw(op.reg)) & 0x7) << 10
case .PG4:
// 4-bit Pg slot (P0..P15) used by predicate-logical and a few
// SVE2 ops.
return (u32(reg_hw(op.reg)) & 0xF) << 10
case .PM3:
// 3-bit Pm at bits 15:13 (SME outer products FMOPA/SMOPA/etc.).
return (u32(reg_hw(op.reg)) & 0x7) << 13
// SVE immediates
case .SVE_IMM8:
// Signed 8-bit at bits 12-5 (DUP/CPY/ADD imm).
return (u32(op.immediate) & 0xFF) << 5
case .SVE_IMM5:
// 5-bit at bits 20-16 (INDEX imm, etc.).
return (u32(op.immediate) & 0x1F) << 16
case .SVE_SHIFT_TSZ_IMM:
// tsz:imm3 at bits 22:16 -- caller passes the already-composed
// 7-bit field (tsz<6:3>:imm3<2:0>) in the IMMEDIATE.
return (u32(op.immediate) & 0x7F) << 16
case .SVE_PATTERN:
return (u32(op.immediate) & 0x1F) << 5
// SVE memory operands
case .SVE_OFFSET_BASE_SS:
// [Xn, Xm, LSL #s] scalar+scalar. Base at 9:5, index at 20:16;
// shift is implicit in the encoding's static bits (per ESize).
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
return base_bits | idx_bits
case .SVE_OFFSET_BASE_SI:
// [Xn{, #imm, MUL VL}] scalar+imm. Base at 9:5, signed 4-bit imm
// at bits 19:16 (caller passes signed disp as op.mem.disp).
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm_bits := (u32(op.mem.disp) & 0xF) << 16
return base_bits | imm_bits
// SME ZA tile number fields (position depends on element size).
case .ZA_TILE_NUM_B:
// ZA0.B only -- nothing to encode (single tile of byte form).
return 0
case .ZA_TILE_NUM_H:
// ZA0.H..ZA1.H -- 1-bit tile number at bit 22.
return (u32(op.immediate) & 0x1) << 22
case .ZA_TILE_NUM_S:
// ZA0.S..ZA3.S -- 2-bit tile number at bits 23:22.
return (u32(op.immediate) & 0x3) << 22
case .ZA_TILE_NUM_D:
// ZA0.D..ZA7.D -- 3-bit tile number at bits 23:21.
return (u32(op.immediate) & 0x7) << 21
case .SME_PATTERN_FIELD:
// 4-bit SME pattern/list at bits 8:5 (ZERO instruction list mask).
return (u32(op.immediate) & 0xF) << 5
// ---- SVE gather/scatter + vector-base memory --------------------------
case .SVE_OFFSET_BASE_VEC:
// [Xn, Zm.S/D, extend] -- base GPR at 9:5, Zm at 20:16.
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
idx := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
return base | idx
case .SVE_OFFSET_VEC_BASE:
// [Zn.S/D, #imm5] -- vector base at 9:5, signed-5 imm at bits 20:16.
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
imm := (u32(op.mem.disp) & 0x1F) << 16
return base | imm
// ---- SVE indexed lane field (FMLA Zda.T, Zn.T, Zm.T[i]) --------------
case .SVE_FMLA_IDX_H:
// i3 = (op.immediate >> 4) & 0x7? No -- user passes lane index
// (0..7) directly. Encoder packs i3 split as bit 22, bits 20:19,
// and Zm at bits 18:16 (low 8 regs only for indexed .H/.S).
// The instruction format we use accepts the lane index as a
// 3-bit immediate; the Zm register comes via .VM.
lane := u32(op.immediate) & 0x7
return ((lane >> 2) & 0x1) << 22 | (lane & 0x3) << 19
case .SVE_FMLA_IDX_S:
lane := u32(op.immediate) & 0x3
return lane << 19
case .SVE_FMLA_IDX_D:
lane := u32(op.immediate) & 0x1
return lane << 20
// ---- SME tile slice descriptor packing -------------------------------
//
// The slice descriptor (packed immediate) is unpacked into the
// instruction's bit positions per element size. The user-passed
// packed value carries:
// imm[3:0] | V[4] | Ws[6:5] | tile[10:7]
//
// Instruction layout (per LLVM golden tests):
// bit 15 = V flag (0=H, 1=V)
// bits 14:13 = Ws index (Ws is W12 + this)
// bits 3:0 = tile_num and imm packed (per element size):
// .B : imm[3:0] (single tile, ZA0.B)
// .H : tile[0]<<3 | imm[2:0] (2 tiles, 8 slices each)
// .W : tile[1:0]<<2 | imm[1:0] (4 tiles, 4 slices each)
// .D : tile[2:0]<<1 | imm[0] (8 tiles, 2 slices each)
// .Q : tile[3:0] (16 tiles, no imm)
case .SME_SLICE_B:
v := u32(op.immediate)
imm := v & 0xF
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
return (vflag << 15) | (ws << 13) | imm
case .SME_SLICE_H:
v := u32(op.immediate)
imm := v & 0x7
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x1
return (vflag << 15) | (ws << 13) | imm | (tile << 3)
case .SME_SLICE_W:
v := u32(op.immediate)
imm := v & 0x3
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x3
return (vflag << 15) | (ws << 13) | imm | (tile << 2)
case .SME_SLICE_D:
v := u32(op.immediate)
imm := v & 0x1
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0x7
return (vflag << 15) | (ws << 13) | imm | (tile << 1)
case .SME_SLICE_Q:
v := u32(op.immediate)
vflag := (v >> 4) & 0x1
ws := (v >> 5) & 0x3
tile := (v >> 7) & 0xF
return (vflag << 15) | (ws << 13) | tile
// ---- Batch 3 misc immediate encodings ----
case .ENC_FCMLA_ROT:
// 2-bit rotation at bits 13:12 (0/1/2/3 = 0°/90°/180°/270°).
return (u32(op.immediate) & 0x3) << 12
case .ENC_FCADD_ROT:
// 1-bit rotation at bit 12 (0 = 90°, 1 = 270°).
return (u32(op.immediate) & 0x1) << 12
case .ENC_SVE_PRFOP:
// 4-bit SVE prefetch op at bits 3:0.
return u32(op.immediate) & 0xF
case .ENC_LDRAA_IMM10:
// Signed 10-bit immediate at bits 21:12 (the user passes a byte
// offset that must be a multiple of 8; we encode imm >> 3).
v := u32(i32(op.immediate) >> 3) & 0x3FF
return v << 12
// ---- Batch 5 composite-packed encodings ----
case .ENC_LSL_IMM_W:
// 32-bit LSL alias: immr = (-imm) & 31, imms = 31 - imm.
imm := u32(op.immediate) & 0x1F
immr := ((~imm + 1) & 0x1F)
imms := (31 - imm) & 0x1F
return (immr << 16) | (imms << 10)
case .ENC_LSL_IMM_X:
// 64-bit LSL alias: immr = (-imm) & 63, imms = 63 - imm.
imm := u32(op.immediate) & 0x3F
immr := ((~imm + 1) & 0x3F)
imms := (63 - imm) & 0x3F
return (immr << 16) | (imms << 10)
case .ENC_DUAL_RN_RM:
// Pack the register at both Rn (9:5) AND Rm (20:16) slots
// (for ROR Rd, Rn, #imm = EXTR Rd, Rn, Rn, #imm).
hw := u32(reg_hw(op.reg)) & 0x1F
return (hw << 5) | (hw << 16)
case .ENC_ROR_SHIFT:
// imms (shift amount) at bits 15:10.
return (u32(op.immediate) & 0x3F) << 10
case .ENC_Z_PAIR_VD, .ENC_Z_QUAD_VD:
// Pack first Z reg into Vd slot (bits 4:0).
return (u32(reg_hw(op.reg)) & 0x1F) << 0
case .ENC_Z_PAIR_VN, .ENC_Z_QUAD_VN:
return (u32(reg_hw(op.reg)) & 0x1F) << 5
case .ENC_Z_PAIR_VM, .ENC_Z_QUAD_VM:
return (u32(reg_hw(op.reg)) & 0x1F) << 16
}
return 0
}
// =============================================================================
// Pass 2 -- relocation resolver
// =============================================================================
@(private="file")
resolve_relocation_inline :: #force_inline proc(
code: []u8,
label_defs: []Label_Definition,
relocation: ^Relocation,
endianness: Endianness,
base_address: u64,
errors: ^[dynamic]Error,
) -> bool {
if int(relocation.label_id) >= len(label_defs) { return false }
ld := label_defs[relocation.label_id]
if ld == LABEL_UNDEFINED { return false }
target := u32(ld)
word := read_u32(code, relocation.offset, endianness)
switch relocation.type {
case .B26:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<25) || words > (1<<25)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= u32(words) & 0x03FFFFFF
case .B_COND19, .LDR_LITERAL19:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<18) || words > (1<<18)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= (u32(words) & 0x7FFFF) << 5
case .TBZ14:
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel & 3 != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
words := rel >> 2
if words < -(1<<13) || words > (1<<13)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
word |= (u32(words) & 0x3FFF) << 5
case .ADR_PCREL21:
// ADR: signed 21-bit byte offset (no scaling).
rel := i32(target) - i32(relocation.offset) + relocation.addend
if rel < -(1<<20) || rel > (1<<20)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
v := u32(rel) & 0x1FFFFF
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
case .ADRP_PCREL21:
// ADRP: difference of page (4KB-aligned) targets.
target_page := u64(target) & ~u64(0xFFF) + base_address & ~u64(0xFFF)
// Effective: ((target + base) >> 12) - ((pc + base) >> 12)
// Simpler: ((target + base) - (pc + base)) >> 12 when both are
// 4KB-aligned; but base + offset alignment is the caller's concern.
pc_page := (u64(relocation.offset) + base_address) & ~u64(0xFFF)
tg_page := target_page
diff := i64(tg_page) - i64(pc_page) + i64(relocation.addend)
if diff & 0xFFF != 0 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
pages := diff >> 12
if pages < -(1<<20) || pages > (1<<20)-1 {
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
return true
}
v := u32(pages) & 0x1FFFFF
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
case .NONE, .PCREL_LO12_I, .PCREL_LO12_S, .ABS64, .ABS32, .ABS16:
// Linker-bound or assembler-layer; not auto-resolved here.
return false
}
write_u32(code, relocation.offset, word, endianness)
return true
}
// =============================================================================
// Endian-aware word I/O
// =============================================================================
@(private="package")
write_u32 :: #force_inline proc "contextless" (
code: []u8, offset: u32, word: u32, endianness: Endianness,
) {
if endianness == .LITTLE {
code[offset+0] = u8(word)
code[offset+1] = u8(word >> 8)
code[offset+2] = u8(word >> 16)
code[offset+3] = u8(word >> 24)
} else {
code[offset+0] = u8(word >> 24)
code[offset+1] = u8(word >> 16)
code[offset+2] = u8(word >> 8)
code[offset+3] = u8(word)
}
}
@(private="package")
read_u32 :: #force_inline proc "contextless" (
code: []u8, offset: u32, endianness: Endianness,
) -> u32 {
if endianness == .LITTLE {
return u32(code[offset+0]) |
(u32(code[offset+1]) << 8) |
(u32(code[offset+2]) << 16) |
(u32(code[offset+3]) << 24)
}
return (u32(code[offset+0]) << 24) |
(u32(code[offset+1]) << 16) |
(u32(code[offset+2]) << 8) |
u32(code[offset+3])
}