mirror of
https://github.com/odin-lang/Odin.git
synced 2026-06-19 08:32:33 +00:00
MOV_V (ORR alias: source feeds both Vn and Vm via a new VN_VM_DUP encoding), MVN_V (NOT alias, plain 2-register), DUP_V (element form Vd.T,Vn.Ts[i] and general form Vd.T,Wn/Xn), INS (element-to-element and from-GPR), EXT_V (imm4 byte index). Adds a VEC_INDEX operand type plus NEON_IDX5/NEON_IDX4/NEON_EXT_IDX encodings: the element-size marker rides in the entry bits, the lane index drives the bits above it, and the decoder recovers the element size from imm5's marker. Element size now rides in op.size (B=1/H=2/S=4/D=8) via op_v_elem_b/h/s/d so the matcher can disambiguate DUP/INS element forms; the builder generator maps V_ELEM_* to those constructors. specgen derives the mask by varying registers and each index field to its max -- the GPR-source forms vary Vd and Rn independently (Rn 31 = wzr/xzr) so the low bit of each field toggles. All 19 representative forms byte-exact vs llvm-mc and decode-clean; 461 tests green. (TBL/TBX register-list forms deferred.)
844 lines
31 KiB
Odin
844 lines
31 KiB
Odin
// rexcode · Brendan Punsky (dotbmp@github), original author
|
|
|
|
package rexcode_arm64
|
|
|
|
// =============================================================================
|
|
// AArch64 ENCODER
|
|
// =============================================================================
|
|
//
|
|
// Fixed-width 4-byte ISA. Two-pass design mirroring mips/encoder.odin /
|
|
// riscv/encoder.odin. The interesting bits vs other arches:
|
|
//
|
|
// * Compound operands: SHIFTED_REG (Rm + shift type + amount) and
|
|
// EXTENDED_REG (Rm + extend + amount) are packed by the RM encoder
|
|
// by inspecting the operand kind -- a plain REGISTER decays to
|
|
// LSL #0 / UXTX, amount=0.
|
|
//
|
|
// * Three split-immediate scatter patterns:
|
|
// BRANCH_PG21 -- 21-bit imm split as immlo[30:29] + immhi[23:5]
|
|
// (ADR / ADRP)
|
|
// TBZ_BIT -- 6-bit bit position split as b5[31] + b40[23:19]
|
|
// (TBZ / TBNZ)
|
|
// SYS_FIELD -- 15-bit (op0:op1:CRn:CRm:op2) at bits 19:5
|
|
//
|
|
// * Loads/stores with the unsigned-offset (LDR/STR) form scale the
|
|
// user displacement by data size (1/2/4/8) derived from bits[31:30]
|
|
// of the encoding. LDP/STP pair forms scale a signed 7-bit field.
|
|
//
|
|
// * Endianness: AArch64 standard mode stores instructions LE; BE-32
|
|
// (instructions stored big-endian) is legacy and rare. Parameter
|
|
// defaults to LITTLE.
|
|
|
|
MAX_INST_SIZE :: 4
|
|
|
|
encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int { return n * 4 }
|
|
encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int { return n }
|
|
|
|
encode :: proc(
|
|
instructions: []Instruction,
|
|
label_defs: []Label_Definition,
|
|
code: []u8,
|
|
relocs: ^[dynamic]Relocation,
|
|
errors: ^[dynamic]Error,
|
|
endianness: Endianness = .LITTLE,
|
|
resolve: bool = true,
|
|
base_address: u64 = 0,
|
|
) -> Result {
|
|
n_inst := u32(len(instructions))
|
|
if u32(len(code)) < n_inst * 4 {
|
|
append(errors, Error{inst_idx = 0, code = .BUFFER_OVERFLOW})
|
|
return Result{byte_count = 0, success = false}
|
|
}
|
|
|
|
errors_start := u32(len(errors))
|
|
pending_start := u32(len(relocs))
|
|
pc: u32 = 0
|
|
|
|
// ---- PASS 1 -----------------------------------------------------------
|
|
for i in 0..<n_inst {
|
|
inst := &instructions[i]
|
|
word, ok := encode_one_inline(inst, pc, u16(i), relocs, errors)
|
|
if !ok { return Result{byte_count = pc, success = false} }
|
|
write_u32(code, pc, word, endianness)
|
|
pc += 4
|
|
}
|
|
|
|
// ---- PASS 1.5: fixed-width => *4 -------------------------------------
|
|
for &ld in label_defs {
|
|
if ld != LABEL_UNDEFINED {
|
|
ld = Label_Definition(u32(ld) * 4)
|
|
}
|
|
}
|
|
|
|
if !resolve {
|
|
return Result{byte_count = pc, success = u32(len(errors)) == errors_start}
|
|
}
|
|
|
|
// ---- PASS 2: resolve relocations -------------------------------------
|
|
n_relocs := u32(len(relocs))
|
|
write_idx := pending_start
|
|
for read_idx in pending_start..<n_relocs {
|
|
r := relocs[read_idx]
|
|
if resolve_relocation_inline(code, label_defs, &r, endianness, base_address, errors) {
|
|
continue
|
|
}
|
|
if write_idx != read_idx { relocs[write_idx] = r }
|
|
write_idx += 1
|
|
}
|
|
if write_idx != n_relocs { resize(relocs, int(write_idx)) }
|
|
|
|
return Result{byte_count = pc, success = u32(len(errors)) == errors_start}
|
|
}
|
|
|
|
// =============================================================================
|
|
// Internal: encode one instruction
|
|
// =============================================================================
|
|
|
|
@(private="file")
|
|
encode_one_inline :: #force_inline proc(
|
|
inst: ^Instruction,
|
|
pc: u32,
|
|
inst_idx: u16,
|
|
relocs: ^[dynamic]Relocation,
|
|
errors: ^[dynamic]Error,
|
|
) -> (word: u32, ok: bool) {
|
|
if inst.mnemonic == .INVALID {
|
|
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
|
|
return 0, false
|
|
}
|
|
forms := encoding_forms(inst.mnemonic)
|
|
if len(forms) == 0 {
|
|
append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC})
|
|
return 0, false
|
|
}
|
|
|
|
form: ^Encoding
|
|
for &f in forms {
|
|
if encoding_matches_inline(inst, &f) { form = &f; break }
|
|
}
|
|
if form == nil {
|
|
append(errors, Error{inst_idx = u32(inst_idx), code = .NO_MATCHING_ENCODING})
|
|
return 0, false
|
|
}
|
|
|
|
word = form.bits
|
|
if form.enc[0] != .NONE { word |= pack_operand_inline(&inst.ops[0], form.enc[0], form, pc, inst_idx, relocs) }
|
|
if form.enc[1] != .NONE { word |= pack_operand_inline(&inst.ops[1], form.enc[1], form, pc, inst_idx, relocs) }
|
|
if form.enc[2] != .NONE { word |= pack_operand_inline(&inst.ops[2], form.enc[2], form, pc, inst_idx, relocs) }
|
|
if form.enc[3] != .NONE { word |= pack_operand_inline(&inst.ops[3], form.enc[3], form, pc, inst_idx, relocs) }
|
|
return word, true
|
|
}
|
|
|
|
@(private="file")
|
|
encoding_matches_inline :: #force_inline proc "contextless" (
|
|
inst: ^Instruction, form: ^Encoding,
|
|
) -> bool {
|
|
return operand_matches_inline(&inst.ops[0], form.ops[0], form) &&
|
|
operand_matches_inline(&inst.ops[1], form.ops[1], form) &&
|
|
operand_matches_inline(&inst.ops[2], form.ops[2], form) &&
|
|
operand_matches_inline(&inst.ops[3], form.ops[3], form)
|
|
}
|
|
|
|
@(private="file")
|
|
operand_matches_inline :: #force_inline proc "contextless" (
|
|
op: ^Operand, ot: Operand_Type, form: ^Encoding,
|
|
) -> bool {
|
|
switch ot {
|
|
case .NONE:
|
|
return op.kind == .NONE
|
|
case .W_REG:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_W
|
|
case .X_REG:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_X
|
|
case .WSP_REG:
|
|
return op.kind == .REGISTER && (reg_class(op.reg) == REG_W || reg_class(op.reg) == REG_WSP)
|
|
case .XSP_REG:
|
|
return op.kind == .REGISTER && (reg_class(op.reg) == REG_X || reg_class(op.reg) == REG_XSP)
|
|
case .W_SHIFTED:
|
|
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
|
|
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_W }
|
|
return false
|
|
case .X_SHIFTED:
|
|
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
|
|
if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_X }
|
|
return false
|
|
case .W_EXTENDED:
|
|
// The extend type selects W vs X for the inner reg; accept either
|
|
// and let the encoder pack option = extend faithfully.
|
|
if op.kind == .REGISTER { return reg_class(op.reg) == REG_W }
|
|
if op.kind == .EXTENDED_REG {
|
|
c := reg_class(op.extended.reg)
|
|
return c == REG_W || c == REG_X
|
|
}
|
|
return false
|
|
case .X_EXTENDED:
|
|
if op.kind == .REGISTER { return reg_class(op.reg) == REG_X }
|
|
if op.kind == .EXTENDED_REG {
|
|
c := reg_class(op.extended.reg)
|
|
return c == REG_W || c == REG_X
|
|
}
|
|
return false
|
|
case .B_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_B
|
|
case .H_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_H
|
|
case .S_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_S
|
|
case .D_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_D
|
|
case .Q_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_Q
|
|
case .V_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_V
|
|
|
|
// NEON vector arrangement variants. The user encodes the arrangement
|
|
// via op.size: 8B=8, 16B=16, 4H=24, 8H=32, 2S=40, 4S=48, 1D=56, 2D=64.
|
|
// (lanes * elem_bytes; unique per arrangement). When op.size==0 the
|
|
// matcher accepts any V register (legacy / "first form wins") --
|
|
// callers using op_reg() get size=4 by default which matches the
|
|
// .V_4H form arithmetically; prefer the explicit op_v_*() builders.
|
|
case .V_8B:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 8)
|
|
case .V_16B:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 16)
|
|
case .V_4H:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
|
|
case .V_8H, .V_8H_FP16:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 32)
|
|
case .V_2S:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 40)
|
|
case .V_4S:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 48)
|
|
case .V_1D:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 56)
|
|
case .V_2D:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64)
|
|
case .V_4H_FP16:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
|
|
// Element-indexed V views: element size carried in op.size (B=1,H=2,S=4,
|
|
// D=8) so DUP/INS forms disambiguate. .S also accepts size 0 so a plain
|
|
// op_reg (as the hand-written SM3TT forms pass) still matches the .S slot.
|
|
case .V_ELEM_B:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 1
|
|
case .V_ELEM_H:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 2
|
|
case .V_ELEM_S:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 4 || op.size == 0)
|
|
case .V_ELEM_D:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 8
|
|
|
|
// SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8.
|
|
// op.size==0 (legacy / default-constructed) accepts any width.
|
|
case .Z_REG_B:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 1)
|
|
case .Z_REG_H:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 2)
|
|
case .Z_REG_S:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 4)
|
|
case .Z_REG_D:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 8)
|
|
case .P_REG, .P_REG_MERGE, .P_REG_ZERO, .P_REG_GOV:
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_P
|
|
|
|
// SME tile state (immediate-encoded tile number; user supplies the
|
|
// tile index as an immediate, e.g. 0 for ZA0.S, 3 for ZA3.S).
|
|
case .ZA_TILE_B, .ZA_TILE_H, .ZA_TILE_S, .ZA_TILE_D, .ZA_TILE_Q:
|
|
return op.kind == .IMMEDIATE
|
|
// Misc immediate sub-types added in batch 3
|
|
case .FCMLA_ROT, .FCADD_ROT, .SVE_PRFOP, .LDRAA_IMM10:
|
|
return op.kind == .IMMEDIATE
|
|
case .LSL_SHIFT_W, .LSL_SHIFT_X, .ROR_SHIFT:
|
|
return op.kind == .IMMEDIATE
|
|
case .Z_PAIR:
|
|
// SME2 vector pair: first reg must be even (Z0, Z2, ..., Z30).
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x1) == 0
|
|
case .Z_QUAD:
|
|
// SME2 vector quad: first reg must be multiple of 4.
|
|
return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x3) == 0
|
|
case .SME_PATTERN, .SVE_PATTERN:
|
|
return op.kind == .IMMEDIATE
|
|
// SME tile slice (packed immediate descriptor; see encoding_types.odin)
|
|
case .SME_SLICE_B, .SME_SLICE_H, .SME_SLICE_W, .SME_SLICE_D, .SME_SLICE_Q:
|
|
return op.kind == .IMMEDIATE
|
|
|
|
case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
|
|
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT, .VEC_INDEX:
|
|
return op.kind == .IMMEDIATE
|
|
case .BITMASK_IMM:
|
|
// The user passes the raw logical mask value; we validate that it
|
|
// fits the AArch64 bitmask-immediate encoding at the form's width.
|
|
return op.kind == .IMMEDIATE && is_valid_bitmask_imm(u64(op.immediate), form.flags.is_64)
|
|
case .REL_26, .REL_19, .REL_14, .REL_PG21:
|
|
return op.kind == .RELATIVE
|
|
case .MEM:
|
|
return op.kind == .MEMORY
|
|
case .COND:
|
|
return op.kind == .COND
|
|
}
|
|
return false
|
|
}
|
|
|
|
// =============================================================================
|
|
// Operand packer
|
|
// =============================================================================
|
|
|
|
@(private="file")
|
|
// Element size in bits for a NEON vector arrangement operand type.
|
|
vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
|
|
#partial switch ot {
|
|
case .V_8B, .V_16B: return 8
|
|
case .V_4H, .V_8H, .V_4H_FP16, .V_8H_FP16: return 16
|
|
case .V_2S, .V_4S: return 32
|
|
case .V_1D, .V_2D: return 64
|
|
}
|
|
return 8
|
|
}
|
|
|
|
@(private="file")
|
|
// Lane-index marker bit (log2 of element-size in bytes) for a DUP/INS form:
|
|
// derived from the V_ELEM_* operand the form carries. B=0, H=1, S=2, D=3.
|
|
vidx_markerbit :: #force_inline proc "contextless" (form: ^Encoding) -> u32 {
|
|
for ot in form.ops {
|
|
#partial switch ot {
|
|
case .V_ELEM_B: return 0
|
|
case .V_ELEM_H: return 1
|
|
case .V_ELEM_S: return 2
|
|
case .V_ELEM_D: return 3
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
pack_operand_inline :: #force_inline proc(
|
|
op: ^Operand,
|
|
enc: Operand_Encoding,
|
|
form: ^Encoding,
|
|
pc: u32,
|
|
inst_idx: u16,
|
|
relocs: ^[dynamic]Relocation,
|
|
) -> u32 {
|
|
switch enc {
|
|
case .NONE, .IMPL:
|
|
return 0
|
|
|
|
// ---- Register slots ----------------------------------------------------
|
|
case .RD, .RT:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 0
|
|
case .RN:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 5
|
|
case .RT2, .RA:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 10
|
|
case .RM:
|
|
// RM has three flavours per the operand kind:
|
|
// REGISTER -- plain Rm at bits 20-16
|
|
// SHIFTED_REG -- Rm + shift type (22:23) + amount (15:10)
|
|
// EXTENDED_REG -- Rm + extend (13:15) + amount (10:12)
|
|
switch op.kind {
|
|
case .REGISTER:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 16
|
|
case .SHIFTED_REG:
|
|
return (u32(reg_hw(op.shifted.reg)) & 0x1F) << 16 |
|
|
(u32(op.shifted.type) & 0x3) << 22 |
|
|
(u32(op.shifted.amount) & 0x3F) << 10
|
|
case .EXTENDED_REG:
|
|
return (u32(reg_hw(op.extended.reg)) & 0x1F) << 16 |
|
|
(u32(op.extended.extend) & 0x7) << 13 |
|
|
(u32(op.extended.amount) & 0x7) << 10
|
|
case .NONE, .IMMEDIATE, .MEMORY, .RELATIVE, .COND:
|
|
return 0
|
|
}
|
|
|
|
// ---- Immediates --------------------------------------------------------
|
|
case .IMM12: return (u32(op.immediate) & 0xFFF) << 10
|
|
case .IMM16: return (u32(op.immediate) & 0xFFFF) << 5
|
|
case .IMM6: return (u32(op.immediate) & 0x3F) << 10
|
|
case .IMM9: return (u32(op.immediate) & 0x1FF) << 12
|
|
case .IMM_HW: return (u32(op.immediate) & 0x3) << 21
|
|
case .IMM_SH12: return (u32(op.immediate) & 0x1) << 22
|
|
case .SHIFT_TYPE: return (u32(op.immediate) & 0x3) << 22
|
|
case .EXT_OPT: return (u32(op.immediate) & 0x7) << 13
|
|
case .EXT_IMM3: return (u32(op.immediate) & 0x7) << 10
|
|
case .COND_HI:
|
|
// Condition payload may arrive as IMMEDIATE (raw) or COND kind.
|
|
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
|
|
return (c & 0xF) << 12
|
|
case .COND_LO:
|
|
c := u32(op.cond) if op.kind == .COND else u32(op.immediate)
|
|
return (c & 0xF) << 0
|
|
case .NZCV_FIELD:
|
|
return (u32(op.immediate) & 0xF) << 0
|
|
case .SYS_FIELD:
|
|
return (u32(op.immediate) & 0x7FFF) << 5
|
|
case .HINT_FIELD:
|
|
return (u32(op.immediate) & 0x7F) << 5
|
|
case .BARRIER_FIELD:
|
|
return (u32(op.immediate) & 0xF) << 8
|
|
|
|
// ---- Memory operand variants ------------------------------------------
|
|
case .OFFSET_BASE_U12:
|
|
// Scaled unsigned 12-bit: imm12 = disp / data_size
|
|
// data_size derived from bits[31:30] of the form: 00=1, 01=2, 10=4, 11=8
|
|
size := u32(1) << ((form.bits >> 30) & 0x3)
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm_bits := (u32(op.mem.disp) / size) & 0xFFF
|
|
return base_bits | (imm_bits << 10)
|
|
case .OFFSET_BASE_S9:
|
|
// Signed 9-bit unscaled at bits 20-12.
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm_bits := u32(op.mem.disp) & 0x1FF
|
|
return base_bits | (imm_bits << 12)
|
|
case .OFFSET_BASE_PRE:
|
|
// Pre-index: bits[11:10] = 11, signed 9-bit at 20-12.
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm_bits := u32(op.mem.disp) & 0x1FF
|
|
return base_bits | (imm_bits << 12) | (0x3 << 10)
|
|
case .OFFSET_BASE_POST:
|
|
// Post-index: bits[11:10] = 01.
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm_bits := u32(op.mem.disp) & 0x1FF
|
|
return base_bits | (imm_bits << 12) | (0x1 << 10)
|
|
case .OFFSET_BASE_A:
|
|
// Atomic addressing: [Xn] only -- no displacement, no shift.
|
|
// Used by load/store exclusives, acquire/release, LSE atomics.
|
|
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
case .OFFSET_REG:
|
|
// [Xn, Xm{, LSL #s}]: option=011, S = shift!=0.
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
|
|
option := u32(0x3) << 13
|
|
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
|
|
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
|
|
case .OFFSET_EXT:
|
|
// [Xn, Wm, SXTW|UXTW|SXTX #s]: option = ext, S = shift!=0.
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
|
|
option := (u32(op.mem.extend) & 0x7) << 13
|
|
s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0
|
|
return base_bits | idx_bits | option | s_bit | (0x2 << 10)
|
|
|
|
// ---- PC-relative branches ---------------------------------------------
|
|
case .BRANCH_26:
|
|
append(relocs, Relocation{
|
|
offset = pc, label_id = u32(op.relative),
|
|
type = .B26, size = 4, inst_idx = inst_idx,
|
|
})
|
|
return 0
|
|
case .BRANCH_19:
|
|
// Could be B.cond, CBZ/CBNZ, or LDR literal -- the relocation
|
|
// type for all three is the same B_COND19 (19-bit signed PC-rel
|
|
// scaled by 4) since the encoding field is identical.
|
|
append(relocs, Relocation{
|
|
offset = pc, label_id = u32(op.relative),
|
|
type = .B_COND19, size = 4, inst_idx = inst_idx,
|
|
})
|
|
return 0
|
|
case .BRANCH_14:
|
|
append(relocs, Relocation{
|
|
offset = pc, label_id = u32(op.relative),
|
|
type = .TBZ14, size = 4, inst_idx = inst_idx,
|
|
})
|
|
return 0
|
|
case .BRANCH_PG21:
|
|
// ADR / ADRP -- choose reloc type by the form's bits[31] (op flag).
|
|
ty: Relocation_Type = .ADR_PCREL21
|
|
if (form.bits >> 31) & 1 != 0 { ty = .ADRP_PCREL21 }
|
|
append(relocs, Relocation{
|
|
offset = pc, label_id = u32(op.relative),
|
|
type = ty, size = 4, inst_idx = inst_idx,
|
|
})
|
|
return 0
|
|
|
|
// ---- TBZ / TBNZ bit position split (b5 at bit 31, b40 at 23-19) -----
|
|
case .TBZ_BIT:
|
|
bit := u32(op.immediate) & 0x3F
|
|
return ((bit >> 5) & 1) << 31 | (bit & 0x1F) << 19
|
|
|
|
// ---- NEON / SIMD register slots (alias of RD/RN/RM/RA bit positions) --
|
|
case .VD:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 0
|
|
case .VN:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 5
|
|
case .VM:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 16
|
|
case .VA:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 10
|
|
|
|
// NEON shift-by-immediate: the element-size marker is already in `bits`;
|
|
// the operand drives only the low immh:immb bits at 22:16.
|
|
case .NEON_SHL_IMM:
|
|
return (u32(op.immediate) & 0x3F) << 16
|
|
case .NEON_SHR_IMM:
|
|
esize := vec_esize(form.ops[0])
|
|
return ((esize - u32(op.immediate)) & 0x3F) << 16
|
|
|
|
// NEON copy/permute index fields (element-size marker fixed in `bits`).
|
|
case .VN_VM_DUP:
|
|
hw := u32(reg_hw(op.reg)) & 0x1F
|
|
return (hw << 5) | (hw << 16)
|
|
case .NEON_IDX5:
|
|
mb := vidx_markerbit(form)
|
|
return (u32(op.immediate) << (mb + 1)) << 16
|
|
case .NEON_IDX4:
|
|
mb := vidx_markerbit(form)
|
|
return (u32(op.immediate) << mb) << 11
|
|
case .NEON_EXT_IDX:
|
|
return (u32(op.immediate) & 0xF) << 11
|
|
|
|
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
|
|
case .NEON_IMM8_FMOV:
|
|
v := u32(op.immediate) & 0xFF
|
|
return ((v >> 5) & 0x7) << 16 | (v & 0x1F) << 5
|
|
|
|
case .NEON_INDEX_H:
|
|
// H lane index: H at bit 20, L at bit 21, M at bit 11 (3 bits total
|
|
// when ESize=H). v1 keeps the simpler layout: just bits 20-19.
|
|
return (u32(op.immediate) & 0x3) << 19
|
|
case .NEON_INDEX_S:
|
|
// S lane index: bits 11 (H) + 21 (L). v1: bit 11 + bit 21.
|
|
v := u32(op.immediate) & 0x3
|
|
return (v & 0x1) << 21 | ((v >> 1) & 0x1) << 11
|
|
case .NEON_INDEX_D:
|
|
return (u32(op.immediate) & 0x1) << 11
|
|
|
|
// LSE atomics share field positions with the standard load/store
|
|
// encoding (Rs at 16-20, Rt at 0-4, Rn at 5-9).
|
|
case .ATOMIC_RS:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 16
|
|
case .ATOMIC_RT:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 0
|
|
case .ATOMIC_RN:
|
|
// Memory operand carries the address register in mem.base.
|
|
if op.kind == .MEMORY {
|
|
return (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
}
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 5
|
|
|
|
// Bitmask logical immediate. The user passes the raw 32/64-bit mask
|
|
// value in op.immediate; the matcher has already validated that the
|
|
// value is encodable at the form's width, so encode_bitmask_imm
|
|
// cannot fail here.
|
|
case .BITMASK_FIELD:
|
|
n, immr, imms, _ := encode_bitmask_imm(u64(op.immediate), form.flags.is_64)
|
|
return (u32(n) << 22) | (u32(immr) << 16) | (u32(imms) << 10)
|
|
|
|
// SVE predicates (low 4 bits at 0/5/16; merge/zero via bit 14 etc.)
|
|
case .PD:
|
|
return (u32(reg_hw(op.reg)) & 0xF) << 0
|
|
case .PN:
|
|
return (u32(reg_hw(op.reg)) & 0xF) << 5
|
|
case .PM:
|
|
return (u32(reg_hw(op.reg)) & 0xF) << 16
|
|
case .PG:
|
|
// Governing predicate (3-bit slot, P0..P7 only).
|
|
return (u32(reg_hw(op.reg)) & 0x7) << 10
|
|
case .PG4:
|
|
// 4-bit Pg slot (P0..P15) used by predicate-logical and a few
|
|
// SVE2 ops.
|
|
return (u32(reg_hw(op.reg)) & 0xF) << 10
|
|
case .PM3:
|
|
// 3-bit Pm at bits 15:13 (SME outer products FMOPA/SMOPA/etc.).
|
|
return (u32(reg_hw(op.reg)) & 0x7) << 13
|
|
|
|
// SVE immediates
|
|
case .SVE_IMM8:
|
|
// Signed 8-bit at bits 12-5 (DUP/CPY/ADD imm).
|
|
return (u32(op.immediate) & 0xFF) << 5
|
|
case .SVE_IMM5:
|
|
// 5-bit at bits 20-16 (INDEX imm, etc.).
|
|
return (u32(op.immediate) & 0x1F) << 16
|
|
case .SVE_SHIFT_TSZ_IMM:
|
|
// tsz:imm3 at bits 22:16 -- caller passes the already-composed
|
|
// 7-bit field (tsz<6:3>:imm3<2:0>) in the IMMEDIATE.
|
|
return (u32(op.immediate) & 0x7F) << 16
|
|
case .SVE_PATTERN:
|
|
return (u32(op.immediate) & 0x1F) << 5
|
|
|
|
// SVE memory operands
|
|
case .SVE_OFFSET_BASE_SS:
|
|
// [Xn, Xm, LSL #s] scalar+scalar. Base at 9:5, index at 20:16;
|
|
// shift is implicit in the encoding's static bits (per ESize).
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
|
|
return base_bits | idx_bits
|
|
case .SVE_OFFSET_BASE_SI:
|
|
// [Xn{, #imm, MUL VL}] scalar+imm. Base at 9:5, signed 4-bit imm
|
|
// at bits 19:16 (caller passes signed disp as op.mem.disp).
|
|
base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm_bits := (u32(op.mem.disp) & 0xF) << 16
|
|
return base_bits | imm_bits
|
|
|
|
// SME ZA tile number fields (position depends on element size).
|
|
case .ZA_TILE_NUM_B:
|
|
// ZA0.B only -- nothing to encode (single tile of byte form).
|
|
return 0
|
|
case .ZA_TILE_NUM_H:
|
|
// ZA0.H..ZA1.H -- 1-bit tile number at bit 22.
|
|
return (u32(op.immediate) & 0x1) << 22
|
|
case .ZA_TILE_NUM_S:
|
|
// ZA0.S..ZA3.S -- 2-bit tile number at bits 23:22.
|
|
return (u32(op.immediate) & 0x3) << 22
|
|
case .ZA_TILE_NUM_D:
|
|
// ZA0.D..ZA7.D -- 3-bit tile number at bits 23:21.
|
|
return (u32(op.immediate) & 0x7) << 21
|
|
case .SME_PATTERN_FIELD:
|
|
// 4-bit SME pattern/list at bits 8:5 (ZERO instruction list mask).
|
|
return (u32(op.immediate) & 0xF) << 5
|
|
|
|
// ---- SVE gather/scatter + vector-base memory --------------------------
|
|
case .SVE_OFFSET_BASE_VEC:
|
|
// [Xn, Zm.S/D, extend] -- base GPR at 9:5, Zm at 20:16.
|
|
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
idx := (u32(reg_hw(op.mem.index)) & 0x1F) << 16
|
|
return base | idx
|
|
case .SVE_OFFSET_VEC_BASE:
|
|
// [Zn.S/D, #imm5] -- vector base at 9:5, signed-5 imm at bits 20:16.
|
|
base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5
|
|
imm := (u32(op.mem.disp) & 0x1F) << 16
|
|
return base | imm
|
|
|
|
// ---- SVE indexed lane field (FMLA Zda.T, Zn.T, Zm.T[i]) --------------
|
|
case .SVE_FMLA_IDX_H:
|
|
// i3 = (op.immediate >> 4) & 0x7? No -- user passes lane index
|
|
// (0..7) directly. Encoder packs i3 split as bit 22, bits 20:19,
|
|
// and Zm at bits 18:16 (low 8 regs only for indexed .H/.S).
|
|
// The instruction format we use accepts the lane index as a
|
|
// 3-bit immediate; the Zm register comes via .VM.
|
|
lane := u32(op.immediate) & 0x7
|
|
return ((lane >> 2) & 0x1) << 22 | (lane & 0x3) << 19
|
|
case .SVE_FMLA_IDX_S:
|
|
lane := u32(op.immediate) & 0x3
|
|
return lane << 19
|
|
case .SVE_FMLA_IDX_D:
|
|
lane := u32(op.immediate) & 0x1
|
|
return lane << 20
|
|
|
|
// ---- SME tile slice descriptor packing -------------------------------
|
|
//
|
|
// The slice descriptor (packed immediate) is unpacked into the
|
|
// instruction's bit positions per element size. The user-passed
|
|
// packed value carries:
|
|
// imm[3:0] | V[4] | Ws[6:5] | tile[10:7]
|
|
//
|
|
// Instruction layout (per LLVM golden tests):
|
|
// bit 15 = V flag (0=H, 1=V)
|
|
// bits 14:13 = Ws index (Ws is W12 + this)
|
|
// bits 3:0 = tile_num and imm packed (per element size):
|
|
// .B : imm[3:0] (single tile, ZA0.B)
|
|
// .H : tile[0]<<3 | imm[2:0] (2 tiles, 8 slices each)
|
|
// .W : tile[1:0]<<2 | imm[1:0] (4 tiles, 4 slices each)
|
|
// .D : tile[2:0]<<1 | imm[0] (8 tiles, 2 slices each)
|
|
// .Q : tile[3:0] (16 tiles, no imm)
|
|
case .SME_SLICE_B:
|
|
v := u32(op.immediate)
|
|
imm := v & 0xF
|
|
vflag := (v >> 4) & 0x1
|
|
ws := (v >> 5) & 0x3
|
|
return (vflag << 15) | (ws << 13) | imm
|
|
case .SME_SLICE_H:
|
|
v := u32(op.immediate)
|
|
imm := v & 0x7
|
|
vflag := (v >> 4) & 0x1
|
|
ws := (v >> 5) & 0x3
|
|
tile := (v >> 7) & 0x1
|
|
return (vflag << 15) | (ws << 13) | imm | (tile << 3)
|
|
case .SME_SLICE_W:
|
|
v := u32(op.immediate)
|
|
imm := v & 0x3
|
|
vflag := (v >> 4) & 0x1
|
|
ws := (v >> 5) & 0x3
|
|
tile := (v >> 7) & 0x3
|
|
return (vflag << 15) | (ws << 13) | imm | (tile << 2)
|
|
case .SME_SLICE_D:
|
|
v := u32(op.immediate)
|
|
imm := v & 0x1
|
|
vflag := (v >> 4) & 0x1
|
|
ws := (v >> 5) & 0x3
|
|
tile := (v >> 7) & 0x7
|
|
return (vflag << 15) | (ws << 13) | imm | (tile << 1)
|
|
case .SME_SLICE_Q:
|
|
v := u32(op.immediate)
|
|
vflag := (v >> 4) & 0x1
|
|
ws := (v >> 5) & 0x3
|
|
tile := (v >> 7) & 0xF
|
|
return (vflag << 15) | (ws << 13) | tile
|
|
|
|
// ---- Batch 3 misc immediate encodings ----
|
|
case .ENC_FCMLA_ROT:
|
|
// 2-bit rotation at bits 13:12 (0/1/2/3 = 0°/90°/180°/270°).
|
|
return (u32(op.immediate) & 0x3) << 12
|
|
case .ENC_FCADD_ROT:
|
|
// 1-bit rotation at bit 12 (0 = 90°, 1 = 270°).
|
|
return (u32(op.immediate) & 0x1) << 12
|
|
case .ENC_SVE_PRFOP:
|
|
// 4-bit SVE prefetch op at bits 3:0.
|
|
return u32(op.immediate) & 0xF
|
|
case .ENC_LDRAA_IMM10:
|
|
// Signed 10-bit immediate at bits 21:12 (the user passes a byte
|
|
// offset that must be a multiple of 8; we encode imm >> 3).
|
|
v := u32(i32(op.immediate) >> 3) & 0x3FF
|
|
return v << 12
|
|
|
|
// ---- Batch 5 composite-packed encodings ----
|
|
case .ENC_LSL_IMM_W:
|
|
// 32-bit LSL alias: immr = (-imm) & 31, imms = 31 - imm.
|
|
imm := u32(op.immediate) & 0x1F
|
|
immr := ((~imm + 1) & 0x1F)
|
|
imms := (31 - imm) & 0x1F
|
|
return (immr << 16) | (imms << 10)
|
|
case .ENC_LSL_IMM_X:
|
|
// 64-bit LSL alias: immr = (-imm) & 63, imms = 63 - imm.
|
|
imm := u32(op.immediate) & 0x3F
|
|
immr := ((~imm + 1) & 0x3F)
|
|
imms := (63 - imm) & 0x3F
|
|
return (immr << 16) | (imms << 10)
|
|
case .ENC_DUAL_RN_RM:
|
|
// Pack the register at both Rn (9:5) AND Rm (20:16) slots
|
|
// (for ROR Rd, Rn, #imm = EXTR Rd, Rn, Rn, #imm).
|
|
hw := u32(reg_hw(op.reg)) & 0x1F
|
|
return (hw << 5) | (hw << 16)
|
|
case .ENC_ROR_SHIFT:
|
|
// imms (shift amount) at bits 15:10.
|
|
return (u32(op.immediate) & 0x3F) << 10
|
|
|
|
case .ENC_Z_PAIR_VD, .ENC_Z_QUAD_VD:
|
|
// Pack first Z reg into Vd slot (bits 4:0).
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 0
|
|
case .ENC_Z_PAIR_VN, .ENC_Z_QUAD_VN:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 5
|
|
case .ENC_Z_PAIR_VM, .ENC_Z_QUAD_VM:
|
|
return (u32(reg_hw(op.reg)) & 0x1F) << 16
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// =============================================================================
|
|
// Pass 2 -- relocation resolver
|
|
// =============================================================================
|
|
|
|
@(private="file")
|
|
resolve_relocation_inline :: #force_inline proc(
|
|
code: []u8,
|
|
label_defs: []Label_Definition,
|
|
relocation: ^Relocation,
|
|
endianness: Endianness,
|
|
base_address: u64,
|
|
errors: ^[dynamic]Error,
|
|
) -> bool {
|
|
if int(relocation.label_id) >= len(label_defs) { return false }
|
|
ld := label_defs[relocation.label_id]
|
|
if ld == LABEL_UNDEFINED { return false }
|
|
target := u32(ld)
|
|
|
|
word := read_u32(code, relocation.offset, endianness)
|
|
|
|
switch relocation.type {
|
|
case .B26:
|
|
rel := i32(target) - i32(relocation.offset) + relocation.addend
|
|
if rel & 3 != 0 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
words := rel >> 2
|
|
if words < -(1<<25) || words > (1<<25)-1 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
word |= u32(words) & 0x03FFFFFF
|
|
|
|
case .B_COND19, .LDR_LITERAL19:
|
|
rel := i32(target) - i32(relocation.offset) + relocation.addend
|
|
if rel & 3 != 0 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
words := rel >> 2
|
|
if words < -(1<<18) || words > (1<<18)-1 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
word |= (u32(words) & 0x7FFFF) << 5
|
|
|
|
case .TBZ14:
|
|
rel := i32(target) - i32(relocation.offset) + relocation.addend
|
|
if rel & 3 != 0 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
words := rel >> 2
|
|
if words < -(1<<13) || words > (1<<13)-1 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
word |= (u32(words) & 0x3FFF) << 5
|
|
|
|
case .ADR_PCREL21:
|
|
// ADR: signed 21-bit byte offset (no scaling).
|
|
rel := i32(target) - i32(relocation.offset) + relocation.addend
|
|
if rel < -(1<<20) || rel > (1<<20)-1 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
v := u32(rel) & 0x1FFFFF
|
|
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
|
|
|
|
case .ADRP_PCREL21:
|
|
// ADRP: difference of page (4KB-aligned) targets.
|
|
target_page := u64(target) & ~u64(0xFFF) + base_address & ~u64(0xFFF)
|
|
// Effective: ((target + base) >> 12) - ((pc + base) >> 12)
|
|
// Simpler: ((target + base) - (pc + base)) >> 12 when both are
|
|
// 4KB-aligned; but base + offset alignment is the caller's concern.
|
|
pc_page := (u64(relocation.offset) + base_address) & ~u64(0xFFF)
|
|
tg_page := target_page
|
|
diff := i64(tg_page) - i64(pc_page) + i64(relocation.addend)
|
|
if diff & 0xFFF != 0 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
pages := diff >> 12
|
|
if pages < -(1<<20) || pages > (1<<20)-1 {
|
|
append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE})
|
|
return true
|
|
}
|
|
v := u32(pages) & 0x1FFFFF
|
|
word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5
|
|
|
|
case .NONE, .PCREL_LO12_I, .PCREL_LO12_S, .ABS64, .ABS32, .ABS16:
|
|
// Linker-bound or assembler-layer; not auto-resolved here.
|
|
return false
|
|
}
|
|
|
|
write_u32(code, relocation.offset, word, endianness)
|
|
return true
|
|
}
|
|
|
|
// =============================================================================
|
|
// Endian-aware word I/O
|
|
// =============================================================================
|
|
|
|
@(private="package")
|
|
write_u32 :: #force_inline proc "contextless" (
|
|
code: []u8, offset: u32, word: u32, endianness: Endianness,
|
|
) {
|
|
if endianness == .LITTLE {
|
|
code[offset+0] = u8(word)
|
|
code[offset+1] = u8(word >> 8)
|
|
code[offset+2] = u8(word >> 16)
|
|
code[offset+3] = u8(word >> 24)
|
|
} else {
|
|
code[offset+0] = u8(word >> 24)
|
|
code[offset+1] = u8(word >> 16)
|
|
code[offset+2] = u8(word >> 8)
|
|
code[offset+3] = u8(word)
|
|
}
|
|
}
|
|
|
|
@(private="package")
|
|
read_u32 :: #force_inline proc "contextless" (
|
|
code: []u8, offset: u32, endianness: Endianness,
|
|
) -> u32 {
|
|
if endianness == .LITTLE {
|
|
return u32(code[offset+0]) |
|
|
(u32(code[offset+1]) << 8) |
|
|
(u32(code[offset+2]) << 16) |
|
|
(u32(code[offset+3]) << 24)
|
|
}
|
|
return (u32(code[offset+0]) << 24) |
|
|
(u32(code[offset+1]) << 16) |
|
|
(u32(code[offset+2]) << 8) |
|
|
u32(code[offset+3])
|
|
}
|