package rexcode_arm64 // ============================================================================= // AArch64 ENCODER // ============================================================================= // // Fixed-width 4-byte ISA. Two-pass design mirroring mips/encoder.odin / // riscv/encoder.odin. The interesting bits vs other arches: // // * Compound operands: SHIFTED_REG (Rm + shift type + amount) and // EXTENDED_REG (Rm + extend + amount) are packed by the RM encoder // by inspecting the operand kind -- a plain REGISTER decays to // LSL #0 / UXTX, amount=0. // // * Three split-immediate scatter patterns: // BRANCH_PG21 -- 21-bit imm split as immlo[30:29] + immhi[23:5] // (ADR / ADRP) // TBZ_BIT -- 6-bit bit position split as b5[31] + b40[23:19] // (TBZ / TBNZ) // SYS_FIELD -- 15-bit (op0:op1:CRn:CRm:op2) at bits 19:5 // // * Loads/stores with the unsigned-offset (LDR/STR) form scale the // user displacement by data size (1/2/4/8) derived from bits[31:30] // of the encoding. LDP/STP pair forms scale a signed 7-bit field. // // * Endianness: AArch64 standard mode stores instructions LE; BE-32 // (instructions stored big-endian) is legacy and rare. Parameter // defaults to LITTLE. MAX_INST_SIZE :: 4 encode_max_code_size :: #force_inline proc "contextless" (n: int) -> int { return n * 4 } encode_max_relocation_count :: #force_inline proc "contextless" (n: int) -> int { return n } encode :: proc( instructions: []Instruction, label_defs: []Label_Definition, code: []u8, relocs: ^[dynamic]Relocation, errors: ^[dynamic]Error, endianness: Endianness = .LITTLE, resolve: bool = true, base_address: u64 = 0, ) -> Result { n_inst := u32(len(instructions)) if u32(len(code)) < n_inst * 4 { append(errors, Error{inst_idx = 0, code = .BUFFER_OVERFLOW}) return Result{byte_count = 0, success = false} } errors_start := u32(len(errors)) pending_start := u32(len(relocs)) pc: u32 = 0 // ---- PASS 1 ----------------------------------------------------------- for i in 0.. *4 ------------------------------------- for &ld in label_defs { if ld != LABEL_UNDEFINED { ld = Label_Definition(u32(ld) * 4) } } if !resolve { return Result{byte_count = pc, success = u32(len(errors)) == errors_start} } // ---- PASS 2: resolve relocations ------------------------------------- n_relocs := u32(len(relocs)) write_idx := pending_start for read_idx in pending_start.. (word: u32, ok: bool) { if inst.mnemonic == .INVALID { append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC}) return 0, false } forms := ENCODING_TABLE[inst.mnemonic] if len(forms) == 0 { append(errors, Error{inst_idx = u32(inst_idx), code = .INVALID_MNEMONIC}) return 0, false } form: ^Encoding for &f in forms { if encoding_matches_inline(inst, &f) { form = &f; break } } if form == nil { append(errors, Error{inst_idx = u32(inst_idx), code = .NO_MATCHING_ENCODING}) return 0, false } word = form.bits if form.enc[0] != .NONE { word |= pack_operand_inline(&inst.ops[0], form.enc[0], form, pc, inst_idx, relocs) } if form.enc[1] != .NONE { word |= pack_operand_inline(&inst.ops[1], form.enc[1], form, pc, inst_idx, relocs) } if form.enc[2] != .NONE { word |= pack_operand_inline(&inst.ops[2], form.enc[2], form, pc, inst_idx, relocs) } if form.enc[3] != .NONE { word |= pack_operand_inline(&inst.ops[3], form.enc[3], form, pc, inst_idx, relocs) } return word, true } @(private="file") encoding_matches_inline :: #force_inline proc "contextless" ( inst: ^Instruction, form: ^Encoding, ) -> bool { return operand_matches_inline(&inst.ops[0], form.ops[0], form) && operand_matches_inline(&inst.ops[1], form.ops[1], form) && operand_matches_inline(&inst.ops[2], form.ops[2], form) && operand_matches_inline(&inst.ops[3], form.ops[3], form) } @(private="file") operand_matches_inline :: #force_inline proc "contextless" ( op: ^Operand, ot: Operand_Type, form: ^Encoding, ) -> bool { switch ot { case .NONE: return op.kind == .NONE case .W_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_W case .X_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_X case .WSP_REG: return op.kind == .REGISTER && (reg_class(op.reg) == REG_W || reg_class(op.reg) == REG_WSP) case .XSP_REG: return op.kind == .REGISTER && (reg_class(op.reg) == REG_X || reg_class(op.reg) == REG_XSP) case .W_SHIFTED: if op.kind == .REGISTER { return reg_class(op.reg) == REG_W } if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_W } return false case .X_SHIFTED: if op.kind == .REGISTER { return reg_class(op.reg) == REG_X } if op.kind == .SHIFTED_REG { return reg_class(op.shifted.reg) == REG_X } return false case .W_EXTENDED: // The extend type selects W vs X for the inner reg; accept either // and let the encoder pack option = extend faithfully. if op.kind == .REGISTER { return reg_class(op.reg) == REG_W } if op.kind == .EXTENDED_REG { c := reg_class(op.extended.reg) return c == REG_W || c == REG_X } return false case .X_EXTENDED: if op.kind == .REGISTER { return reg_class(op.reg) == REG_X } if op.kind == .EXTENDED_REG { c := reg_class(op.extended.reg) return c == REG_W || c == REG_X } return false case .B_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_B case .H_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_H case .S_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_S case .D_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_D case .Q_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_Q case .V_REG: return op.kind == .REGISTER && reg_class(op.reg) == REG_V // NEON vector arrangement variants. The user encodes the arrangement // via op.size: 8B=8, 16B=16, 4H=24, 8H=32, 2S=40, 4S=48, 1D=56, 2D=64. // (lanes * elem_bytes; unique per arrangement). When op.size==0 the // matcher accepts any V register (legacy / "first form wins") -- // callers using op_reg() get size=4 by default which matches the // .V_4H form arithmetically; prefer the explicit op_v_*() builders. case .V_8B: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 8) case .V_16B: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 16) case .V_4H: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24) case .V_8H, .V_8H_FP16: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 32) case .V_2S: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 40) case .V_4S: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 48) case .V_1D: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 56) case .V_2D: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64) case .V_4H_FP16: return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24) case .V_ELEM_B, .V_ELEM_H, .V_ELEM_S, .V_ELEM_D: return op.kind == .REGISTER && reg_class(op.reg) == REG_V // SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8. // op.size==0 (legacy / default-constructed) accepts any width. case .Z_REG_B: return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 1) case .Z_REG_H: return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 2) case .Z_REG_S: return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 4) case .Z_REG_D: return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (op.size == 0 || op.size == 8) case .P_REG, .P_REG_MERGE, .P_REG_ZERO, .P_REG_GOV: return op.kind == .REGISTER && reg_class(op.reg) == REG_P // SME tile state (immediate-encoded tile number; user supplies the // tile index as an immediate, e.g. 0 for ZA0.S, 3 for ZA3.S). case .ZA_TILE_B, .ZA_TILE_H, .ZA_TILE_S, .ZA_TILE_D, .ZA_TILE_Q: return op.kind == .IMMEDIATE // Misc immediate sub-types added in batch 3 case .FCMLA_ROT, .FCADD_ROT, .SVE_PRFOP, .LDRAA_IMM10: return op.kind == .IMMEDIATE case .LSL_SHIFT_W, .LSL_SHIFT_X, .ROR_SHIFT: return op.kind == .IMMEDIATE case .Z_PAIR: // SME2 vector pair: first reg must be even (Z0, Z2, ..., Z30). return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x1) == 0 case .Z_QUAD: // SME2 vector quad: first reg must be multiple of 4. return op.kind == .REGISTER && reg_class(op.reg) == REG_Z && (reg_hw(op.reg) & 0x3) == 0 case .SME_PATTERN, .SVE_PATTERN: return op.kind == .IMMEDIATE // SME tile slice (packed immediate descriptor; see encoding_types.odin) case .SME_SLICE_B, .SME_SLICE_H, .SME_SLICE_W, .SME_SLICE_D, .SME_SLICE_Q: return op.kind == .IMMEDIATE case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2, .NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE: return op.kind == .IMMEDIATE case .BITMASK_IMM: // The user passes the raw logical mask value; we validate that it // fits the AArch64 bitmask-immediate encoding at the form's width. return op.kind == .IMMEDIATE && is_valid_bitmask_imm(u64(op.immediate), form.flags.is_64) case .REL_26, .REL_19, .REL_14, .REL_PG21: return op.kind == .RELATIVE case .MEM: return op.kind == .MEMORY case .COND: return op.kind == .COND } return false } // ============================================================================= // Operand packer // ============================================================================= @(private="file") pack_operand_inline :: #force_inline proc( op: ^Operand, enc: Operand_Encoding, form: ^Encoding, pc: u32, inst_idx: u16, relocs: ^[dynamic]Relocation, ) -> u32 { switch enc { case .NONE, .IMPL: return 0 // ---- Register slots ---------------------------------------------------- case .RD, .RT: return (u32(reg_hw(op.reg)) & 0x1F) << 0 case .RN: return (u32(reg_hw(op.reg)) & 0x1F) << 5 case .RT2, .RA: return (u32(reg_hw(op.reg)) & 0x1F) << 10 case .RM: // RM has three flavours per the operand kind: // REGISTER -- plain Rm at bits 20-16 // SHIFTED_REG -- Rm + shift type (22:23) + amount (15:10) // EXTENDED_REG -- Rm + extend (13:15) + amount (10:12) switch op.kind { case .REGISTER: return (u32(reg_hw(op.reg)) & 0x1F) << 16 case .SHIFTED_REG: return (u32(reg_hw(op.shifted.reg)) & 0x1F) << 16 | (u32(op.shifted.type) & 0x3) << 22 | (u32(op.shifted.amount) & 0x3F) << 10 case .EXTENDED_REG: return (u32(reg_hw(op.extended.reg)) & 0x1F) << 16 | (u32(op.extended.extend) & 0x7) << 13 | (u32(op.extended.amount) & 0x7) << 10 case .NONE, .IMMEDIATE, .MEMORY, .RELATIVE, .COND: return 0 } // ---- Immediates -------------------------------------------------------- case .IMM12: return (u32(op.immediate) & 0xFFF) << 10 case .IMM16: return (u32(op.immediate) & 0xFFFF) << 5 case .IMM6: return (u32(op.immediate) & 0x3F) << 10 case .IMM9: return (u32(op.immediate) & 0x1FF) << 12 case .IMM_HW: return (u32(op.immediate) & 0x3) << 21 case .IMM_SH12: return (u32(op.immediate) & 0x1) << 22 case .SHIFT_TYPE: return (u32(op.immediate) & 0x3) << 22 case .EXT_OPT: return (u32(op.immediate) & 0x7) << 13 case .EXT_IMM3: return (u32(op.immediate) & 0x7) << 10 case .COND_HI: // Condition payload may arrive as IMMEDIATE (raw) or COND kind. c := u32(op.cond) if op.kind == .COND else u32(op.immediate) return (c & 0xF) << 12 case .COND_LO: c := u32(op.cond) if op.kind == .COND else u32(op.immediate) return (c & 0xF) << 0 case .NZCV_FIELD: return (u32(op.immediate) & 0xF) << 0 case .SYS_FIELD: return (u32(op.immediate) & 0x7FFF) << 5 case .HINT_FIELD: return (u32(op.immediate) & 0x7F) << 5 case .BARRIER_FIELD: return (u32(op.immediate) & 0xF) << 8 // ---- Memory operand variants ------------------------------------------ case .OFFSET_BASE_U12: // Scaled unsigned 12-bit: imm12 = disp / data_size // data_size derived from bits[31:30] of the form: 00=1, 01=2, 10=4, 11=8 size := u32(1) << ((form.bits >> 30) & 0x3) base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm_bits := (u32(op.mem.disp) / size) & 0xFFF return base_bits | (imm_bits << 10) case .OFFSET_BASE_S9: // Signed 9-bit unscaled at bits 20-12. base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm_bits := u32(op.mem.disp) & 0x1FF return base_bits | (imm_bits << 12) case .OFFSET_BASE_PRE: // Pre-index: bits[11:10] = 11, signed 9-bit at 20-12. base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm_bits := u32(op.mem.disp) & 0x1FF return base_bits | (imm_bits << 12) | (0x3 << 10) case .OFFSET_BASE_POST: // Post-index: bits[11:10] = 01. base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm_bits := u32(op.mem.disp) & 0x1FF return base_bits | (imm_bits << 12) | (0x1 << 10) case .OFFSET_BASE_A: // Atomic addressing: [Xn] only -- no displacement, no shift. // Used by load/store exclusives, acquire/release, LSE atomics. return (u32(reg_hw(op.mem.base)) & 0x1F) << 5 case .OFFSET_REG: // [Xn, Xm{, LSL #s}]: option=011, S = shift!=0. base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16 option := u32(0x3) << 13 s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0 return base_bits | idx_bits | option | s_bit | (0x2 << 10) case .OFFSET_EXT: // [Xn, Wm, SXTW|UXTW|SXTX #s]: option = ext, S = shift!=0. base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16 option := (u32(op.mem.extend) & 0x7) << 13 s_bit := op.mem.shift != 0 ? u32(1) << 12 : 0 return base_bits | idx_bits | option | s_bit | (0x2 << 10) // ---- PC-relative branches --------------------------------------------- case .BRANCH_26: append(relocs, Relocation{ offset = pc, label_id = u32(op.relative), type = .B26, size = 4, inst_idx = inst_idx, }) return 0 case .BRANCH_19: // Could be B.cond, CBZ/CBNZ, or LDR literal -- the relocation // type for all three is the same B_COND19 (19-bit signed PC-rel // scaled by 4) since the encoding field is identical. append(relocs, Relocation{ offset = pc, label_id = u32(op.relative), type = .B_COND19, size = 4, inst_idx = inst_idx, }) return 0 case .BRANCH_14: append(relocs, Relocation{ offset = pc, label_id = u32(op.relative), type = .TBZ14, size = 4, inst_idx = inst_idx, }) return 0 case .BRANCH_PG21: // ADR / ADRP -- choose reloc type by the form's bits[31] (op flag). ty: Relocation_Type = .ADR_PCREL21 if (form.bits >> 31) & 1 != 0 { ty = .ADRP_PCREL21 } append(relocs, Relocation{ offset = pc, label_id = u32(op.relative), type = ty, size = 4, inst_idx = inst_idx, }) return 0 // ---- TBZ / TBNZ bit position split (b5 at bit 31, b40 at 23-19) ----- case .TBZ_BIT: bit := u32(op.immediate) & 0x3F return ((bit >> 5) & 1) << 31 | (bit & 0x1F) << 19 // ---- NEON / SIMD register slots (alias of RD/RN/RM/RA bit positions) -- case .VD: return (u32(reg_hw(op.reg)) & 0x1F) << 0 case .VN: return (u32(reg_hw(op.reg)) & 0x1F) << 5 case .VM: return (u32(reg_hw(op.reg)) & 0x1F) << 16 case .VA: return (u32(reg_hw(op.reg)) & 0x1F) << 10 // NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5. case .NEON_IMM8_FMOV: v := u32(op.immediate) & 0xFF return ((v >> 5) & 0x7) << 16 | (v & 0x1F) << 5 case .NEON_INDEX_H: // H lane index: H at bit 20, L at bit 21, M at bit 11 (3 bits total // when ESize=H). v1 keeps the simpler layout: just bits 20-19. return (u32(op.immediate) & 0x3) << 19 case .NEON_INDEX_S: // S lane index: bits 11 (H) + 21 (L). v1: bit 11 + bit 21. v := u32(op.immediate) & 0x3 return (v & 0x1) << 21 | ((v >> 1) & 0x1) << 11 case .NEON_INDEX_D: return (u32(op.immediate) & 0x1) << 11 // LSE atomics share field positions with the standard load/store // encoding (Rs at 16-20, Rt at 0-4, Rn at 5-9). case .ATOMIC_RS: return (u32(reg_hw(op.reg)) & 0x1F) << 16 case .ATOMIC_RT: return (u32(reg_hw(op.reg)) & 0x1F) << 0 case .ATOMIC_RN: // Memory operand carries the address register in mem.base. if op.kind == .MEMORY { return (u32(reg_hw(op.mem.base)) & 0x1F) << 5 } return (u32(reg_hw(op.reg)) & 0x1F) << 5 // Bitmask logical immediate. The user passes the raw 32/64-bit mask // value in op.immediate; the matcher has already validated that the // value is encodable at the form's width, so encode_bitmask_imm // cannot fail here. case .BITMASK_FIELD: n, immr, imms, _ := encode_bitmask_imm(u64(op.immediate), form.flags.is_64) return (u32(n) << 22) | (u32(immr) << 16) | (u32(imms) << 10) // SVE predicates (low 4 bits at 0/5/16; merge/zero via bit 14 etc.) case .PD: return (u32(reg_hw(op.reg)) & 0xF) << 0 case .PN: return (u32(reg_hw(op.reg)) & 0xF) << 5 case .PM: return (u32(reg_hw(op.reg)) & 0xF) << 16 case .PG: // Governing predicate (3-bit slot, P0..P7 only). return (u32(reg_hw(op.reg)) & 0x7) << 10 case .PG4: // 4-bit Pg slot (P0..P15) used by predicate-logical and a few // SVE2 ops. return (u32(reg_hw(op.reg)) & 0xF) << 10 case .PM3: // 3-bit Pm at bits 15:13 (SME outer products FMOPA/SMOPA/etc.). return (u32(reg_hw(op.reg)) & 0x7) << 13 // SVE immediates case .SVE_IMM8: // Signed 8-bit at bits 12-5 (DUP/CPY/ADD imm). return (u32(op.immediate) & 0xFF) << 5 case .SVE_IMM5: // 5-bit at bits 20-16 (INDEX imm, etc.). return (u32(op.immediate) & 0x1F) << 16 case .SVE_SHIFT_TSZ_IMM: // tsz:imm3 at bits 22:16 -- caller passes the already-composed // 7-bit field (tsz<6:3>:imm3<2:0>) in the IMMEDIATE. return (u32(op.immediate) & 0x7F) << 16 case .SVE_PATTERN: return (u32(op.immediate) & 0x1F) << 5 // SVE memory operands case .SVE_OFFSET_BASE_SS: // [Xn, Xm, LSL #s] scalar+scalar. Base at 9:5, index at 20:16; // shift is implicit in the encoding's static bits (per ESize). base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 idx_bits := (u32(reg_hw(op.mem.index)) & 0x1F) << 16 return base_bits | idx_bits case .SVE_OFFSET_BASE_SI: // [Xn{, #imm, MUL VL}] scalar+imm. Base at 9:5, signed 4-bit imm // at bits 19:16 (caller passes signed disp as op.mem.disp). base_bits := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm_bits := (u32(op.mem.disp) & 0xF) << 16 return base_bits | imm_bits // SME ZA tile number fields (position depends on element size). case .ZA_TILE_NUM_B: // ZA0.B only -- nothing to encode (single tile of byte form). return 0 case .ZA_TILE_NUM_H: // ZA0.H..ZA1.H -- 1-bit tile number at bit 22. return (u32(op.immediate) & 0x1) << 22 case .ZA_TILE_NUM_S: // ZA0.S..ZA3.S -- 2-bit tile number at bits 23:22. return (u32(op.immediate) & 0x3) << 22 case .ZA_TILE_NUM_D: // ZA0.D..ZA7.D -- 3-bit tile number at bits 23:21. return (u32(op.immediate) & 0x7) << 21 case .SME_PATTERN_FIELD: // 4-bit SME pattern/list at bits 8:5 (ZERO instruction list mask). return (u32(op.immediate) & 0xF) << 5 // ---- SVE gather/scatter + vector-base memory -------------------------- case .SVE_OFFSET_BASE_VEC: // [Xn, Zm.S/D, extend] -- base GPR at 9:5, Zm at 20:16. base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 idx := (u32(reg_hw(op.mem.index)) & 0x1F) << 16 return base | idx case .SVE_OFFSET_VEC_BASE: // [Zn.S/D, #imm5] -- vector base at 9:5, signed-5 imm at bits 20:16. base := (u32(reg_hw(op.mem.base)) & 0x1F) << 5 imm := (u32(op.mem.disp) & 0x1F) << 16 return base | imm // ---- SVE indexed lane field (FMLA Zda.T, Zn.T, Zm.T[i]) -------------- case .SVE_FMLA_IDX_H: // i3 = (op.immediate >> 4) & 0x7? No -- user passes lane index // (0..7) directly. Encoder packs i3 split as bit 22, bits 20:19, // and Zm at bits 18:16 (low 8 regs only for indexed .H/.S). // The instruction format we use accepts the lane index as a // 3-bit immediate; the Zm register comes via .VM. lane := u32(op.immediate) & 0x7 return ((lane >> 2) & 0x1) << 22 | (lane & 0x3) << 19 case .SVE_FMLA_IDX_S: lane := u32(op.immediate) & 0x3 return lane << 19 case .SVE_FMLA_IDX_D: lane := u32(op.immediate) & 0x1 return lane << 20 // ---- SME tile slice descriptor packing ------------------------------- // // The slice descriptor (packed immediate) is unpacked into the // instruction's bit positions per element size. The user-passed // packed value carries: // imm[3:0] | V[4] | Ws[6:5] | tile[10:7] // // Instruction layout (per LLVM golden tests): // bit 15 = V flag (0=H, 1=V) // bits 14:13 = Ws index (Ws is W12 + this) // bits 3:0 = tile_num and imm packed (per element size): // .B : imm[3:0] (single tile, ZA0.B) // .H : tile[0]<<3 | imm[2:0] (2 tiles, 8 slices each) // .W : tile[1:0]<<2 | imm[1:0] (4 tiles, 4 slices each) // .D : tile[2:0]<<1 | imm[0] (8 tiles, 2 slices each) // .Q : tile[3:0] (16 tiles, no imm) case .SME_SLICE_B: v := u32(op.immediate) imm := v & 0xF vflag := (v >> 4) & 0x1 ws := (v >> 5) & 0x3 return (vflag << 15) | (ws << 13) | imm case .SME_SLICE_H: v := u32(op.immediate) imm := v & 0x7 vflag := (v >> 4) & 0x1 ws := (v >> 5) & 0x3 tile := (v >> 7) & 0x1 return (vflag << 15) | (ws << 13) | imm | (tile << 3) case .SME_SLICE_W: v := u32(op.immediate) imm := v & 0x3 vflag := (v >> 4) & 0x1 ws := (v >> 5) & 0x3 tile := (v >> 7) & 0x3 return (vflag << 15) | (ws << 13) | imm | (tile << 2) case .SME_SLICE_D: v := u32(op.immediate) imm := v & 0x1 vflag := (v >> 4) & 0x1 ws := (v >> 5) & 0x3 tile := (v >> 7) & 0x7 return (vflag << 15) | (ws << 13) | imm | (tile << 1) case .SME_SLICE_Q: v := u32(op.immediate) vflag := (v >> 4) & 0x1 ws := (v >> 5) & 0x3 tile := (v >> 7) & 0xF return (vflag << 15) | (ws << 13) | tile // ---- Batch 3 misc immediate encodings ---- case .ENC_FCMLA_ROT: // 2-bit rotation at bits 13:12 (0/1/2/3 = 0°/90°/180°/270°). return (u32(op.immediate) & 0x3) << 12 case .ENC_FCADD_ROT: // 1-bit rotation at bit 12 (0 = 90°, 1 = 270°). return (u32(op.immediate) & 0x1) << 12 case .ENC_SVE_PRFOP: // 4-bit SVE prefetch op at bits 3:0. return u32(op.immediate) & 0xF case .ENC_LDRAA_IMM10: // Signed 10-bit immediate at bits 21:12 (the user passes a byte // offset that must be a multiple of 8; we encode imm >> 3). v := u32(i32(op.immediate) >> 3) & 0x3FF return v << 12 // ---- Batch 5 composite-packed encodings ---- case .ENC_LSL_IMM_W: // 32-bit LSL alias: immr = (-imm) & 31, imms = 31 - imm. imm := u32(op.immediate) & 0x1F immr := ((~imm + 1) & 0x1F) imms := (31 - imm) & 0x1F return (immr << 16) | (imms << 10) case .ENC_LSL_IMM_X: // 64-bit LSL alias: immr = (-imm) & 63, imms = 63 - imm. imm := u32(op.immediate) & 0x3F immr := ((~imm + 1) & 0x3F) imms := (63 - imm) & 0x3F return (immr << 16) | (imms << 10) case .ENC_DUAL_RN_RM: // Pack the register at both Rn (9:5) AND Rm (20:16) slots // (for ROR Rd, Rn, #imm = EXTR Rd, Rn, Rn, #imm). hw := u32(reg_hw(op.reg)) & 0x1F return (hw << 5) | (hw << 16) case .ENC_ROR_SHIFT: // imms (shift amount) at bits 15:10. return (u32(op.immediate) & 0x3F) << 10 case .ENC_Z_PAIR_VD, .ENC_Z_QUAD_VD: // Pack first Z reg into Vd slot (bits 4:0). return (u32(reg_hw(op.reg)) & 0x1F) << 0 case .ENC_Z_PAIR_VN, .ENC_Z_QUAD_VN: return (u32(reg_hw(op.reg)) & 0x1F) << 5 case .ENC_Z_PAIR_VM, .ENC_Z_QUAD_VM: return (u32(reg_hw(op.reg)) & 0x1F) << 16 } return 0 } // ============================================================================= // Pass 2 -- relocation resolver // ============================================================================= @(private="file") resolve_relocation_inline :: #force_inline proc( code: []u8, label_defs: []Label_Definition, relocation: ^Relocation, endianness: Endianness, base_address: u64, errors: ^[dynamic]Error, ) -> bool { if int(relocation.label_id) >= len(label_defs) { return false } ld := label_defs[relocation.label_id] if ld == LABEL_UNDEFINED { return false } target := u32(ld) word := read_u32(code, relocation.offset, endianness) switch relocation.type { case .B26: rel := i32(target) - i32(relocation.offset) + relocation.addend if rel & 3 != 0 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } words := rel >> 2 if words < -(1<<25) || words > (1<<25)-1 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } word |= u32(words) & 0x03FFFFFF case .B_COND19, .LDR_LITERAL19: rel := i32(target) - i32(relocation.offset) + relocation.addend if rel & 3 != 0 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } words := rel >> 2 if words < -(1<<18) || words > (1<<18)-1 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } word |= (u32(words) & 0x7FFFF) << 5 case .TBZ14: rel := i32(target) - i32(relocation.offset) + relocation.addend if rel & 3 != 0 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } words := rel >> 2 if words < -(1<<13) || words > (1<<13)-1 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } word |= (u32(words) & 0x3FFF) << 5 case .ADR_PCREL21: // ADR: signed 21-bit byte offset (no scaling). rel := i32(target) - i32(relocation.offset) + relocation.addend if rel < -(1<<20) || rel > (1<<20)-1 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } v := u32(rel) & 0x1FFFFF word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5 case .ADRP_PCREL21: // ADRP: difference of page (4KB-aligned) targets. target_page := u64(target) & ~u64(0xFFF) + base_address & ~u64(0xFFF) // Effective: ((target + base) >> 12) - ((pc + base) >> 12) // Simpler: ((target + base) - (pc + base)) >> 12 when both are // 4KB-aligned; but base + offset alignment is the caller's concern. pc_page := (u64(relocation.offset) + base_address) & ~u64(0xFFF) tg_page := target_page diff := i64(tg_page) - i64(pc_page) + i64(relocation.addend) if diff & 0xFFF != 0 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } pages := diff >> 12 if pages < -(1<<20) || pages > (1<<20)-1 { append(errors, Error{inst_idx = u32(relocation.inst_idx), code = .LABEL_OUT_OF_RANGE}) return true } v := u32(pages) & 0x1FFFFF word |= (v & 0x3) << 29 | ((v >> 2) & 0x7FFFF) << 5 case .NONE, .PCREL_LO12_I, .PCREL_LO12_S, .ABS64, .ABS32, .ABS16: // Linker-bound or assembler-layer; not auto-resolved here. return false } write_u32(code, relocation.offset, word, endianness) return true } // ============================================================================= // Endian-aware word I/O // ============================================================================= @(private="package") write_u32 :: #force_inline proc "contextless" ( code: []u8, offset: u32, word: u32, endianness: Endianness, ) { if endianness == .LITTLE { code[offset+0] = u8(word) code[offset+1] = u8(word >> 8) code[offset+2] = u8(word >> 16) code[offset+3] = u8(word >> 24) } else { code[offset+0] = u8(word >> 24) code[offset+1] = u8(word >> 16) code[offset+2] = u8(word >> 8) code[offset+3] = u8(word) } } @(private="package") read_u32 :: #force_inline proc "contextless" ( code: []u8, offset: u32, endianness: Endianness, ) -> u32 { if endianness == .LITTLE { return u32(code[offset+0]) | (u32(code[offset+1]) << 8) | (u32(code[offset+2]) << 16) | (u32(code[offset+3]) << 24) } return (u32(code[offset+0]) << 24) | (u32(code[offset+1]) << 16) | (u32(code[offset+2]) << 8) | u32(code[offset+3]) }