rexcode/arm64: single-structure lane load/store (LD1-4_LANE / ST1-4_LANE)

All eight LD#_LANE / ST#_LANE mnemonics across .B/.H/.S/.D (32 forms).
New NEON_LANE_B/H/S/D encodings split the lane index across Q (bit 30),
S (bit 12) and size (bits 11:10) per element size; the list length and
load/store bit are fixed in the entry bits. All 11 representative forms
(every element size, structure count, and lane extremes) byte-exact vs
llvm-mc and decode-clean; 461 tests green.
This commit is contained in:
Brendan Punsky
2026-06-18 00:21:43 -04:00
committed by Flāvius
parent 2c8768b39a
commit 33e5202f05
11 changed files with 975 additions and 786 deletions

View File

@@ -252,6 +252,17 @@ extract_operand_inline :: #force_inline proc "contextless" (
return Operand{immediate = i64(v), kind = .IMMEDIATE, size = 1}
case .ZA_TILE_LOW:
return Operand{immediate = i64(word & 0x7), kind = .IMMEDIATE, size = 1}
case .NEON_LANE_B:
i := ((word >> 30) & 0x1) << 3 | ((word >> 12) & 0x1) << 2 | ((word >> 10) & 0x3)
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
case .NEON_LANE_H:
i := ((word >> 30) & 0x1) << 2 | ((word >> 12) & 0x1) << 1 | ((word >> 11) & 0x1)
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
case .NEON_LANE_S:
i := ((word >> 30) & 0x1) << 1 | ((word >> 12) & 0x1)
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
case .NEON_LANE_D:
return Operand{immediate = i64((word >> 30) & 0x1), kind = .IMMEDIATE, size = 1}
// ---- Memory operand variants ------------------------------------------
case .OFFSET_BASE_U12:

View File

@@ -506,6 +506,19 @@ pack_operand_inline :: #force_inline proc(
case .ZA_TILE_LOW:
return (u32(op.immediate) & 0x7) << 0
// NEON single-structure lane index (Q at 30, S at 12, size at 11:10).
case .NEON_LANE_B:
i := u32(op.immediate)
return ((i >> 3) & 0x1) << 30 | ((i >> 2) & 0x1) << 12 | (i & 0x3) << 10
case .NEON_LANE_H:
i := u32(op.immediate)
return ((i >> 2) & 0x1) << 30 | ((i >> 1) & 0x1) << 12 | (i & 0x1) << 11
case .NEON_LANE_S:
i := u32(op.immediate)
return ((i >> 1) & 0x1) << 30 | (i & 0x1) << 12
case .NEON_LANE_D:
return (u32(op.immediate) & 0x1) << 30
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
case .NEON_IMM8_FMOV:
v := u32(op.immediate) & 0xFF

View File

@@ -265,6 +265,15 @@ Operand_Encoding :: enum u8 {
SVE_EXT_IMM, // SVE EXT byte index: imm8h at 20:16, imm8l at 12:10
ZA_TILE_LOW, // SME ZA accumulator tile number at bits 2:0 (ADDHA/ADDVA)
// ---- NEON single-structure lane index (LD1..4_LANE / ST1..4_LANE) ----
// The lane index is split across Q (bit 30), S (bit 12) and size (bits
// 11:10) in an element-size-dependent way; the structure-size/count opcode
// bits stay fixed in the entry `bits`.
NEON_LANE_B, // .B[i]: Q<<3 | S<<2 | size (i in 0..15)
NEON_LANE_H, // .H[i]: Q<<2 | S<<1 | bit11 (i in 0..7)
NEON_LANE_S, // .S[i]: Q<<1 | S (i in 0..3)
NEON_LANE_D, // .D[i]: Q (i in 0..1)
// ---- LSE atomics ------------------------------------------------------
ATOMIC_RS, // Rs (source / compare) at bits 16-20
ATOMIC_RT, // Rt (target) at bits 0-4

View File

@@ -1309,6 +1309,22 @@ inst_ld3r_r_m :: #force_inline proc "contextless" (dst: Regist
emit_ld3r_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ld3r_r_m(dst, mem)) }
inst_ld4r_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD4R, operand_count = 2, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_mem(mem), {}, {}}} }
emit_ld4r_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ld4r_r_m(dst, mem)) }
inst_ld1_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD1_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_ld1_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld1_lane_r_i_m(dst, imm, mem)) }
inst_ld2_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD2_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_ld2_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld2_lane_r_i_m(dst, imm, mem)) }
inst_ld3_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD3_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_ld3_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld3_lane_r_i_m(dst, imm, mem)) }
inst_ld4_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD4_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_ld4_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld4_lane_r_i_m(dst, imm, mem)) }
inst_st1_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST1_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_st1_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st1_lane_r_i_m(dst, imm, mem)) }
inst_st2_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST2_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_st2_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st2_lane_r_i_m(dst, imm, mem)) }
inst_st3_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST3_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_st3_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st3_lane_r_i_m(dst, imm, mem)) }
inst_st4_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST4_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
emit_st4_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st4_lane_r_i_m(dst, imm, mem)) }
inst_ldr_v_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return inst_ldst(.LDR_V, dst, mem) }
emit_ldr_v_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ldr_v_r_m(dst, mem)) }
inst_str_v_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return inst_ldst(.STR_V, dst, mem) }
@@ -3504,6 +3520,22 @@ inst_ld3r :: inst_ld3r_r_m
emit_ld3r :: emit_ld3r_r_m
inst_ld4r :: inst_ld4r_r_m
emit_ld4r :: emit_ld4r_r_m
inst_ld1_lane :: inst_ld1_lane_r_i_m
emit_ld1_lane :: emit_ld1_lane_r_i_m
inst_ld2_lane :: inst_ld2_lane_r_i_m
emit_ld2_lane :: emit_ld2_lane_r_i_m
inst_ld3_lane :: inst_ld3_lane_r_i_m
emit_ld3_lane :: emit_ld3_lane_r_i_m
inst_ld4_lane :: inst_ld4_lane_r_i_m
emit_ld4_lane :: emit_ld4_lane_r_i_m
inst_st1_lane :: inst_st1_lane_r_i_m
emit_st1_lane :: emit_st1_lane_r_i_m
inst_st2_lane :: inst_st2_lane_r_i_m
emit_st2_lane :: emit_st2_lane_r_i_m
inst_st3_lane :: inst_st3_lane_r_i_m
emit_st3_lane :: emit_st3_lane_r_i_m
inst_st4_lane :: inst_st4_lane_r_i_m
emit_st4_lane :: emit_st4_lane_r_i_m
inst_ldr_v :: inst_ldr_v_r_m
emit_ldr_v :: emit_ldr_v_r_m
inst_str_v :: inst_str_v_r_m

View File

@@ -1406,6 +1406,58 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
.LD3R = { {.LD3R, {.V_16B, .MEM, .NONE, .NONE}, {.VD, .OFFSET_BASE_A, .NONE, .NONE}, 0x4D40E000, 0xFFFFFC00, .NEON, {}} },
.LD4R = { {.LD4R, {.V_16B, .MEM, .NONE, .NONE}, {.VD, .OFFSET_BASE_A, .NONE, .NONE}, 0x4D60E000, 0xFFFFFC00, .NEON, {}} },
// Single-structure (one lane) load/store: LD#_LANE / ST#_LANE. The lane
// index is split across Q (30), S (12) and size (11:10) per element size;
// the list length + load/store bit are fixed in the bits.
.LD1_LANE = {
{.LD1_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D400000, 0xBFFFE000, .NEON, {}},
{.LD1_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D404000, 0xBFFFE400, .NEON, {}},
{.LD1_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D408000, 0xBFFFEC00, .NEON, {}},
{.LD1_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D408400, 0xBFFFFC00, .NEON, {}},
},
.LD2_LANE = {
{.LD2_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D600000, 0xBFFFE000, .NEON, {}},
{.LD2_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D604000, 0xBFFFE400, .NEON, {}},
{.LD2_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D608000, 0xBFFFEC00, .NEON, {}},
{.LD2_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D608400, 0xBFFFFC00, .NEON, {}},
},
.LD3_LANE = {
{.LD3_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D402000, 0xBFFFE000, .NEON, {}},
{.LD3_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D406000, 0xBFFFE400, .NEON, {}},
{.LD3_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D40A000, 0xBFFFEC00, .NEON, {}},
{.LD3_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D40A400, 0xBFFFFC00, .NEON, {}},
},
.LD4_LANE = {
{.LD4_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D602000, 0xBFFFE000, .NEON, {}},
{.LD4_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D606000, 0xBFFFE400, .NEON, {}},
{.LD4_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D60A000, 0xBFFFEC00, .NEON, {}},
{.LD4_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D60A400, 0xBFFFFC00, .NEON, {}},
},
.ST1_LANE = {
{.ST1_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D000000, 0xBFFFE000, .NEON, {}},
{.ST1_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D004000, 0xBFFFE400, .NEON, {}},
{.ST1_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D008000, 0xBFFFEC00, .NEON, {}},
{.ST1_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D008400, 0xBFFFFC00, .NEON, {}},
},
.ST2_LANE = {
{.ST2_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D200000, 0xBFFFE000, .NEON, {}},
{.ST2_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D204000, 0xBFFFE400, .NEON, {}},
{.ST2_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D208000, 0xBFFFEC00, .NEON, {}},
{.ST2_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D208400, 0xBFFFFC00, .NEON, {}},
},
.ST3_LANE = {
{.ST3_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D002000, 0xBFFFE000, .NEON, {}},
{.ST3_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D006000, 0xBFFFE400, .NEON, {}},
{.ST3_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D00A000, 0xBFFFEC00, .NEON, {}},
{.ST3_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D00A400, 0xBFFFFC00, .NEON, {}},
},
.ST4_LANE = {
{.ST4_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D202000, 0xBFFFE000, .NEON, {}},
{.ST4_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D206000, 0xBFFFE400, .NEON, {}},
{.ST4_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D20A000, 0xBFFFEC00, .NEON, {}},
{.ST4_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D20A400, 0xBFFFFC00, .NEON, {}},
},
// FP/SIMD scalar load/store via V regs (offset-form)
.LDR_V = {
{.LDR_V, {.B_REG, .MEM, .NONE, .NONE}, {.RT, .OFFSET_BASE_U12, .NONE, .NONE}, 0x3D400000, 0xFFC00000, .FP, {}},

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff