mirror of
https://github.com/odin-lang/Odin.git
synced 2026-06-19 16:42:33 +00:00
rexcode/arm64: single-structure lane load/store (LD1-4_LANE / ST1-4_LANE)
All eight LD#_LANE / ST#_LANE mnemonics across .B/.H/.S/.D (32 forms). New NEON_LANE_B/H/S/D encodings split the lane index across Q (bit 30), S (bit 12) and size (bits 11:10) per element size; the list length and load/store bit are fixed in the entry bits. All 11 representative forms (every element size, structure count, and lane extremes) byte-exact vs llvm-mc and decode-clean; 461 tests green.
This commit is contained in:
@@ -252,6 +252,17 @@ extract_operand_inline :: #force_inline proc "contextless" (
|
||||
return Operand{immediate = i64(v), kind = .IMMEDIATE, size = 1}
|
||||
case .ZA_TILE_LOW:
|
||||
return Operand{immediate = i64(word & 0x7), kind = .IMMEDIATE, size = 1}
|
||||
case .NEON_LANE_B:
|
||||
i := ((word >> 30) & 0x1) << 3 | ((word >> 12) & 0x1) << 2 | ((word >> 10) & 0x3)
|
||||
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
|
||||
case .NEON_LANE_H:
|
||||
i := ((word >> 30) & 0x1) << 2 | ((word >> 12) & 0x1) << 1 | ((word >> 11) & 0x1)
|
||||
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
|
||||
case .NEON_LANE_S:
|
||||
i := ((word >> 30) & 0x1) << 1 | ((word >> 12) & 0x1)
|
||||
return Operand{immediate = i64(i), kind = .IMMEDIATE, size = 1}
|
||||
case .NEON_LANE_D:
|
||||
return Operand{immediate = i64((word >> 30) & 0x1), kind = .IMMEDIATE, size = 1}
|
||||
|
||||
// ---- Memory operand variants ------------------------------------------
|
||||
case .OFFSET_BASE_U12:
|
||||
|
||||
@@ -506,6 +506,19 @@ pack_operand_inline :: #force_inline proc(
|
||||
case .ZA_TILE_LOW:
|
||||
return (u32(op.immediate) & 0x7) << 0
|
||||
|
||||
// NEON single-structure lane index (Q at 30, S at 12, size at 11:10).
|
||||
case .NEON_LANE_B:
|
||||
i := u32(op.immediate)
|
||||
return ((i >> 3) & 0x1) << 30 | ((i >> 2) & 0x1) << 12 | (i & 0x3) << 10
|
||||
case .NEON_LANE_H:
|
||||
i := u32(op.immediate)
|
||||
return ((i >> 2) & 0x1) << 30 | ((i >> 1) & 0x1) << 12 | (i & 0x1) << 11
|
||||
case .NEON_LANE_S:
|
||||
i := u32(op.immediate)
|
||||
return ((i >> 1) & 0x1) << 30 | (i & 0x1) << 12
|
||||
case .NEON_LANE_D:
|
||||
return (u32(op.immediate) & 0x1) << 30
|
||||
|
||||
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
|
||||
case .NEON_IMM8_FMOV:
|
||||
v := u32(op.immediate) & 0xFF
|
||||
|
||||
@@ -265,6 +265,15 @@ Operand_Encoding :: enum u8 {
|
||||
SVE_EXT_IMM, // SVE EXT byte index: imm8h at 20:16, imm8l at 12:10
|
||||
ZA_TILE_LOW, // SME ZA accumulator tile number at bits 2:0 (ADDHA/ADDVA)
|
||||
|
||||
// ---- NEON single-structure lane index (LD1..4_LANE / ST1..4_LANE) ----
|
||||
// The lane index is split across Q (bit 30), S (bit 12) and size (bits
|
||||
// 11:10) in an element-size-dependent way; the structure-size/count opcode
|
||||
// bits stay fixed in the entry `bits`.
|
||||
NEON_LANE_B, // .B[i]: Q<<3 | S<<2 | size (i in 0..15)
|
||||
NEON_LANE_H, // .H[i]: Q<<2 | S<<1 | bit11 (i in 0..7)
|
||||
NEON_LANE_S, // .S[i]: Q<<1 | S (i in 0..3)
|
||||
NEON_LANE_D, // .D[i]: Q (i in 0..1)
|
||||
|
||||
// ---- LSE atomics ------------------------------------------------------
|
||||
ATOMIC_RS, // Rs (source / compare) at bits 16-20
|
||||
ATOMIC_RT, // Rt (target) at bits 0-4
|
||||
|
||||
@@ -1309,6 +1309,22 @@ inst_ld3r_r_m :: #force_inline proc "contextless" (dst: Regist
|
||||
emit_ld3r_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ld3r_r_m(dst, mem)) }
|
||||
inst_ld4r_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD4R, operand_count = 2, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_mem(mem), {}, {}}} }
|
||||
emit_ld4r_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ld4r_r_m(dst, mem)) }
|
||||
inst_ld1_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD1_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_ld1_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld1_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_ld2_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD2_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_ld2_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld2_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_ld3_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD3_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_ld3_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld3_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_ld4_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .LD4_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_ld4_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_ld4_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_st1_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST1_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_st1_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st1_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_st2_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST2_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_st2_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st2_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_st3_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST3_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_st3_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st3_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_st4_lane_r_i_m :: #force_inline proc "contextless" (dst: Register, imm: i64, mem: Memory) -> Instruction { return Instruction{mnemonic = .ST4_LANE, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_mem(mem), {}}} }
|
||||
emit_st4_lane_r_i_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, mem: Memory) { append(instructions, inst_st4_lane_r_i_m(dst, imm, mem)) }
|
||||
inst_ldr_v_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return inst_ldst(.LDR_V, dst, mem) }
|
||||
emit_ldr_v_r_m :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, mem: Memory) { append(instructions, inst_ldr_v_r_m(dst, mem)) }
|
||||
inst_str_v_r_m :: #force_inline proc "contextless" (dst: Register, mem: Memory) -> Instruction { return inst_ldst(.STR_V, dst, mem) }
|
||||
@@ -3504,6 +3520,22 @@ inst_ld3r :: inst_ld3r_r_m
|
||||
emit_ld3r :: emit_ld3r_r_m
|
||||
inst_ld4r :: inst_ld4r_r_m
|
||||
emit_ld4r :: emit_ld4r_r_m
|
||||
inst_ld1_lane :: inst_ld1_lane_r_i_m
|
||||
emit_ld1_lane :: emit_ld1_lane_r_i_m
|
||||
inst_ld2_lane :: inst_ld2_lane_r_i_m
|
||||
emit_ld2_lane :: emit_ld2_lane_r_i_m
|
||||
inst_ld3_lane :: inst_ld3_lane_r_i_m
|
||||
emit_ld3_lane :: emit_ld3_lane_r_i_m
|
||||
inst_ld4_lane :: inst_ld4_lane_r_i_m
|
||||
emit_ld4_lane :: emit_ld4_lane_r_i_m
|
||||
inst_st1_lane :: inst_st1_lane_r_i_m
|
||||
emit_st1_lane :: emit_st1_lane_r_i_m
|
||||
inst_st2_lane :: inst_st2_lane_r_i_m
|
||||
emit_st2_lane :: emit_st2_lane_r_i_m
|
||||
inst_st3_lane :: inst_st3_lane_r_i_m
|
||||
emit_st3_lane :: emit_st3_lane_r_i_m
|
||||
inst_st4_lane :: inst_st4_lane_r_i_m
|
||||
emit_st4_lane :: emit_st4_lane_r_i_m
|
||||
inst_ldr_v :: inst_ldr_v_r_m
|
||||
emit_ldr_v :: emit_ldr_v_r_m
|
||||
inst_str_v :: inst_str_v_r_m
|
||||
|
||||
@@ -1406,6 +1406,58 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
|
||||
.LD3R = { {.LD3R, {.V_16B, .MEM, .NONE, .NONE}, {.VD, .OFFSET_BASE_A, .NONE, .NONE}, 0x4D40E000, 0xFFFFFC00, .NEON, {}} },
|
||||
.LD4R = { {.LD4R, {.V_16B, .MEM, .NONE, .NONE}, {.VD, .OFFSET_BASE_A, .NONE, .NONE}, 0x4D60E000, 0xFFFFFC00, .NEON, {}} },
|
||||
|
||||
// Single-structure (one lane) load/store: LD#_LANE / ST#_LANE. The lane
|
||||
// index is split across Q (30), S (12) and size (11:10) per element size;
|
||||
// the list length + load/store bit are fixed in the bits.
|
||||
.LD1_LANE = {
|
||||
{.LD1_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D400000, 0xBFFFE000, .NEON, {}},
|
||||
{.LD1_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D404000, 0xBFFFE400, .NEON, {}},
|
||||
{.LD1_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D408000, 0xBFFFEC00, .NEON, {}},
|
||||
{.LD1_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D408400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.LD2_LANE = {
|
||||
{.LD2_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D600000, 0xBFFFE000, .NEON, {}},
|
||||
{.LD2_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D604000, 0xBFFFE400, .NEON, {}},
|
||||
{.LD2_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D608000, 0xBFFFEC00, .NEON, {}},
|
||||
{.LD2_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D608400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.LD3_LANE = {
|
||||
{.LD3_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D402000, 0xBFFFE000, .NEON, {}},
|
||||
{.LD3_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D406000, 0xBFFFE400, .NEON, {}},
|
||||
{.LD3_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D40A000, 0xBFFFEC00, .NEON, {}},
|
||||
{.LD3_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D40A400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.LD4_LANE = {
|
||||
{.LD4_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D602000, 0xBFFFE000, .NEON, {}},
|
||||
{.LD4_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D606000, 0xBFFFE400, .NEON, {}},
|
||||
{.LD4_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D60A000, 0xBFFFEC00, .NEON, {}},
|
||||
{.LD4_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D60A400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.ST1_LANE = {
|
||||
{.ST1_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D000000, 0xBFFFE000, .NEON, {}},
|
||||
{.ST1_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D004000, 0xBFFFE400, .NEON, {}},
|
||||
{.ST1_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D008000, 0xBFFFEC00, .NEON, {}},
|
||||
{.ST1_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D008400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.ST2_LANE = {
|
||||
{.ST2_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D200000, 0xBFFFE000, .NEON, {}},
|
||||
{.ST2_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D204000, 0xBFFFE400, .NEON, {}},
|
||||
{.ST2_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D208000, 0xBFFFEC00, .NEON, {}},
|
||||
{.ST2_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D208400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.ST3_LANE = {
|
||||
{.ST3_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D002000, 0xBFFFE000, .NEON, {}},
|
||||
{.ST3_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D006000, 0xBFFFE400, .NEON, {}},
|
||||
{.ST3_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D00A000, 0xBFFFEC00, .NEON, {}},
|
||||
{.ST3_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D00A400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
.ST4_LANE = {
|
||||
{.ST4_LANE, {.V_ELEM_B, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_B, .OFFSET_BASE_A, .NONE}, 0x0D202000, 0xBFFFE000, .NEON, {}},
|
||||
{.ST4_LANE, {.V_ELEM_H, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_H, .OFFSET_BASE_A, .NONE}, 0x0D206000, 0xBFFFE400, .NEON, {}},
|
||||
{.ST4_LANE, {.V_ELEM_S, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_S, .OFFSET_BASE_A, .NONE}, 0x0D20A000, 0xBFFFEC00, .NEON, {}},
|
||||
{.ST4_LANE, {.V_ELEM_D, .VEC_INDEX, .MEM, .NONE}, {.VD, .NEON_LANE_D, .OFFSET_BASE_A, .NONE}, 0x0D20A400, 0xBFFFFC00, .NEON, {}},
|
||||
},
|
||||
|
||||
// FP/SIMD scalar load/store via V regs (offset-form)
|
||||
.LDR_V = {
|
||||
{.LDR_V, {.B_REG, .MEM, .NONE, .NONE}, {.RT, .OFFSET_BASE_U12, .NONE, .NONE}, 0x3D400000, 0xFFC00000, .FP, {}},
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user