rexcode/arm64: NEON copy/permute (MOV/MVN/DUP/INS/EXT) encode forms

MOV_V (ORR alias: source feeds both Vn and Vm via a new VN_VM_DUP
encoding), MVN_V (NOT alias, plain 2-register), DUP_V (element form
Vd.T,Vn.Ts[i] and general form Vd.T,Wn/Xn), INS (element-to-element and
from-GPR), EXT_V (imm4 byte index). Adds a VEC_INDEX operand type plus
NEON_IDX5/NEON_IDX4/NEON_EXT_IDX encodings: the element-size marker rides
in the entry bits, the lane index drives the bits above it, and the
decoder recovers the element size from imm5's marker.

Element size now rides in op.size (B=1/H=2/S=4/D=8) via op_v_elem_b/h/s/d
so the matcher can disambiguate DUP/INS element forms; the builder
generator maps V_ELEM_* to those constructors. specgen derives the mask
by varying registers and each index field to its max -- the GPR-source
forms vary Vd and Rn independently (Rn 31 = wzr/xzr) so the low bit of
each field toggles. All 19 representative forms byte-exact vs llvm-mc and
decode-clean; 461 tests green. (TBL/TBX register-list forms deferred.)
This commit is contained in:
Brendan Punsky
2026-06-17 23:23:44 -04:00
committed by Flāvius
parent 5761c23ba4
commit 06eb3de6a2
14 changed files with 1214 additions and 877 deletions

View File

@@ -211,6 +211,30 @@ extract_operand_inline :: #force_inline proc "contextless" (
if en == .NEON_SHR_IMM { amt = 2 * esize - val }
return Operand{immediate = amt, kind = .IMMEDIATE, size = 1}
// ---- NEON copy/permute index fields ------------------------------------
case .VN_VM_DUP:
return Operand{reg = Register(REG_V | u16((word >> 5) & 0x1F)), kind = .REGISTER, size = 4}
case .NEON_IDX5:
// imm5 = index << (markerbit+1) | (1 << markerbit); marker = lowest set bit.
imm5 := (word >> 16) & 0x1F
mb: u32 = 0
if imm5 & 0x1 != 0 { mb = 0 }
else if imm5 & 0x2 != 0 { mb = 1 }
else if imm5 & 0x4 != 0 { mb = 2 }
else { mb = 3 }
return Operand{immediate = i64(imm5 >> (mb + 1)), kind = .IMMEDIATE, size = 1}
case .NEON_IDX4:
// imm4 = index << markerbit; recover markerbit from imm5 in the word.
imm5 := (word >> 16) & 0x1F
mb: u32 = 0
if imm5 & 0x1 != 0 { mb = 0 }
else if imm5 & 0x2 != 0 { mb = 1 }
else if imm5 & 0x4 != 0 { mb = 2 }
else { mb = 3 }
return Operand{immediate = i64(((word >> 11) & 0xF) >> mb), kind = .IMMEDIATE, size = 1}
case .NEON_EXT_IDX:
return Operand{immediate = i64((word >> 11) & 0xF), kind = .IMMEDIATE, size = 1}
// ---- Memory operand variants ------------------------------------------
case .OFFSET_BASE_U12:
size := u32(1) << ((word >> 30) & 0x3)

View File

@@ -209,8 +209,17 @@ operand_matches_inline :: #force_inline proc "contextless" (
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64)
case .V_4H_FP16:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
case .V_ELEM_B, .V_ELEM_H, .V_ELEM_S, .V_ELEM_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V
// Element-indexed V views: element size carried in op.size (B=1,H=2,S=4,
// D=8) so DUP/INS forms disambiguate. .S also accepts size 0 so a plain
// op_reg (as the hand-written SM3TT forms pass) still matches the .S slot.
case .V_ELEM_B:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 1
case .V_ELEM_H:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 2
case .V_ELEM_S:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 4 || op.size == 0)
case .V_ELEM_D:
return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 8
// SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8.
// op.size==0 (legacy / default-constructed) accepts any width.
@@ -247,7 +256,7 @@ operand_matches_inline :: #force_inline proc "contextless" (
return op.kind == .IMMEDIATE
case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT:
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT, .VEC_INDEX:
return op.kind == .IMMEDIATE
case .BITMASK_IMM:
// The user passes the raw logical mask value; we validate that it
@@ -279,6 +288,21 @@ vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
return 8
}
@(private="file")
// Lane-index marker bit (log2 of element-size in bytes) for a DUP/INS form:
// derived from the V_ELEM_* operand the form carries. B=0, H=1, S=2, D=3.
vidx_markerbit :: #force_inline proc "contextless" (form: ^Encoding) -> u32 {
for ot in form.ops {
#partial switch ot {
case .V_ELEM_B: return 0
case .V_ELEM_H: return 1
case .V_ELEM_S: return 2
case .V_ELEM_D: return 3
}
}
return 0
}
pack_operand_inline :: #force_inline proc(
op: ^Operand,
enc: Operand_Encoding,
@@ -441,6 +465,19 @@ pack_operand_inline :: #force_inline proc(
esize := vec_esize(form.ops[0])
return ((esize - u32(op.immediate)) & 0x3F) << 16
// NEON copy/permute index fields (element-size marker fixed in `bits`).
case .VN_VM_DUP:
hw := u32(reg_hw(op.reg)) & 0x1F
return (hw << 5) | (hw << 16)
case .NEON_IDX5:
mb := vidx_markerbit(form)
return (u32(op.immediate) << (mb + 1)) << 16
case .NEON_IDX4:
mb := vidx_markerbit(form)
return (u32(op.immediate) << mb) << 11
case .NEON_EXT_IDX:
return (u32(op.immediate) & 0xF) << 11
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
case .NEON_IMM8_FMOV:
v := u32(op.immediate) & 0xFF

View File

@@ -169,6 +169,10 @@ Operand_Type :: enum u8 {
// ---- NEON shift-by-immediate amount (encoded into immh:immb together
// with the element size: left = esize+shift, right = 2*esize-shift) ----
VEC_SHIFT,
// ---- NEON element lane index (DUP/INS/EXT). The element-size marker
// lives in the entry `bits`; the operand drives only the index bits. ----
VEC_INDEX,
}
// Where each operand's bits land in the 32-bit word.
@@ -233,6 +237,15 @@ Operand_Encoding :: enum u8 {
NEON_SHL_IMM,
NEON_SHR_IMM,
// ---- NEON copy/permute index fields ----
// The element-size marker bit lives in the entry `bits`; the lane index
// operand drives the bits above it (DUP/INS imm5, INS imm4) or the plain
// imm4 (EXT). The decoder recovers the element size from imm5's marker.
VN_VM_DUP, // one V reg packed into BOTH Vn (9:5) and Vm (20:16) (MOV = ORR alias)
NEON_IDX5, // element lane index in imm5 (20:16); index << (markerbit+1)
NEON_IDX4, // INS source lane index in imm4 (14:11); index << markerbit
NEON_EXT_IDX, // EXT byte index in imm4 (14:11)
// ---- LSE atomics ------------------------------------------------------
ATOMIC_RS, // Rs (source / compare) at bits 16-20
ATOMIC_RT, // Rt (target) at bits 0-4

View File

@@ -647,13 +647,13 @@ inst_sm3partw2_r_r_r :: #force_inline proc "contextless" (dst: Regist
emit_sm3partw2_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3partw2_r_r_r(dst, src, src2)) }
inst_sm3ss1_r_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register, src3: Register) -> Instruction { return Instruction{mnemonic = .SM3SS1, operand_count = 4, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_4s(u8(reg_hw(src2))), op_v_4s(u8(reg_hw(src3)))}} }
emit_sm3ss1_r_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register, src3: Register) { append(instructions, inst_sm3ss1_r_r_r_r(dst, src, src2, src3)) }
inst_sm3tt1a_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
inst_sm3tt1a_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
emit_sm3tt1a_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt1a_r_r_r(dst, src, src2)) }
inst_sm3tt1b_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
inst_sm3tt1b_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
emit_sm3tt1b_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt1b_r_r_r(dst, src, src2)) }
inst_sm3tt2a_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
inst_sm3tt2a_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
emit_sm3tt2a_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt2a_r_r_r(dst, src, src2)) }
inst_sm3tt2b_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
inst_sm3tt2b_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
emit_sm3tt2b_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt2b_r_r_r(dst, src, src2)) }
inst_sm4e_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .SM4E, operand_count = 2, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), {}, {}}} }
emit_sm4e_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_sm4e_r_r(dst, src)) }
@@ -1067,6 +1067,8 @@ inst_bic_v_r_r_r :: #force_inline proc "contextless" (dst: Regist
emit_bic_v_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bic_v_r_r_r(dst, src, src2)) }
inst_orn_v_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ORN_V, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
emit_orn_v_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_orn_v_r_r_r(dst, src, src2)) }
inst_mvn_v_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .MVN_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
emit_mvn_v_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_mvn_v_r_r(dst, src)) }
inst_bit_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .BIT, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
emit_bit_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bit_r_r_r(dst, src, src2)) }
inst_bif_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .BIF, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
@@ -1155,6 +1157,18 @@ inst_sqrshrun_r_r_i :: #force_inline proc "contextless" (dst: Regist
emit_sqrshrun_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqrshrun_r_r_i(dst, src, imm)) }
inst_sqrshrun2_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SQRSHRUN2, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_8h(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sqrshrun2_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqrshrun2_r_r_i(dst, src, imm)) }
inst_dup_v_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .DUP_V, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_elem_b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
inst_dup_v_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .DUP_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_reg(src), {}, {}}} }
emit_dup_v_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_dup_v_r_r_i(dst, src, imm)) }
emit_dup_v_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_dup_v_r_r(dst, src)) }
inst_ins_r_i_r_i :: #force_inline proc "contextless" (dst: Register, imm: i64, src: Register, imm2: i64) -> Instruction { return Instruction{mnemonic = .INS, operand_count = 4, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_v_elem_b(u8(reg_hw(src))), op_imm(imm2, 4)}} }
inst_ins_r_i_r :: #force_inline proc "contextless" (dst: Register, imm: i64, src: Register) -> Instruction { return Instruction{mnemonic = .INS, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_reg(src), {}}} }
emit_ins_r_i_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, src: Register, imm2: i64) { append(instructions, inst_ins_r_i_r_i(dst, imm, src, imm2)) }
emit_ins_r_i_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, src: Register) { append(instructions, inst_ins_r_i_r(dst, imm, src)) }
inst_mov_v_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .MOV_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
emit_mov_v_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_mov_v_r_r(dst, src)) }
inst_ext_v_r_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .EXT_V, operand_count = 4, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), op_imm(imm, 4)}} }
emit_ext_v_r_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register, imm: i64) { append(instructions, inst_ext_v_r_r_r_i(dst, src, src2, imm)) }
inst_zip1_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ZIP1, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
emit_zip1_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_zip1_r_r_r(dst, src, src2)) }
inst_zip2_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ZIP2, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
@@ -3040,6 +3054,8 @@ inst_bic_v :: inst_bic_v_r_r_r
emit_bic_v :: emit_bic_v_r_r_r
inst_orn_v :: inst_orn_v_r_r_r
emit_orn_v :: emit_orn_v_r_r_r
inst_mvn_v :: inst_mvn_v_r_r
emit_mvn_v :: emit_mvn_v_r_r
inst_bit :: inst_bit_r_r_r
emit_bit :: emit_bit_r_r_r
inst_bif :: inst_bif_r_r_r
@@ -3128,6 +3144,14 @@ inst_sqrshrun :: inst_sqrshrun_r_r_i
emit_sqrshrun :: emit_sqrshrun_r_r_i
inst_sqrshrun2 :: inst_sqrshrun2_r_r_i
emit_sqrshrun2 :: emit_sqrshrun2_r_r_i
inst_dup_v :: proc{ inst_dup_v_r_r_i, inst_dup_v_r_r }
emit_dup_v :: proc{ emit_dup_v_r_r_i, emit_dup_v_r_r }
inst_ins :: proc{ inst_ins_r_i_r_i, inst_ins_r_i_r }
emit_ins :: proc{ emit_ins_r_i_r_i, emit_ins_r_i_r }
inst_mov_v :: inst_mov_v_r_r
emit_mov_v :: emit_mov_v_r_r
inst_ext_v :: inst_ext_v_r_r_r_i
emit_ext_v :: emit_ext_v_r_r_r_i
inst_zip1 :: inst_zip1_r_r_r
emit_zip1 :: emit_zip1_r_r_r
inst_zip2 :: inst_zip2_r_r_r

View File

@@ -210,6 +210,26 @@ op_v_2d :: #force_inline proc "contextless" (n: u8) -> Operand {
return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 64}
}
// Element-indexed V views (V0.B[i]/.H[i]/.S[i]/.D[i]). The element size rides
// in op.size (1/2/4/8) so the matcher can disambiguate DUP/INS forms; the lane
// index is a separate immediate operand.
@(require_results)
op_v_elem_b :: #force_inline proc "contextless" (n: u8) -> Operand {
return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 1}
}
@(require_results)
op_v_elem_h :: #force_inline proc "contextless" (n: u8) -> Operand {
return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 2}
}
@(require_results)
op_v_elem_s :: #force_inline proc "contextless" (n: u8) -> Operand {
return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 4}
}
@(require_results)
op_v_elem_d :: #force_inline proc "contextless" (n: u8) -> Operand {
return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 8}
}
// -----------------------------------------------------------------------------
// Memory constructors (one per addressing mode)
// -----------------------------------------------------------------------------

View File

@@ -4340,5 +4340,45 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
{.SQRSHRUN2, {.V_8H, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F108C00, 0xFFF0FC00, .NEON, {}},
{.SQRSHRUN2, {.V_4S, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F208C00, 0xFFE0FC00, .NEON, {}},
},
// Advanced SIMD copy / permute (MOV/MVN/DUP/INS/EXT).
.MOV_V = {
{.MOV_V, {.V_8B, .V_8B, .NONE, .NONE}, {.VD, .VN_VM_DUP, .NONE, .NONE}, 0x0EA01C00, 0xFFE0FC00, .NEON, {}},
{.MOV_V, {.V_16B, .V_16B, .NONE, .NONE}, {.VD, .VN_VM_DUP, .NONE, .NONE}, 0x4EA01C00, 0xFFE0FC00, .NEON, {}},
},
.MVN_V = {
{.MVN_V, {.V_8B, .V_8B, .NONE, .NONE}, {.VD, .VN, .NONE, .NONE}, 0x2E205800, 0xFFFFFC00, .NEON, {}},
{.MVN_V, {.V_16B, .V_16B, .NONE, .NONE}, {.VD, .VN, .NONE, .NONE}, 0x6E205800, 0xFFFFFC00, .NEON, {}},
},
.DUP_V = {
{.DUP_V, {.V_8B, .V_ELEM_B, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E010400, 0xFFE1FC00, .NEON, {}},
{.DUP_V, {.V_16B, .V_ELEM_B, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E010400, 0xFFE1FC00, .NEON, {}},
{.DUP_V, {.V_4H, .V_ELEM_H, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E020400, 0xFFE3FC00, .NEON, {}},
{.DUP_V, {.V_8H, .V_ELEM_H, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E020400, 0xFFE3FC00, .NEON, {}},
{.DUP_V, {.V_2S, .V_ELEM_S, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E040400, 0xFFE7FC00, .NEON, {}},
{.DUP_V, {.V_4S, .V_ELEM_S, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E040400, 0xFFE7FC00, .NEON, {}},
{.DUP_V, {.V_2D, .V_ELEM_D, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E080400, 0xFFEFFC00, .NEON, {}},
{.DUP_V, {.V_8B, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E010C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_16B, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E010C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_4H, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E020C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_8H, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E020C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_2S, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E040C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_4S, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E040C00, 0xFFFFFC00, .NEON, {}},
{.DUP_V, {.V_2D, .X_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E080C00, 0xFFFFFC00, .NEON, {}},
},
.INS = {
{.INS, {.V_ELEM_B, .VEC_INDEX, .V_ELEM_B, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E010400, 0xFFE18400, .NEON, {}},
{.INS, {.V_ELEM_H, .VEC_INDEX, .V_ELEM_H, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E020400, 0xFFE38C00, .NEON, {}},
{.INS, {.V_ELEM_S, .VEC_INDEX, .V_ELEM_S, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E040400, 0xFFE79C00, .NEON, {}},
{.INS, {.V_ELEM_D, .VEC_INDEX, .V_ELEM_D, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E080400, 0xFFEFBC00, .NEON, {}},
{.INS, {.V_ELEM_B, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E011C00, 0xFFE1FC00, .NEON, {}},
{.INS, {.V_ELEM_H, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E021C00, 0xFFE3FC00, .NEON, {}},
{.INS, {.V_ELEM_S, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E041C00, 0xFFE7FC00, .NEON, {}},
{.INS, {.V_ELEM_D, .VEC_INDEX, .X_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E081C00, 0xFFEFFC00, .NEON, {}},
},
.EXT_V = {
{.EXT_V, {.V_8B, .V_8B, .V_8B, .VEC_INDEX}, {.VD, .VN, .VM, .NEON_EXT_IDX}, 0x2E000000, 0xFFE0C400, .NEON, {}},
{.EXT_V, {.V_16B, .V_16B, .V_16B, .VEC_INDEX}, {.VD, .VN, .VM, .NEON_EXT_IDX}, 0x6E000000, 0xFFE08400, .NEON, {}},
},
// SPECGEN:END
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -347,6 +347,120 @@ do
sections[#sections+1] = "\t// Advanced SIMD shift by immediate.\n" .. table.concat(blk, "\n")
end
-- ---- NEON copy / permute (MOV/MVN/DUP/INS/EXT) -----------------------------
-- These carry a lane index in imm5/imm4 whose element-size marker is fixed in
-- `bits`; the operand drives the bits above it. mask is derived by varying the
-- registers AND each index field to its maximum (mask = ~union of the deltas).
local function mask_of(base, variants)
local x = 0
for _, w in ipairs(variants) do x = bit.bor(x, bit.bxor(base, w)) end
return bit.band(bit.bnot(x), 0xFFFFFFFF)
end
-- element metadata: asm token, valid dst arrangements, max lane index, V_ELEM type.
local ELEM = {
B = {tok="b", dst={"8B","16B"}, idxmax=15, vt="V_ELEM_B"},
H = {tok="h", dst={"4H","8H"}, idxmax=7, vt="V_ELEM_H"},
S = {tok="s", dst={"2S","4S"}, idxmax=3, vt="V_ELEM_S"},
D = {tok="d", dst={"2D"}, idxmax=1, vt="V_ELEM_D"},
}
do
local blk = {}
local function emit_rows(mnem, rows)
if #rows > 0 then blk[#blk+1] = string.format("\t.%s = {\n%s\n\t},", mnem, table.concat(rows, "\n")); n_mnem = n_mnem + 1 end
end
local function mkrow(mnem, ops, enc, b0, variants)
n_forms = n_forms + 1
return string.format("\t\t{.%s, %s, %s, 0x%s, 0x%s, .NEON, {}},",
mnem, ops, enc, bit.tohex(b0):upper(), bit.tohex(mask_of(b0, variants)):upper())
end
-- MOV Vd.T, Vn.T (= ORR Vd,Vn,Vn): the single source feeds both Vn and Vm.
local mov = {}
for _, a in ipairs({"8B","16B"}) do
local function mk(x) return string.format("mov v%d.%s, v%d.%s", x, ARR[a].asm, x, ARR[a].asm) end
local b0, b31 = word(mk(0)), word(mk(31))
if b0 and b31 then mov[#mov+1] = mkrow("MOV_V",
string.format("{.%s, .%s, .NONE, .NONE}", ARR[a].vt, ARR[a].vt),
"{.VD, .VN_VM_DUP, .NONE, .NONE}", b0, {b31}) end
end
emit_rows("MOV_V", mov)
-- MVN Vd.T, Vn.T (= NOT alias): plain two-register.
local mvn = {}
for _, a in ipairs({"8B","16B"}) do
local function mk(x) return string.format("mvn v%d.%s, v%d.%s", x, ARR[a].asm, x, ARR[a].asm) end
local b0, b31 = word(mk(0)), word(mk(31))
if b0 and b31 then mvn[#mvn+1] = mkrow("MVN_V",
string.format("{.%s, .%s, .NONE, .NONE}", ARR[a].vt, ARR[a].vt),
"{.VD, .VN, .NONE, .NONE}", b0, {b31}) end
end
emit_rows("MVN_V", mvn)
-- DUP: element form (Vd.T, Vn.Ts[i]) + general form (Vd.T, Wn/Xn).
local dup = {}
for _, ek in ipairs({"B","H","S","D"}) do
local e = ELEM[ek]
for _, a in ipairs(e.dst) do
local function mk(r,i) return string.format("dup v%d.%s, v%d.%s[%d]", r, ARR[a].asm, r, e.tok, i) end
local b0, br, bi = word(mk(0,0)), word(mk(31,0)), word(mk(0,e.idxmax))
if b0 and br and bi then dup[#dup+1] = mkrow("DUP_V",
string.format("{.%s, .%s, .VEC_INDEX, .NONE}", ARR[a].vt, e.vt),
"{.VD, .VN, .NEON_IDX5, .NONE}", b0, {br, bi}) end
end
end
for _, g in ipairs({{"8B","w","W_REG"},{"16B","w","W_REG"},{"4H","w","W_REG"},{"8H","w","W_REG"},{"2S","w","W_REG"},{"4S","w","W_REG"},{"2D","x","X_REG"}}) do
-- Vary Vd and Rn independently to 31 (Rn 31 is the zero register, asm
-- "wzr"/"xzr") so the low bit of each register field toggles.
local zr = (g[2] == "x") and "xzr" or "wzr"
local function mk(vr, gs) return string.format("dup v%d.%s, %s", vr, ARR[g[1]].asm, gs) end
local b0, bV, bR = word(mk(0, g[2].."0")), word(mk(31, g[2].."0")), word(mk(0, zr))
if b0 and bV and bR then dup[#dup+1] = mkrow("DUP_V",
string.format("{.%s, .%s, .NONE, .NONE}", ARR[g[1]].vt, g[3]),
"{.VD, .RN, .NONE, .NONE}", b0, {bV, bR}) end
end
emit_rows("DUP_V", dup)
-- INS: element form (Vd.Ts[i], Vn.Ts[j]) + general form (Vd.Ts[i], Wn/Xn).
local ins = {}
for _, ek in ipairs({"B","H","S","D"}) do
local e = ELEM[ek]
local function mk(r,i,j) return string.format("ins v%d.%s[%d], v%d.%s[%d]", r, e.tok, i, r, e.tok, j) end
local b0, br, bi, bj = word(mk(0,0,0)), word(mk(31,0,0)), word(mk(0,e.idxmax,0)), word(mk(0,0,e.idxmax))
if b0 and br and bi and bj then ins[#ins+1] = mkrow("INS",
string.format("{.%s, .VEC_INDEX, .%s, .VEC_INDEX}", e.vt, e.vt),
"{.VD, .NEON_IDX5, .VN, .NEON_IDX4}", b0, {br, bi, bj}) end
end
for _, ek in ipairs({"B","H","S","D"}) do
local e = ELEM[ek]
local gpr = (ek == "D") and "x" or "w"
local gvt = (ek == "D") and "X_REG" or "W_REG"
local zr = (ek == "D") and "xzr" or "wzr"
local function mk(vr, gs, i) return string.format("ins v%d.%s[%d], %s", vr, e.tok, i, gs) end
local b0 = word(mk(0, gpr.."0", 0))
local bV = word(mk(31, gpr.."0", 0)) -- Vd field
local bR = word(mk(0, zr, 0)) -- Rn field (zero register)
local bi = word(mk(0, gpr.."0", e.idxmax))
if b0 and bV and bR and bi then ins[#ins+1] = mkrow("INS",
string.format("{.%s, .VEC_INDEX, .%s, .NONE}", e.vt, gvt),
"{.VD, .NEON_IDX5, .RN, .NONE}", b0, {bV, bR, bi}) end
end
emit_rows("INS", ins)
-- EXT Vd.T, Vn.T, Vm.T, #index (imm4 byte index; .8b idx 0..7, .16b 0..15).
local ext = {}
for _, ai in ipairs({{"8B",7},{"16B",15}}) do
local a, imax = ai[1], ai[2]
local function mk(r,i) return string.format("ext v%d.%s, v%d.%s, v%d.%s, #%d", r, ARR[a].asm, r, ARR[a].asm, r, ARR[a].asm, i) end
local b0, br, bi = word(mk(0,0)), word(mk(31,0)), word(mk(0,imax))
if b0 and br and bi then ext[#ext+1] = mkrow("EXT_V",
string.format("{.%s, .%s, .%s, .VEC_INDEX}", ARR[a].vt, ARR[a].vt, ARR[a].vt),
"{.VD, .VN, .VM, .NEON_EXT_IDX}", b0, {br, bi}) end
end
emit_rows("EXT_V", ext)
sections[#sections+1] = "\t// Advanced SIMD copy / permute (MOV/MVN/DUP/INS/EXT).\n" .. table.concat(blk, "\n")
end
-- ---- splice into the SoT ---------------------------------------------------
local region = "\t// SPECGEN:BEGIN\n" .. table.concat(sections, "\n\n") .. "\n\t// SPECGEN:END"
local fh = assert(io.open(TABLE, "r")); local src = fh:read("*a"); fh:close()

View File

@@ -315,6 +315,10 @@ write_operand_expr :: proc(sb: ^strings.Builder, t: a.Operand_Type, names: [3]st
case .V_4S: fmt.sbprintf(sb, "op_v_4s(u8(reg_hw(%s)))", names[0])
case .V_1D: fmt.sbprintf(sb, "op_v_1d(u8(reg_hw(%s)))", names[0])
case .V_2D: fmt.sbprintf(sb, "op_v_2d(u8(reg_hw(%s)))", names[0])
case .V_ELEM_B: fmt.sbprintf(sb, "op_v_elem_b(u8(reg_hw(%s)))", names[0])
case .V_ELEM_H: fmt.sbprintf(sb, "op_v_elem_h(u8(reg_hw(%s)))", names[0])
case .V_ELEM_S: fmt.sbprintf(sb, "op_v_elem_s(u8(reg_hw(%s)))", names[0])
case .V_ELEM_D: fmt.sbprintf(sb, "op_v_elem_d(u8(reg_hw(%s)))", names[0])
case: fmt.sbprintf(sb, "op_reg(%s)", names[0])
}
case .ZREG: