rexcode/arm64: NEON copy/permute (MOV/MVN/DUP/INS/EXT) encode forms

MOV_V (ORR alias: source feeds both Vn and Vm via a new VN_VM_DUP encoding), MVN_V (NOT alias, plain 2-register), DUP_V (element form Vd.T,Vn.Ts[i] and general form Vd.T,Wn/Xn), INS (element-to-element and from-GPR), EXT_V (imm4 byte index). Adds a VEC_INDEX operand type plus NEON_IDX5/NEON_IDX4/NEON_EXT_IDX encodings: the element-size marker rides in the entry bits, the lane index drives the bits above it, and the decoder recovers the element size from imm5's marker. Element size now rides in op.size (B=1/H=2/S=4/D=8) via op_v_elem_b/h/s/d so the matcher can disambiguate DUP/INS element forms; the builder generator maps V_ELEM_* to those constructors. specgen derives the mask by varying registers and each index field to its max -- the GPR-source forms vary Vd and Rn independently (Rn 31 = wzr/xzr) so the low bit of each field toggles. All 19 representative forms byte-exact vs llvm-mc and decode-clean; 461 tests green. (TBL/TBX register-list forms deferred.)
2026-06-19 08:32:33 +00:00 · 2026-06-17 23:23:44 -04:00
parent 5761c23ba4
commit 06eb3de6a2
14 changed files with 1214 additions and 877 deletions
--- a/core/rexcode/arm64/decoder.odin
+++ b/core/rexcode/arm64/decoder.odin
@@ -211,6 +211,30 @@ extract_operand_inline :: #force_inline proc "contextless" (
 		if en == .NEON_SHR_IMM { amt = 2 * esize - val }
 		return Operand{immediate = amt, kind = .IMMEDIATE, size = 1}

+	// ---- NEON copy/permute index fields ------------------------------------
+	case .VN_VM_DUP:
+		return Operand{reg = Register(REG_V | u16((word >> 5) & 0x1F)), kind = .REGISTER, size = 4}
+	case .NEON_IDX5:
+		// imm5 = index << (markerbit+1) | (1 << markerbit); marker = lowest set bit.
+		imm5 := (word >> 16) & 0x1F
+		mb: u32 = 0
+		if      imm5 & 0x1 != 0 { mb = 0 }
+		else if imm5 & 0x2 != 0 { mb = 1 }
+		else if imm5 & 0x4 != 0 { mb = 2 }
+		else                    { mb = 3 }
+		return Operand{immediate = i64(imm5 >> (mb + 1)), kind = .IMMEDIATE, size = 1}
+	case .NEON_IDX4:
+		// imm4 = index << markerbit; recover markerbit from imm5 in the word.
+		imm5 := (word >> 16) & 0x1F
+		mb: u32 = 0
+		if      imm5 & 0x1 != 0 { mb = 0 }
+		else if imm5 & 0x2 != 0 { mb = 1 }
+		else if imm5 & 0x4 != 0 { mb = 2 }
+		else                    { mb = 3 }
+		return Operand{immediate = i64(((word >> 11) & 0xF) >> mb), kind = .IMMEDIATE, size = 1}
+	case .NEON_EXT_IDX:
+		return Operand{immediate = i64((word >> 11) & 0xF), kind = .IMMEDIATE, size = 1}
+
 	// ---- Memory operand variants ------------------------------------------
 	case .OFFSET_BASE_U12:
 		size := u32(1) << ((word >> 30) & 0x3)
--- a/core/rexcode/arm64/encoder.odin
+++ b/core/rexcode/arm64/encoder.odin
@@ -209,8 +209,17 @@ operand_matches_inline :: #force_inline proc "contextless" (
 		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 64)
 	case .V_4H_FP16:
 		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 0 || op.size == 24)
-	case .V_ELEM_B, .V_ELEM_H, .V_ELEM_S, .V_ELEM_D:
-		return op.kind == .REGISTER && reg_class(op.reg) == REG_V
+	// Element-indexed V views: element size carried in op.size (B=1,H=2,S=4,
+	// D=8) so DUP/INS forms disambiguate. .S also accepts size 0 so a plain
+	// op_reg (as the hand-written SM3TT forms pass) still matches the .S slot.
+	case .V_ELEM_B:
+		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 1
+	case .V_ELEM_H:
+		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 2
+	case .V_ELEM_S:
+		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && (op.size == 4 || op.size == 0)
+	case .V_ELEM_D:
+		return op.kind == .REGISTER && reg_class(op.reg) == REG_V && op.size == 8

 	// SVE Z registers. Element size carried in op.size: B=1, H=2, S=4, D=8.
 	// op.size==0 (legacy / default-constructed) accepts any width.
@@ -247,7 +256,7 @@ operand_matches_inline :: #force_inline proc "contextless" (
 		return op.kind == .IMMEDIATE

 	case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
-		 .NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT:
+		 .NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT, .VEC_INDEX:
 		return op.kind == .IMMEDIATE
 	case .BITMASK_IMM:
 		// The user passes the raw logical mask value; we validate that it
@@ -279,6 +288,21 @@ vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
 	return 8
 }

+@(private="file")
+// Lane-index marker bit (log2 of element-size in bytes) for a DUP/INS form:
+// derived from the V_ELEM_* operand the form carries. B=0, H=1, S=2, D=3.
+vidx_markerbit :: #force_inline proc "contextless" (form: ^Encoding) -> u32 {
+	for ot in form.ops {
+		#partial switch ot {
+		case .V_ELEM_B: return 0
+		case .V_ELEM_H: return 1
+		case .V_ELEM_S: return 2
+		case .V_ELEM_D: return 3
+		}
+	}
+	return 0
+}
+
 pack_operand_inline :: #force_inline proc(
 	op:       ^Operand,
 	enc:      Operand_Encoding,
@@ -441,6 +465,19 @@ pack_operand_inline :: #force_inline proc(
 		esize := vec_esize(form.ops[0])
 		return ((esize - u32(op.immediate)) & 0x3F) << 16

+	// NEON copy/permute index fields (element-size marker fixed in `bits`).
+	case .VN_VM_DUP:
+		hw := u32(reg_hw(op.reg)) & 0x1F
+		return (hw << 5) | (hw << 16)
+	case .NEON_IDX5:
+		mb := vidx_markerbit(form)
+		return (u32(op.immediate) << (mb + 1)) << 16
+	case .NEON_IDX4:
+		mb := vidx_markerbit(form)
+		return (u32(op.immediate) << mb) << 11
+	case .NEON_EXT_IDX:
+		return (u32(op.immediate) & 0xF) << 11
+
 	// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
 	case .NEON_IMM8_FMOV:
 		v := u32(op.immediate) & 0xFF
--- a/core/rexcode/arm64/encoding_types.odin
+++ b/core/rexcode/arm64/encoding_types.odin
@@ -169,6 +169,10 @@ Operand_Type :: enum u8 {
 	// ---- NEON shift-by-immediate amount (encoded into immh:immb together
 	//      with the element size: left = esize+shift, right = 2*esize-shift) ----
 	VEC_SHIFT,
+
+	// ---- NEON element lane index (DUP/INS/EXT). The element-size marker
+	//      lives in the entry `bits`; the operand drives only the index bits. ----
+	VEC_INDEX,
 }

 // Where each operand's bits land in the 32-bit word.
@@ -233,6 +237,15 @@ Operand_Encoding :: enum u8 {
 	NEON_SHL_IMM,
 	NEON_SHR_IMM,

+	// ---- NEON copy/permute index fields ----
+	// The element-size marker bit lives in the entry `bits`; the lane index
+	// operand drives the bits above it (DUP/INS imm5, INS imm4) or the plain
+	// imm4 (EXT). The decoder recovers the element size from imm5's marker.
+	VN_VM_DUP,        // one V reg packed into BOTH Vn (9:5) and Vm (20:16) (MOV = ORR alias)
+	NEON_IDX5,        // element lane index in imm5 (20:16); index << (markerbit+1)
+	NEON_IDX4,        // INS source lane index in imm4 (14:11); index << markerbit
+	NEON_EXT_IDX,     // EXT byte index in imm4 (14:11)
+
 	// ---- LSE atomics ------------------------------------------------------
 	ATOMIC_RS,            // Rs (source / compare) at bits 16-20
 	ATOMIC_RT,            // Rt (target) at bits 0-4
--- a/core/rexcode/arm64/mnemonic_builders.odin
+++ b/core/rexcode/arm64/mnemonic_builders.odin
@@ -647,13 +647,13 @@ inst_sm3partw2_r_r_r            :: #force_inline proc "contextless" (dst: Regist
 emit_sm3partw2_r_r_r            :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3partw2_r_r_r(dst, src, src2)) }
 inst_sm3ss1_r_r_r_r             :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register, src3: Register) -> Instruction { return Instruction{mnemonic = .SM3SS1, operand_count = 4, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_4s(u8(reg_hw(src2))), op_v_4s(u8(reg_hw(src3)))}} }
 emit_sm3ss1_r_r_r_r             :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register, src3: Register) { append(instructions, inst_sm3ss1_r_r_r_r(dst, src, src2, src3)) }
-inst_sm3tt1a_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
+inst_sm3tt1a_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
 emit_sm3tt1a_r_r_r              :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt1a_r_r_r(dst, src, src2)) }
-inst_sm3tt1b_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
+inst_sm3tt1b_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT1B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
 emit_sm3tt1b_r_r_r              :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt1b_r_r_r(dst, src, src2)) }
-inst_sm3tt2a_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
+inst_sm3tt2a_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2A, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
 emit_sm3tt2a_r_r_r              :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt2a_r_r_r(dst, src, src2)) }
-inst_sm3tt2b_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_reg(src2), {}}} }
+inst_sm3tt2b_r_r_r              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SM3TT2B, operand_count = 3, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), op_v_elem_s(u8(reg_hw(src2))), {}}} }
 emit_sm3tt2b_r_r_r              :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sm3tt2b_r_r_r(dst, src, src2)) }
 inst_sm4e_r_r                   :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .SM4E, operand_count = 2, length = 4, ops = {op_v_4s(u8(reg_hw(dst))), op_v_4s(u8(reg_hw(src))), {}, {}}} }
 emit_sm4e_r_r                   :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_sm4e_r_r(dst, src)) }
@@ -1067,6 +1067,8 @@ inst_bic_v_r_r_r                :: #force_inline proc "contextless" (dst: Regist
 emit_bic_v_r_r_r                :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bic_v_r_r_r(dst, src, src2)) }
 inst_orn_v_r_r_r                :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ORN_V, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
 emit_orn_v_r_r_r                :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_orn_v_r_r_r(dst, src, src2)) }
+inst_mvn_v_r_r                  :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .MVN_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
+emit_mvn_v_r_r                  :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_mvn_v_r_r(dst, src)) }
 inst_bit_r_r_r                  :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .BIT, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
 emit_bit_r_r_r                  :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bit_r_r_r(dst, src, src2)) }
 inst_bif_r_r_r                  :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .BIF, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
@@ -1155,6 +1157,18 @@ inst_sqrshrun_r_r_i             :: #force_inline proc "contextless" (dst: Regist
 emit_sqrshrun_r_r_i             :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqrshrun_r_r_i(dst, src, imm)) }
 inst_sqrshrun2_r_r_i            :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SQRSHRUN2, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_8h(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
 emit_sqrshrun2_r_r_i            :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqrshrun2_r_r_i(dst, src, imm)) }
+inst_dup_v_r_r_i                :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .DUP_V, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_elem_b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
+inst_dup_v_r_r                  :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .DUP_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_reg(src), {}, {}}} }
+emit_dup_v_r_r_i                :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_dup_v_r_r_i(dst, src, imm)) }
+emit_dup_v_r_r                  :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_dup_v_r_r(dst, src)) }
+inst_ins_r_i_r_i                :: #force_inline proc "contextless" (dst: Register, imm: i64, src: Register, imm2: i64) -> Instruction { return Instruction{mnemonic = .INS, operand_count = 4, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_v_elem_b(u8(reg_hw(src))), op_imm(imm2, 4)}} }
+inst_ins_r_i_r                  :: #force_inline proc "contextless" (dst: Register, imm: i64, src: Register) -> Instruction { return Instruction{mnemonic = .INS, operand_count = 3, length = 4, ops = {op_v_elem_b(u8(reg_hw(dst))), op_imm(imm, 4), op_reg(src), {}}} }
+emit_ins_r_i_r_i                :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, src: Register, imm2: i64) { append(instructions, inst_ins_r_i_r_i(dst, imm, src, imm2)) }
+emit_ins_r_i_r                  :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, imm: i64, src: Register) { append(instructions, inst_ins_r_i_r(dst, imm, src)) }
+inst_mov_v_r_r                  :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .MOV_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
+emit_mov_v_r_r                  :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_mov_v_r_r(dst, src)) }
+inst_ext_v_r_r_r_i              :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .EXT_V, operand_count = 4, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), op_imm(imm, 4)}} }
+emit_ext_v_r_r_r_i              :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register, imm: i64) { append(instructions, inst_ext_v_r_r_r_i(dst, src, src2, imm)) }
 inst_zip1_r_r_r                 :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ZIP1, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
 emit_zip1_r_r_r                 :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_zip1_r_r_r(dst, src, src2)) }
 inst_zip2_r_r_r                 :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .ZIP2, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
@@ -3040,6 +3054,8 @@ inst_bic_v                           :: inst_bic_v_r_r_r
 emit_bic_v                           :: emit_bic_v_r_r_r
 inst_orn_v                           :: inst_orn_v_r_r_r
 emit_orn_v                           :: emit_orn_v_r_r_r
+inst_mvn_v                           :: inst_mvn_v_r_r
+emit_mvn_v                           :: emit_mvn_v_r_r
 inst_bit                             :: inst_bit_r_r_r
 emit_bit                             :: emit_bit_r_r_r
 inst_bif                             :: inst_bif_r_r_r
@@ -3128,6 +3144,14 @@ inst_sqrshrun                        :: inst_sqrshrun_r_r_i
 emit_sqrshrun                        :: emit_sqrshrun_r_r_i
 inst_sqrshrun2                       :: inst_sqrshrun2_r_r_i
 emit_sqrshrun2                       :: emit_sqrshrun2_r_r_i
+inst_dup_v                           :: proc{ inst_dup_v_r_r_i, inst_dup_v_r_r }
+emit_dup_v                           :: proc{ emit_dup_v_r_r_i, emit_dup_v_r_r }
+inst_ins                             :: proc{ inst_ins_r_i_r_i, inst_ins_r_i_r }
+emit_ins                             :: proc{ emit_ins_r_i_r_i, emit_ins_r_i_r }
+inst_mov_v                           :: inst_mov_v_r_r
+emit_mov_v                           :: emit_mov_v_r_r
+inst_ext_v                           :: inst_ext_v_r_r_r_i
+emit_ext_v                           :: emit_ext_v_r_r_r_i
 inst_zip1                            :: inst_zip1_r_r_r
 emit_zip1                            :: emit_zip1_r_r_r
 inst_zip2                            :: inst_zip2_r_r_r
--- a/core/rexcode/arm64/operands.odin
+++ b/core/rexcode/arm64/operands.odin
@@ -210,6 +210,26 @@ op_v_2d  :: #force_inline proc "contextless" (n: u8) -> Operand {
 	return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 64}
 }

+// Element-indexed V views (V0.B[i]/.H[i]/.S[i]/.D[i]). The element size rides
+// in op.size (1/2/4/8) so the matcher can disambiguate DUP/INS forms; the lane
+// index is a separate immediate operand.
+@(require_results)
+op_v_elem_b :: #force_inline proc "contextless" (n: u8) -> Operand {
+	return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 1}
+}
+@(require_results)
+op_v_elem_h :: #force_inline proc "contextless" (n: u8) -> Operand {
+	return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 2}
+}
+@(require_results)
+op_v_elem_s :: #force_inline proc "contextless" (n: u8) -> Operand {
+	return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 4}
+}
+@(require_results)
+op_v_elem_d :: #force_inline proc "contextless" (n: u8) -> Operand {
+	return Operand{reg = Register(REG_V | u16(n & 0x1F)), kind = .REGISTER, size = 8}
+}
+
 // -----------------------------------------------------------------------------
 // Memory constructors (one per addressing mode)
 // -----------------------------------------------------------------------------
--- a/core/rexcode/arm64/tablegen/encoding_table.odin
+++ b/core/rexcode/arm64/tablegen/encoding_table.odin
@@ -4340,5 +4340,45 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
 		{.SQRSHRUN2, {.V_8H, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F108C00, 0xFFF0FC00, .NEON, {}},
 		{.SQRSHRUN2, {.V_4S, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F208C00, 0xFFE0FC00, .NEON, {}},
 	},
+
+	// Advanced SIMD copy / permute (MOV/MVN/DUP/INS/EXT).
+	.MOV_V = {
+		{.MOV_V, {.V_8B, .V_8B, .NONE, .NONE}, {.VD, .VN_VM_DUP, .NONE, .NONE}, 0x0EA01C00, 0xFFE0FC00, .NEON, {}},
+		{.MOV_V, {.V_16B, .V_16B, .NONE, .NONE}, {.VD, .VN_VM_DUP, .NONE, .NONE}, 0x4EA01C00, 0xFFE0FC00, .NEON, {}},
+	},
+	.MVN_V = {
+		{.MVN_V, {.V_8B, .V_8B, .NONE, .NONE}, {.VD, .VN, .NONE, .NONE}, 0x2E205800, 0xFFFFFC00, .NEON, {}},
+		{.MVN_V, {.V_16B, .V_16B, .NONE, .NONE}, {.VD, .VN, .NONE, .NONE}, 0x6E205800, 0xFFFFFC00, .NEON, {}},
+	},
+	.DUP_V = {
+		{.DUP_V, {.V_8B, .V_ELEM_B, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E010400, 0xFFE1FC00, .NEON, {}},
+		{.DUP_V, {.V_16B, .V_ELEM_B, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E010400, 0xFFE1FC00, .NEON, {}},
+		{.DUP_V, {.V_4H, .V_ELEM_H, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E020400, 0xFFE3FC00, .NEON, {}},
+		{.DUP_V, {.V_8H, .V_ELEM_H, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E020400, 0xFFE3FC00, .NEON, {}},
+		{.DUP_V, {.V_2S, .V_ELEM_S, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x0E040400, 0xFFE7FC00, .NEON, {}},
+		{.DUP_V, {.V_4S, .V_ELEM_S, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E040400, 0xFFE7FC00, .NEON, {}},
+		{.DUP_V, {.V_2D, .V_ELEM_D, .VEC_INDEX, .NONE}, {.VD, .VN, .NEON_IDX5, .NONE}, 0x4E080400, 0xFFEFFC00, .NEON, {}},
+		{.DUP_V, {.V_8B, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E010C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_16B, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E010C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_4H, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E020C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_8H, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E020C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_2S, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x0E040C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_4S, .W_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E040C00, 0xFFFFFC00, .NEON, {}},
+		{.DUP_V, {.V_2D, .X_REG, .NONE, .NONE}, {.VD, .RN, .NONE, .NONE}, 0x4E080C00, 0xFFFFFC00, .NEON, {}},
+	},
+	.INS = {
+		{.INS, {.V_ELEM_B, .VEC_INDEX, .V_ELEM_B, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E010400, 0xFFE18400, .NEON, {}},
+		{.INS, {.V_ELEM_H, .VEC_INDEX, .V_ELEM_H, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E020400, 0xFFE38C00, .NEON, {}},
+		{.INS, {.V_ELEM_S, .VEC_INDEX, .V_ELEM_S, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E040400, 0xFFE79C00, .NEON, {}},
+		{.INS, {.V_ELEM_D, .VEC_INDEX, .V_ELEM_D, .VEC_INDEX}, {.VD, .NEON_IDX5, .VN, .NEON_IDX4}, 0x6E080400, 0xFFEFBC00, .NEON, {}},
+		{.INS, {.V_ELEM_B, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E011C00, 0xFFE1FC00, .NEON, {}},
+		{.INS, {.V_ELEM_H, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E021C00, 0xFFE3FC00, .NEON, {}},
+		{.INS, {.V_ELEM_S, .VEC_INDEX, .W_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E041C00, 0xFFE7FC00, .NEON, {}},
+		{.INS, {.V_ELEM_D, .VEC_INDEX, .X_REG, .NONE}, {.VD, .NEON_IDX5, .RN, .NONE}, 0x4E081C00, 0xFFEFFC00, .NEON, {}},
+	},
+	.EXT_V = {
+		{.EXT_V, {.V_8B, .V_8B, .V_8B, .VEC_INDEX}, {.VD, .VN, .VM, .NEON_EXT_IDX}, 0x2E000000, 0xFFE0C400, .NEON, {}},
+		{.EXT_V, {.V_16B, .V_16B, .V_16B, .VEC_INDEX}, {.VD, .VN, .VM, .NEON_EXT_IDX}, 0x6E000000, 0xFFE08400, .NEON, {}},
+	},
 	// SPECGEN:END
 }
--- a/core/rexcode/arm64/tablegen/generated/decode_tables.odin
+++ b/core/rexcode/arm64/tablegen/generated/decode_tables.odin
--- a/core/rexcode/arm64/tablegen/generated/encode_tables.odin
+++ b/core/rexcode/arm64/tablegen/generated/encode_tables.odin
--- a/core/rexcode/arm64/tablegen/specgen.lua
+++ b/core/rexcode/arm64/tablegen/specgen.lua
@@ -347,6 +347,120 @@ do
 	sections[#sections+1] = "\t// Advanced SIMD shift by immediate.\n" .. table.concat(blk, "\n")
 end

+-- ---- NEON copy / permute (MOV/MVN/DUP/INS/EXT) -----------------------------
+-- These carry a lane index in imm5/imm4 whose element-size marker is fixed in
+-- `bits`; the operand drives the bits above it. mask is derived by varying the
+-- registers AND each index field to its maximum (mask = ~union of the deltas).
+local function mask_of(base, variants)
+	local x = 0
+	for _, w in ipairs(variants) do x = bit.bor(x, bit.bxor(base, w)) end
+	return bit.band(bit.bnot(x), 0xFFFFFFFF)
+end
+-- element metadata: asm token, valid dst arrangements, max lane index, V_ELEM type.
+local ELEM = {
+	B = {tok="b", dst={"8B","16B"}, idxmax=15, vt="V_ELEM_B"},
+	H = {tok="h", dst={"4H","8H"},  idxmax=7,  vt="V_ELEM_H"},
+	S = {tok="s", dst={"2S","4S"},  idxmax=3,  vt="V_ELEM_S"},
+	D = {tok="d", dst={"2D"},       idxmax=1,  vt="V_ELEM_D"},
+}
+do
+	local blk = {}
+	local function emit_rows(mnem, rows)
+		if #rows > 0 then blk[#blk+1] = string.format("\t.%s = {\n%s\n\t},", mnem, table.concat(rows, "\n")); n_mnem = n_mnem + 1 end
+	end
+	local function mkrow(mnem, ops, enc, b0, variants)
+		n_forms = n_forms + 1
+		return string.format("\t\t{.%s, %s, %s, 0x%s, 0x%s, .NEON, {}},",
+			mnem, ops, enc, bit.tohex(b0):upper(), bit.tohex(mask_of(b0, variants)):upper())
+	end
+
+	-- MOV Vd.T, Vn.T  (= ORR Vd,Vn,Vn): the single source feeds both Vn and Vm.
+	local mov = {}
+	for _, a in ipairs({"8B","16B"}) do
+		local function mk(x) return string.format("mov v%d.%s, v%d.%s", x, ARR[a].asm, x, ARR[a].asm) end
+		local b0, b31 = word(mk(0)), word(mk(31))
+		if b0 and b31 then mov[#mov+1] = mkrow("MOV_V",
+			string.format("{.%s, .%s, .NONE, .NONE}", ARR[a].vt, ARR[a].vt),
+			"{.VD, .VN_VM_DUP, .NONE, .NONE}", b0, {b31}) end
+	end
+	emit_rows("MOV_V", mov)
+
+	-- MVN Vd.T, Vn.T (= NOT alias): plain two-register.
+	local mvn = {}
+	for _, a in ipairs({"8B","16B"}) do
+		local function mk(x) return string.format("mvn v%d.%s, v%d.%s", x, ARR[a].asm, x, ARR[a].asm) end
+		local b0, b31 = word(mk(0)), word(mk(31))
+		if b0 and b31 then mvn[#mvn+1] = mkrow("MVN_V",
+			string.format("{.%s, .%s, .NONE, .NONE}", ARR[a].vt, ARR[a].vt),
+			"{.VD, .VN, .NONE, .NONE}", b0, {b31}) end
+	end
+	emit_rows("MVN_V", mvn)
+
+	-- DUP: element form (Vd.T, Vn.Ts[i]) + general form (Vd.T, Wn/Xn).
+	local dup = {}
+	for _, ek in ipairs({"B","H","S","D"}) do
+		local e = ELEM[ek]
+		for _, a in ipairs(e.dst) do
+			local function mk(r,i) return string.format("dup v%d.%s, v%d.%s[%d]", r, ARR[a].asm, r, e.tok, i) end
+			local b0, br, bi = word(mk(0,0)), word(mk(31,0)), word(mk(0,e.idxmax))
+			if b0 and br and bi then dup[#dup+1] = mkrow("DUP_V",
+				string.format("{.%s, .%s, .VEC_INDEX, .NONE}", ARR[a].vt, e.vt),
+				"{.VD, .VN, .NEON_IDX5, .NONE}", b0, {br, bi}) end
+		end
+	end
+	for _, g in ipairs({{"8B","w","W_REG"},{"16B","w","W_REG"},{"4H","w","W_REG"},{"8H","w","W_REG"},{"2S","w","W_REG"},{"4S","w","W_REG"},{"2D","x","X_REG"}}) do
+		-- Vary Vd and Rn independently to 31 (Rn 31 is the zero register, asm
+		-- "wzr"/"xzr") so the low bit of each register field toggles.
+		local zr = (g[2] == "x") and "xzr" or "wzr"
+		local function mk(vr, gs) return string.format("dup v%d.%s, %s", vr, ARR[g[1]].asm, gs) end
+		local b0, bV, bR = word(mk(0, g[2].."0")), word(mk(31, g[2].."0")), word(mk(0, zr))
+		if b0 and bV and bR then dup[#dup+1] = mkrow("DUP_V",
+			string.format("{.%s, .%s, .NONE, .NONE}", ARR[g[1]].vt, g[3]),
+			"{.VD, .RN, .NONE, .NONE}", b0, {bV, bR}) end
+	end
+	emit_rows("DUP_V", dup)
+
+	-- INS: element form (Vd.Ts[i], Vn.Ts[j]) + general form (Vd.Ts[i], Wn/Xn).
+	local ins = {}
+	for _, ek in ipairs({"B","H","S","D"}) do
+		local e = ELEM[ek]
+		local function mk(r,i,j) return string.format("ins v%d.%s[%d], v%d.%s[%d]", r, e.tok, i, r, e.tok, j) end
+		local b0, br, bi, bj = word(mk(0,0,0)), word(mk(31,0,0)), word(mk(0,e.idxmax,0)), word(mk(0,0,e.idxmax))
+		if b0 and br and bi and bj then ins[#ins+1] = mkrow("INS",
+			string.format("{.%s, .VEC_INDEX, .%s, .VEC_INDEX}", e.vt, e.vt),
+			"{.VD, .NEON_IDX5, .VN, .NEON_IDX4}", b0, {br, bi, bj}) end
+	end
+	for _, ek in ipairs({"B","H","S","D"}) do
+		local e = ELEM[ek]
+		local gpr = (ek == "D") and "x" or "w"
+		local gvt = (ek == "D") and "X_REG" or "W_REG"
+		local zr  = (ek == "D") and "xzr" or "wzr"
+		local function mk(vr, gs, i) return string.format("ins v%d.%s[%d], %s", vr, e.tok, i, gs) end
+		local b0 = word(mk(0, gpr.."0", 0))
+		local bV = word(mk(31, gpr.."0", 0))   -- Vd field
+		local bR = word(mk(0, zr, 0))          -- Rn field (zero register)
+		local bi = word(mk(0, gpr.."0", e.idxmax))
+		if b0 and bV and bR and bi then ins[#ins+1] = mkrow("INS",
+			string.format("{.%s, .VEC_INDEX, .%s, .NONE}", e.vt, gvt),
+			"{.VD, .NEON_IDX5, .RN, .NONE}", b0, {bV, bR, bi}) end
+	end
+	emit_rows("INS", ins)
+
+	-- EXT Vd.T, Vn.T, Vm.T, #index  (imm4 byte index; .8b idx 0..7, .16b 0..15).
+	local ext = {}
+	for _, ai in ipairs({{"8B",7},{"16B",15}}) do
+		local a, imax = ai[1], ai[2]
+		local function mk(r,i) return string.format("ext v%d.%s, v%d.%s, v%d.%s, #%d", r, ARR[a].asm, r, ARR[a].asm, r, ARR[a].asm, i) end
+		local b0, br, bi = word(mk(0,0)), word(mk(31,0)), word(mk(0,imax))
+		if b0 and br and bi then ext[#ext+1] = mkrow("EXT_V",
+			string.format("{.%s, .%s, .%s, .VEC_INDEX}", ARR[a].vt, ARR[a].vt, ARR[a].vt),
+			"{.VD, .VN, .VM, .NEON_EXT_IDX}", b0, {br, bi}) end
+	end
+	emit_rows("EXT_V", ext)
+
+	sections[#sections+1] = "\t// Advanced SIMD copy / permute (MOV/MVN/DUP/INS/EXT).\n" .. table.concat(blk, "\n")
+end
+
 -- ---- splice into the SoT ---------------------------------------------------
 local region = "\t// SPECGEN:BEGIN\n" .. table.concat(sections, "\n\n") .. "\n\t// SPECGEN:END"
 local fh = assert(io.open(TABLE, "r")); local src = fh:read("*a"); fh:close()
--- a/core/rexcode/arm64/tables/arm64.encode_forms.bin
+++ b/core/rexcode/arm64/tables/arm64.encode_forms.bin
--- a/core/rexcode/arm64/tables/arm64.encode_runs.bin
+++ b/core/rexcode/arm64/tables/arm64.encode_runs.bin
--- a/core/rexcode/arm64/tables/arm64.entries.bin
+++ b/core/rexcode/arm64/tables/arm64.entries.bin
--- a/core/rexcode/arm64/tables/arm64.idx_op0.bin
+++ b/core/rexcode/arm64/tables/arm64.idx_op0.bin
--- a/core/rexcode/arm64/tools/gen_mnemonic_builders.odin
+++ b/core/rexcode/arm64/tools/gen_mnemonic_builders.odin
@@ -315,6 +315,10 @@ write_operand_expr :: proc(sb: ^strings.Builder, t: a.Operand_Type, names: [3]st
 		case .V_4S:  fmt.sbprintf(sb, "op_v_4s(u8(reg_hw(%s)))",  names[0])
 		case .V_1D:  fmt.sbprintf(sb, "op_v_1d(u8(reg_hw(%s)))",  names[0])
 		case .V_2D:  fmt.sbprintf(sb, "op_v_2d(u8(reg_hw(%s)))",  names[0])
+		case .V_ELEM_B: fmt.sbprintf(sb, "op_v_elem_b(u8(reg_hw(%s)))", names[0])
+		case .V_ELEM_H: fmt.sbprintf(sb, "op_v_elem_h(u8(reg_hw(%s)))", names[0])
+		case .V_ELEM_S: fmt.sbprintf(sb, "op_v_elem_s(u8(reg_hw(%s)))", names[0])
+		case .V_ELEM_D: fmt.sbprintf(sb, "op_v_elem_d(u8(reg_hw(%s)))", names[0])
 		case:        fmt.sbprintf(sb, "op_reg(%s)", names[0])
 		}
 	case .ZREG: