rexcode/arm32: MVE VMLSV/VMLSVA (correct 3-bit Q regs); drop placeholders

Implement VMLSV/VMLSVA (MVE multiply-subtract reduce) properly: new
VN_Q_MVE (Qn at 19:17) and VM_Q_MVE (Qm at 3:1) encodings -- the actual
3-bit MVE Q fields -- with Rd at 15:12 (RDLO_A32). The earlier collision
was from reusing the 4-bit VN_Q (19:16) and RD_T32 (11:8), which place
the fields wrong; byte-exact vs llvm-mc now with distinct Qn/Qm/Rd.

Drop three placeholder/redundant enum entries: VRINT and VPRINT (not real
instructions -- llvm rejects bare 'vrint'; VPRINT is a printf-like debug
pseudo-op), and VRSHL_MVE (the author's own comment marks it a
placeholder; 'vrshl q,q,q' already decodes via VRSHL's MVE form). 600
tests green, verify matches llvm-mc.
This commit is contained in:
Brendan Punsky
2026-06-18 01:58:19 -04:00
committed by Flāvius
parent 239dea4f55
commit a63fb51fdd
16 changed files with 768 additions and 744 deletions

View File

@@ -356,6 +356,10 @@ unpack_operand :: proc(word: u32, enc: Operand_Encoding, ot: Operand_Type) -> Op
return op_imm(((word >> 12) & 1) == 1 ? 270 : 90)
case .MVE_ROT_CMLA:
return op_imm(i64((word >> 23) & 0x3) * 90)
case .VN_Q_MVE:
return op_reg(Register(REG_QPR | u16((word >> 17) & 0x7)))
case .VM_Q_MVE:
return op_reg(Register(REG_QPR | u16((word >> 1) & 0x7)))
case .VD_Q:
n := (((word >> 22) & 1) << 4 | ((word >> 12) & 0xF)) >> 1
return op_reg(Register(REG_QPR | u16(n)))

View File

@@ -440,6 +440,10 @@ pack_operand_inline :: #force_inline proc(
return (u32(op.immediate) == 270 ? 1 : 0) << 12
case .MVE_ROT_CMLA:
return ((u32(op.immediate) / 90) & 0x3) << 23
case .VN_Q_MVE:
return (u32(reg_hw(op.reg)) & 0x7) << 17
case .VM_Q_MVE:
return (u32(reg_hw(op.reg)) & 0x7) << 1
case .VFP_IMM8:
// Run the VFP 8-bit float encoder; the user supplies the wire-format
// 32-bit float bit pattern (for F32). The encoder finds the abcdefgh.

View File

@@ -316,6 +316,9 @@ Operand_Encoding :: enum u8 {
// MVE_ROT_HCADD: #90/#270 -> bit 12; MVE_ROT_CMLA: #0/90/180/270 -> bits 24:23
MVE_ROT_HCADD,
MVE_ROT_CMLA,
// MVE 3-bit Q registers (Q0..Q7): Qn at bits 19:17, Qm at bits 3:1.
VN_Q_MVE,
VM_Q_MVE,
VFP_IMM8, // VFP immediate (VMOV.F32/F64 #imm)
NEON_IMM8_ABCDEFGH, // bits 18-16 (abc) + bits 3-0 (defgh)
NEON_CMODE, // bits 11-8 (cmode for VMOV/VMVN immediate)

View File

@@ -1298,6 +1298,10 @@ inst_vmlav_r_q_q :: #force_inline proc "contextless" (dst: Regis
emit_vmlav_r_q_q :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_vmlav_r_q_q(dst, src, src2)) }
inst_vmlava_r_q_q :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .VMLAVA, operand_count = 3, mode = .T32, cond = 14, length = 4, ops = {op_reg(dst), op_reg(src), op_reg(src2), {}}} }
emit_vmlava_r_q_q :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_vmlava_r_q_q(dst, src, src2)) }
inst_vmlsv_r_q_q :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .VMLSV, operand_count = 3, mode = .T32, cond = 14, length = 4, ops = {op_reg(dst), op_reg(src), op_reg(src2), {}}} }
emit_vmlsv_r_q_q :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_vmlsv_r_q_q(dst, src, src2)) }
inst_vmlsva_r_q_q :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .VMLSVA, operand_count = 3, mode = .T32, cond = 14, length = 4, ops = {op_reg(dst), op_reg(src), op_reg(src2), {}}} }
emit_vmlsva_r_q_q :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_vmlsva_r_q_q(dst, src, src2)) }
inst_vcmul_q_q_q :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .VCMUL, operand_count = 3, mode = .T32, cond = 14, length = 4, ops = {op_reg(dst), op_reg(src), op_reg(src2), {}}} }
emit_vcmul_q_q_q :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_vcmul_q_q_q(dst, src, src2)) }
inst_vhcadd_q_q_q :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .VHCADD, operand_count = 3, mode = .T32, cond = 14, length = 4, ops = {op_reg(dst), op_reg(src), op_reg(src2), {}}} }
@@ -2551,6 +2555,10 @@ inst_vmlav :: inst_vmlav_r_q_q
emit_vmlav :: emit_vmlav_r_q_q
inst_vmlava :: inst_vmlava_r_q_q
emit_vmlava :: emit_vmlava_r_q_q
inst_vmlsv :: inst_vmlsv_r_q_q
emit_vmlsv :: emit_vmlsv_r_q_q
inst_vmlsva :: inst_vmlsva_r_q_q
emit_vmlsva :: emit_vmlsva_r_q_q
inst_vcmul :: inst_vcmul_q_q_q
emit_vcmul :: emit_vcmul_q_q_q
inst_vhcadd :: inst_vhcadd_q_q_q

View File

@@ -306,7 +306,7 @@ Mnemonic :: enum u16 {
SHA256H, SHA256H2, SHA256SU0, SHA256SU1,
// -- VFP rounding (ARMv8 FEAT_FP) ----------------------------------------
VRINT, VJCVT, // VJCVT: F64-to-S32 with FPSCR.RM rounding
VJCVT, // VJCVT: F64-to-S32 with FPSCR.RM rounding
// -- Dot Product (FEAT_DotProd) ------------------------------------------
VSDOT, VUDOT,
@@ -465,7 +465,6 @@ Mnemonic :: enum u16 {
// Bit reverse + shifts unique to MVE
VBRSR, // bit reverse with shift right
VSHLC, // shift left with carry
VRSHL_MVE, // (placeholder if needed; usually VRSHL)
VDDUP, // decrement and duplicate
VIDUP, // increment and duplicate
VDWDUP, // decrement-wrap and duplicate
@@ -501,7 +500,6 @@ Mnemonic :: enum u16 {
VQRDMLSDH, VQRDMLSDHX,
// Misc
VPRINT, // printf-like debug op (rare)
VHCADD_SAT, // (rarely used)
VCMLA_MVE, // (MVE form; VCMLA already exists)

View File

@@ -3695,6 +3695,10 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
// mask (the MVE convention); the complex ops encode the rotation immediate.
.VHCADD_SAT = { {.VHCADD_SAT, {.QPR, .QPR, .QPR, .IMM}, {.VD_Q, .VN_Q, .VM_Q, .MVE_ROT_HCADD}, 0xEE000F00, 0xFFE10FF1, .MVE_INT, .T32, {thumb32=true, cond_in_28=false}} },
.VCMLA_MVE = { {.VCMLA_MVE, {.QPR, .QPR, .QPR, .IMM}, {.VD_Q, .VN_Q, .VM_Q, .MVE_ROT_CMLA}, 0xFC200840, 0xFE611FF1, .MVE_FP, .T32, {thumb32=true, cond_in_28=false}} },
// MVE multiply-subtract reduce (Rd at 15:12, Qn at 19:17, Qm at 3:1 -- the
// proper 3-bit MVE Q fields, .s16 form). VMLSVA accumulates (bit 5).
.VMLSV = { {.VMLSV, {.GPR, .QPR, .QPR, .NONE}, {.RDLO_A32, .VN_Q_MVE, .VM_Q_MVE, .NONE}, 0xEEF00E01, 0xFFF10FF1, .MVE_INT, .T32, {thumb32=true, cond_in_28=false}} },
.VMLSVA = { {.VMLSVA, {.GPR, .QPR, .QPR, .NONE}, {.RDLO_A32, .VN_Q_MVE, .VM_Q_MVE, .NONE}, 0xEEF00E21, 0xFFF10FF1, .MVE_INT, .T32, {thumb32=true, cond_in_28=false}} },
// SPECGEN:BEGIN
.VADDL = {

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ package rexcode_arm32_generated
import lib "../.."
@(rodata)
ENCODE_FORMS := [1673]lib.Encoding{
ENCODE_FORMS := [1675]lib.Encoding{
// .AND
{ .AND, {.GPR,.GPR,.IMM_MOD,.NONE}, {.RD,.RN_A32,.A32_IMM_MOD,.NONE}, 0x02000000, 0x0FE00000, .BASE, .A32, {} },
{ .AND, {.GPR,.GPR,.GPR_SHIFTED,.NONE}, {.RD,.RN_A32,.RM_A32,.NONE}, 0x00000000, 0x0FE00010, .BASE, .A32, {} },
@@ -2172,6 +2172,10 @@ ENCODE_FORMS := [1673]lib.Encoding{
{ .VMLAV, {.GPR,.QPR,.QPR,.NONE}, {.RD_T32,.VN_Q,.VM_Q,.NONE}, 0xEEB00F00, 0xEFB10F51, .MVE_INT, .T32, {thumb32=true} },
// .VMLAVA
{ .VMLAVA, {.GPR,.QPR,.QPR,.NONE}, {.RD_T32,.VN_Q,.VM_Q,.NONE}, 0xEEB00F20, 0xEFB10F51, .MVE_INT, .T32, {thumb32=true} },
// .VMLSV
{ .VMLSV, {.GPR,.QPR,.QPR,.NONE}, {.RDLO_A32,.VN_Q_MVE,.VM_Q_MVE,.NONE}, 0xEEF00E01, 0xFFF10FF1, .MVE_INT, .T32, {thumb32=true} },
// .VMLSVA
{ .VMLSVA, {.GPR,.QPR,.QPR,.NONE}, {.RDLO_A32,.VN_Q_MVE,.VM_Q_MVE,.NONE}, 0xEEF00E21, 0xFFF10FF1, .MVE_INT, .T32, {thumb32=true} },
// .VCMUL
{ .VCMUL, {.QPR,.QPR,.QPR,.NONE}, {.VD_Q,.VN_Q,.VM_Q,.NONE}, 0xEE300E00, 0xEFB10F51, .MVE_FP, .T32, {thumb32=true} },
// .VHCADD
@@ -2722,7 +2726,6 @@ ENCODE_RUNS := [lib.Mnemonic]lib.Encode_Run{
.SHA256H2 = { 1365, 1},
.SHA256SU0 = { 1366, 1},
.SHA256SU1 = { 1367, 1},
.VRINT = { 1368, 0},
.VJCVT = { 1368, 1},
.VSDOT = { 1369, 2},
.VUDOT = { 1371, 2},
@@ -2873,76 +2876,74 @@ ENCODE_RUNS := [lib.Mnemonic]lib.Encode_Run{
.VRMLSLDAVHAX = { 1603, 1},
.VMLAV = { 1604, 1},
.VMLAVA = { 1605, 1},
.VMLSV = { 1606, 0},
.VMLSVA = { 1606, 0},
.VCMUL = { 1606, 1},
.VHCADD = { 1607, 1},
.VBRSR = { 1608, 1},
.VSHLC = { 1609, 1},
.VRSHL_MVE = { 1610, 0},
.VDDUP = { 1610, 1},
.VIDUP = { 1611, 1},
.VDWDUP = { 1612, 1},
.VIWDUP = { 1613, 1},
.VMOVNB = { 1614, 1},
.VMOVNT = { 1615, 1},
.VQMOVNB = { 1616, 1},
.VQMOVNT = { 1617, 1},
.VQMOVUNB = { 1618, 1},
.VQMOVUNT = { 1619, 1},
.VSHLLB = { 1620, 1},
.VSHLLT = { 1621, 1},
.VMULLB = { 1622, 1},
.VMULLT = { 1623, 1},
.VMLALB = { 1624, 1},
.VMLALT = { 1625, 1},
.VMLSLB = { 1626, 1},
.VMLSLT = { 1627, 1},
.VSHRNB = { 1628, 1},
.VSHRNT = { 1629, 1},
.VRSHRNB = { 1630, 1},
.VRSHRNT = { 1631, 1},
.VQSHRNB = { 1632, 1},
.VQSHRNT = { 1633, 1},
.VQRSHRNB = { 1634, 1},
.VQRSHRNT = { 1635, 1},
.VQSHRUNB = { 1636, 1},
.VQSHRUNT = { 1637, 1},
.VQRSHRUNB = { 1638, 1},
.VQRSHRUNT = { 1639, 1},
.VMOV_Q_R = { 1640, 1},
.VMOV_R_Q = { 1641, 1},
.VMOV_2GPR_Q = { 1642, 1},
.VQDMLADH = { 1643, 1},
.VQDMLADHX = { 1644, 1},
.VQDMLSDH = { 1645, 1},
.VQDMLSDHX = { 1646, 1},
.VQRDMLADH = { 1647, 1},
.VQRDMLADHX = { 1648, 1},
.VQRDMLSDH = { 1649, 1},
.VQRDMLSDHX = { 1650, 1},
.VPRINT = { 1651, 0},
.VHCADD_SAT = { 1651, 1},
.VCMLA_MVE = { 1652, 1},
.VLDRB = { 1653, 1},
.VLDRH = { 1654, 1},
.VLDRW = { 1655, 1},
.VLDRD = { 1656, 1},
.VSTRB = { 1657, 1},
.VSTRH = { 1658, 1},
.VSTRW = { 1659, 1},
.VSTRD = { 1660, 1},
.VLD20 = { 1661, 1},
.VLD21 = { 1662, 1},
.VLD40 = { 1663, 1},
.VLD41 = { 1664, 1},
.VLD42 = { 1665, 1},
.VLD43 = { 1666, 1},
.VST20 = { 1667, 1},
.VST21 = { 1668, 1},
.VST40 = { 1669, 1},
.VST41 = { 1670, 1},
.VST42 = { 1671, 1},
.VST43 = { 1672, 1},
._COUNT = { 1673, 0},
.VMLSV = { 1606, 1},
.VMLSVA = { 1607, 1},
.VCMUL = { 1608, 1},
.VHCADD = { 1609, 1},
.VBRSR = { 1610, 1},
.VSHLC = { 1611, 1},
.VDDUP = { 1612, 1},
.VIDUP = { 1613, 1},
.VDWDUP = { 1614, 1},
.VIWDUP = { 1615, 1},
.VMOVNB = { 1616, 1},
.VMOVNT = { 1617, 1},
.VQMOVNB = { 1618, 1},
.VQMOVNT = { 1619, 1},
.VQMOVUNB = { 1620, 1},
.VQMOVUNT = { 1621, 1},
.VSHLLB = { 1622, 1},
.VSHLLT = { 1623, 1},
.VMULLB = { 1624, 1},
.VMULLT = { 1625, 1},
.VMLALB = { 1626, 1},
.VMLALT = { 1627, 1},
.VMLSLB = { 1628, 1},
.VMLSLT = { 1629, 1},
.VSHRNB = { 1630, 1},
.VSHRNT = { 1631, 1},
.VRSHRNB = { 1632, 1},
.VRSHRNT = { 1633, 1},
.VQSHRNB = { 1634, 1},
.VQSHRNT = { 1635, 1},
.VQRSHRNB = { 1636, 1},
.VQRSHRNT = { 1637, 1},
.VQSHRUNB = { 1638, 1},
.VQSHRUNT = { 1639, 1},
.VQRSHRUNB = { 1640, 1},
.VQRSHRUNT = { 1641, 1},
.VMOV_Q_R = { 1642, 1},
.VMOV_R_Q = { 1643, 1},
.VMOV_2GPR_Q = { 1644, 1},
.VQDMLADH = { 1645, 1},
.VQDMLADHX = { 1646, 1},
.VQDMLSDH = { 1647, 1},
.VQDMLSDHX = { 1648, 1},
.VQRDMLADH = { 1649, 1},
.VQRDMLADHX = { 1650, 1},
.VQRDMLSDH = { 1651, 1},
.VQRDMLSDHX = { 1652, 1},
.VHCADD_SAT = { 1653, 1},
.VCMLA_MVE = { 1654, 1},
.VLDRB = { 1655, 1},
.VLDRH = { 1656, 1},
.VLDRW = { 1657, 1},
.VLDRD = { 1658, 1},
.VSTRB = { 1659, 1},
.VSTRH = { 1660, 1},
.VSTRW = { 1661, 1},
.VSTRD = { 1662, 1},
.VLD20 = { 1663, 1},
.VLD21 = { 1664, 1},
.VLD40 = { 1665, 1},
.VLD41 = { 1666, 1},
.VLD42 = { 1667, 1},
.VLD43 = { 1668, 1},
.VST20 = { 1669, 1},
.VST21 = { 1670, 1},
.VST40 = { 1671, 1},
.VST41 = { 1672, 1},
.VST42 = { 1673, 1},
.VST43 = { 1674, 1},
._COUNT = { 1675, 0},
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.3 KiB

After

Width:  |  Height:  |  Size: 3.3 KiB