rexcode/arm64: NEON shift-by-immediate encode forms + encoder extension

First encoder-extension family. Adds Operand_Type VEC_SHIFT and Operand_Encoding NEON_SHL_IMM/NEON_SHR_IMM: the element-size marker bit sits in the entry's bits, the encoder packs the amount into the low immh:immb bits (left = shift; right = esize - shift, esize from the vector operand via vec_esize/form.ops[0]), and the decoder recovers esize from immh to compute the amount.

Adds 13 mnemonics (91 forms) via specgen: left SHL/SLI/SQSHLU/SQSHL, right SSHR/USHR/SRSHR/URSHR/SSRA/USRA/SRSRA/URSRA/SRI. specgen derives bits/mask empirically by varying registers AND the shift (canon = operand bits zero; other extreme sets all shift bits), so per-arrangement immh discrimination + the growing shift-field width fall out automatically.

Verified end-to-end: encode matches llvm-mc byte-for-byte AND decode recovers mnemonic + amount (sshr/shl/sli/ushr/srsra across B/H/S/D); arm64 check + 461 tests pass.

First of the encoder-extension phase ([[rexcode-encode-coverage]]); CCMP_IMM imm5@20:16 pattern generalizes here.
This commit is contained in:
Brendan Punsky
2026-06-16 02:19:03 -04:00
committed by Flāvius
parent ff3a1acdc7
commit e52953c7ff
12 changed files with 1278 additions and 826 deletions

View File

@@ -199,6 +199,18 @@ extract_operand_inline :: #force_inline proc "contextless" (
case .BARRIER_FIELD:
return Operand{immediate = i64((word >> 8) & 0xF), kind = .IMMEDIATE, size = 1}
// ---- NEON shift-by-immediate: recover the amount from immh:immb ---------
case .NEON_SHL_IMM, .NEON_SHR_IMM:
immh := (word >> 19) & 0xF
esize: i64 = 8
if immh >= 8 { esize = 64 }
else if immh >= 4 { esize = 32 }
else if immh >= 2 { esize = 16 }
val := i64((word >> 16) & 0x7F)
amt := val - esize
if en == .NEON_SHR_IMM { amt = 2 * esize - val }
return Operand{immediate = amt, kind = .IMMEDIATE, size = 1}
// ---- Memory operand variants ------------------------------------------
case .OFFSET_BASE_U12:
size := u32(1) << ((word >> 30) & 0x3)

View File

@@ -247,7 +247,7 @@ operand_matches_inline :: #force_inline proc "contextless" (
return op.kind == .IMMEDIATE
case .IMM_12, .IMM_16, .IMM_8, .IMM_6, .IMM_5, .IMM_4, .IMM_3, .IMM_2,
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE:
.NZCV_IMM, .SYS_REG, .HW_SHIFT, .LSE_SIZE, .VEC_SHIFT:
return op.kind == .IMMEDIATE
case .BITMASK_IMM:
// The user passes the raw logical mask value; we validate that it
@@ -268,6 +268,17 @@ operand_matches_inline :: #force_inline proc "contextless" (
// =============================================================================
@(private="file")
// Element size in bits for a NEON vector arrangement operand type.
vec_esize :: #force_inline proc "contextless" (ot: Operand_Type) -> u32 {
#partial switch ot {
case .V_8B, .V_16B: return 8
case .V_4H, .V_8H, .V_4H_FP16, .V_8H_FP16: return 16
case .V_2S, .V_4S: return 32
case .V_1D, .V_2D: return 64
}
return 8
}
pack_operand_inline :: #force_inline proc(
op: ^Operand,
enc: Operand_Encoding,
@@ -422,6 +433,14 @@ pack_operand_inline :: #force_inline proc(
case .VA:
return (u32(reg_hw(op.reg)) & 0x1F) << 10
// NEON shift-by-immediate: the element-size marker is already in `bits`;
// the operand drives only the low immh:immb bits at 22:16.
case .NEON_SHL_IMM:
return (u32(op.immediate) & 0x3F) << 16
case .NEON_SHR_IMM:
esize := vec_esize(form.ops[0])
return ((esize - u32(op.immediate)) & 0x3F) << 16
// NEON MOVI/FMOV immediate split: abc at bits 18-16, defgh at bits 9-5.
case .NEON_IMM8_FMOV:
v := u32(op.immediate) & 0xFF

View File

@@ -165,6 +165,10 @@ Operand_Type :: enum u8 {
// ---- Condition code ----
COND,
// ---- NEON shift-by-immediate amount (encoded into immh:immb together
// with the element size: left = esize+shift, right = 2*esize-shift) ----
VEC_SHIFT,
}
// Where each operand's bits land in the 32-bit word.
@@ -223,6 +227,12 @@ Operand_Encoding :: enum u8 {
NEON_INDEX_S, // S lane index
NEON_INDEX_D, // D lane index
// ---- NEON shift-by-immediate (immh:immb at bits 22:16) ----
// The element-size marker bit is fixed in the entry's `bits`; the operand
// drives only the low bits. Left: low = shift. Right: low = esize - shift.
NEON_SHL_IMM,
NEON_SHR_IMM,
// ---- LSE atomics ------------------------------------------------------
ATOMIC_RS, // Rs (source / compare) at bits 16-20
ATOMIC_RT, // Rt (target) at bits 0-4

View File

@@ -1065,14 +1065,40 @@ inst_bif_r_r_r :: #force_inline proc "contextless" (dst: Regist
emit_bif_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bif_r_r_r(dst, src, src2)) }
inst_bsl_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .BSL, operand_count = 3, length = 4, ops = {op_v_16b(u8(reg_hw(dst))), op_v_16b(u8(reg_hw(src))), op_v_16b(u8(reg_hw(src2))), {}}} }
emit_bsl_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_bsl_r_r_r(dst, src, src2)) }
inst_shl_v_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SHL_V, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_shl_v_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_shl_v_r_r_i(dst, src, imm)) }
inst_sqshl_v_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SQSHL_V, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sqshl_v_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqshl_v_r_r_i(dst, src, imm)) }
inst_sqshlu_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SQSHLU, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sqshlu_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sqshlu_r_r_i(dst, src, imm)) }
inst_srshl_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SRSHL, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
emit_srshl_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_srshl_r_r_r(dst, src, src2)) }
inst_urshl_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .URSHL, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
emit_urshl_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_urshl_r_r_r(dst, src, src2)) }
inst_sshr_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SSHR, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sshr_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sshr_r_r_i(dst, src, imm)) }
inst_ushr_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .USHR, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_ushr_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_ushr_r_r_i(dst, src, imm)) }
inst_ssra_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SSRA, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_ssra_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_ssra_r_r_i(dst, src, imm)) }
inst_usra_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .USRA, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_usra_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_usra_r_r_i(dst, src, imm)) }
inst_srshr_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SRSHR, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_srshr_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_srshr_r_r_i(dst, src, imm)) }
inst_urshr_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .URSHR, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_urshr_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_urshr_r_r_i(dst, src, imm)) }
inst_srsra_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SRSRA, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_srsra_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_srsra_r_r_i(dst, src, imm)) }
inst_ursra_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .URSRA, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_ursra_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_ursra_r_r_i(dst, src, imm)) }
inst_sshl_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .SSHL, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
emit_sshl_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_sshl_r_r_r(dst, src, src2)) }
inst_ushl_r_r_r :: #force_inline proc "contextless" (dst: Register, src: Register, src2: Register) -> Instruction { return Instruction{mnemonic = .USHL, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_v_8b(u8(reg_hw(src2))), {}}} }
emit_ushl_r_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, src2: Register) { append(instructions, inst_ushl_r_r_r(dst, src, src2)) }
inst_sli_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SLI, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sli_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sli_r_r_i(dst, src, imm)) }
inst_sri_r_r_i :: #force_inline proc "contextless" (dst: Register, src: Register, imm: i64) -> Instruction { return Instruction{mnemonic = .SRI, operand_count = 3, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), op_imm(imm, 4), {}}} }
emit_sri_r_r_i :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register, imm: i64) { append(instructions, inst_sri_r_r_i(dst, src, imm)) }
inst_not_v_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .NOT_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
emit_not_v_r_r :: #force_inline proc(instructions: ^[dynamic]Instruction, dst: Register, src: Register) { append(instructions, inst_not_v_r_r(dst, src)) }
inst_rbit_v_r_r :: #force_inline proc "contextless" (dst: Register, src: Register) -> Instruction { return Instruction{mnemonic = .RBIT_V, operand_count = 2, length = 4, ops = {op_v_8b(u8(reg_hw(dst))), op_v_8b(u8(reg_hw(src))), {}, {}}} }
@@ -2944,14 +2970,40 @@ inst_bif :: inst_bif_r_r_r
emit_bif :: emit_bif_r_r_r
inst_bsl :: inst_bsl_r_r_r
emit_bsl :: emit_bsl_r_r_r
inst_shl_v :: inst_shl_v_r_r_i
emit_shl_v :: emit_shl_v_r_r_i
inst_sqshl_v :: inst_sqshl_v_r_r_i
emit_sqshl_v :: emit_sqshl_v_r_r_i
inst_sqshlu :: inst_sqshlu_r_r_i
emit_sqshlu :: emit_sqshlu_r_r_i
inst_srshl :: inst_srshl_r_r_r
emit_srshl :: emit_srshl_r_r_r
inst_urshl :: inst_urshl_r_r_r
emit_urshl :: emit_urshl_r_r_r
inst_sshr :: inst_sshr_r_r_i
emit_sshr :: emit_sshr_r_r_i
inst_ushr :: inst_ushr_r_r_i
emit_ushr :: emit_ushr_r_r_i
inst_ssra :: inst_ssra_r_r_i
emit_ssra :: emit_ssra_r_r_i
inst_usra :: inst_usra_r_r_i
emit_usra :: emit_usra_r_r_i
inst_srshr :: inst_srshr_r_r_i
emit_srshr :: emit_srshr_r_r_i
inst_urshr :: inst_urshr_r_r_i
emit_urshr :: emit_urshr_r_r_i
inst_srsra :: inst_srsra_r_r_i
emit_srsra :: emit_srsra_r_r_i
inst_ursra :: inst_ursra_r_r_i
emit_ursra :: emit_ursra_r_r_i
inst_sshl :: inst_sshl_r_r_r
emit_sshl :: emit_sshl_r_r_r
inst_ushl :: inst_ushl_r_r_r
emit_ushl :: emit_ushl_r_r_r
inst_sli :: inst_sli_r_r_i
emit_sli :: emit_sli_r_r_i
inst_sri :: inst_sri_r_r_i
emit_sri :: emit_sri_r_r_i
inst_not_v :: inst_not_v_r_r
emit_not_v :: emit_not_v_r_r
inst_rbit_v :: inst_rbit_v_r_r

View File

@@ -4009,5 +4009,124 @@ ENCODING_TABLE := #partial [Mnemonic][]Encoding{
.FCVTXN2 = {
{.FCVTXN2, {.V_4S, .V_2D, .NONE, .NONE}, {.VD, .VN, .NONE, .NONE}, 0x6E616800, 0xFFFFFC00, .NEON, {}},
},
// Advanced SIMD shift by immediate.
.SHL_V = {
{.SHL_V, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F085400, 0xFFF8FC00, .NEON, {}},
{.SHL_V, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F085400, 0xFFF8FC00, .NEON, {}},
{.SHL_V, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F105400, 0xFFF0FC00, .NEON, {}},
{.SHL_V, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F105400, 0xFFF0FC00, .NEON, {}},
{.SHL_V, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F205400, 0xFFE0FC00, .NEON, {}},
{.SHL_V, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F205400, 0xFFE0FC00, .NEON, {}},
{.SHL_V, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F405400, 0xFFC0FC00, .NEON, {}},
},
.SLI = {
{.SLI, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F085400, 0xFFF8FC00, .NEON, {}},
{.SLI, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F085400, 0xFFF8FC00, .NEON, {}},
{.SLI, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F105400, 0xFFF0FC00, .NEON, {}},
{.SLI, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F105400, 0xFFF0FC00, .NEON, {}},
{.SLI, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F205400, 0xFFE0FC00, .NEON, {}},
{.SLI, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F205400, 0xFFE0FC00, .NEON, {}},
{.SLI, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F405400, 0xFFC0FC00, .NEON, {}},
},
.SQSHLU = {
{.SQSHLU, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F086400, 0xFFF8FC00, .NEON, {}},
{.SQSHLU, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F086400, 0xFFF8FC00, .NEON, {}},
{.SQSHLU, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F106400, 0xFFF0FC00, .NEON, {}},
{.SQSHLU, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F106400, 0xFFF0FC00, .NEON, {}},
{.SQSHLU, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x2F206400, 0xFFE0FC00, .NEON, {}},
{.SQSHLU, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F206400, 0xFFE0FC00, .NEON, {}},
{.SQSHLU, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x6F406400, 0xFFC0FC00, .NEON, {}},
},
.SQSHL_V = {
{.SQSHL_V, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F087400, 0xFFF8FC00, .NEON, {}},
{.SQSHL_V, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F087400, 0xFFF8FC00, .NEON, {}},
{.SQSHL_V, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F107400, 0xFFF0FC00, .NEON, {}},
{.SQSHL_V, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F107400, 0xFFF0FC00, .NEON, {}},
{.SQSHL_V, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x0F207400, 0xFFE0FC00, .NEON, {}},
{.SQSHL_V, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F207400, 0xFFE0FC00, .NEON, {}},
{.SQSHL_V, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHL_IMM, .NONE}, 0x4F407400, 0xFFC0FC00, .NEON, {}},
},
.SSHR = {
{.SSHR, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F080400, 0xFFF8FC00, .NEON, {}},
{.SSHR, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F080400, 0xFFF8FC00, .NEON, {}},
{.SSHR, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F100400, 0xFFF0FC00, .NEON, {}},
{.SSHR, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F100400, 0xFFF0FC00, .NEON, {}},
{.SSHR, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F200400, 0xFFE0FC00, .NEON, {}},
{.SSHR, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F200400, 0xFFE0FC00, .NEON, {}},
{.SSHR, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F400400, 0xFFC0FC00, .NEON, {}},
},
.USHR = {
{.USHR, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F080400, 0xFFF8FC00, .NEON, {}},
{.USHR, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F080400, 0xFFF8FC00, .NEON, {}},
{.USHR, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F100400, 0xFFF0FC00, .NEON, {}},
{.USHR, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F100400, 0xFFF0FC00, .NEON, {}},
{.USHR, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F200400, 0xFFE0FC00, .NEON, {}},
{.USHR, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F200400, 0xFFE0FC00, .NEON, {}},
{.USHR, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F400400, 0xFFC0FC00, .NEON, {}},
},
.SRSHR = {
{.SRSHR, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F082400, 0xFFF8FC00, .NEON, {}},
{.SRSHR, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F082400, 0xFFF8FC00, .NEON, {}},
{.SRSHR, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F102400, 0xFFF0FC00, .NEON, {}},
{.SRSHR, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F102400, 0xFFF0FC00, .NEON, {}},
{.SRSHR, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F202400, 0xFFE0FC00, .NEON, {}},
{.SRSHR, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F202400, 0xFFE0FC00, .NEON, {}},
{.SRSHR, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F402400, 0xFFC0FC00, .NEON, {}},
},
.URSHR = {
{.URSHR, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F082400, 0xFFF8FC00, .NEON, {}},
{.URSHR, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F082400, 0xFFF8FC00, .NEON, {}},
{.URSHR, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F102400, 0xFFF0FC00, .NEON, {}},
{.URSHR, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F102400, 0xFFF0FC00, .NEON, {}},
{.URSHR, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F202400, 0xFFE0FC00, .NEON, {}},
{.URSHR, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F202400, 0xFFE0FC00, .NEON, {}},
{.URSHR, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F402400, 0xFFC0FC00, .NEON, {}},
},
.SSRA = {
{.SSRA, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F081400, 0xFFF8FC00, .NEON, {}},
{.SSRA, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F081400, 0xFFF8FC00, .NEON, {}},
{.SSRA, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F101400, 0xFFF0FC00, .NEON, {}},
{.SSRA, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F101400, 0xFFF0FC00, .NEON, {}},
{.SSRA, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F201400, 0xFFE0FC00, .NEON, {}},
{.SSRA, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F201400, 0xFFE0FC00, .NEON, {}},
{.SSRA, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F401400, 0xFFC0FC00, .NEON, {}},
},
.USRA = {
{.USRA, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F081400, 0xFFF8FC00, .NEON, {}},
{.USRA, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F081400, 0xFFF8FC00, .NEON, {}},
{.USRA, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F101400, 0xFFF0FC00, .NEON, {}},
{.USRA, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F101400, 0xFFF0FC00, .NEON, {}},
{.USRA, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F201400, 0xFFE0FC00, .NEON, {}},
{.USRA, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F201400, 0xFFE0FC00, .NEON, {}},
{.USRA, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F401400, 0xFFC0FC00, .NEON, {}},
},
.SRSRA = {
{.SRSRA, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F083400, 0xFFF8FC00, .NEON, {}},
{.SRSRA, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F083400, 0xFFF8FC00, .NEON, {}},
{.SRSRA, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F103400, 0xFFF0FC00, .NEON, {}},
{.SRSRA, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F103400, 0xFFF0FC00, .NEON, {}},
{.SRSRA, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x0F203400, 0xFFE0FC00, .NEON, {}},
{.SRSRA, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F203400, 0xFFE0FC00, .NEON, {}},
{.SRSRA, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x4F403400, 0xFFC0FC00, .NEON, {}},
},
.URSRA = {
{.URSRA, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F083400, 0xFFF8FC00, .NEON, {}},
{.URSRA, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F083400, 0xFFF8FC00, .NEON, {}},
{.URSRA, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F103400, 0xFFF0FC00, .NEON, {}},
{.URSRA, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F103400, 0xFFF0FC00, .NEON, {}},
{.URSRA, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F203400, 0xFFE0FC00, .NEON, {}},
{.URSRA, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F203400, 0xFFE0FC00, .NEON, {}},
{.URSRA, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F403400, 0xFFC0FC00, .NEON, {}},
},
.SRI = {
{.SRI, {.V_8B, .V_8B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F084400, 0xFFF8FC00, .NEON, {}},
{.SRI, {.V_16B, .V_16B, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F084400, 0xFFF8FC00, .NEON, {}},
{.SRI, {.V_4H, .V_4H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F104400, 0xFFF0FC00, .NEON, {}},
{.SRI, {.V_8H, .V_8H, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F104400, 0xFFF0FC00, .NEON, {}},
{.SRI, {.V_2S, .V_2S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x2F204400, 0xFFE0FC00, .NEON, {}},
{.SRI, {.V_4S, .V_4S, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F204400, 0xFFE0FC00, .NEON, {}},
{.SRI, {.V_2D, .V_2D, .VEC_SHIFT, .NONE}, {.VD, .VN, .NEON_SHR_IMM, .NONE}, 0x6F404400, 0xFFC0FC00, .NEON, {}},
},
// SPECGEN:END
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -231,6 +231,51 @@ for _, fam in ipairs(DIFF) do
sections[#sections+1] = "\t// Advanced SIMD "..fam.title..".\n" .. table.concat(blk, "\n")
end
-- ---- NEON shift-by-immediate ----------------------------------------------
-- <mnem> Vd.T, Vn.T, #shift. immh:immb carries element size + amount; mask is
-- derived empirically by also varying the shift (canon = operand bits 0, other =
-- all shift bits set). The encoder/decoder compute the amount (NEON_SHL/SHR_IMM).
local ESIZE = {["8B"]=8,["16B"]=8,["4H"]=16,["8H"]=16,["2S"]=32,["4S"]=32,["1D"]=64,["2D"]=64}
local SHIFT_ARR = {"8B","16B","4H","8H","2S","4S","2D"}
local function emit_shift(mnem, llvm, dir)
local enc_tok = (dir == "L") and ".NEON_SHL_IMM" or ".NEON_SHR_IMM"
local rows = {}
for _, a in ipairs(SHIFT_ARR) do
local es = ESIZE[a]
local canon = (dir == "L") and 0 or es
local other = (dir == "L") and (es - 1) or 1
local sa = ARR[a].asm
local function mk(r, sh) return string.format("%s v%d.%s, v%d.%s, #%d", llvm, r, sa, r, sa, sh) end
local bits, regV, shV = word(mk(0, canon)), word(mk(31, canon)), word(mk(0, other))
if bits and regV and shV then
local mask = bit.band(bit.bnot(bit.bor(bit.bxor(bits, regV), bit.bxor(bits, shV))), 0xFFFFFFFF)
rows[#rows+1] = string.format(
"\t\t{.%s, {.%s, .%s, .VEC_SHIFT, .NONE}, {.VD, .VN, %s, .NONE}, 0x%s, 0x%s, .NEON, {}},",
mnem, ARR[a].vt, ARR[a].vt, enc_tok, bit.tohex(bits):upper(), bit.tohex(mask):upper())
n_forms = n_forms + 1
else
skips[#skips+1] = mnem.." ."..a
end
end
if #rows == 0 then return nil end
n_mnem = n_mnem + 1
return string.format("\t.%s = {\n%s\n\t},", mnem, table.concat(rows, "\n"))
end
local SHIFTS = {
{"SHL_V","shl","L"},{"SLI","sli","L"},{"SQSHLU","sqshlu","L"},{"SQSHL_V","sqshl","L"},
{"SSHR","sshr","R"},{"USHR","ushr","R"},{"SRSHR","srshr","R"},{"URSHR","urshr","R"},
{"SSRA","ssra","R"},{"USRA","usra","R"},{"SRSRA","srsra","R"},{"URSRA","ursra","R"},
{"SRI","sri","R"},
}
do
local blk = {}
for _, it in ipairs(SHIFTS) do
local b = emit_shift(it[1], it[2], it[3])
if b then blk[#blk+1] = b end
end
sections[#sections+1] = "\t// Advanced SIMD shift by immediate.\n" .. table.concat(blk, "\n")
end
-- ---- splice into the SoT ---------------------------------------------------
local region = "\t// SPECGEN:BEGIN\n" .. table.concat(sections, "\n\n") .. "\n\t// SPECGEN:END"
local fh = assert(io.open(TABLE, "r")); local src = fh:read("*a"); fh:close()