Better Neon tranpose

vld can de-interleave up to 4-float element structures in one go into multiple registers. This allows us to load each column of the matrix and put each element into a corresponding vector, essentially doing the entire transpose in 1 instruction Fixed it
2026-02-26 09:04:56 +00:00 · 2023-12-11 23:53:04 +00:00
parent 5f20d693c9
commit 133a595b6f
1 changed files with 7 additions and 12 deletions
--- a/HandmadeMath.h
+++ b/HandmadeMath.h
@@ -1517,21 +1517,16 @@ static inline HMM_Mat4 HMM_TransposeM4(HMM_Mat4 Matrix)
 {
    ASSERT_COVERED(HMM_TransposeM4);

-    HMM_Mat4 Result = Matrix;
+    HMM_Mat4 Result;
 #ifdef HANDMADE_MATH__USE_SSE
+    Result = Matrix;
    _MM_TRANSPOSE4_PS(Result.Columns[0].SSE, Result.Columns[1].SSE, Result.Columns[2].SSE, Result.Columns[3].SSE);
 #elif HANDMADE_MATH__USE_NEON
-    // Based on Fabian's article on SIMD Transposes, although he says that the interleave32 is
-    // a VUZP instruction, its actually a VZIP instruction
-    // https://fgiesen.wordpress.com/2013/07/09/simd-transposes-1/
-    float32x4x2_t B0B2 = vzipq_f32(Result.Columns[0].NEON, Result.Columns[2].NEON);
-    float32x4x2_t B1B3 = vzipq_f32(Result.Columns[1].NEON, Result.Columns[3].NEON);
-    float32x4x2_t XY = vzipq_f32(B0B2.val[0], B1B3.val[0]);
-    float32x4x2_t ZW = vzipq_f32(B0B2.val[1], B1B3.val[1]);
-    Result.Columns[0].NEON = XY.val[0];
-    Result.Columns[1].NEON = XY.val[1];
-    Result.Columns[2].NEON = ZW.val[0];
-    Result.Columns[3].NEON = ZW.val[1];
+    float32x4x4_t Transposed = vld4q_f32((float*)Matrix.Columns);
+    Result.Columns[0].NEON = Transposed.val[0];
+    Result.Columns[1].NEON = Transposed.val[1];
+    Result.Columns[2].NEON = Transposed.val[2];
+    Result.Columns[3].NEON = Transposed.val[3];
 #else
    Result.Elements[0][1] = Matrix.Elements[1][0];
    Result.Elements[0][2] = Matrix.Elements[2][0];