diff --git a/HandmadeMath.h b/HandmadeMath.h index c384272..36593a1 100644 --- a/HandmadeMath.h +++ b/HandmadeMath.h @@ -105,10 +105,15 @@ # define ASSERT_COVERED(a) #endif +#ifdef HANDMADE_MATH_NO_SSE +# warning "HANDMADE_MATH_NO_SSE is depricated, use HANDMADE_MATH_NO_SIMD instead" +# define HANDMADE_MATH_NO_SIMD +#endif + /* let's figure out if SSE is really available (unless disabled anyway) (it isn't on non-x86/x86_64 platforms or even x86 without explicit SSE support) => only use "#ifdef HANDMADE_MATH__USE_SSE" to check for SSE support below this block! */ -#ifndef HANDMADE_MATH_NO_SSE +#ifndef HANDMADE_MATH_NO_SIMD # ifdef _MSC_VER /* MSVC supports SSE in amd64 mode or _M_IX86_FP >= 1 (2 means SSE2) */ # if defined(_M_AMD64) || ( defined(_M_IX86_FP) && _M_IX86_FP >= 1 ) # define HANDMADE_MATH__USE_SSE 1 @@ -118,7 +123,10 @@ # define HANDMADE_MATH__USE_SSE 1 # endif /* __SSE__ */ # endif /* not _MSC_VER */ -#endif /* #ifndef HANDMADE_MATH_NO_SSE */ +# ifdef __ARM_NEON +# define HANDMADE_MATH__USE_NEON 1 +# endif /* NEON Supported */ +#endif /* #ifndef HANDMADE_MATH_NO_SIMD */ #if (!defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) # define HANDMADE_MATH__USE_C11_GENERICS 1 @@ -128,6 +136,10 @@ # include #endif +#ifdef HANDMADE_MATH__USE_NEON +# include +#endif + #ifdef _MSC_VER #pragma warning(disable:4201) #endif @@ -354,6 +366,10 @@ typedef union HMM_Vec4 __m128 SSE; #endif +#ifdef HANDMADE_MATH__USE_NEON + float32x4_t NEON; +#endif + #ifdef __cplusplus inline float &operator[](int Index) { return Elements[Index]; } inline const float &operator[](int Index) const { return Elements[Index]; } @@ -414,6 +430,9 @@ typedef union HMM_Quat #ifdef HANDMADE_MATH__USE_SSE __m128 SSE; #endif +#ifdef HANDMADE_MATH__USE_NEON + float32x4_t NEON; +#endif } HMM_Quat; typedef signed int HMM_Bool; @@ -503,6 +522,10 @@ static inline float HMM_SqrtF(float Float) __m128 In = _mm_set_ss(Float); __m128 Out = _mm_sqrt_ss(In); Result = _mm_cvtss_f32(Out); +#elif HANDMADE_MATH__USE_NEON + float32x4_t In = vdupq_n_f32(Float); + float32x4_t Out = vsqrtq_f32(In); + Result = vgetq_lane_f32(Out, 0); #else Result = HMM_SQRTF(Float); #endif @@ -593,6 +616,9 @@ static inline HMM_Vec4 HMM_V4(float X, float Y, float Z, float W) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_setr_ps(X, Y, Z, W); +#elif HANDMADE_MATH__USE_NEON + float32x4_t v = {X, Y, Z, W}; + Result.NEON = v; #else Result.X = X; Result.Y = Y; @@ -612,6 +638,9 @@ static inline HMM_Vec4 HMM_V4V(HMM_Vec3 Vector, float W) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_setr_ps(Vector.X, Vector.Y, Vector.Z, W); +#elif HANDMADE_MATH__USE_NEON + float32x4_t v = {Vector.X, Vector.Y, Vector.Z, W}; + Result.NEON = v; #else Result.XYZ = Vector; Result.W = W; @@ -659,6 +688,8 @@ static inline HMM_Vec4 HMM_AddV4(HMM_Vec4 Left, HMM_Vec4 Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_add_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vaddq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X + Right.X; Result.Y = Left.Y + Right.Y; @@ -703,6 +734,8 @@ static inline HMM_Vec4 HMM_SubV4(HMM_Vec4 Left, HMM_Vec4 Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_sub_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vsubq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X - Right.X; Result.Y = Left.Y - Right.Y; @@ -772,6 +805,8 @@ static inline HMM_Vec4 HMM_MulV4(HMM_Vec4 Left, HMM_Vec4 Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_mul_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vmulq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X * Right.X; Result.Y = Left.Y * Right.Y; @@ -792,6 +827,8 @@ static inline HMM_Vec4 HMM_MulV4F(HMM_Vec4 Left, float Right) #ifdef HANDMADE_MATH__USE_SSE __m128 Scalar = _mm_set1_ps(Right); Result.SSE = _mm_mul_ps(Left.SSE, Scalar); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vmulq_n_f32(Left.NEON, Right); #else Result.X = Left.X * Right; Result.Y = Left.Y * Right; @@ -861,6 +898,8 @@ static inline HMM_Vec4 HMM_DivV4(HMM_Vec4 Left, HMM_Vec4 Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_div_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vdivq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X / Right.X; Result.Y = Left.Y / Right.Y; @@ -881,6 +920,9 @@ static inline HMM_Vec4 HMM_DivV4F(HMM_Vec4 Left, float Right) #ifdef HANDMADE_MATH__USE_SSE __m128 Scalar = _mm_set1_ps(Right); Result.SSE = _mm_div_ps(Left.SSE, Scalar); +#elif HANDMADE_MATH__USE_NEON + float32x4_t Scalar = vdupq_n_f32(Right); + Result.NEON = vdivq_f32(Left.NEON, Scalar); #else Result.X = Left.X / Right; Result.Y = Left.Y / Right; @@ -943,6 +985,11 @@ static inline float HMM_DotV4(HMM_Vec4 Left, HMM_Vec4 Right) SSEResultTwo = _mm_shuffle_ps(SSEResultOne, SSEResultOne, _MM_SHUFFLE(0, 1, 2, 3)); SSEResultOne = _mm_add_ps(SSEResultOne, SSEResultTwo); _mm_store_ss(&Result, SSEResultOne); +#elif HANDMADE_MATH__USE_NEON + float32x4_t NEONMultiplyResult = vmulq_f32(Left.NEON, Right.NEON); + float32x4_t NEONHalfAdd = vpaddq_f32(NEONMultiplyResult, NEONMultiplyResult); + float32x4_t NEONFullAdd = vpaddq_f32(NEONHalfAdd, NEONHalfAdd); + Result = vgetq_lane_f32(NEONFullAdd, 0); #else Result = ((Left.X * Right.X) + (Left.Z * Right.Z)) + ((Left.Y * Right.Y) + (Left.W * Right.W)); #endif @@ -1071,6 +1118,11 @@ static inline HMM_Vec4 HMM_LinearCombineV4M4(HMM_Vec4 Left, HMM_Mat4 Right) Result.SSE = _mm_add_ps(Result.SSE, _mm_mul_ps(_mm_shuffle_ps(Left.SSE, Left.SSE, 0x55), Right.Columns[1].SSE)); Result.SSE = _mm_add_ps(Result.SSE, _mm_mul_ps(_mm_shuffle_ps(Left.SSE, Left.SSE, 0xaa), Right.Columns[2].SSE)); Result.SSE = _mm_add_ps(Result.SSE, _mm_mul_ps(_mm_shuffle_ps(Left.SSE, Left.SSE, 0xff), Right.Columns[3].SSE)); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vmulq_laneq_f32(Right.Columns[0].NEON, Left.NEON, 0); + Result.NEON = vfmaq_laneq_f32(Result.NEON, Right.Columns[1].NEON, Left.NEON, 1); + Result.NEON = vfmaq_laneq_f32(Result.NEON, Right.Columns[2].NEON, Left.NEON, 2); + Result.NEON = vfmaq_laneq_f32(Result.NEON, Right.Columns[3].NEON, Left.NEON, 3); #else Result.X = Left.Elements[0] * Right.Columns[0].X; Result.Y = Left.Elements[0] * Right.Columns[0].Y; @@ -1468,6 +1520,18 @@ static inline HMM_Mat4 HMM_TransposeM4(HMM_Mat4 Matrix) HMM_Mat4 Result = Matrix; #ifdef HANDMADE_MATH__USE_SSE _MM_TRANSPOSE4_PS(Result.Columns[0].SSE, Result.Columns[1].SSE, Result.Columns[2].SSE, Result.Columns[3].SSE); +#elif HANDMADE_MATH__USE_NEON + // Based on Fabian's article on SIMD Transposes, although he says that the interleave32 is + // a VUZP instruction, its actually a VZIP instruction + // https://fgiesen.wordpress.com/2013/07/09/simd-transposes-1/ + float32x4x2_t B0B2 = vzipq_f32(Result.Columns[0].NEON, Result.Columns[2].NEON); + float32x4x2_t B1B3 = vzipq_f32(Result.Columns[1].NEON, Result.Columns[3].NEON); + float32x4x2_t XY = vzipq_f32(B0B2.val[0], B1B3.val[0]); + float32x4x2_t ZW = vzipq_f32(B0B2.val[1], B1B3.val[1]); + Result.Columns[0].NEON = XY.val[0]; + Result.Columns[1].NEON = XY.val[1]; + Result.Columns[2].NEON = ZW.val[0]; + Result.Columns[3].NEON = ZW.val[1]; #else Result.Elements[0][1] = Matrix.Elements[1][0]; Result.Elements[0][2] = Matrix.Elements[2][0]; @@ -1493,29 +1557,10 @@ static inline HMM_Mat4 HMM_AddM4(HMM_Mat4 Left, HMM_Mat4 Right) HMM_Mat4 Result; -#ifdef HANDMADE_MATH__USE_SSE - Result.Columns[0].SSE = _mm_add_ps(Left.Columns[0].SSE, Right.Columns[0].SSE); - Result.Columns[1].SSE = _mm_add_ps(Left.Columns[1].SSE, Right.Columns[1].SSE); - Result.Columns[2].SSE = _mm_add_ps(Left.Columns[2].SSE, Right.Columns[2].SSE); - Result.Columns[3].SSE = _mm_add_ps(Left.Columns[3].SSE, Right.Columns[3].SSE); -#else - Result.Elements[0][0] = Left.Elements[0][0] + Right.Elements[0][0]; - Result.Elements[0][1] = Left.Elements[0][1] + Right.Elements[0][1]; - Result.Elements[0][2] = Left.Elements[0][2] + Right.Elements[0][2]; - Result.Elements[0][3] = Left.Elements[0][3] + Right.Elements[0][3]; - Result.Elements[1][0] = Left.Elements[1][0] + Right.Elements[1][0]; - Result.Elements[1][1] = Left.Elements[1][1] + Right.Elements[1][1]; - Result.Elements[1][2] = Left.Elements[1][2] + Right.Elements[1][2]; - Result.Elements[1][3] = Left.Elements[1][3] + Right.Elements[1][3]; - Result.Elements[2][0] = Left.Elements[2][0] + Right.Elements[2][0]; - Result.Elements[2][1] = Left.Elements[2][1] + Right.Elements[2][1]; - Result.Elements[2][2] = Left.Elements[2][2] + Right.Elements[2][2]; - Result.Elements[2][3] = Left.Elements[2][3] + Right.Elements[2][3]; - Result.Elements[3][0] = Left.Elements[3][0] + Right.Elements[3][0]; - Result.Elements[3][1] = Left.Elements[3][1] + Right.Elements[3][1]; - Result.Elements[3][2] = Left.Elements[3][2] + Right.Elements[3][2]; - Result.Elements[3][3] = Left.Elements[3][3] + Right.Elements[3][3]; -#endif + Result.Columns[0] = HMM_AddV4(Left.Columns[0], Right.Columns[0]); + Result.Columns[1] = HMM_AddV4(Left.Columns[1], Right.Columns[1]); + Result.Columns[2] = HMM_AddV4(Left.Columns[2], Right.Columns[2]); + Result.Columns[3] = HMM_AddV4(Left.Columns[3], Right.Columns[3]); return Result; } @@ -1527,29 +1572,10 @@ static inline HMM_Mat4 HMM_SubM4(HMM_Mat4 Left, HMM_Mat4 Right) HMM_Mat4 Result; -#ifdef HANDMADE_MATH__USE_SSE - Result.Columns[0].SSE = _mm_sub_ps(Left.Columns[0].SSE, Right.Columns[0].SSE); - Result.Columns[1].SSE = _mm_sub_ps(Left.Columns[1].SSE, Right.Columns[1].SSE); - Result.Columns[2].SSE = _mm_sub_ps(Left.Columns[2].SSE, Right.Columns[2].SSE); - Result.Columns[3].SSE = _mm_sub_ps(Left.Columns[3].SSE, Right.Columns[3].SSE); -#else - Result.Elements[0][0] = Left.Elements[0][0] - Right.Elements[0][0]; - Result.Elements[0][1] = Left.Elements[0][1] - Right.Elements[0][1]; - Result.Elements[0][2] = Left.Elements[0][2] - Right.Elements[0][2]; - Result.Elements[0][3] = Left.Elements[0][3] - Right.Elements[0][3]; - Result.Elements[1][0] = Left.Elements[1][0] - Right.Elements[1][0]; - Result.Elements[1][1] = Left.Elements[1][1] - Right.Elements[1][1]; - Result.Elements[1][2] = Left.Elements[1][2] - Right.Elements[1][2]; - Result.Elements[1][3] = Left.Elements[1][3] - Right.Elements[1][3]; - Result.Elements[2][0] = Left.Elements[2][0] - Right.Elements[2][0]; - Result.Elements[2][1] = Left.Elements[2][1] - Right.Elements[2][1]; - Result.Elements[2][2] = Left.Elements[2][2] - Right.Elements[2][2]; - Result.Elements[2][3] = Left.Elements[2][3] - Right.Elements[2][3]; - Result.Elements[3][0] = Left.Elements[3][0] - Right.Elements[3][0]; - Result.Elements[3][1] = Left.Elements[3][1] - Right.Elements[3][1]; - Result.Elements[3][2] = Left.Elements[3][2] - Right.Elements[3][2]; - Result.Elements[3][3] = Left.Elements[3][3] - Right.Elements[3][3]; -#endif + Result.Columns[0] = HMM_SubV4(Left.Columns[0], Right.Columns[0]); + Result.Columns[1] = HMM_SubV4(Left.Columns[1], Right.Columns[1]); + Result.Columns[2] = HMM_SubV4(Left.Columns[2], Right.Columns[2]); + Result.Columns[3] = HMM_SubV4(Left.Columns[3], Right.Columns[3]); return Result; } @@ -1574,7 +1600,8 @@ static inline HMM_Mat4 HMM_MulM4F(HMM_Mat4 Matrix, float Scalar) ASSERT_COVERED(HMM_MulM4F); HMM_Mat4 Result; - + + #ifdef HANDMADE_MATH__USE_SSE __m128 SSEScalar = _mm_set1_ps(Scalar); Result.Columns[0].SSE = _mm_mul_ps(Matrix.Columns[0].SSE, SSEScalar); @@ -1582,22 +1609,13 @@ static inline HMM_Mat4 HMM_MulM4F(HMM_Mat4 Matrix, float Scalar) Result.Columns[2].SSE = _mm_mul_ps(Matrix.Columns[2].SSE, SSEScalar); Result.Columns[3].SSE = _mm_mul_ps(Matrix.Columns[3].SSE, SSEScalar); #else - Result.Elements[0][0] = Matrix.Elements[0][0] * Scalar; - Result.Elements[0][1] = Matrix.Elements[0][1] * Scalar; - Result.Elements[0][2] = Matrix.Elements[0][2] * Scalar; - Result.Elements[0][3] = Matrix.Elements[0][3] * Scalar; - Result.Elements[1][0] = Matrix.Elements[1][0] * Scalar; - Result.Elements[1][1] = Matrix.Elements[1][1] * Scalar; - Result.Elements[1][2] = Matrix.Elements[1][2] * Scalar; - Result.Elements[1][3] = Matrix.Elements[1][3] * Scalar; - Result.Elements[2][0] = Matrix.Elements[2][0] * Scalar; - Result.Elements[2][1] = Matrix.Elements[2][1] * Scalar; - Result.Elements[2][2] = Matrix.Elements[2][2] * Scalar; - Result.Elements[2][3] = Matrix.Elements[2][3] * Scalar; - Result.Elements[3][0] = Matrix.Elements[3][0] * Scalar; - Result.Elements[3][1] = Matrix.Elements[3][1] * Scalar; - Result.Elements[3][2] = Matrix.Elements[3][2] * Scalar; - Result.Elements[3][3] = Matrix.Elements[3][3] * Scalar; + // NOTE(jack) In neon we dont need to do an equivalent to _mm_set1_ps + // so we can fall back on the MulV4F calls without the extra setting + // being done + Result.Columns[0] = HMM_MulV4F(Matrix.Columns[0], Scalar); + Result.Columns[1] = HMM_MulV4F(Matrix.Columns[1], Scalar); + Result.Columns[2] = HMM_MulV4F(Matrix.Columns[2], Scalar); + Result.Columns[3] = HMM_MulV4F(Matrix.Columns[3], Scalar); #endif return Result; @@ -1623,23 +1641,17 @@ static inline HMM_Mat4 HMM_DivM4F(HMM_Mat4 Matrix, float Scalar) Result.Columns[1].SSE = _mm_div_ps(Matrix.Columns[1].SSE, SSEScalar); Result.Columns[2].SSE = _mm_div_ps(Matrix.Columns[2].SSE, SSEScalar); Result.Columns[3].SSE = _mm_div_ps(Matrix.Columns[3].SSE, SSEScalar); +#elif HANDMADE_MATH__USE_NEON + float32x4_t NEONScalar = vdupq_n_f32(Scalar); + Result.Columns[0].NEON = vdivq_f32(Matrix.Columns[0].NEON, NEONScalar); + Result.Columns[1].NEON = vdivq_f32(Matrix.Columns[1].NEON, NEONScalar); + Result.Columns[2].NEON = vdivq_f32(Matrix.Columns[2].NEON, NEONScalar); + Result.Columns[3].NEON = vdivq_f32(Matrix.Columns[3].NEON, NEONScalar); #else - Result.Elements[0][0] = Matrix.Elements[0][0] / Scalar; - Result.Elements[0][1] = Matrix.Elements[0][1] / Scalar; - Result.Elements[0][2] = Matrix.Elements[0][2] / Scalar; - Result.Elements[0][3] = Matrix.Elements[0][3] / Scalar; - Result.Elements[1][0] = Matrix.Elements[1][0] / Scalar; - Result.Elements[1][1] = Matrix.Elements[1][1] / Scalar; - Result.Elements[1][2] = Matrix.Elements[1][2] / Scalar; - Result.Elements[1][3] = Matrix.Elements[1][3] / Scalar; - Result.Elements[2][0] = Matrix.Elements[2][0] / Scalar; - Result.Elements[2][1] = Matrix.Elements[2][1] / Scalar; - Result.Elements[2][2] = Matrix.Elements[2][2] / Scalar; - Result.Elements[2][3] = Matrix.Elements[2][3] / Scalar; - Result.Elements[3][0] = Matrix.Elements[3][0] / Scalar; - Result.Elements[3][1] = Matrix.Elements[3][1] / Scalar; - Result.Elements[3][2] = Matrix.Elements[3][2] / Scalar; - Result.Elements[3][3] = Matrix.Elements[3][3] / Scalar; + Result.Columns[0] = HMM_DivV4F(Matrix.Columns[0], Scalar); + Result.Columns[1] = HMM_DivV4F(Matrix.Columns[1], Scalar); + Result.Columns[2] = HMM_DivV4F(Matrix.Columns[2], Scalar); + Result.Columns[3] = HMM_DivV4F(Matrix.Columns[3], Scalar); #endif return Result; @@ -2065,6 +2077,9 @@ static inline HMM_Quat HMM_Q(float X, float Y, float Z, float W) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_setr_ps(X, Y, Z, W); +#elif HANDMADE_MATH__USE_NEON + float32x4_t v = { X, Y, Z, W }; + Result.NEON = v; #else Result.X = X; Result.Y = Y; @@ -2084,6 +2099,8 @@ static inline HMM_Quat HMM_QV4(HMM_Vec4 Vector) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = Vector.SSE; +#elif HANDMADE_MATH__USE_NEON + Result.NEON = Vector.NEON; #else Result.X = Vector.X; Result.Y = Vector.Y; @@ -2103,6 +2120,8 @@ static inline HMM_Quat HMM_AddQ(HMM_Quat Left, HMM_Quat Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_add_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vaddq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X + Right.X; @@ -2123,6 +2142,8 @@ static inline HMM_Quat HMM_SubQ(HMM_Quat Left, HMM_Quat Right) #ifdef HANDMADE_MATH__USE_SSE Result.SSE = _mm_sub_ps(Left.SSE, Right.SSE); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vsubq_f32(Left.NEON, Right.NEON); #else Result.X = Left.X - Right.X; Result.Y = Left.Y - Right.Y; @@ -2157,6 +2178,10 @@ static inline HMM_Quat HMM_MulQ(HMM_Quat Left, HMM_Quat Right) SSEResultTwo = _mm_shuffle_ps(Right.SSE, Right.SSE, _MM_SHUFFLE(3, 2, 1, 0)); Result.SSE = _mm_add_ps(SSEResultThree, _mm_mul_ps(SSEResultTwo, SSEResultOne)); #else +# if HANDMADE_MATH__USE_NEON + // TOOD (jack): go look up how quaternion multiplication works and implment it in neon, we don't have xor. +# warning "Vectorized Quaternion Multiply not yet implemented in NEON" +# endif Result.X = Right.Elements[3] * +Left.Elements[0]; Result.Y = Right.Elements[2] * -Left.Elements[0]; Result.Z = Right.Elements[1] * +Left.Elements[0]; @@ -2191,6 +2216,8 @@ static inline HMM_Quat HMM_MulQF(HMM_Quat Left, float Multiplicative) #ifdef HANDMADE_MATH__USE_SSE __m128 Scalar = _mm_set1_ps(Multiplicative); Result.SSE = _mm_mul_ps(Left.SSE, Scalar); +#elif HANDMADE_MATH__USE_NEON + Result.NEON = vmulq_n_f32(Left.NEON, Multiplicative); #else Result.X = Left.X * Multiplicative; Result.Y = Left.Y * Multiplicative; @@ -2211,6 +2238,9 @@ static inline HMM_Quat HMM_DivQF(HMM_Quat Left, float Divnd) #ifdef HANDMADE_MATH__USE_SSE __m128 Scalar = _mm_set1_ps(Divnd); Result.SSE = _mm_div_ps(Left.SSE, Scalar); +#elif HANDMADE_MATH__USE_NEON + float32x4_t Scalar = vdupq_n_f32(Divnd); + Result.NEON = vdivq_f32(Left.NEON, Scalar); #else Result.X = Left.X / Divnd; Result.Y = Left.Y / Divnd; @@ -2235,6 +2265,11 @@ static inline float HMM_DotQ(HMM_Quat Left, HMM_Quat Right) SSEResultTwo = _mm_shuffle_ps(SSEResultOne, SSEResultOne, _MM_SHUFFLE(0, 1, 2, 3)); SSEResultOne = _mm_add_ps(SSEResultOne, SSEResultTwo); _mm_store_ss(&Result, SSEResultOne); +#elif HANDMADE_MATH__USE_NEON + float32x4_t NEONMultiplyResult = vmulq_f32(Left.NEON, Right.NEON); + float32x4_t NEONHalfAdd = vpaddq_f32(NEONMultiplyResult, NEONMultiplyResult); + float32x4_t NEONFullAdd = vpaddq_f32(NEONHalfAdd, NEONHalfAdd); + Result = vgetq_lane_f32(NEONFullAdd, 0); #else Result = ((Left.X * Right.X) + (Left.Z * Right.Z)) + ((Left.Y * Right.Y) + (Left.W * Right.W)); #endif @@ -2278,6 +2313,10 @@ static inline HMM_Quat _HMM_MixQ(HMM_Quat Left, float MixLeft, HMM_Quat Right, f __m128 SSEResultOne = _mm_mul_ps(Left.SSE, ScalarLeft); __m128 SSEResultTwo = _mm_mul_ps(Right.SSE, ScalarRight); Result.SSE = _mm_add_ps(SSEResultOne, SSEResultTwo); +#elif HANDMADE_MATH__USE_NEON + float32x4_t ScaledLeft = vmulq_n_f32(Left.NEON, MixLeft); + float32x4_t ScaledRight = vmulq_n_f32(Right.NEON, MixRight); + Result.NEON = vaddq_f32(ScaledLeft, ScaledRight); #else Result.X = Left.X*MixLeft + Right.X*MixRight; Result.Y = Left.Y*MixLeft + Right.Y*MixRight; @@ -3695,6 +3734,9 @@ static inline HMM_Vec4 operator-(HMM_Vec4 In) HMM_Vec4 Result; #if HANDMADE_MATH__USE_SSE Result.SSE = _mm_xor_ps(In.SSE, _mm_set1_ps(-0.0f)); +#elif HANDMADE_MATH__USE_NEON + float32x4_t Zero = vdupq_n_f32(0.0f); + Result.NEON = vsubq_f32(Zero, In.NEON); #else Result.X = -In.X; Result.Y = -In.Y;