From f2dba2626ec2c02d5d37ccdbc7b1f03293321d61 Mon Sep 17 00:00:00 2001 From: Gabriel Wang Date: Tue, 26 May 2026 22:57:44 +0800 Subject: [PATCH] SVE2: Improves SVE2 8888 swizzling performance and important fixes (#15662) * SVE2 was actually disabled in fdfbbce, this issue is fixed - The macro __ARM_FEATURE_SVE is only defined when the compilation target is set as -march=armv8-m+sve2 * Improves 8888 alpha-blending performance - Now, in In-Order AArch64 processors, e.g. A520, SVE2 is better than NEON with the 128bit vector width - For Out-of-order processors, NEON is still better than SVE2 (We could improve this in the future), the performance is improved from 3.0 to 3.6. * The 8888 -> RGB565 performance is also improved (from 7.4 to 9.3) --- CMakeLists.txt | 5 +++ include/SDL3/SDL_intrin.h | 23 +++++++++----- src/video/arm/SDL_sve2_extension.h | 50 +++++++++++++++++++++--------- src/video/arm/SDL_sve2_swizzle.h | 6 ++++ 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e777e1537e..26ac91a465 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -934,6 +934,10 @@ if(SDL_ASSEMBLY) return 0; }]==] COMPILER_SUPPORTS_ARMSVE2) if(COMPILER_SUPPORTS_ARMSVE2) + # IMPORTANT: As not all AArch64 processors support SVE2, we only + # attach the following compilation option to SVE + # dedicated source files. + set(SVE2_MARCH_FLAG "-march=armv8-a+sve2") set(HAVE_ARMSVE2 TRUE) endif() cmake_pop_check_state() @@ -947,6 +951,7 @@ if(SDL_ASSEMBLY) "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c" "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c" PROPERTIES + COMPILE_FLAGS "${SVE2_MARCH_FLAG}" SKIP_PRECOMPILE_HEADERS ON ) endif() diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h index 289a356b7c..449eb44555 100644 --- a/include/SDL3/SDL_intrin.h +++ b/include/SDL3/SDL_intrin.h @@ -88,8 +88,12 @@ /** * Defined if (and only if) the compiler supports ARM SVE2 intrinsics. * - * If this macro is defined, SDL will have already included `` as - * appropriate. + * If this macro is defined, `` (providing SVE intrinsics) will + * only be included if the target architecture supports SVE + * (`__ARM_FEATURE_SVE` feature macro). + * Some toolchains do not support `SDL_TARGETING("arch=armv8-a+sve2")`, so + * for best portability you need to write all SVE code in a separate + * translation unit and add appropriate compile flags. * * \since This macro is available since SDL 3.6.0. */ @@ -247,9 +251,11 @@ _m_prefetch(void *__P) # define SDL_NEON_INTRINSICS 1 # include #endif -#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2) +#if !defined(SDL_DISABLE_SVE2) # define SDL_SVE2_INTRINSICS 1 -# include +# if defined(__ARM_FEATURE_SVE) +# include +# endif #endif #else @@ -284,16 +290,19 @@ _m_prefetch(void *__P) /* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */ # if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */ # define SDL_SVE2_INTRINSICS 1 -# include # define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */ +# define __ARM_FEATURE_SVE 1 /* Set __ARM_FEATURE_SVE so that it can be used elsewhere, at compile time */ # define __ARM_ARCH 8 +# include # endif # elif defined(SDL_PLATFORM_APPLE) /* Apple has no AArch64 device supporting SVE2 */ # elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) && \ - defined(__has_include) && __has_include() && defined(__ARM_FEATURE_SVE) + defined(__has_include) && __has_include() # define SDL_SVE2_INTRINSICS 1 -# include +# if defined(__ARM_FEATURE_SVE) +# include +# endif # endif #endif #endif /* compiler version */ diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h index b9db084bba..c18dace028 100644 --- a/src/video/arm/SDL_sve2_extension.h +++ b/src/video/arm/SDL_sve2_extension.h @@ -19,10 +19,33 @@ 3. This notice may not be removed or altered from any source distribution. */ +/* + * IMPORTANT: Please do NOT include this header file directly or indirectly + * outside the src/video/arm folder. + * + */ + #if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) #define SDL_SVE2_EXTENSION_H #include "SDL_sve2_util.h" + +/* + * NOTE: Some Android builds didn't attach '-march=armv8-a+sve2' to + * SDL_sve2_*.c and hence the macro __ARM_FEATURE_SVE is not + * defined by the compiler. This might not be a problem as the + * SDL_TARGETING("arch=armv8-a+sve2") enables the feature for + * individual functions, until some version of compilers + * provides arm_sve.h raising errors then __ARM_FEATURE_SVE + * is not defined. Although it should be avoided, as a + * workaround, we have to define the __ARM_FEATURE_SVE here as + * an ugly hack. + */ +#ifdef SDL_PLATFORM_ANDROID +#ifndef __ARM_FEATURE_SVE +#define __ARM_FEATURE_SVE 1 +#endif +#endif #include #include @@ -907,7 +930,8 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint16_t vMask) { // vTarget = vSource * vMask + vTarget * (255 - vMask); - svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp0 = svdup_u16(1); + vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vSource, vMask); vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vTarget, @@ -915,17 +939,13 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svdup_u16(255), vMask)); - vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1); - - svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); /* x += x >> 8 */ - vTemp0 = svadd_u16_m(svptrue_b16(), - vTemp0, - vTemp1); - - return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; + return svreinterpret_u16_u8( + svaddhnb_u16(vTemp0, + svlsr_n_u16_m(svptrue_b16(), + vTemp0, + 8))); } - /*! \note the Element range of vMask is [0, 0xFF] */ SDL_TARGETING("arch=armv8-a+sve2") @@ -968,15 +988,15 @@ static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource, */ SDL_TARGETING("arch=armv8-a+sve2") static inline svuint16_t sdl_sve_chn_blend_with_opacity_fast(svuint16_t vSource, - svuint16_t vTarget, - uint16_t hwOpacity) + svuint16_t vTarget, + uint16_t hwOpacity) { // vTarget = vSource * vMask + vTarget * (255 - vMask); svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity); vTemp0 = svmla_n_u16_m(svptrue_b16(), - vTemp0, - vTarget, - 256 - hwOpacity); + vTemp0, + vTarget, + 256 - hwOpacity); return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; } diff --git a/src/video/arm/SDL_sve2_swizzle.h b/src/video/arm/SDL_sve2_swizzle.h index a2d6f978d2..032e9d6796 100644 --- a/src/video/arm/SDL_sve2_swizzle.h +++ b/src/video/arm/SDL_sve2_swizzle.h @@ -19,6 +19,12 @@ 3. This notice may not be removed or altered from any source distribution. */ +/* + * IMPORTANT: Please do NOT include this header file directly or indirectly + * outside the src/video/arm folder. + * + */ + #if !defined(SD_SVE2_SWIZZLE_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) #define SD_SVE2_SWIZZLE_H