mirror of
https://github.com/libsdl-org/SDL.git
synced 2026-05-28 07:45:22 +00:00
SVE2: Improves SVE2 8888 swizzling performance and important fixes (#15662)
* SVE2 was actually disabled in fdfbbce, this issue is fixed
- The macro __ARM_FEATURE_SVE is only defined when the compilation target is set as -march=armv8-m+sve2
* Improves 8888 alpha-blending performance
- Now, in In-Order AArch64 processors, e.g. A520, SVE2 is better than NEON with the 128bit vector width
- For Out-of-order processors, NEON is still better than SVE2 (We could improve this in the future), the performance is improved from 3.0 to 3.6.
* The 8888 -> RGB565 performance is also improved (from 7.4 to 9.3)
This commit is contained in:
@@ -934,6 +934,10 @@ if(SDL_ASSEMBLY)
|
||||
return 0;
|
||||
}]==] COMPILER_SUPPORTS_ARMSVE2)
|
||||
if(COMPILER_SUPPORTS_ARMSVE2)
|
||||
# IMPORTANT: As not all AArch64 processors support SVE2, we only
|
||||
# attach the following compilation option to SVE
|
||||
# dedicated source files.
|
||||
set(SVE2_MARCH_FLAG "-march=armv8-a+sve2")
|
||||
set(HAVE_ARMSVE2 TRUE)
|
||||
endif()
|
||||
cmake_pop_check_state()
|
||||
@@ -947,6 +951,7 @@ if(SDL_ASSEMBLY)
|
||||
"${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c"
|
||||
"${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c"
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "${SVE2_MARCH_FLAG}"
|
||||
SKIP_PRECOMPILE_HEADERS ON
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -88,8 +88,12 @@
|
||||
/**
|
||||
* Defined if (and only if) the compiler supports ARM SVE2 intrinsics.
|
||||
*
|
||||
* If this macro is defined, SDL will have already included `<arm_sve.h>` as
|
||||
* appropriate.
|
||||
* If this macro is defined, `<arm_sve.h>` (providing SVE intrinsics) will
|
||||
* only be included if the target architecture supports SVE
|
||||
* (`__ARM_FEATURE_SVE` feature macro).
|
||||
* Some toolchains do not support `SDL_TARGETING("arch=armv8-a+sve2")`, so
|
||||
* for best portability you need to write all SVE code in a separate
|
||||
* translation unit and add appropriate compile flags.
|
||||
*
|
||||
* \since This macro is available since SDL 3.6.0.
|
||||
*/
|
||||
@@ -247,9 +251,11 @@ _m_prefetch(void *__P)
|
||||
# define SDL_NEON_INTRINSICS 1
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2)
|
||||
#if !defined(SDL_DISABLE_SVE2)
|
||||
# define SDL_SVE2_INTRINSICS 1
|
||||
# include <arm_sve.h>
|
||||
# if defined(__ARM_FEATURE_SVE)
|
||||
# include <arm_sve.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#else
|
||||
@@ -284,16 +290,19 @@ _m_prefetch(void *__P)
|
||||
/* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */
|
||||
# if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */
|
||||
# define SDL_SVE2_INTRINSICS 1
|
||||
# include <arm_sve.h>
|
||||
# define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */
|
||||
# define __ARM_FEATURE_SVE 1 /* Set __ARM_FEATURE_SVE so that it can be used elsewhere, at compile time */
|
||||
# define __ARM_ARCH 8
|
||||
# include <arm_sve.h>
|
||||
# endif
|
||||
# elif defined(SDL_PLATFORM_APPLE)
|
||||
/* Apple has no AArch64 device supporting SVE2 */
|
||||
# elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) && \
|
||||
defined(__has_include) && __has_include(<arm_sve.h>) && defined(__ARM_FEATURE_SVE)
|
||||
defined(__has_include) && __has_include(<arm_sve.h>)
|
||||
# define SDL_SVE2_INTRINSICS 1
|
||||
# include <arm_sve.h>
|
||||
# if defined(__ARM_FEATURE_SVE)
|
||||
# include <arm_sve.h>
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
#endif /* compiler version */
|
||||
|
||||
@@ -19,10 +19,33 @@
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/*
|
||||
* IMPORTANT: Please do NOT include this header file directly or indirectly
|
||||
* outside the src/video/arm folder.
|
||||
*
|
||||
*/
|
||||
|
||||
#if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2)
|
||||
#define SDL_SVE2_EXTENSION_H
|
||||
|
||||
#include "SDL_sve2_util.h"
|
||||
|
||||
/*
|
||||
* NOTE: Some Android builds didn't attach '-march=armv8-a+sve2' to
|
||||
* SDL_sve2_*.c and hence the macro __ARM_FEATURE_SVE is not
|
||||
* defined by the compiler. This might not be a problem as the
|
||||
* SDL_TARGETING("arch=armv8-a+sve2") enables the feature for
|
||||
* individual functions, until some version of compilers
|
||||
* provides arm_sve.h raising errors then __ARM_FEATURE_SVE
|
||||
* is not defined. Although it should be avoided, as a
|
||||
* workaround, we have to define the __ARM_FEATURE_SVE here as
|
||||
* an ugly hack.
|
||||
*/
|
||||
#ifdef SDL_PLATFORM_ANDROID
|
||||
#ifndef __ARM_FEATURE_SVE
|
||||
#define __ARM_FEATURE_SVE 1
|
||||
#endif
|
||||
#endif
|
||||
#include <arm_sve.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -907,7 +930,8 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource,
|
||||
svuint16_t vMask)
|
||||
{
|
||||
// vTarget = vSource * vMask + vTarget * (255 - vMask);
|
||||
svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask);
|
||||
svuint16_t vTemp0 = svdup_u16(1);
|
||||
vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vSource, vMask);
|
||||
vTemp0 = svmla_u16_m(svptrue_b16(),
|
||||
vTemp0,
|
||||
vTarget,
|
||||
@@ -915,17 +939,13 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource,
|
||||
svdup_u16(255),
|
||||
vMask));
|
||||
|
||||
vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1);
|
||||
|
||||
svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8);
|
||||
/* x += x >> 8 */
|
||||
vTemp0 = svadd_u16_m(svptrue_b16(),
|
||||
vTemp0,
|
||||
vTemp1);
|
||||
|
||||
return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8;
|
||||
return svreinterpret_u16_u8(
|
||||
svaddhnb_u16(vTemp0,
|
||||
svlsr_n_u16_m(svptrue_b16(),
|
||||
vTemp0,
|
||||
8)));
|
||||
}
|
||||
|
||||
/*! \note the Element range of vMask is [0, 0xFF]
|
||||
*/
|
||||
SDL_TARGETING("arch=armv8-a+sve2")
|
||||
@@ -968,15 +988,15 @@ static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource,
|
||||
*/
|
||||
SDL_TARGETING("arch=armv8-a+sve2")
|
||||
static inline svuint16_t sdl_sve_chn_blend_with_opacity_fast(svuint16_t vSource,
|
||||
svuint16_t vTarget,
|
||||
uint16_t hwOpacity)
|
||||
svuint16_t vTarget,
|
||||
uint16_t hwOpacity)
|
||||
{
|
||||
// vTarget = vSource * vMask + vTarget * (255 - vMask);
|
||||
svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity);
|
||||
vTemp0 = svmla_n_u16_m(svptrue_b16(),
|
||||
vTemp0,
|
||||
vTarget,
|
||||
256 - hwOpacity);
|
||||
vTemp0,
|
||||
vTarget,
|
||||
256 - hwOpacity);
|
||||
|
||||
return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,12 @@
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/*
|
||||
* IMPORTANT: Please do NOT include this header file directly or indirectly
|
||||
* outside the src/video/arm folder.
|
||||
*
|
||||
*/
|
||||
|
||||
#if !defined(SD_SVE2_SWIZZLE_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2)
|
||||
#define SD_SVE2_SWIZZLE_H
|
||||
|
||||
|
||||
Reference in New Issue
Block a user