diff --git a/src/video/arm/SDL_sve2_blit_A.c b/src/video/arm/SDL_sve2_blit_A.c index be029bcc70..606df3b060 100644 --- a/src/video/arm/SDL_sve2_blit_A.c +++ b/src/video/arm/SDL_sve2_blit_A.c @@ -51,12 +51,12 @@ } #undef sdl_sve_rgb32_blend_to_rgb565_op -#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ - do { \ - svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ - sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ - sve_target_u16, \ - vMask); \ +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ + do { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask_fast(sve_source_u16, \ + sve_target_u16, \ + vMask); \ } while (0) #include "SDL_sve2_swizzle.h" diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h index 2f5a74a12b..3e2327a79c 100644 --- a/src/video/arm/SDL_sve2_extension.h +++ b/src/video/arm/SDL_sve2_extension.h @@ -902,7 +902,9 @@ static inline void svst4ub_u16(svbool_t vPredu8, /*! \note the Element range of vMask is [0, 0xFF] */ SDL_TARGETING("arch=armv8-a+sve2") -static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint16_t vTarget, svuint16_t vMask) +static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask) { // vTarget = vSource * vMask + vTarget * (255 - vMask); svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); @@ -924,6 +926,25 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint1 return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; } +/*! \note the Element range of vMask is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_mask_fast(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask) +{ + // vTarget = vSource * vMask + vTarget * (255 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + vTemp0 = svmla_u16_m(svptrue_b16(), + vTemp0, + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(255), + vMask)); + + return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; +} + /*! \note the hwOpacity range [0, 0x100] */ SDL_TARGETING("arch=armv8-a+sve2")