diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 60e48e71d9..1057493e63 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -1570,6 +1570,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) if (SDL_HasMMX()) { return Blit565to565SurfaceAlphaMMX; } else +#endif +#ifdef SDL_SVE2_INTRINSICS + if (SDL_HasSVE2()) { + return Blit565to565SurfaceAlphaSVE2; + } else #endif { return Blit565to565SurfaceAlpha; diff --git a/src/video/arm/SDL_sve2_blit_A.c b/src/video/arm/SDL_sve2_blit_A.c index 606df3b060..ef4dd5fff5 100644 --- a/src/video/arm/SDL_sve2_blit_A.c +++ b/src/video/arm/SDL_sve2_blit_A.c @@ -86,4 +86,101 @@ size_t SDL_GetSVEVectorSize(void) return svlen(svundef_u8()) * 8; } +/*-----------------------------------------------------------------------------* + * RGB565 Blend with Surface Alpha * + *-----------------------------------------------------------------------------*/ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_rgb565_stride_blend_with_opacity(uint16_t *SDL_RESTRICT phwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride, + uint16_t hwOpacity) +{ + sdl_sve_stride_loop_rgb16(uStride, vTailPred) + { + + svuint16x3_t vSource16x3 = + sdl_sve_rgb565_unpack(svld1_u16(vTailPred, phwSource)); + + svuint16x3_t vTarget16x3 = + sdl_sve_rgb565_unpack(svld1_u16(vTailPred, phwTarget)); + + sdl_sve_pixel_ccc_foreach_chn( + vSource16x3, + vTarget16x3, + { + sve_target_u16 = sdl_sve_chn_blend_with_opacity_fast( + sve_source_u16, + sve_target_u16, + hwOpacity); + }); + + svst1_u16(vTailPred, phwTarget, sdl_sve_rgb565_pack(vTarget16x3)); + + phwSource += sve_iteration_advance; + phwTarget += sve_iteration_advance; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_rgb565_blend_with_opacity(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight, + uint16_t hwOpacity) +{ + hwOpacity += hwOpacity == 255; + assert(0 == ((uintptr_t)pchSource & 0x01)); + assert(0 == ((uintptr_t)pchTarget & 0x01)); + + while (nHeight--) { + + sdl_sve_rgb565_stride_blend_with_opacity((uint16_t *)pchSource, + (uint16_t *)pchTarget, + nWidth, + hwOpacity); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +// fast RGB565->RGB565 blending with surface alpha +SDL_TARGETING("arch=armv8-a+sve2") +void Blit565to565SurfaceAlphaSVE2(SDL_BlitInfo *info) +{ + uint16_t alpha = info->a; + + int width = info->dst_w; + int height = info->dst_h; + uint8_t *src = info->src; + int srcskip = info->src_skip; + uint8_t *dst = info->dst; + int dstskip = info->dst_skip; + + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + + // Set up some basic variables + int srcbpp = srcfmt->bytes_per_pixel; + int dstbpp = dstfmt->bytes_per_pixel; + + assert(srcbpp == 2); + assert(dstbpp == 2); + + int srcstride = srcskip + srcbpp * width; + int dststride = dstskip + dstbpp * width; + + sdl_sve_rgb565_blend_with_opacity(src, + srcstride, + dst, + dststride, + width, + height, + alpha); +} + #endif /* SDL_SVE2_INTRINSICS */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_A.h b/src/video/arm/SDL_sve2_blit_A.h index 2a7e2b8149..2a86295566 100644 --- a/src/video/arm/SDL_sve2_blit_A.h +++ b/src/video/arm/SDL_sve2_blit_A.h @@ -30,6 +30,8 @@ void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); +void Blit565to565SurfaceAlphaSVE2(SDL_BlitInfo *info); + size_t SDL_GetSVEVectorSize(void); #endif /* SDL_SVE2_INTRINSICS */ diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h index 3e2327a79c..b9db084bba 100644 --- a/src/video/arm/SDL_sve2_extension.h +++ b/src/video/arm/SDL_sve2_extension.h @@ -964,6 +964,23 @@ static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource, return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; } +/*! \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_opacity_fast(svuint16_t vSource, + svuint16_t vTarget, + uint16_t hwOpacity) +{ + // vTarget = vSource * vMask + vTarget * (255 - vMask); + svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity); + vTemp0 = svmla_n_u16_m(svptrue_b16(), + vTemp0, + vTarget, + 256 - hwOpacity); + + return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; +} + /*! \note the Element range of vMask is [0, 0xFF] * \note the hwOpacity range [0, 0x100] */