diff --git a/CMakeLists.txt b/CMakeLists.txt index 626ddb4051..6f2572814a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -898,7 +898,15 @@ if(SDL_ASSEMBLY) cmake_pop_check_state() if(COMPILER_SUPPORTS_LSX AND HAVE_LSXINTRIN_H) - set_property(SOURCE "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c" APPEND PROPERTY COMPILE_OPTIONS "-mlsx") + set_property(SOURCE + "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c" + "${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c" + APPEND PROPERTY COMPILE_OPTIONS "-mlsx") + + set_property(SOURCE + "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c" + "${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c" + PROPERTY SKIP_PRECOMPILE_HEADERS 1) set(HAVE_LSX TRUE) endif() endif() diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h index bac6d7ad4f..c7338ee8f5 100644 --- a/include/SDL3/SDL_intrin.h +++ b/include/SDL3/SDL_intrin.h @@ -280,12 +280,14 @@ _m_prefetch(void *__P) * \sa SDL_TARGETING */ #define SDL_HAS_TARGET_ATTRIBS - +#elif defined(__loongarch64) && defined(__GNUC__) && (__GNUC__ >= 15) +/* LoongArch requires GCC 15+ for target attribute support */ +# define SDL_HAS_TARGET_ATTRIBS #elif defined(__clang__) && defined(__has_attribute) # if __has_attribute(target) # define SDL_HAS_TARGET_ATTRIBS # endif -#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */ +#elif defined(__GNUC__) && !defined(__loongarch64) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */ # define SDL_HAS_TARGET_ATTRIBS #elif defined(__ICC) && __ICC >= 1600 # define SDL_HAS_TARGET_ATTRIBS diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 1cefd45603..5e21b51e37 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -242,6 +242,103 @@ static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *inf #endif +#ifdef SDL_LSX_INTRINSICS + +static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint8 *src = info->src; + int srcskip = info->src_skip; + Uint8 *dst = info->dst; + int dstskip = info->dst_skip; + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + bool fill_alpha = !dstfmt->Amask; + Uint32 dstAmask, dstAshift; + const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + + SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); + + const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00); + const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128); + const __m128i const_32641 = __lsx_vreplgr2vr_h(32641); + const __m128i const_257 = __lsx_vreplgr2vr_h(257); + + // The byte offsets for the start of each pixel + const __m128i mask_offsets = __lsx_vld(offsets, 0); + + const __m128i convert_mask = __lsx_vadd_w( + __lsx_vreplgr2vr_w( + ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | + ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | + ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), + mask_offsets); + + const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets); + const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask); + + while (height--) { + int i = 0; + + for (; i + 4 <= width; i += 4) { + __m128i src128 = __lsx_vld(src, 0); + __m128i dst128 = __lsx_vld(dst, 0); + + __m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask); + src128 = __lsx_vshuf_b(src128, src128, convert_mask); + + src128 = __lsx_vor_v(src128, alpha_fill_mask); + + __m128i srca_lo = __lsx_vilvl_b(srcA, srcA); + __m128i srca_hi = __lsx_vilvh_b(srcA, srcA); + + srca_lo = __lsx_vxor_v(srca_lo, const_0xff00); + srca_hi = __lsx_vxor_v(srca_hi, const_0xff00); + + src128 = __lsx_vsub_b(src128, const_128); + dst128 = __lsx_vsub_b(dst128, const_128); + + __m128i tmp = __lsx_vilvl_b(dst128, src128); + __m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp)); + tmp = __lsx_vilvh_b(dst128, src128); + __m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp)); + + dst_lo = __lsx_vadd_h(dst_lo, const_32641); + dst_hi = __lsx_vadd_h(dst_hi, const_32641); + + dst_lo = __lsx_vmuh_hu(dst_lo, const_257); + dst_hi = __lsx_vmuh_hu(dst_hi, const_257); + + dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0); + if (fill_alpha) { + dst128 = __lsx_vor_v(dst128, alpha_fill_mask); + } + __lsx_vst(dst128, dst, 0); + + src += 16; + dst += 16; + } + + for (; i < width; ++i) { + Uint32 src32 = *(Uint32 *)src; + Uint32 dst32 = *(Uint32 *)dst; + ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); + if (fill_alpha) { + dst32 |= dstAmask; + } + *(Uint32 *)dst = dst32; + src += 4; + dst += 4; + } + + src += srcskip; + dst += dstskip; + } +} + +#endif + // fast RGB888->(A)RGB888 blending with surface alpha=128 special case static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) { @@ -1402,6 +1499,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return Blit8888to8888PixelAlphaSwizzleSSE41; } #endif +#ifdef SDL_LSX_INTRINSICS + if (SDL_HasLSX()) { + return Blit8888to8888PixelAlphaSwizzleLSX; + } +#endif #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) // To prevent "unused function" compiler warnings/errors (void)Blit8888to8888PixelAlpha;