diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index e8f5b71e76..8b9a6137e9 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -38,7 +38,7 @@ // Functions to blit from N-bit surfaces to other surfaces #define BLIT_FEATURE_NONE 0x00 -#define BLIT_FEATURE_HAS_MMX 0x01 +#define BLIT_FEATURE_HAS_SSE41 0x01 #define BLIT_FEATURE_HAS_ALTIVEC 0x02 #define BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH 0x04 @@ -875,8 +875,8 @@ static Uint32 GetBlitFeatures(void) static Uint32 features = ~0u; if (features == ~0u) { features = (0 - // Feature 1 is has-MMX - | ((SDL_HasMMX()) ? BLIT_FEATURE_HAS_MMX : 0) + // Feature 1 is has-SSE41 + | ((SDL_HasSSE41()) ? BLIT_FEATURE_HAS_SSE41 : 0) // Feature 2 is has-AltiVec | ((SDL_HasAltiVec()) ? BLIT_FEATURE_HAS_ALTIVEC : 0) // Feature 4 is dont-use-prefetch @@ -890,8 +890,8 @@ static Uint32 GetBlitFeatures(void) #pragma altivec_model off #endif #else -// Feature 1 is has-MMX -#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0)) +// Feature 1 is has-SSE41 +#define GetBlitFeatures() ((SDL_HasSSE41() ? BLIT_FEATURE_HAS_SSE41 : 0)) #endif // This is now endian dependent @@ -1163,6 +1163,109 @@ static void Blit_XRGB8888_RGB565(SDL_BlitInfo *info) #endif // USE_DUFFS_LOOP } +#ifdef SDL_SSE4_1_INTRINSICS + +static void SDL_TARGETING("sse4.1") Blit_RGB565_32_SSE41(SDL_BlitInfo *info, int Rshift, int Gshift, int Bshift, int Amask) +{ + int c; + int width, height; + const Uint16 *src; + Uint32 *dst; + int srcskip, dstskip; + Uint8 r, g, b; + + // Set up some basic variables + width = info->dst_w; + height = info->dst_h; + src = (const Uint16 *)info->src; + srcskip = info->src_skip / 2; + dst = (Uint32 *)info->dst; + dstskip = info->dst_skip / 4; + + while (height--) { + // Copy in 4 pixel chunks + for (c = width / 4; c; --c) { + // Load 4 16-bit RGB565 pixels into an SSE register + __m128i pixels_rgb565 = _mm_loadu_si128((__m128i*)src); + + // Extract Red components (5 bits) + __m128i red_5bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0xF800)); // Mask for Red + red_5bit = _mm_srli_epi16(red_5bit, 11); // Shift to get 5-bit value + __m128i red_8bit = _mm_cvtepu16_epi32(red_5bit); // Convert to 32-bit and zero-extend + red_8bit = _mm_slli_epi32(red_8bit, 3); // Scale to 8 bits (multiply by 8) + red_8bit = _mm_or_si128(red_8bit, _mm_srli_epi32(red_8bit, 5)); // Replicate top 3 bits for better scaling + + // Extract Green components (6 bits) + __m128i green_6bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0x07E0)); // Mask for Green + green_6bit = _mm_srli_epi16(green_6bit, 5); // Shift to get 6-bit value + __m128i green_8bit = _mm_cvtepu16_epi32(green_6bit); // Convert to 32-bit and zero-extend + green_8bit = _mm_slli_epi32(green_8bit, 2); // Scale to 8 bits (multiply by 4) + green_8bit = _mm_or_si128(green_8bit, _mm_srli_epi32(green_8bit, 6)); // Replicate top 2 bits + + // Extract Blue components (5 bits) + __m128i blue_5bit = _mm_and_si128(pixels_rgb565, _mm_set1_epi16(0x001F)); // Mask for Blue + __m128i blue_8bit = _mm_cvtepu16_epi32(blue_5bit); // Convert to 32-bit and zero-extend + blue_8bit = _mm_slli_epi32(blue_8bit, 3); // Scale to 8 bits (multiply by 8) + blue_8bit = _mm_or_si128(blue_8bit, _mm_srli_epi32(blue_8bit, 5)); // Replicate top 3 bits + + // Set Alpha to opaque (0xFF) + __m128i alpha_8bit = _mm_set1_epi32(Amask); + + // Combine into 32-bit values + __m128i argb_pixels_low = _mm_or_si128(alpha_8bit, _mm_slli_epi32(red_8bit, Rshift)); + argb_pixels_low = _mm_or_si128(argb_pixels_low, _mm_slli_epi32(green_8bit, Gshift)); + argb_pixels_low = _mm_or_si128(argb_pixels_low, _mm_slli_epi32(blue_8bit, Bshift)); + + // Store the results + _mm_storeu_si128((__m128i*)dst, argb_pixels_low); + src += 4; + dst += 4; + } + // Get any leftovers + switch (width & 3) { + case 3: + RGB_FROM_RGB565(*src, r, g, b); + *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask; + ++src; + SDL_FALLTHROUGH; + case 2: + RGB_FROM_RGB565(*src, r, g, b); + *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask; + ++src; + SDL_FALLTHROUGH; + case 1: + RGB_FROM_RGB565(*src, r, g, b); + *dst++ = (r << Rshift) | (g << Gshift) | (b << Bshift) | Amask; + ++src; + break; + } + src += srcskip; + dst += dstskip; + } +} + +static void Blit_RGB565_ARGB8888_SSE41(SDL_BlitInfo * info) +{ + Blit_RGB565_32_SSE41(info, 16, 8, 0, 0xFF000000); +} + +static void Blit_RGB565_ABGR8888_SSE41(SDL_BlitInfo * info) +{ + Blit_RGB565_32_SSE41(info, 0, 8, 16, 0xFF000000); +} + +static void Blit_RGB565_RGBA8888_SSE41(SDL_BlitInfo * info) +{ + Blit_RGB565_32_SSE41(info, 24, 16, 8, 0x000000FF); +} + +static void Blit_RGB565_BGRA8888_SSE41(SDL_BlitInfo * info) +{ + Blit_RGB565_32_SSE41(info, 8, 16, 24, 0x000000FF); +} + +#endif // SDL_SSE4_1_INTRINSICS + #ifdef SDL_HAVE_BLIT_N_RGB565 // Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces @@ -2700,6 +2803,16 @@ static const struct blit_table normal_blit_2[] = { { 0x00007C00, 0x000003E0, 0x0000001F, 4, 0x00000000, 0x00000000, 0x00000000, BLIT_FEATURE_HAS_ALTIVEC, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, #endif +#ifdef SDL_SSE4_1_INTRINSICS + { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00, 0x000000FF, + BLIT_FEATURE_HAS_SSE41, Blit_RGB565_ARGB8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, + { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x000000FF, 0x0000FF00, 0x00FF0000, + BLIT_FEATURE_HAS_SSE41, Blit_RGB565_ABGR8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, + { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0xFF000000, 0x00FF0000, 0x0000FF00, + BLIT_FEATURE_HAS_SSE41, Blit_RGB565_RGBA8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, + { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x0000FF00, 0x00FF0000, 0xFF000000, + BLIT_FEATURE_HAS_SSE41, Blit_RGB565_BGRA8888_SSE41, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, +#endif #ifdef SDL_HAVE_BLIT_N_RGB565 { 0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00, 0x000000FF, 0, Blit_RGB565_ARGB8888, NO_ALPHA | COPY_ALPHA | SET_ALPHA },