From a027ad97889feb1aa61ad142d990f53b10437356 Mon Sep 17 00:00:00 2001 From: Isaac Aronson Date: Fri, 8 Sep 2023 07:39:24 -0500 Subject: [PATCH] Remove buffer in SSE4.1, use unpacklo and packus intrinsics --- src/video/SDL_blit_A_avx2.c | 9 ++++-- src/video/SDL_blit_A_sse4_1.c | 54 +++++++++++++++++------------------ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c index 78bdf9ecc7..4c157bc643 100644 --- a/src/video/SDL_blit_A_avx2.c +++ b/src/video/SDL_blit_A_avx2.c @@ -84,9 +84,10 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); __m128i c_src = _mm_loadu_si64(src_ptr); - c_src = _mm_shuffle_epi8(c_src, colorShiftMask); - __m128i c_dst = _mm_loadu_si64(dst_ptr); + c_src = _mm_unpacklo_epi8(_mm_shuffle_epi8(c_src, colorShiftMask), _mm_setzero_si128()); + __m128i c_dst = _mm_unpacklo_epi8(_mm_loadu_si64(dst_ptr), _mm_setzero_si128()); __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask); + c_mix = _mm_packus_epi16(c_mix, _mm_setzero_si128()); _mm_storeu_si64(dst_ptr, c_mix); remaining_pixels -= 2; offset += 2; @@ -103,7 +104,11 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) __m128i c_src = _mm_loadu_si32(&pixel); __m128i c_dst = _mm_loadu_si32(dst_ptr); #endif + c_src = _mm_unpacklo_epi8(c_src, _mm_setzero_si128()); + c_dst = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128()); __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, sse4_1AlphaMask); + mixed_pixel = _mm_srli_epi16(mixed_pixel, 8); + mixed_pixel = _mm_unpacklo_epi8(mixed_pixel, _mm_setzero_si128()); /* Old GCC has bad or no _mm_storeu_si32 */ #if defined(__GNUC__) && (__GNUC__ < 11) *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c index 5348879277..82100ae9e5 100644 --- a/src/video/SDL_blit_A_sse4_1.c +++ b/src/video/SDL_blit_A_sse4_1.c @@ -13,38 +13,33 @@ * A helper function to create an alpha mask for use with MixRGBA_SSE4_1 based on pixel format */ __m128i SDL_TARGETING("sse4.1") GetSDL_PixelFormatAlphaMask_SSE4_1(const SDL_PixelFormat* dstfmt) { - Uint8 index = dstfmt->Ashift / 8; + Uint8 index = dstfmt->Ashift / 4; /* Handle case where bad input sent */ if (dstfmt->Ashift == dstfmt->Bshift && dstfmt->Ashift == 0) { - index = 3; + index = 6; } return _mm_set_epi8( - -1, index + 4, -1, index + 4, -1, index + 4, -1, index + 4, + -1, index + 8, -1, index + 8, -1, index + 8, -1, index + 8, -1, index, -1, index, -1, index, -1, index); } /** * Using the SSE4.1 instruction set, blit four pixels with alpha blending - * @param src A pointer to two 32-bit pixels of ARGB format to blit into dst - * @param dst A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending + * @param src_color A pointer to two 32-bit pixels of ARGB format to blit into dst + * @param dst_color A pointer to two 32-bit pixels of ARGB format to retain visual data for while alpha blending * @return A 128-bit wide vector of two alpha-blended pixels in ARGB format */ -__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(const __m128i src, const __m128i dst, const __m128i alphaMask) { - __m128i src_color = _mm_cvtepu8_epi16(src); - __m128i dst_color = _mm_cvtepu8_epi16(dst); +__m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(const __m128i src_color, const __m128i dst_color, const __m128i alphaMask) { /** * Combines a shuffle and an _mm_cvtepu8_epi16 operation into one operation by moving the lower 8 bits of the alpha * channel around to create 16-bit integers. */ - __m128i alpha = _mm_shuffle_epi8(src, alphaMask); + __m128i alpha = _mm_shuffle_epi8(src_color, alphaMask); __m128i sub = _mm_sub_epi16(src_color, dst_color); __m128i mul = _mm_mullo_epi16(sub, alpha); - const __m128i SHUFFLE_REDUCE = _mm_set_epi8( - -1, -1, -1, -1, -1, -1, -1, -1, - 15, 13, 11, 9, 7, 5, 3, 1); - __m128i reduced = _mm_shuffle_epi8(mul, SHUFFLE_REDUCE); + mul = _mm_srli_epi16(mul, 8); - return _mm_add_epi8(reduced, dst); + return _mm_add_epi8(mul, dst_color); } Uint32 AlignPixelToSDL_PixelFormat(Uint32 color, const SDL_PixelFormat* srcfmt, const SDL_PixelFormat* dstfmt) { @@ -107,23 +102,22 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { SDL_PixelFormat *dstfmt = info->dst_fmt; int chunks = width / 4; - Uint8 *buffer = (Uint8*)SDL_malloc(chunks * 16 * sizeof(Uint8)); const __m128i colorShiftMask = GetSDL_PixelFormatShuffleMask(srcfmt, dstfmt); const __m128i alphaMask = GetSDL_PixelFormatAlphaMask_SSE4_1(dstfmt); while (height--) { /* Process 4-wide chunks of source color data that may be in wrong format into buffer */ for (int i = 0; i < chunks; i += 1) { - __m128i colors = _mm_loadu_si128((__m128i*)(src + i * 16)); - _mm_storeu_si128((__m128i*)(buffer + i * 16), _mm_shuffle_epi8(colors, colorShiftMask)); - } - - /* Alpha-blend in 2-wide chunks from buffer into destination */ - for (int i = 0; i < chunks * 2; i += 1) { - __m128i c_src = _mm_loadu_si64((buffer + (i * 8))); - __m128i c_dst = _mm_loadu_si64((dst + i * 8)); - __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); - _mm_storeu_si64(dst + i * 8, c_mix); + __m128i c_src = _mm_loadu_si128((__m128i*)(src + i * 16)); + c_src = _mm_shuffle_epi8(c_src, colorShiftMask); + __m128i c_dst = _mm_loadu_si128((__m128i*)(dst + i * 16)); + __m128i src_lo = _mm_unpacklo_epi8(c_src, _mm_setzero_si128()); + __m128i dst_lo = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128()); + __m128i mix_lo = MixRGBA_SSE4_1(src_lo, dst_lo, alphaMask); + __m128i src_hi = _mm_unpackhi_epi8(c_src, _mm_setzero_si128()); + __m128i dst_hi = _mm_unpackhi_epi8(c_dst, _mm_setzero_si128()); + __m128i mix_hi = MixRGBA_SSE4_1(src_hi, dst_hi, alphaMask); + _mm_storeu_si128((__m128i*)(dst + i * 16), _mm_packus_epi16(mix_lo, mix_hi)); } /* Handle remaining pixels when width is not a multiple of 4 */ @@ -134,9 +128,10 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { Uint32 *src_ptr = ((Uint32*)(src + (offset * 4))); Uint32 *dst_ptr = ((Uint32*)(dst + (offset * 4))); __m128i c_src = _mm_loadu_si64(src_ptr); - c_src = _mm_shuffle_epi8(c_src, colorShiftMask); - __m128i c_dst = _mm_loadu_si64(dst_ptr); + c_src = _mm_unpacklo_epi8(_mm_shuffle_epi8(c_src, colorShiftMask), _mm_setzero_si128()); + __m128i c_dst = _mm_unpacklo_epi8(_mm_loadu_si64(dst_ptr), _mm_setzero_si128()); __m128i c_mix = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); + c_mix = _mm_packus_epi16(c_mix, _mm_setzero_si128()); _mm_storeu_si64(dst_ptr, c_mix); remaining_pixels -= 2; offset += 2; @@ -153,7 +148,11 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { __m128i c_src = _mm_loadu_si32(&pixel); __m128i c_dst = _mm_loadu_si32(dst_ptr); #endif + c_src = _mm_unpacklo_epi8(c_src, _mm_setzero_si128()); + c_dst = _mm_unpacklo_epi8(c_dst, _mm_setzero_si128()); __m128i mixed_pixel = MixRGBA_SSE4_1(c_src, c_dst, alphaMask); + mixed_pixel = _mm_srli_epi16(mixed_pixel, 8); + mixed_pixel = _mm_unpacklo_epi8(mixed_pixel, _mm_setzero_si128()); /* Old GCC has bad or no _mm_storeu_si32 */ #if defined(__GNUC__) && (__GNUC__ < 11) *dst_ptr = _mm_extract_epi32(mixed_pixel, 0); @@ -169,7 +168,6 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo* info) { src += srcskip; dst += dstskip; } - SDL_free(buffer); } #endif