diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c index 96e1714da0..4421aed0e2 100644 --- a/src/video/SDL_blit_A_avx2.c +++ b/src/video/SDL_blit_A_avx2.c @@ -6,48 +6,6 @@ #include "SDL_blit.h" -// Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending -SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2( - __m256i src, __m256i dst, - const __m256i alpha_shuffle, const __m256i alpha_saturate) -{ - // SIMD implementation of blend_mul2. - // dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA)) - // dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA)) - - // Splat the alpha into all channels for each pixel - __m256i srca = _mm256_shuffle_epi8(src, alpha_shuffle); - - // Set the alpha channels of src to 255 - src = _mm256_or_si256(src, alpha_saturate); - - __m256i src_lo = _mm256_unpacklo_epi8(src, _mm256_setzero_si256()); - __m256i src_hi = _mm256_unpackhi_epi8(src, _mm256_setzero_si256()); - - __m256i dst_lo = _mm256_unpacklo_epi8(dst, _mm256_setzero_si256()); - __m256i dst_hi = _mm256_unpackhi_epi8(dst, _mm256_setzero_si256()); - - __m256i srca_lo = _mm256_unpacklo_epi8(srca, _mm256_setzero_si256()); - __m256i srca_hi = _mm256_unpackhi_epi8(srca, _mm256_setzero_si256()); - - // dst = ((src - dst) * srcA) + ((dst << 8) - dst) - dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo), - _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo)); - dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi), - _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi)); - - // dst += 0x1U (use 0x80 to round instead of floor) - dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1)); - dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1)); - - // dst += dst >> 8 - dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8); - dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8); - - dst = _mm256_packus_epi16(dst_lo, dst_hi); - return dst; -} - void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) { int width = info->dst_w; @@ -59,32 +17,64 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) SDL_PixelFormat *srcfmt = info->src_fmt; SDL_PixelFormat *dstfmt = info->dst_fmt; + // The byte offsets for the start of each pixel const __m256i mask_offsets = _mm256_set_epi8( 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - const __m256i shift_mask = _mm256_add_epi32( + const __m256i convert_mask = _mm256_add_epi32( _mm256_set1_epi32( ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | - ((srcfmt->Bshift >> 3) << dstfmt->Bshift) | - ((srcfmt->Ashift >> 3) << dstfmt->Ashift)), + ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), mask_offsets); - const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets); - const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask); + const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); + const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask); while (height--) { int i = 0; for (; i + 8 <= width; i += 8) { - // Load 8 src pixels and shuffle into the dst format - __m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask); + // Load 8 src pixels + __m256i src256 = _mm256_loadu_si256((__m256i *)src); // Load 8 dst pixels - __m256i c_dst = _mm256_loadu_si256((__m256i *)dst); + __m256i dst256 = _mm256_loadu_si256((__m256i *)dst); + + // Extract the alpha from each pixel and splat it into all the channels + __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask); + + // Convert to dst format + src256 = _mm256_shuffle_epi8(src256, convert_mask); + + // Set the alpha channels of src to 255 + src256 = _mm256_or_si256(src256, alpha_fill_mask); + + __m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256()); + __m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256()); + + __m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256()); + __m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256()); + + __m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256()); + __m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256()); + + // dst = ((src - dst) * srcA) + ((dst << 8) - dst) + dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo), + _mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo)); + dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi), + _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi)); + + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1)); + dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1)); + + // dst += dst >> 8 + dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8); // Blend the pixels together and save the result - _mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask)); + _mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi)); src += 32; dst += 32; @@ -94,12 +84,29 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info) Uint32 src32 = *(Uint32 *)src; Uint32 dst32 = *(Uint32 *)dst; + Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF; + src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | - (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift); + dstfmt->Amask; - ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift); + Uint32 srcRB = src32 & 0x00FF00FF; + Uint32 dstRB = dst32 & 0x00FF00FF; + + Uint32 srcGA = (src32 >> 8) & 0x00FF00FF; + Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF; + + Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; + resRB += 0x00010001; + resRB += (resRB >> 8) & 0x00FF00FF; + resRB = (resRB >> 8) & 0x00FF00FF; + + Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; + resGA += 0x00010001; + resGA += (resGA >> 8) & 0x00FF00FF; + resGA &= 0xFF00FF00; + dst32 = resRB | resGA; *(Uint32 *)dst = dst32; diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c index 3e68819431..425f5f0281 100644 --- a/src/video/SDL_blit_A_sse4_1.c +++ b/src/video/SDL_blit_A_sse4_1.c @@ -6,48 +6,6 @@ #include "SDL_blit.h" -// Using the SSE4.1 instruction set, blit eight pixels into four with alpha blending -SDL_FORCE_INLINE __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1( - __m128i src, __m128i dst, - const __m128i alpha_shuffle, const __m128i alpha_saturate) -{ - // SIMD implementation of blend_mul2. - // dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA)) - // dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA)) - - // Splat the alpha into all channels for each pixel - __m128i srca = _mm_shuffle_epi8(src, alpha_shuffle); - - // Set the alpha channels of src to 255 - src = _mm_or_si128(src, alpha_saturate); - - __m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128()); - __m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128()); - - __m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); - __m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); - - __m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128()); - __m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128()); - - // dst = ((src - dst) * srcA) + ((dst << 8) - dst) - dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), - _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); - dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), - _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); - - // dst += 0x1U (use 0x80 to round instead of floor) - dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); - dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); - - // dst += dst >> 8 - dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); - dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); - - dst = _mm_packus_epi16(dst_lo, dst_hi); - return dst; -} - void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info) { int width = info->dst_w; @@ -59,32 +17,64 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info) SDL_PixelFormat *srcfmt = info->src_fmt; SDL_PixelFormat *dstfmt = info->dst_fmt; + // The byte offsets for the start of each pixel const __m128i mask_offsets = _mm_set_epi8( 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - const __m128i shift_mask = _mm_add_epi32( + const __m128i convert_mask = _mm_add_epi32( _mm_set1_epi32( ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | - ((srcfmt->Bshift >> 3) << dstfmt->Bshift) | - ((srcfmt->Ashift >> 3) << dstfmt->Ashift)), + ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), mask_offsets); - const __m128i splat_mask = _mm_add_epi8(_mm_set1_epi8(dstfmt->Ashift >> 3), mask_offsets); - const __m128i saturate_mask = _mm_set1_epi32((int)dstfmt->Amask); + const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); + const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask); while (height--) { int i = 0; for (; i + 4 <= width; i += 4) { - // Load 4 src pixels and shuffle into the dst format - __m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)src), shift_mask); + // Load 4 src pixels + __m128i src128 = _mm_loadu_si128((__m128i *)src); // Load 4 dst pixels - __m128i c_dst = _mm_loadu_si128((__m128i *)dst); + __m128i dst128 = _mm_loadu_si128((__m128i *)dst); + + // Extract the alpha from each pixel and splat it into all the channels + __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask); + + // Convert to dst format + src128 = _mm_shuffle_epi8(src128, convert_mask); + + // Set the alpha channels of src to 255 + src128 = _mm_or_si128(src128, alpha_fill_mask); + + __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128()); + __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128()); + + __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128()); + __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128()); + + __m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128()); + __m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128()); + + // dst = ((src - dst) * srcA) + ((dst << 8) - dst) + dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo), + _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); + dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), + _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); + + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); + dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); + + // dst += dst >> 8 + dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); // Blend the pixels together and save the result - _mm_storeu_si128((__m128i *)dst, MixRGBA_SSE4_1(c_src, c_dst, splat_mask, saturate_mask)); + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi)); src += 16; dst += 16; @@ -94,12 +84,29 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info) Uint32 src32 = *(Uint32 *)src; Uint32 dst32 = *(Uint32 *)dst; + Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF; + src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) | (((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) | (((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) | - (((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift); + dstfmt->Amask; - ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift); + Uint32 srcRB = src32 & 0x00FF00FF; + Uint32 dstRB = dst32 & 0x00FF00FF; + + Uint32 srcGA = (src32 >> 8) & 0x00FF00FF; + Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF; + + Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB; + resRB += 0x00010001; + resRB += (resRB >> 8) & 0x00FF00FF; + resRB = (resRB >> 8) & 0x00FF00FF; + + Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA; + resGA += 0x00010001; + resGA += (resGA >> 8) & 0x00FF00FF; + resGA &= 0xFF00FF00; + dst32 = resRB | resGA; *(Uint32 *)dst = dst32;