Fix BlitNtoNPixelAlpha for formats with no dst alpha

This commit is contained in:
Brick
2024-07-03 16:37:26 +01:00
committed by Sam Lantinga
parent 3c59d3f69c
commit c457dbb629
2 changed files with 120 additions and 106 deletions

View File

@@ -6,48 +6,6 @@
#include "SDL_blit.h"
// Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending
SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(
__m256i src, __m256i dst,
const __m256i alpha_shuffle, const __m256i alpha_saturate)
{
// SIMD implementation of blend_mul2.
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))
// Splat the alpha into all channels for each pixel
__m256i srca = _mm256_shuffle_epi8(src, alpha_shuffle);
// Set the alpha channels of src to 255
src = _mm256_or_si256(src, alpha_saturate);
__m256i src_lo = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
__m256i src_hi = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
__m256i dst_lo = _mm256_unpacklo_epi8(dst, _mm256_setzero_si256());
__m256i dst_hi = _mm256_unpackhi_epi8(dst, _mm256_setzero_si256());
__m256i srca_lo = _mm256_unpacklo_epi8(srca, _mm256_setzero_si256());
__m256i srca_hi = _mm256_unpackhi_epi8(srca, _mm256_setzero_si256());
// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
_mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
// dst += dst >> 8
dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
dst = _mm256_packus_epi16(dst_lo, dst_hi);
return dst;
}
void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
{
int width = info->dst_w;
@@ -59,32 +17,64 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
// The byte offsets for the start of each pixel
const __m256i mask_offsets = _mm256_set_epi8(
28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
const __m256i shift_mask = _mm256_add_epi32(
const __m256i convert_mask = _mm256_add_epi32(
_mm256_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);
const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask);
const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);
while (height--) {
int i = 0;
for (; i + 8 <= width; i += 8) {
// Load 8 src pixels and shuffle into the dst format
__m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask);
// Load 8 src pixels
__m256i src256 = _mm256_loadu_si256((__m256i *)src);
// Load 8 dst pixels
__m256i c_dst = _mm256_loadu_si256((__m256i *)dst);
__m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
// Extract the alpha from each pixel and splat it into all the channels
__m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
// Convert to dst format
src256 = _mm256_shuffle_epi8(src256, convert_mask);
// Set the alpha channels of src to 255
src256 = _mm256_or_si256(src256, alpha_fill_mask);
__m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
__m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());
__m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
__m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());
__m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
__m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());
// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
_mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
// dst += dst >> 8
dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
// Blend the pixels together and save the result
_mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask));
_mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));
src += 32;
dst += 32;
@@ -94,12 +84,29 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
(((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
(((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
(((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
dstfmt->Amask;
ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
Uint32 srcRB = src32 & 0x00FF00FF;
Uint32 dstRB = dst32 & 0x00FF00FF;
Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
resRB += 0x00010001;
resRB += (resRB >> 8) & 0x00FF00FF;
resRB = (resRB >> 8) & 0x00FF00FF;
Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
resGA += 0x00010001;
resGA += (resGA >> 8) & 0x00FF00FF;
resGA &= 0xFF00FF00;
dst32 = resRB | resGA;
*(Uint32 *)dst = dst32;

View File

@@ -6,48 +6,6 @@
#include "SDL_blit.h"
// Using the SSE4.1 instruction set, blit eight pixels into four with alpha blending
SDL_FORCE_INLINE __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(
__m128i src, __m128i dst,
const __m128i alpha_shuffle, const __m128i alpha_saturate)
{
// SIMD implementation of blend_mul2.
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))
// Splat the alpha into all channels for each pixel
__m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);
// Set the alpha channels of src to 255
src = _mm_or_si128(src, alpha_saturate);
__m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());
__m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
__m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
__m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());
// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
_mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
// dst += dst >> 8
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
dst = _mm_packus_epi16(dst_lo, dst_hi);
return dst;
}
void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
{
int width = info->dst_w;
@@ -59,32 +17,64 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
// The byte offsets for the start of each pixel
const __m128i mask_offsets = _mm_set_epi8(
12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
const __m128i shift_mask = _mm_add_epi32(
const __m128i convert_mask = _mm_add_epi32(
_mm_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);
const __m128i splat_mask = _mm_add_epi8(_mm_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
const __m128i saturate_mask = _mm_set1_epi32((int)dstfmt->Amask);
const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);
while (height--) {
int i = 0;
for (; i + 4 <= width; i += 4) {
// Load 4 src pixels and shuffle into the dst format
__m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)src), shift_mask);
// Load 4 src pixels
__m128i src128 = _mm_loadu_si128((__m128i *)src);
// Load 4 dst pixels
__m128i c_dst = _mm_loadu_si128((__m128i *)dst);
__m128i dst128 = _mm_loadu_si128((__m128i *)dst);
// Extract the alpha from each pixel and splat it into all the channels
__m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
// Convert to dst format
src128 = _mm_shuffle_epi8(src128, convert_mask);
// Set the alpha channels of src to 255
src128 = _mm_or_si128(src128, alpha_fill_mask);
__m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
__m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
__m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
__m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());
// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
_mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
// dst += dst >> 8
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
// Blend the pixels together and save the result
_mm_storeu_si128((__m128i *)dst, MixRGBA_SSE4_1(c_src, c_dst, splat_mask, saturate_mask));
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));
src += 16;
dst += 16;
@@ -94,12 +84,29 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;
Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;
src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
(((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
(((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
(((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
dstfmt->Amask;
ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
Uint32 srcRB = src32 & 0x00FF00FF;
Uint32 dstRB = dst32 & 0x00FF00FF;
Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;
Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
resRB += 0x00010001;
resRB += (resRB >> 8) & 0x00FF00FF;
resRB = (resRB >> 8) & 0x00FF00FF;
Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
resGA += 0x00010001;
resGA += (resGA >> 8) & 0x00FF00FF;
resGA &= 0xFF00FF00;
dst32 = resRB | resGA;
*(Uint32 *)dst = dst32;