Allow optimizing memcpy and memset where possible

Modern C runtimes have well optimized memset and memcpy, so use those instead of dispatching into SDL's versions. In addition, some compilers can analyze memset and memcpy calls and directly turn them into optimized assembly.
This commit is contained in:
Sam Lantinga
2024-01-20 08:12:11 -08:00
parent 8d25c2d260
commit 7a069cc4b0
16 changed files with 390 additions and 220 deletions

View File

@@ -510,147 +510,6 @@ int SDL_toupper(int x) { return ((x) >= 'a') && ((x) <= 'z') ? ('A' + ((x) - 'a'
int SDL_tolower(int x) { return ((x) >= 'A') && ((x) <= 'Z') ? ('a' + ((x) - 'A')) : (x); }
#endif
/* This file contains a portable memcpy manipulation function for SDL */
void *SDL_memcpy(SDL_OUT_BYTECAP(len) void *dst, SDL_IN_BYTECAP(len) const void *src, size_t len)
{
#ifdef __GNUC__
/* Presumably this is well tuned for speed.
On my machine this is twice as fast as the C code below.
*/
return __builtin_memcpy(dst, src, len);
#elif defined(HAVE_MEMCPY)
return memcpy(dst, src, len);
#elif defined(HAVE_BCOPY)
bcopy(src, dst, len);
return dst;
#else
/* GCC 4.9.0 with -O3 will generate movaps instructions with the loop
using Uint32* pointers, so we need to make sure the pointers are
aligned before we loop using them.
*/
if (((uintptr_t)src & 0x3) || ((uintptr_t)dst & 0x3)) {
/* Do an unaligned byte copy */
Uint8 *srcp1 = (Uint8 *)src;
Uint8 *dstp1 = (Uint8 *)dst;
while (len--) {
*dstp1++ = *srcp1++;
}
} else {
size_t left = (len % 4);
Uint32 *srcp4, *dstp4;
Uint8 *srcp1, *dstp1;
srcp4 = (Uint32 *)src;
dstp4 = (Uint32 *)dst;
len /= 4;
while (len--) {
*dstp4++ = *srcp4++;
}
srcp1 = (Uint8 *)srcp4;
dstp1 = (Uint8 *)dstp4;
switch (left) {
case 3:
*dstp1++ = *srcp1++;
case 2:
*dstp1++ = *srcp1++;
case 1:
*dstp1++ = *srcp1++;
}
}
return dst;
#endif /* __GNUC__ */
}
void *SDL_memset(SDL_OUT_BYTECAP(len) void *dst, int c, size_t len)
{
#ifdef HAVE_MEMSET
return memset(dst, c, len);
#else
size_t left;
Uint32 *dstp4;
Uint8 *dstp1 = (Uint8 *)dst;
Uint8 value1;
Uint32 value4;
/* The value used in memset() is a byte, passed as an int */
c &= 0xff;
/* The destination pointer needs to be aligned on a 4-byte boundary to
* execute a 32-bit set. Set first bytes manually if needed until it is
* aligned. */
value1 = (Uint8)c;
while ((uintptr_t)dstp1 & 0x3) {
if (len--) {
*dstp1++ = value1;
} else {
return dst;
}
}
value4 = ((Uint32)c | ((Uint32)c << 8) | ((Uint32)c << 16) | ((Uint32)c << 24));
dstp4 = (Uint32 *)dstp1;
left = (len % 4);
len /= 4;
while (len--) {
*dstp4++ = value4;
}
dstp1 = (Uint8 *)dstp4;
switch (left) {
case 3:
*dstp1++ = value1;
case 2:
*dstp1++ = value1;
case 1:
*dstp1++ = value1;
}
return dst;
#endif /* HAVE_MEMSET */
}
/* Note that memset() is a byte assignment and this is a 32-bit assignment, so they're not directly equivalent. */
void *SDL_memset4(void *dst, Uint32 val, size_t dwords)
{
#if defined(__APPLE__) && defined(HAVE_STRING_H)
memset_pattern4(dst, &val, dwords * 4);
#elif defined(__GNUC__) && defined(__i386__)
int u0, u1, u2;
__asm__ __volatile__(
"cld \n\t"
"rep ; stosl \n\t"
: "=&D"(u0), "=&a"(u1), "=&c"(u2)
: "0"(dst), "1"(val), "2"(SDL_static_cast(Uint32, dwords))
: "memory");
#else
size_t _n = (dwords + 3) / 4;
Uint32 *_p = SDL_static_cast(Uint32 *, dst);
Uint32 _val = (val);
if (dwords == 0) {
return dst;
}
switch (dwords % 4) {
case 0:
do {
*_p++ = _val;
SDL_FALLTHROUGH;
case 3:
*_p++ = _val;
SDL_FALLTHROUGH;
case 2:
*_p++ = _val;
SDL_FALLTHROUGH;
case 1:
*_p++ = _val;
} while (--_n);
}
#endif
return dst;
}
#if defined(HAVE_CTYPE_H) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
int SDL_isblank(int x)
{