stdlib: Improve Unicode support and consistency in string comparison functions.

SDL_strcasecmp (even when calling into a C runtime) does not work with
Unicode chars, and depending on the user's locale, might not work with
even basic ASCII strings.

This implements the function from scratch, using "case-folding,"
which is a more robust method that deals with various languages. It
involves a hashtable of a few hundred codepoints that are "uppercase" and
how to map them to lowercase equivalents (possibly increasing the size of
the string in the process). The vast majority of human languages (and
Unicode) do not have letters with different cases, but still, this static
table takes about 10 kilobytes on a 64-bit machine.

Even this will fail in one known case: the Turkish 'i' folds differently
if you're writing in Turkish vs other languages. Generally this is seen as
unfortunate collateral damage in cases where you can't specify the language
in use.

In addition to case-folding the codepoints, the new functions also know how
to decode the various formats to turn them into codepoints in the first
place, instead of blindly stepping by one byte (or one wchar_t) per
character.

Also included is casefolding.txt from the Unicode Consortium and a perl
script to generate the hashtable from that text file, so we can trivially
update this if new languages are added in the future.

A simple test using the new function:

```c
 #include <SDL3/SDL.h>

 int main(void)
 {
     const char *a = "α ε η";
     const char *b = "Α Ε Η";
     SDL_Log("    strcasecmp(\"%s\", \"%s\") == %d\n", a, b, strcasecmp(a, b));
     SDL_Log("SDL_strcasecmp(\"%s\", \"%s\") == %d\n", a, b, SDL_strcasecmp(a, b));
     return 0;
 }
```

Produces:

```
INFO:     strcasecmp("α ε η", "Α Ε Η") == 32
INFO: SDL_strcasecmp("α ε η", "Α Ε Η") == 0
```

glibc strcasecmp() fails to compare a Greek lowercase string to its uppercase
equivalent, even with a UTF-8 locale, but SDL_strcasecmp() works.

Other SDL_stdinc.h functions are changed to be more consistent, which is to
say they now ignore any C runtime and often dictate that only English-based
low-ASCII works with them.

Fixes Issue #9313.
This commit is contained in:
Ryan C. Gordon
2024-03-26 13:22:38 -04:00
parent 4659a84bd1
commit a5c892d2c3
17 changed files with 4971 additions and 210 deletions

View File

@@ -28,6 +28,205 @@
#include <psp2/kernel/clib.h>
#endif
#include "SDL_casefolding.h"
// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values.
#define INVALID_UNICODE_CODEPOINT 0xFFFD
#if defined(__SIZEOF_WCHAR_T__)
#define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__
#elif defined(SDL_PLATFORM_WINDOWS)
#define SDL_SIZEOF_WCHAR_T 2
#else // assume everything else is UTF-32 (add more tests if compiler-assert fails below!)
#define SDL_SIZEOF_WCHAR_T 4
#endif
SDL_COMPILE_TIME_ASSERT(sizeof_wchar_t, sizeof(wchar_t) == SDL_SIZEOF_WCHAR_T);
// this expects `from` and `to` to be UTF-32 encoding!
static int SDL_UnicodeCaseFold(const Uint32 from, Uint32 *to)
{
// !!! FIXME: since the hashtable is static, maybe we should binary
// !!! FIXME: search it instead of walking the whole bucket.
if (from < 128) { // low-ASCII, easy!
if ((from >= 'A') && (from <= 'Z')) {
*to = 'a' + (from - 'A');
return 1;
}
} else if (from <= 0xFFFF) { // the Basic Multilingual Plane.
const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const Uint16 from16 = (Uint16) from;
// see if it maps to a single char (most common)...
{
const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping1_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
*to = mapping->to0;
return 1;
}
}
}
// see if it folds down to two chars...
{
const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping2_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
to[0] = mapping->to0;
to[1] = mapping->to1;
return 2;
}
}
}
// okay, maybe it's _three_ characters!
{
const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping3_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
to[0] = mapping->to0;
to[1] = mapping->to1;
to[2] = mapping->to2;
return 3;
}
}
}
} else { // codepoint that doesn't fit in 16 bits.
const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping1_32 *mapping = &bucket->list[i];
if (mapping->from == from) {
*to = mapping->to0;
return 1;
}
}
}
// Not found...there's no folding needed for this codepoint.
*to = from;
return 1;
}
#define UNICODE_STRCASECMP(bits, slen1, slen2, update_slen1, update_slen2) \
Uint32 folded1[3], folded2[3]; \
int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
while (SDL_TRUE) { \
Uint32 cp1, cp2; \
if (head1 != tail1) { \
cp1 = folded1[tail1++]; \
} else { \
const Uint##bits *str1start = (const Uint##bits *) str1; \
head1 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str1, slen1), folded1); \
update_slen1; \
cp1 = folded1[0]; \
tail1 = 1; \
} \
if (head2 != tail2) { \
cp2 = folded2[tail2++]; \
} else { \
const Uint##bits *str2start = (const Uint##bits *) str2; \
head2 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str2, slen2), folded2); \
update_slen2; \
cp2 = folded2[0]; \
tail2 = 1; \
} \
if (cp1 < cp2) { \
return -1; \
} else if (cp1 > cp2) { \
return 1; \
} else if (cp1 == 0) { \
break; /* complete match. */ \
} \
} \
return 0
static Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
{
const char *str = *_str;
const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0);
// !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc.
if (octet == 0) { // null terminator, end of string.
return 0; // don't advance `*_str`.
} else if ((octet & 0x80) == 0) { // 0xxxxxxx: one byte codepoint.
(*_str)++;
return octet;
} else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) { // 110xxxxx 10xxxxxx: two byte codepoint.
if (slen >= 2) {
*_str += 2;
return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F);
}
} else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) { // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint.
*_str += 3;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6;
const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F);
return ((octet & 0x0F) << 12) | octet2 | octet3;
} else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) { // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint.
*_str += 4;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12;
const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6;
const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F);
return ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
}
// bogus byte, skip ahead, return a REPLACEMENT CHARACTER.
(*_str)++;
return INVALID_UNICODE_CODEPOINT;
}
#if (SDL_SIZEOF_WCHAR_T == 2)
static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
{
const Uint16 *str = *_str;
Uint32 cp = (Uint32) *(str++);
if (cp == 0) {
return 0; // don't advance string pointer.
} else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) {
cp = INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair
} else if ((cp >= 0xD800) && (cp <= 0xDBFF)) { // start of surrogate pair!
const Uint32 pair = (Uint32) *str;
if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) {
cp = INVALID_UNICODE_CODEPOINT;
} else {
str++; // eat the other surrogate.
cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
}
}
*_str = str;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
}
#elif (SDL_SIZEOF_WCHAR_T == 4)
static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
{
if (!slen) {
return 0;
}
const Uint32 *str = *_str;
const Uint32 cp = *str;
if (cp == 0) {
return 0; // don't advance string pointer.
}
(*_str)++;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
}
#endif
#if !defined(HAVE_VSSCANF) || !defined(HAVE_STRTOL) || !defined(HAVE_WCSTOL) || !defined(HAVE_STRTOUL) || !defined(HAVE_STRTOD) || !defined(HAVE_STRTOLL) || !defined(HAVE_STRTOULL)
#define SDL_isupperhex(X) (((X) >= 'A') && ((X) <= 'F'))
#define SDL_islowerhex(X) (((X) >= 'a') && ((X) <= 'f'))
@@ -507,83 +706,41 @@ int SDL_wcsncmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
#endif /* HAVE_WCSNCMP */
}
int SDL_wcscasecmp(const wchar_t *str1, const wchar_t *str2)
int SDL_wcscasecmp(const wchar_t *wstr1, const wchar_t *wstr2)
{
#ifdef HAVE_WCSCASECMP
return wcscasecmp(str1, str2);
#elif defined(HAVE__WCSICMP)
return _wcsicmp(str1, str2);
#if (SDL_SIZEOF_WCHAR_T == 2)
const Uint16 *str1 = (const Uint16 *) wstr1;
const Uint16 *str2 = (const Uint16 *) wstr2;
UNICODE_STRCASECMP(16, 2, 2, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
#elif (SDL_SIZEOF_WCHAR_T == 4)
const Uint32 *str1 = (const Uint32 *) wstr1;
const Uint32 *str2 = (const Uint32 *) wstr2;
UNICODE_STRCASECMP(32, 1, 1, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
#else
wchar_t a = 0;
wchar_t b = 0;
while (*str1 && *str2) {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
if (a != b) {
break;
}
++str1;
++str2;
}
#error Unexpected wchar_t size
#endif
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
return (int)((unsigned int)a - (unsigned int)b);
#endif /* HAVE__WCSICMP */
return -1;
}
int SDL_wcsncasecmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
int SDL_wcsncasecmp(const wchar_t *wstr1, const wchar_t *wstr2, size_t maxlen)
{
#ifdef HAVE_WCSNCASECMP
return wcsncasecmp(str1, str2, maxlen);
#elif defined(HAVE__WCSNICMP)
return _wcsnicmp(str1, str2, maxlen);
#else
wchar_t a = 0;
wchar_t b = 0;
while (*str1 && *str2 && maxlen) {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
if (a != b) {
break;
}
++str1;
++str2;
--maxlen;
}
size_t slen1 = maxlen;
size_t slen2 = maxlen;
if (maxlen == 0) {
return 0;
} else {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
return (int)((unsigned int)a - (unsigned int)b);
}
#endif /* HAVE__WCSNICMP */
#if (SDL_SIZEOF_WCHAR_T == 2)
const Uint16 *str1 = (const Uint16 *) wstr1;
const Uint16 *str2 = (const Uint16 *) wstr2;
UNICODE_STRCASECMP(16, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
#elif (SDL_SIZEOF_WCHAR_T == 4)
const Uint32 *str1 = (const Uint32 *) wstr1;
const Uint32 *str2 = (const Uint32 *) wstr2;
UNICODE_STRCASECMP(32, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
#else
#error Unexpected wchar_t size
#endif
return -1;
}
long SDL_wcstol(const wchar_t *string, wchar_t **endp, int base)
@@ -733,7 +890,7 @@ char *SDL_strrev(char *string)
char *b = &string[len - 1];
len /= 2;
while (len--) {
char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
const char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
*a++ = *b;
*b-- = c;
}
@@ -743,30 +900,22 @@ char *SDL_strrev(char *string)
char *SDL_strupr(char *string)
{
#ifdef HAVE__STRUPR
return _strupr(string);
#else
char *bufp = string;
while (*bufp) {
*bufp = (char)SDL_toupper((unsigned char)*bufp);
++bufp;
}
return string;
#endif /* HAVE__STRUPR */
}
char *SDL_strlwr(char *string)
{
#ifdef HAVE__STRLWR
return _strlwr(string);
#else
char *bufp = string;
while (*bufp) {
*bufp = (char)SDL_tolower((unsigned char)*bufp);
++bufp;
}
return string;
#endif /* HAVE__STRLWR */
}
char *SDL_strchr(const char *string, int c)
@@ -838,18 +987,14 @@ char *SDL_strstr(const char *haystack, const char *needle)
char *SDL_strcasestr(const char *haystack, const char *needle)
{
#ifdef HAVE_STRCASESTR
return SDL_const_cast(char *, strcasestr(haystack, needle));
#else
size_t length = SDL_strlen(needle);
while (*haystack) {
const size_t length = SDL_strlen(needle);
do {
if (SDL_strncasecmp(haystack, needle, length) == 0) {
return (char *)haystack;
}
++haystack;
}
} while (SDL_StepUTF8(&haystack, 4)); // move ahead by a full codepoint at a time, regardless of bytes.
return NULL;
#endif /* HAVE_STRCASESTR */
}
#if !defined(HAVE__LTOA) || !defined(HAVE__I64TOA) || \
@@ -1079,8 +1224,7 @@ Uint64 SDL_strtoull(const char *string, char **endp, int base)
#endif /* HAVE_STRTOULL */
}
double
SDL_strtod(const char *string, char **endp)
double SDL_strtod(const char *string, char **endp)
{
#ifdef HAVE_STRTOD
return strtod(string, endp);
@@ -1137,49 +1281,14 @@ int SDL_strncmp(const char *str1, const char *str2, size_t maxlen)
int SDL_strcasecmp(const char *str1, const char *str2)
{
#ifdef HAVE_STRCASECMP
return strcasecmp(str1, str2);
#elif defined(HAVE__STRICMP)
return _stricmp(str1, str2);
#else
int a, b, result;
while (1) {
a = SDL_toupper((unsigned char)*str1);
b = SDL_toupper((unsigned char)*str2);
result = a - b;
if (result != 0 || a == 0 /*&& b == 0*/) {
break;
}
++str1;
++str2;
}
return result;
#endif /* HAVE_STRCASECMP */
UNICODE_STRCASECMP(8, 4, 4, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
}
int SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen)
{
#ifdef HAVE_STRNCASECMP
return strncasecmp(str1, str2, maxlen);
#elif defined(HAVE__STRNICMP)
return _strnicmp(str1, str2, maxlen);
#else
int a, b, result = 0;
while (maxlen) {
a = SDL_tolower((unsigned char)*str1);
b = SDL_tolower((unsigned char)*str2);
result = a - b;
if (result != 0 || a == 0 /*&& b == 0*/) {
break;
}
++str1;
++str2;
--maxlen;
}
return result;
#endif /* HAVE_STRNCASECMP */
size_t slen1 = maxlen;
size_t slen2 = maxlen;
UNICODE_STRCASECMP(8, slen1, slen2, slen1 -= (size_t) (str1 - ((const char *) str1start)), slen2 -= (size_t) (str2 - ((const char *) str2start)));
}
int SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...)