stdlib: Improve Unicode support and consistency in string comparison functions.

SDL_strcasecmp (even when calling into a C runtime) does not work with Unicode chars, and depending on the user's locale, might not work with even basic ASCII strings. This implements the function from scratch, using "case-folding," which is a more robust method that deals with various languages. It involves a hashtable of a few hundred codepoints that are "uppercase" and how to map them to lowercase equivalents (possibly increasing the size of the string in the process). The vast majority of human languages (and Unicode) do not have letters with different cases, but still, this static table takes about 10 kilobytes on a 64-bit machine. Even this will fail in one known case: the Turkish 'i' folds differently if you're writing in Turkish vs other languages. Generally this is seen as unfortunate collateral damage in cases where you can't specify the language in use. In addition to case-folding the codepoints, the new functions also know how to decode the various formats to turn them into codepoints in the first place, instead of blindly stepping by one byte (or one wchar_t) per character. Also included is casefolding.txt from the Unicode Consortium and a perl script to generate the hashtable from that text file, so we can trivially update this if new languages are added in the future. A simple test using the new function: ```c #include <SDL3/SDL.h> int main(void) { const char *a = "α ε η"; const char *b = "Α Ε Η"; SDL_Log(" strcasecmp(\"%s\", \"%s\") == %d\n", a, b, strcasecmp(a, b)); SDL_Log("SDL_strcasecmp(\"%s\", \"%s\") == %d\n", a, b, SDL_strcasecmp(a, b)); return 0; } ``` Produces: ``` INFO: strcasecmp("α ε η", "Α Ε Η") == 32 INFO: SDL_strcasecmp("α ε η", "Α Ε Η") == 0 ``` glibc strcasecmp() fails to compare a Greek lowercase string to its uppercase equivalent, even with a UTF-8 locale, but SDL_strcasecmp() works. Other SDL_stdinc.h functions are changed to be more consistent, which is to say they now ignore any C runtime and often dictate that only English-based low-ASCII works with them. Fixes Issue #9313.
2025-09-25 20:48:29 +00:00 · 2024-03-26 13:22:38 -04:00
parent 4659a84bd1
commit a5c892d2c3
17 changed files with 4971 additions and 210 deletions
--- a/src/stdlib/SDL_string.c
+++ b/src/stdlib/SDL_string.c
@@ -28,6 +28,205 @@
 #include <psp2/kernel/clib.h>
 #endif

+#include "SDL_casefolding.h"
+
+// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values.
+#define INVALID_UNICODE_CODEPOINT 0xFFFD
+
+#if defined(__SIZEOF_WCHAR_T__)
+#define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__
+#elif defined(SDL_PLATFORM_WINDOWS)
+#define SDL_SIZEOF_WCHAR_T 2
+#else  // assume everything else is UTF-32 (add more tests if compiler-assert fails below!)
+#define SDL_SIZEOF_WCHAR_T 4
+#endif
+SDL_COMPILE_TIME_ASSERT(sizeof_wchar_t, sizeof(wchar_t) == SDL_SIZEOF_WCHAR_T);
+
+
+// this expects `from` and `to` to be UTF-32 encoding!
+static int SDL_UnicodeCaseFold(const Uint32 from, Uint32 *to)
+{
+    // !!! FIXME: since the hashtable is static, maybe we should binary
+    // !!! FIXME: search it instead of walking the whole bucket.
+
+    if (from < 128) {   // low-ASCII, easy!
+        if ((from >= 'A') && (from <= 'Z')) {
+            *to = 'a' + (from - 'A');
+            return 1;
+        }
+    } else if (from <= 0xFFFF) {  // the Basic Multilingual Plane.
+        const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
+        const Uint16 from16 = (Uint16) from;
+
+        // see if it maps to a single char (most common)...
+        {
+            const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
+            const int count = (int) bucket->count;
+            for (int i = 0; i < count; i++) {
+                const CaseFoldMapping1_16 *mapping = &bucket->list[i];
+                if (mapping->from == from16) {
+                    *to = mapping->to0;
+                    return 1;
+                }
+            }
+        }
+
+        // see if it folds down to two chars...
+        {
+            const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
+            const int count = (int) bucket->count;
+            for (int i = 0; i < count; i++) {
+                const CaseFoldMapping2_16 *mapping = &bucket->list[i];
+                if (mapping->from == from16) {
+                    to[0] = mapping->to0;
+                    to[1] = mapping->to1;
+                    return 2;
+                }
+            }
+        }
+
+        // okay, maybe it's _three_ characters!
+        {
+            const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
+            const int count = (int) bucket->count;
+            for (int i = 0; i < count; i++) {
+                const CaseFoldMapping3_16 *mapping = &bucket->list[i];
+                if (mapping->from == from16) {
+                    to[0] = mapping->to0;
+                    to[1] = mapping->to1;
+                    to[2] = mapping->to2;
+                    return 3;
+                }
+            }
+        }
+
+    } else {  // codepoint that doesn't fit in 16 bits.
+        const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
+        const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
+        const int count = (int) bucket->count;
+        for (int i = 0; i < count; i++) {
+            const CaseFoldMapping1_32 *mapping = &bucket->list[i];
+            if (mapping->from == from) {
+                *to = mapping->to0;
+                return 1;
+            }
+        }
+    }
+
+    // Not found...there's no folding needed for this codepoint.
+    *to = from;
+    return 1;
+}
+
+#define UNICODE_STRCASECMP(bits, slen1, slen2, update_slen1, update_slen2) \
+    Uint32 folded1[3], folded2[3]; \
+    int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
+    while (SDL_TRUE) { \
+        Uint32 cp1, cp2; \
+        if (head1 != tail1) { \
+            cp1 = folded1[tail1++]; \
+        } else { \
+            const Uint##bits *str1start = (const Uint##bits *) str1; \
+            head1 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str1, slen1), folded1); \
+            update_slen1; \
+            cp1 = folded1[0]; \
+            tail1 = 1; \
+        } \
+        if (head2 != tail2) { \
+            cp2 = folded2[tail2++]; \
+        } else { \
+            const Uint##bits *str2start = (const Uint##bits *) str2; \
+            head2 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str2, slen2), folded2); \
+            update_slen2; \
+            cp2 = folded2[0]; \
+            tail2 = 1; \
+        } \
+        if (cp1 < cp2) { \
+            return -1; \
+        } else if (cp1 > cp2) { \
+            return 1; \
+        } else if (cp1 == 0) { \
+            break;  /* complete match. */ \
+        } \
+    } \
+    return 0
+
+
+static Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
+{
+    const char *str = *_str;
+    const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0);
+
+    // !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc.
+
+    if (octet == 0) {  // null terminator, end of string.
+        return 0;  // don't advance `*_str`.
+    } else if ((octet & 0x80) == 0) {  // 0xxxxxxx: one byte codepoint.
+        (*_str)++;
+        return octet;
+    } else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) {  // 110xxxxx 10xxxxxx: two byte codepoint.
+        if (slen >= 2) {
+            *_str += 2;
+            return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F);
+        }
+    } else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) {  // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint.
+        *_str += 3;
+        const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6;
+        const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F);
+        return ((octet & 0x0F) << 12) | octet2 | octet3;
+    } else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) {  // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint.
+        *_str += 4;
+        const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12;
+        const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6;
+        const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F);
+        return ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
+    }
+
+    // bogus byte, skip ahead, return a REPLACEMENT CHARACTER.
+    (*_str)++;
+    return INVALID_UNICODE_CODEPOINT;
+}
+
+#if (SDL_SIZEOF_WCHAR_T == 2)
+static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
+{
+    const Uint16 *str = *_str;
+    Uint32 cp = (Uint32) *(str++);
+    if (cp == 0) {
+        return 0;  // don't advance string pointer.
+    } else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) {
+        cp = INVALID_UNICODE_CODEPOINT;  // Orphaned second half of surrogate pair
+    } else if ((cp >= 0xD800) && (cp <= 0xDBFF)) {  // start of surrogate pair!
+        const Uint32 pair = (Uint32) *str;
+        if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) {
+            cp = INVALID_UNICODE_CODEPOINT;
+        } else {
+            str++;  // eat the other surrogate.
+            cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
+        }
+    }
+
+    *_str = str;
+    return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
+}
+#elif (SDL_SIZEOF_WCHAR_T == 4)
+static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
+{
+    if (!slen) {
+        return 0;
+    }
+
+    const Uint32 *str = *_str;
+    const Uint32 cp = *str;
+    if (cp == 0) {
+        return 0;  // don't advance string pointer.
+    }
+
+    (*_str)++;
+    return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
+}
+#endif
+
 #if !defined(HAVE_VSSCANF) || !defined(HAVE_STRTOL) || !defined(HAVE_WCSTOL) || !defined(HAVE_STRTOUL) || !defined(HAVE_STRTOD) || !defined(HAVE_STRTOLL) || !defined(HAVE_STRTOULL)
 #define SDL_isupperhex(X) (((X) >= 'A') && ((X) <= 'F'))
 #define SDL_islowerhex(X) (((X) >= 'a') && ((X) <= 'f'))
@@ -507,83 +706,41 @@ int SDL_wcsncmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
 #endif /* HAVE_WCSNCMP */
 }

-int SDL_wcscasecmp(const wchar_t *str1, const wchar_t *str2)
+int SDL_wcscasecmp(const wchar_t *wstr1, const wchar_t *wstr2)
 {
-#ifdef HAVE_WCSCASECMP
-    return wcscasecmp(str1, str2);
-#elif defined(HAVE__WCSICMP)
-    return _wcsicmp(str1, str2);
+#if (SDL_SIZEOF_WCHAR_T == 2)
+    const Uint16 *str1 = (const Uint16 *) wstr1;
+    const Uint16 *str2 = (const Uint16 *) wstr2;
+    UNICODE_STRCASECMP(16, 2, 2, (void) str1start, (void) str2start);  // always NULL-terminated, no need to adjust lengths.
+#elif (SDL_SIZEOF_WCHAR_T == 4)
+    const Uint32 *str1 = (const Uint32 *) wstr1;
+    const Uint32 *str2 = (const Uint32 *) wstr2;
+    UNICODE_STRCASECMP(32, 1, 1, (void) str1start, (void) str2start);  // always NULL-terminated, no need to adjust lengths.
 #else
-    wchar_t a = 0;
-    wchar_t b = 0;
-    while (*str1 && *str2) {
-        /* FIXME: This doesn't actually support wide characters */
-        if (*str1 >= 0x80 || *str2 >= 0x80) {
-            a = *str1;
-            b = *str2;
-        } else {
-            a = (wchar_t)SDL_toupper((unsigned char)*str1);
-            b = (wchar_t)SDL_toupper((unsigned char)*str2);
-        }
-        if (a != b) {
-            break;
-        }
-        ++str1;
-        ++str2;
-    }
+    #error Unexpected wchar_t size
+#endif

-    /* FIXME: This doesn't actually support wide characters */
-    if (*str1 >= 0x80 || *str2 >= 0x80) {
-        a = *str1;
-        b = *str2;
-    } else {
-        a = (wchar_t)SDL_toupper((unsigned char)*str1);
-        b = (wchar_t)SDL_toupper((unsigned char)*str2);
-    }
-    return (int)((unsigned int)a - (unsigned int)b);
-#endif /* HAVE__WCSICMP */
+    return -1;
 }

-int SDL_wcsncasecmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
+int SDL_wcsncasecmp(const wchar_t *wstr1, const wchar_t *wstr2, size_t maxlen)
 {
-#ifdef HAVE_WCSNCASECMP
-    return wcsncasecmp(str1, str2, maxlen);
-#elif defined(HAVE__WCSNICMP)
-    return _wcsnicmp(str1, str2, maxlen);
-#else
-    wchar_t a = 0;
-    wchar_t b = 0;
-    while (*str1 && *str2 && maxlen) {
-        /* FIXME: This doesn't actually support wide characters */
-        if (*str1 >= 0x80 || *str2 >= 0x80) {
-            a = *str1;
-            b = *str2;
-        } else {
-            a = (wchar_t)SDL_toupper((unsigned char)*str1);
-            b = (wchar_t)SDL_toupper((unsigned char)*str2);
-        }
-        if (a != b) {
-            break;
-        }
-        ++str1;
-        ++str2;
-        --maxlen;
-    }
+    size_t slen1 = maxlen;
+    size_t slen2 = maxlen;

-    if (maxlen == 0) {
-        return 0;
-    } else {
-        /* FIXME: This doesn't actually support wide characters */
-        if (*str1 >= 0x80 || *str2 >= 0x80) {
-            a = *str1;
-            b = *str2;
-        } else {
-            a = (wchar_t)SDL_toupper((unsigned char)*str1);
-            b = (wchar_t)SDL_toupper((unsigned char)*str2);
-        }
-        return (int)((unsigned int)a - (unsigned int)b);
-    }
-#endif /* HAVE__WCSNICMP */
+#if (SDL_SIZEOF_WCHAR_T == 2)
+    const Uint16 *str1 = (const Uint16 *) wstr1;
+    const Uint16 *str2 = (const Uint16 *) wstr2;
+    UNICODE_STRCASECMP(16, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
+#elif (SDL_SIZEOF_WCHAR_T == 4)
+    const Uint32 *str1 = (const Uint32 *) wstr1;
+    const Uint32 *str2 = (const Uint32 *) wstr2;
+    UNICODE_STRCASECMP(32, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
+#else
+    #error Unexpected wchar_t size
+#endif
+
+    return -1;
 }

 long SDL_wcstol(const wchar_t *string, wchar_t **endp, int base)
@@ -733,7 +890,7 @@ char *SDL_strrev(char *string)
    char *b = &string[len - 1];
    len /= 2;
    while (len--) {
-        char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
+        const char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
        *a++ = *b;
        *b-- = c;
    }
@@ -743,30 +900,22 @@ char *SDL_strrev(char *string)

 char *SDL_strupr(char *string)
 {
-#ifdef HAVE__STRUPR
-    return _strupr(string);
-#else
    char *bufp = string;
    while (*bufp) {
        *bufp = (char)SDL_toupper((unsigned char)*bufp);
        ++bufp;
    }
    return string;
-#endif /* HAVE__STRUPR */
 }

 char *SDL_strlwr(char *string)
 {
-#ifdef HAVE__STRLWR
-    return _strlwr(string);
-#else
    char *bufp = string;
    while (*bufp) {
        *bufp = (char)SDL_tolower((unsigned char)*bufp);
        ++bufp;
    }
    return string;
-#endif /* HAVE__STRLWR */
 }

 char *SDL_strchr(const char *string, int c)
@@ -838,18 +987,14 @@ char *SDL_strstr(const char *haystack, const char *needle)

 char *SDL_strcasestr(const char *haystack, const char *needle)
 {
-#ifdef HAVE_STRCASESTR
-    return SDL_const_cast(char *, strcasestr(haystack, needle));
-#else
-    size_t length = SDL_strlen(needle);
-    while (*haystack) {
+    const size_t length = SDL_strlen(needle);
+    do {
        if (SDL_strncasecmp(haystack, needle, length) == 0) {
            return (char *)haystack;
        }
-        ++haystack;
-    }
+    } while (SDL_StepUTF8(&haystack, 4));  // move ahead by a full codepoint at a time, regardless of bytes.
+
    return NULL;
-#endif /* HAVE_STRCASESTR */
 }

 #if !defined(HAVE__LTOA) || !defined(HAVE__I64TOA) || \
@@ -1079,8 +1224,7 @@ Uint64 SDL_strtoull(const char *string, char **endp, int base)
 #endif /* HAVE_STRTOULL */
 }

-double
-SDL_strtod(const char *string, char **endp)
+double SDL_strtod(const char *string, char **endp)
 {
 #ifdef HAVE_STRTOD
    return strtod(string, endp);
@@ -1137,49 +1281,14 @@ int SDL_strncmp(const char *str1, const char *str2, size_t maxlen)

 int SDL_strcasecmp(const char *str1, const char *str2)
 {
-#ifdef HAVE_STRCASECMP
-    return strcasecmp(str1, str2);
-#elif defined(HAVE__STRICMP)
-    return _stricmp(str1, str2);
-#else
-    int a, b, result;
-
-    while (1) {
-        a = SDL_toupper((unsigned char)*str1);
-        b = SDL_toupper((unsigned char)*str2);
-        result = a - b;
-        if (result != 0 || a == 0 /*&& b == 0*/) {
-            break;
-        }
-        ++str1;
-        ++str2;
-    }
-    return result;
-#endif /* HAVE_STRCASECMP */
+    UNICODE_STRCASECMP(8, 4, 4, (void) str1start, (void) str2start);  // always NULL-terminated, no need to adjust lengths.
 }

 int SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen)
 {
-#ifdef HAVE_STRNCASECMP
-    return strncasecmp(str1, str2, maxlen);
-#elif defined(HAVE__STRNICMP)
-    return _strnicmp(str1, str2, maxlen);
-#else
-    int a, b, result = 0;
-
-    while (maxlen) {
-        a = SDL_tolower((unsigned char)*str1);
-        b = SDL_tolower((unsigned char)*str2);
-        result = a - b;
-        if (result != 0 || a == 0 /*&& b == 0*/) {
-            break;
-        }
-        ++str1;
-        ++str2;
-        --maxlen;
-    }
-    return result;
-#endif /* HAVE_STRNCASECMP */
+    size_t slen1 = maxlen;
+    size_t slen2 = maxlen;
+    UNICODE_STRCASECMP(8, slen1, slen2, slen1 -= (size_t) (str1 - ((const char *) str1start)), slen2 -= (size_t) (str2 - ((const char *) str2start)));
 }

 int SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...)