stdlib: Improve Unicode support and consistency in string comparison functions.

SDL_strcasecmp (even when calling into a C runtime) does not work with
Unicode chars, and depending on the user's locale, might not work with
even basic ASCII strings.

This implements the function from scratch, using "case-folding,"
which is a more robust method that deals with various languages. It
involves a hashtable of a few hundred codepoints that are "uppercase" and
how to map them to lowercase equivalents (possibly increasing the size of
the string in the process). The vast majority of human languages (and
Unicode) do not have letters with different cases, but still, this static
table takes about 10 kilobytes on a 64-bit machine.

Even this will fail in one known case: the Turkish 'i' folds differently
if you're writing in Turkish vs other languages. Generally this is seen as
unfortunate collateral damage in cases where you can't specify the language
in use.

In addition to case-folding the codepoints, the new functions also know how
to decode the various formats to turn them into codepoints in the first
place, instead of blindly stepping by one byte (or one wchar_t) per
character.

Also included is casefolding.txt from the Unicode Consortium and a perl
script to generate the hashtable from that text file, so we can trivially
update this if new languages are added in the future.

A simple test using the new function:

```c
 #include <SDL3/SDL.h>

 int main(void)
 {
     const char *a = "α ε η";
     const char *b = "Α Ε Η";
     SDL_Log("    strcasecmp(\"%s\", \"%s\") == %d\n", a, b, strcasecmp(a, b));
     SDL_Log("SDL_strcasecmp(\"%s\", \"%s\") == %d\n", a, b, SDL_strcasecmp(a, b));
     return 0;
 }
```

Produces:

```
INFO:     strcasecmp("α ε η", "Α Ε Η") == 32
INFO: SDL_strcasecmp("α ε η", "Α Ε Η") == 0
```

glibc strcasecmp() fails to compare a Greek lowercase string to its uppercase
equivalent, even with a UTF-8 locale, but SDL_strcasecmp() works.

Other SDL_stdinc.h functions are changed to be more consistent, which is to
say they now ignore any C runtime and often dictate that only English-based
low-ASCII works with them.

Fixes Issue #9313.
This commit is contained in:
Ryan C. Gordon
2024-03-26 13:22:38 -04:00
parent 4659a84bd1
commit a5c892d2c3
17 changed files with 4971 additions and 210 deletions

View File

@@ -474,28 +474,7 @@ int SDL_abs(int x)
#endif
}
#ifdef HAVE_CTYPE_H
int SDL_isalpha(int x)
{
return isalpha(x);
}
int SDL_isalnum(int x) { return isalnum(x); }
int SDL_isdigit(int x) { return isdigit(x); }
int SDL_isxdigit(int x) { return isxdigit(x); }
int SDL_ispunct(int x) { return ispunct(x); }
int SDL_isspace(int x) { return isspace(x); }
int SDL_isupper(int x) { return isupper(x); }
int SDL_islower(int x) { return islower(x); }
int SDL_isprint(int x) { return isprint(x); }
int SDL_isgraph(int x) { return isgraph(x); }
int SDL_iscntrl(int x) { return iscntrl(x); }
int SDL_toupper(int x) { return toupper(x); }
int SDL_tolower(int x) { return tolower(x); }
#else
int SDL_isalpha(int x)
{
return (SDL_isupper(x)) || (SDL_islower(x));
}
int SDL_isalpha(int x) { return (SDL_isupper(x)) || (SDL_islower(x)); }
int SDL_isalnum(int x) { return (SDL_isalpha(x)) || (SDL_isdigit(x)); }
int SDL_isdigit(int x) { return ((x) >= '0') && ((x) <= '9'); }
int SDL_isxdigit(int x) { return (((x) >= 'A') && ((x) <= 'F')) || (((x) >= 'a') && ((x) <= 'f')) || (SDL_isdigit(x)); }
@@ -508,19 +487,7 @@ int SDL_isgraph(int x) { return (SDL_isprint(x)) && ((x) != ' '); }
int SDL_iscntrl(int x) { return (((x) >= '\0') && ((x) <= '\x1f')) || ((x) == '\x7f'); }
int SDL_toupper(int x) { return ((x) >= 'a') && ((x) <= 'z') ? ('A' + ((x) - 'a')) : (x); }
int SDL_tolower(int x) { return ((x) >= 'A') && ((x) <= 'Z') ? ('a' + ((x) - 'A')) : (x); }
#endif
#if defined(HAVE_CTYPE_H) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
int SDL_isblank(int x)
{
return isblank(x);
}
#else
int SDL_isblank(int x)
{
return ((x) == ' ') || ((x) == '\t');
}
#endif
int SDL_isblank(int x) { return ((x) == ' ') || ((x) == '\t'); }
void *SDL_aligned_alloc(size_t alignment, size_t size)
{