stdinc: Drastically improve SDL_StepUTF8() and make it a public API.

Fixes #10105.
2025-09-30 23:18:29 +00:00 · 2024-06-26 21:32:45 -04:00
parent 9b8c5f642f
commit a9cfcf6bde
8 changed files with 211 additions and 58 deletions
--- a/include/SDL3/SDL_stdinc.h
+++ b/include/SDL3/SDL_stdinc.h
@@ -1250,6 +1250,67 @@ extern SDL_DECLSPEC int SDLCALL SDL_strcasecmp(const char *str1, const char *str
 */
 extern SDL_DECLSPEC int SDLCALL SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen);

+/**
+ * The Unicode REPLACEMENT CHARACTER codepoint.
+ *
+ * SDL_StepUTF8() reports this codepoint when it encounters a UTF-8 string
+ * with encoding errors.
+ *
+ * This tends to render as something like a question mark in most places.
+ *
+ * \since This macro is available since SDL 3.0.0.
+ *
+ * \sa SDL_StepUTF8
+ */
+#define SDL_INVALID_UNICODE_CODEPOINT 0xFFFD
+
+/**
+ * Decode a UTF-8 string, one Unicode codepoint at a time.
+ *
+ * This will return the first Unicode codepoint in the UTF-8 encoded
+ * string in `*pstr`, and then advance `*pstr` past any consumed bytes
+ * before returning.
+ *
+ * It will not access more than `*pslen` bytes from the string.
+ * `*pslen` will be adjusted, as well, subtracting the number of
+ * bytes consumed.
+ *
+ * `pslen` is allowed to be NULL, in which case the string _must_ be
+ * NULL-terminated, as the function will blindly read until it sees
+ * the NULL char.
+ *
+ * if `*pslen` is zero, it assumes the end of string is reached and
+ * returns a zero codepoint regardless of the contents of the string
+ * buffer.
+ *
+ * If the resulting codepoint is zero (a NULL terminator), or `*pslen`
+ * is zero, it will not advance `*pstr` or `*pslen` at all.
+ *
+ * Generally this function is called in a loop until it returns zero,
+ * adjusting its parameters each iteration.
+ *
+ * If an invalid UTF-8 sequence is encountered, this function returns
+ * SDL_INVALID_UNICODE_CODEPOINT and advances the string/length by one
+ * byte (which is to say, a multibyte sequence might produce several
+ * SDL_INVALID_UNICODE_CODEPOINT returns before it syncs to the next
+ * valid UTF-8 sequence).
+ *
+ * Several things can generate invalid UTF-8 sequences, including
+ * overlong encodings, the use of UTF-16 surrogate values, and
+ * truncated data. Please refer to
+ * [RFC3629](https://www.ietf.org/rfc/rfc3629.txt) for details.
+ *
+ * \param pstr a pointer to a UTF-8 string pointer to be read and adjusted.
+ * \param pslen a pointer to the number of bytes in the string, to be read
+ *              and adjusted. NULL is allowed.
+ * \returns the first Unicode codepoint in the string.
+ *
+ * \threadsafety It is safe to call this function from any thread.
+ *
+ * \since This function is available since SDL 3.0.0.
+ */
+extern SDL_DECLSPEC Uint32 SDLCALL SDL_StepUTF8(const char **pstr, size_t *pslen);
+
 extern SDL_DECLSPEC int SDLCALL SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...) SDL_SCANF_VARARG_FUNC(2);
 extern SDL_DECLSPEC int SDLCALL SDL_vsscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, va_list ap) SDL_SCANF_VARARG_FUNCV(2);
 extern SDL_DECLSPEC int SDLCALL SDL_snprintf(SDL_OUT_Z_CAP(maxlen) char *text, size_t maxlen, SDL_PRINTF_FORMAT_STRING const char *fmt, ... ) SDL_PRINTF_VARARG_FUNC(3);