From ca481dc52d86a6856abba7481bc849a864f0183d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:51:03 -0400 Subject: [PATCH 1/6] Fix displaying error on wrong line with token at EOL Previously, this would get a token on text like "\n*\n" where `*` is the token's position, and it would advance off that line. --- src/parser.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/parser.cpp b/src/parser.cpp index 0cd96f5b5..468f4749f 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -51,6 +51,12 @@ gb_internal gbString get_file_line_as_string(TokenPos const &pos, i32 *offset_) u8 *line_start = pos_offset; u8 *line_end = pos_offset; + + if (offset > 0 && *line_start == '\n') { + // Prevent an error token that starts at the boundary of a line that + // leads to an empty line from advancing off its line. + line_start -= 1; + } while (line_start >= start) { if (*line_start == '\n') { line_start += 1; From 8626d38db10f134f97ea36d34ceda072a137e779 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:02:54 -0400 Subject: [PATCH 2/6] Fix displaying emptiness when error is on first line --- src/parser.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/parser.cpp b/src/parser.cpp index 468f4749f..2f764c753 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -64,6 +64,11 @@ gb_internal gbString get_file_line_as_string(TokenPos const &pos, i32 *offset_) } line_start -= 1; } + if (line_start == start - 1) { + // Prevent an error on the first line from stepping behind the boundary + // of the text. + line_start += 1; + } while (line_end < end) { if (*line_end == '\n') { From 582154f20d44ebf724301316e8cf27b1c768b7ee Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 14 Jun 2024 15:44:50 -0400 Subject: [PATCH 3/6] Remove unused code from tokenizer --- src/tokenizer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index f7751d840..236bc84a8 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -777,7 +777,6 @@ gb_internal void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { case '`': // Raw String Literal case '"': // String Literal { - bool has_carriage_return = false; i32 success; Rune quote = curr_rune; token->kind = Token_String; @@ -807,9 +806,6 @@ gb_internal void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { if (r == quote) { break; } - if (r == '\r') { - has_carriage_return = true; - } } } token->string.len = t->curr - token->string.text; From d4e2fa037797c026a1b7803f1514a87792af8888 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 14 Jun 2024 16:50:28 -0400 Subject: [PATCH 4/6] Refactor `show_error_on_line` This should adequately solve any issues with Unicode alignment by sidestepping the issue entirely. With this change, we make use of the built-in ANSI facilities of the terminal to underline the text. If the terminal does not support underlining, there are still the fallback bold markers at the start and end of error locations. --- src/check_stmt.cpp | 18 +-- src/error.cpp | 328 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 259 insertions(+), 87 deletions(-) diff --git a/src/check_stmt.cpp b/src/check_stmt.cpp index f2e3b0242..c369ba098 100644 --- a/src/check_stmt.cpp +++ b/src/check_stmt.cpp @@ -582,28 +582,20 @@ gb_internal Type *check_assignment_variable(CheckerContext *ctx, Operand *lhs, O isize offset = show_error_on_line(e->token.pos, token_pos_end(e->token)); if (offset < 0) { if (is_type_map(e->type)) { - error_line("\tSuggestion: Did you mean? 'for key, &%.*s in ...'\n", LIT(e->token.string)); + error_line("\t\tSuggestion: Did you mean? 'for key, &%.*s in ...'\n", LIT(e->token.string)); } else { - error_line("\tSuggestion: Did you mean? 'for &%.*s in ...'\n", LIT(e->token.string)); + error_line("\t\tSuggestion: Did you mean? 'for &%.*s in ...'\n", LIT(e->token.string)); } } else { - error_line("\t"); - for (isize i = 0; i < offset-1; i++) { - error_line(" "); - } - error_line("'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); + error_line("\t\t'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); } } else if (e && e->flags & EntityFlag_SwitchValue) { isize offset = show_error_on_line(e->token.pos, token_pos_end(e->token)); if (offset < 0) { - error_line("\tSuggestion: Did you mean? 'switch &%.*s in ...'\n", LIT(e->token.string)); + error_line("\t\tSuggestion: Did you mean? 'switch &%.*s in ...'\n", LIT(e->token.string)); } else { - error_line("\t"); - for (isize i = 0; i < offset-1; i++) { - error_line(" "); - } - error_line("'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); + error_line("\t\t'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); } } } diff --git a/src/error.cpp b/src/error.cpp index 03d96219b..26b4106a0 100644 --- a/src/error.cpp +++ b/src/error.cpp @@ -237,6 +237,7 @@ enum TerminalColour { TerminalColour_Blue, TerminalColour_Purple, TerminalColour_Black, + TerminalColour_Grey, }; gb_internal void terminal_set_colours(TerminalStyle style, TerminalColour foreground) { @@ -256,6 +257,7 @@ gb_internal void terminal_set_colours(TerminalStyle style, TerminalColour foregr case TerminalColour_Blue: error_out("\x1b[%s;34m", ss); break; case TerminalColour_Purple: error_out("\x1b[%s;35m", ss); break; case TerminalColour_Black: error_out("\x1b[%s;30m", ss); break; + case TerminalColour_Grey: error_out("\x1b[%s;90m", ss); break; } } } @@ -272,85 +274,263 @@ gb_internal isize show_error_on_line(TokenPos const &pos, TokenPos end) { return -1; } - i32 offset = 0; - gbString the_line = get_file_line_as_string(pos, &offset); + i32 error_start_index_bytes = 0; + gbString the_line = get_file_line_as_string(pos, &error_start_index_bytes); defer (gb_string_free(the_line)); - if (the_line != nullptr) { - char const *line_text = the_line; - isize line_len = gb_string_length(the_line); - - // TODO(bill): This assumes ASCII - - enum { - MAX_LINE_LENGTH = 80, - MAX_TAB_WIDTH = 8, - ELLIPSIS_PADDING = 8, // `... ...` - MAX_LINE_LENGTH_PADDED = MAX_LINE_LENGTH-MAX_TAB_WIDTH-ELLIPSIS_PADDING, - }; - - i32 error_length = gb_max(end.offset - pos.offset, 1); - - error_out("\t"); - - terminal_set_colours(TerminalStyle_Bold, TerminalColour_White); - - - isize squiggle_extra = 0; - - if (line_len > MAX_LINE_LENGTH_PADDED) { - i32 left = MAX_TAB_WIDTH; - i32 diff = gb_max(offset-left, 0); - if (diff > 0) { - line_text += diff; - line_len -= diff; - offset = left + ELLIPSIS_PADDING/2; - } - if (line_len > MAX_LINE_LENGTH_PADDED) { - line_len = MAX_LINE_LENGTH_PADDED; - if (error_length > line_len-left) { - error_length = cast(i32)line_len - left; - squiggle_extra = 1; - } - } - if (diff > 0) { - error_out("... %.*s ...", cast(i32)line_len, line_text); - } else { - error_out("%.*s ...", cast(i32)line_len, line_text); - } - } else { - error_out("%.*s", cast(i32)line_len, line_text); - } - error_out("\n\t"); - - for (i32 i = 0; i < offset; i++) { - error_out(" "); - } - - terminal_set_colours(TerminalStyle_Bold, TerminalColour_Green); - - error_out("^"); - if (end.file_id == pos.file_id) { - if (end.line > pos.line) { - for (i32 i = offset; i < line_len; i++) { - error_out("~"); - } - } else if (end.line == pos.line && end.column > pos.column) { - for (i32 i = 1; i < error_length-1+squiggle_extra; i++) { - error_out("~"); - } - if (error_length > 1 && squiggle_extra == 0) { - error_out("^"); - } - } - } - + if (the_line == nullptr || gb_string_length(the_line) == 0) { + terminal_set_colours(TerminalStyle_Normal, TerminalColour_Grey); + error_out("\t( empty line )\n"); terminal_reset_colours(); - error_out("\n"); - return offset; + // Preserve the old return behaviour. Even if we can't guarantee the + // exact visual space offset, there are two places that check this to + // change what sort of suggestion they offer. + if (the_line == nullptr) { + return -1; + } else { + return cast(isize)error_start_index_bytes; + } } - return -1; + + // Specfically use basic ASCII arrows here, in case the terminal + // doesn't support anything fancy. This is meant to be a good fallback. + char const *mark_error_sign = "><"; + char const *open_error_sign = ">>"; + char const *close_error_sign = "<<"; + const TerminalColour marker_colour = TerminalColour_Yellow; + + // ANSI SGR: + // 0 = Reset. + // 58:5:2 = Underline colour, 8-bit, green. (non-standard) + // 4:3 = Wiggly underline. (non-standard) + char const *wiggly_underline_sgr = ""; + char const *disable_underline_sgr = ""; + if (has_ansi_terminal_colours()) { + wiggly_underline_sgr = "\x1b[0;58:5:2;4:3m"; + disable_underline_sgr = "\x1b[24m"; + } + + // These two will be used like an Odin slice later. + char const *line_text = the_line; + i32 line_length_bytes = cast(i32)gb_string_length(the_line); + + // NOTE(Feoramund): The numbers below are in Unicode codepoints + // (or runes), not visual glyph width. Calculating the visual width of + // a cluster of Unicode codepoints is vexing, and `utf8proc_charwidth` + // is inadequate. + // + // We're counting codepoints here so we don't slice one down the + // middle during truncation. It will still look strange if we slice + // a cluster down the middle. (i.e. a letter and a combining diacritic) + // + // Luckily, if our assumption about 1 codepoint == 1 glyph is wrong, + // we only suffer a shorter or longer line displayed in total, but all + // of our highlighting and marking will be precise. + // (Unless there's an invalid Unicode codepoint, in which case, no guarantees.) + // + // The line will be longer if a codepoint occupies more than one space + // (CJK in most cases) and shorter if a codepoint is invisible or is + // a type of joiner or combining codepoint. + // + // If we get a complete Unicode glyph counter, it would be as simple as + // replacing `utf8_decode` below to make all of this work perfectly. + + enum { + MAX_LINE_LENGTH = 80, + MAX_TAB_WIDTH = 8, + ELLIPSIS_PADDING = 8, // `... ...` + MAX_MARK_WIDTH = 4, // `><` or `>>` and `<<` + MIN_LEFT_VIEW = 8, + + // A rough estimate of how many characters we'll insert, at most: + MAX_INSERTED_WIDTH = MAX_TAB_WIDTH + ELLIPSIS_PADDING + MAX_MARK_WIDTH, + + MAX_LINE_LENGTH_PADDED = MAX_LINE_LENGTH - MAX_INSERTED_WIDTH, + }; + + // For the purposes of truncating long lines, we calculate how many + // runes the line is composed of, first. We'll take note of at which + // rune index the error starts, too. + i32 error_start_index_runes = 0; + + i32 line_length_runes = 0; + for (i32 i = 0; i < line_length_bytes; /**/) { + Rune rune; + + if (i == error_start_index_bytes) { + error_start_index_runes = line_length_runes; + } + + i32 bytes_read = cast(i32)utf8_decode(cast(const u8 *)line_text + i, line_length_bytes - i, &rune); + if (rune == GB_RUNE_INVALID || bytes_read <= 0) { + // Bail out; we won't even try to truncate the line later. + line_length_runes = 0; + break; + } + + line_length_runes += 1; + i += bytes_read; + } + + if (error_start_index_runes == 0 && error_start_index_bytes != 0 && line_length_runes != 0) { + // The error index in runes was not found, but we did find a valid Unicode string. + // + // This is an edge case where the error is sitting on a newline or the + // end of the line, as that is the only location we could not have checked. + error_start_index_runes = line_length_runes; + } + + error_out("\t"); + + bool show_right_ellipsis = false; + + if (line_length_runes > MAX_LINE_LENGTH_PADDED) { + // Now that we know the line is over the length limit, we have to + // compose a runic window in which to display the error. + i32 window_width = MAX_LINE_LENGTH_PADDED; + + i32 extend_right = 0; + i32 extend_left = 0; + if (error_start_index_runes + window_width > line_length_runes - 1) { + // Trade space from the right to the left. + extend_right = line_length_runes - error_start_index_runes; + extend_left = window_width - extend_right; + } else if (MIN_LEFT_VIEW - error_start_index_runes > 0) { + // Trade space from the left to the right. + extend_left = error_start_index_runes; + extend_right = window_width - extend_left; + } else { + // Square in the middle somewhere. + extend_left = MIN_LEFT_VIEW; + extend_right = window_width - extend_left; + } + + i32 window_right_runes = gb_min(error_start_index_runes + extend_right, line_length_runes); + i32 window_left_runes = gb_max(0, error_start_index_runes - extend_left); + + i32 window_right_bytes = 0; + i32 window_left_bytes = 0; + + i32 i_runes = 0; + for (i32 i = 0; i < line_length_bytes; /**/) { + if (i_runes == window_left_runes ) { window_left_bytes = i; } + if (i_runes == window_right_runes) { window_right_bytes = i; } + + // No need for error-checking. + // + // We've already validated the string at this point, otherwise + // `line_length_runes` would be 0, and we would not have + // entered this block. + i32 bytes_read = cast(i32)utf8_decode(cast(const u8 *)line_text + i, line_length_bytes - i, nullptr); + + i_runes += 1; + i += bytes_read; + } + + if (window_right_bytes == 0) { + // The end of the window is the end of the line. + window_right_bytes = line_length_bytes; + } + + GB_ASSERT_MSG(window_right_runes >= window_left_runes, "Error line truncation window has wrong rune indices. (left, right: %i, %i)", window_left_runes, window_right_runes); + GB_ASSERT_MSG(window_right_bytes >= window_left_bytes, "Error line truncation window has wrong byte indices. (left, right: %i, %i)", window_left_bytes, window_right_bytes); + + if (window_right_bytes != line_length_bytes) { + show_right_ellipsis = true; + } + + // The text will advance; all indices and lengths will become relative. + // We must keep our other iterators in sync. + // NOTE: Uncomment the rune versions if they ever get used beyond this point. + + // Close the window, going left. + line_length_bytes = window_right_bytes; + + // Adjust the slice of text. In Odin, this would be: + // `line_text = line_text[window_left_bytes:]` + line_text += window_left_bytes; + line_length_bytes -= window_left_bytes; + // line_length_runes -= window_left_runes; + GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); + + // Part of advancing `line_text`: + error_start_index_bytes -= window_left_bytes; + // error_start_index_runes -= window_left_runes; + GB_ASSERT_MSG(error_start_index_bytes >= 0, "Bounds-checking error: error_start_index_bytes"); + + if (window_left_bytes > 0) { + error_out("... "); + } + } + + // Start printing code. + + terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); + error_out("%.*s", error_start_index_bytes, line_text); + + // Odin-like: `line_text = line_text[error_start_index_bytes:]` + line_text += error_start_index_bytes; + line_length_bytes -= error_start_index_bytes; + GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); + + if (end.file_id == pos.file_id) { + // The error has an endpoint. + terminal_set_colours(TerminalStyle_Bold, marker_colour); + error_out(open_error_sign); + + if (end.line > pos.line) { + // Error goes to next line. + error_out(wiggly_underline_sgr); + error_out("%.*s", line_length_bytes, line_text); + + error_out(disable_underline_sgr); + + // Always show the ellipsis in this case + show_right_ellipsis = true; + + } else if (end.line == pos.line && end.column > pos.column) { + // Error terminates before line end. + i32 error_length_bytes = gb_min(end.column - pos.column, line_length_bytes); + + error_out(wiggly_underline_sgr); + error_out("%.*s", error_length_bytes, line_text); + line_text += error_length_bytes; + line_length_bytes -= error_length_bytes; + GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); + + error_out(disable_underline_sgr); + + if (!show_right_ellipsis) { + // The line hasn't been truncated; show the end marker. + terminal_set_colours(TerminalStyle_Bold, marker_colour); + error_out(close_error_sign); + } + + terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); + error_out("%.*s", line_length_bytes, line_text); + } + + } else { + // The error is at one spot; no range known. + terminal_set_colours(TerminalStyle_Bold, marker_colour); + error_out(mark_error_sign); + + terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); + error_out("%.*s", line_length_bytes, line_text); + } + + if (show_right_ellipsis) { + error_out(" ..."); + } + + // NOTE(Feoramund): Specifically print a newline, then reset colours, + // instead of the other way around. Otherwise the printing mechanism + // will collapse the newline for reasons currently beyond my ken. + error_out("\n"); + terminal_reset_colours(); + + return error_start_index_bytes; } gb_internal void error_out_empty(void) { From 8b305a4c67b0ddab5c1133c6e0efce85c98c46c5 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sat, 29 Jun 2024 18:03:32 -0400 Subject: [PATCH 5/6] Add `UCG` library to Odin compiler --- src/ucg/ucg.c | 686 +++++++++++ src/ucg/ucg_tables.h | 2629 ++++++++++++++++++++++++++++++++++++++++++ src/unicode.cpp | 5 + 3 files changed, 3320 insertions(+) create mode 100644 src/ucg/ucg.c create mode 100644 src/ucg/ucg_tables.h diff --git a/src/ucg/ucg.c b/src/ucg/ucg.c new file mode 100644 index 000000000..c3e270e1a --- /dev/null +++ b/src/ucg/ucg.c @@ -0,0 +1,686 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Feoramund + * SPDX-License-Identifier: BSD-3-Clause + */ + + +// +// NOTE(Feoramund): This is my UCG library, adapted for use within the Odin compiler. +// Most of the comments have been let alone and may not strictly apply anymore. +// +// 1. The UCG allocator interface was replaced by gbAllocator. +// 2. The UCG UTF-8 decoder was replaced with the one already in the compiler. +// 3. Non-essential code was stripped. +// 4. Some types were changed for compatibility. +// + + +/* This is the data that is allocated when an allocator is passed to + * ucg_decode_grapheme_clusters. */ +typedef struct { + i32 byte_index; + i32 rune_index; + i32 width; +} ucg_grapheme; + + +/* #include "ucg.h" */ +#include "ucg_tables.h" + +#define UCG_TABLE_LEN(t) (sizeof(t) / sizeof(int32_t)) + +#define ZERO_WIDTH_SPACE 0x200B +#define ZERO_WIDTH_NON_JOINER 0x200C +#define ZERO_WIDTH_JOINER 0x200D +#define WORD_JOINER 0x2060 + +int ucg_binary_search(int32_t value, const int32_t* table, int length, int stride) { + GB_ASSERT(table != NULL); + GB_ASSERT(length > 0); + GB_ASSERT(stride > 0); + + int n = length; + int t = 0; + for (/**/; n > 1; /**/) { + int m = n / 2; + int p = t + m * stride; + if (value >= table[p]) { + t = p; + n = n - m; + } else { + n = m; + } + } + if (n != 0 && value >= table[t]) { + return t; + } + return -1; +} + +// +// The procedures below are accurate as of Unicode 15.1.0. +// + +bool ucg_is_control(int32_t r) { + if (r <= 0x1F || (0x7F <= r && r <= 0x9F)) { + return true; + } + return false; +} + +// Emoji_Modifier +bool ucg_is_emoji_modifier(int32_t r) { + return 0x1F3FB <= r && r <= 0x1F3FF; +} + +// Regional_Indicator +bool ucg_is_regional_indicator(int32_t r) { + return 0x1F1E6 <= r && r <= 0x1F1FF; +} + +// General_Category=Enclosing_Mark +bool ucg_is_enclosing_mark(int32_t r) { + switch (r) { + case 0x0488: + case 0x0489: + case 0x1ABE: + return true; + } + + if (0x20DD <= r && r <= 0x20E0) { return true; } + if (0x20E2 <= r && r <= 0x20E4) { return true; } + if (0xA670 <= r && r <= 0xA672) { return true; } + + return false; +} + +// Prepended_Concatenation_Mark +bool ucg_is_prepended_concatenation_mark(int32_t r) { + switch (r) { + case 0x006DD: + case 0x0070F: + case 0x008E2: + case 0x110BD: + case 0x110CD: + return true; + } + + if (0x00600 <= r && r <= 0x00605) { return true; } + if (0x00890 <= r && r <= 0x00891) { return true; } + + return false; +} + +// General_Category=Spacing_Mark +bool ucg_is_spacing_mark(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_spacing_mark_ranges, UCG_TABLE_LEN(ucg_spacing_mark_ranges)/2, 2); + if (p >= 0 && ucg_spacing_mark_ranges[p] <= r && r <= ucg_spacing_mark_ranges[p+1]) { + return true; + } + return false; +} + +// General_Category=Nonspacing_Mark +bool ucg_is_nonspacing_mark(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_nonspacing_mark_ranges, UCG_TABLE_LEN(ucg_nonspacing_mark_ranges)/2, 2); + if (p >= 0 && ucg_nonspacing_mark_ranges[p] <= r && r <= ucg_nonspacing_mark_ranges[p+1]) { + return true; + } + return false; +} + +// Extended_Pictographic +bool ucg_is_emoji_extended_pictographic(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_emoji_extended_pictographic_ranges, UCG_TABLE_LEN(ucg_emoji_extended_pictographic_ranges)/2, 2); + if (p >= 0 && ucg_emoji_extended_pictographic_ranges[p] <= r && r <= ucg_emoji_extended_pictographic_ranges[p+1]) { + return true; + } + return false; +} + +// Grapheme_Extend +bool ucg_is_grapheme_extend(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_grapheme_extend_ranges, UCG_TABLE_LEN(ucg_grapheme_extend_ranges)/2, 2); + if (p >= 0 && ucg_grapheme_extend_ranges[p] <= r && r <= ucg_grapheme_extend_ranges[p+1]) { + return true; + } + return false; +} + + +// Hangul_Syllable_Type=Leading_Jamo +bool ucg_is_hangul_syllable_leading(int32_t r) { + return (0x1100 <= r && r <= 0x115F) || (0xA960 <= r && r <= 0xA97C); +} + +// Hangul_Syllable_Type=Vowel_Jamo +bool ucg_is_hangul_syllable_vowel(int32_t r) { + return (0x1160 <= r && r <= 0x11A7) || (0xD7B0 <= r && r <= 0xD7C6); +} + +// Hangul_Syllable_Type=Trailing_Jamo +bool ucg_is_hangul_syllable_trailing(int32_t r) { + return (0x11A8 <= r && r <= 0x11FF) || (0xD7CB <= r && r <= 0xD7FB); +} + +// Hangul_Syllable_Type=LV_Syllable +bool ucg_is_hangul_syllable_lv(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lv_singlets, UCG_TABLE_LEN(ucg_hangul_syllable_lv_singlets), 1); + if (p >= 0 && r == ucg_hangul_syllable_lv_singlets[p]) { + return true; + } + return false; +} + +// Hangul_Syllable_Type=LVT_Syllable +bool ucg_is_hangul_syllable_lvt(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lvt_ranges, UCG_TABLE_LEN(ucg_hangul_syllable_lvt_ranges)/2, 2); + if (p >= 0 && ucg_hangul_syllable_lvt_ranges[p] <= r && r <= ucg_hangul_syllable_lvt_ranges[p+1]) { + return true; + } + return false; +} + + +// Indic_Syllabic_Category=Consonant_Preceding_Repha +bool ucg_is_indic_consonant_preceding_repha(int32_t r) { + switch (r) { + case 0x00D4E: + case 0x11941: + case 0x11D46: + case 0x11F02: + return true; + } + return false; +} + +// Indic_Syllabic_Category=Consonant_Prefixed +bool ucg_is_indic_consonant_prefixed(int32_t r) { + switch (r) { + case 0x1193F: + case 0x11A3A: + return true; + } + + if (0x111C2 <= r && r <= 0x111C3) { return true; } + if (0x11A84 <= r && r <= 0x11A89) { return true; } + + return false; +} + +// Indic_Conjunct_Break=Linker +bool ucg_is_indic_conjunct_break_linker(int32_t r) { + switch (r) { + case 0x094D: + case 0x09CD: + case 0x0ACD: + case 0x0B4D: + case 0x0C4D: + case 0x0D4D: + return true; + } + return false; +} + +// Indic_Conjunct_Break=Consonant +bool ucg_is_indic_conjunct_break_consonant(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_consonant_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_consonant_ranges)/2, 2); + if (p >= 0 && ucg_indic_conjunct_break_consonant_ranges[p] <= r && r <= ucg_indic_conjunct_break_consonant_ranges[p+1]) { + return true; + } + return false; +} + +// Indic_Conjunct_Break=Extend +bool ucg_is_indic_conjunct_break_extend(int32_t r) { + intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_extend_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_extend_ranges)/2, 2); + if (p >= 0 && ucg_indic_conjunct_break_extend_ranges[p] <= r && r <= ucg_indic_conjunct_break_extend_ranges[p+1]) { + return true; + } + return false; +} + + +/* +``` +Indic_Syllabic_Category = Consonant_Preceding_Repha, or +Indic_Syllabic_Category = Consonant_Prefixed, or +Prepended_Concatenation_Mark = Yes +``` +*/ +bool ucg_is_gcb_prepend_class(int32_t r) { + return ucg_is_indic_consonant_preceding_repha(r) || ucg_is_indic_consonant_prefixed(r) || ucg_is_prepended_concatenation_mark(r); +} + +/* +``` +Grapheme_Extend = Yes, or +Emoji_Modifier = Yes + +This includes: +General_Category = Nonspacing_Mark +General_Category = Enclosing_Mark +U+200C ZERO WIDTH NON-JOINER + +plus a few General_Category = Spacing_Mark needed for canonical equivalence. +``` +*/ +bool ucg_is_gcb_extend_class(int32_t r) { + return ucg_is_grapheme_extend(r) || ucg_is_emoji_modifier(r); +} + +// Return values: +// +// - 2 if East_Asian_Width=F or W, or +// - 0 if non-printable / zero-width, or +// - 1 in all other cases. +// +int ucg_normalized_east_asian_width(int32_t r) { + if (ucg_is_control(r)) { + return 0; + } else if (r <= 0x10FF) { + // Easy early out for low runes. + return 1; + } + + switch (r) { + // This is a different interpretation of the BOM which occurs in the middle of text. + case 0xFEFF: /* ZERO_WIDTH_NO_BREAK_SPACE */ + case ZERO_WIDTH_SPACE: + case ZERO_WIDTH_NON_JOINER: + case ZERO_WIDTH_JOINER: + case WORD_JOINER: + return 0; + } + + intptr_t p = ucg_binary_search(r, ucg_normalized_east_asian_width_ranges, UCG_TABLE_LEN(ucg_normalized_east_asian_width_ranges)/3, 3); + if (p >= 0 && ucg_normalized_east_asian_width_ranges[p] <= r && r <= ucg_normalized_east_asian_width_ranges[p+1]) { + return (int)ucg_normalized_east_asian_width_ranges[p+2]; + } + return 1; +} + +// +// End of Unicode 15.1.0 block. +// + +enum grapheme_cluster_sequence { + None, + Indic, + Emoji, + Regional, +}; + +typedef struct { + ucg_grapheme* graphemes; + i32 rune_count; + i32 grapheme_count; + i32 width; + + int32_t last_rune; + bool last_rune_breaks_forward; + + i32 last_width; + i32 last_grapheme_count; + + bool bypass_next_rune; + + int regional_indicator_counter; + + enum grapheme_cluster_sequence current_sequence; + bool continue_sequence; +} ucg_decoder_state; + + +void _ucg_decode_grapheme_clusters_deferred_step( + gbAllocator allocator, + ucg_decoder_state* state, + i32 byte_index, + int32_t this_rune +) { + // "Break at the start and end of text, unless the text is empty." + // + // GB1: sot ÷ Any + // GB2: Any ÷ eot + if (state->rune_count == 0 && state->grapheme_count == 0) { + state->grapheme_count += 1; + } + + if (state->grapheme_count > state->last_grapheme_count) { + state->width += ucg_normalized_east_asian_width(this_rune); + + /* if (allocator != NULL) { */ + state->graphemes = (ucg_grapheme*)gb_resize(allocator, + state->graphemes, + sizeof(ucg_grapheme) * (state->grapheme_count), + sizeof(ucg_grapheme) * (1 + state->grapheme_count)); + + ucg_grapheme append = { + byte_index, + state->rune_count, + state->width - state->last_width, + }; + + state->graphemes[state->grapheme_count - 1] = append; + /* } */ + + state->last_grapheme_count = state->grapheme_count; + state->last_width = state->width; + } + + state->last_rune = this_rune; + state->rune_count += 1; + + if (!state->continue_sequence) { + state->current_sequence = None; + state->regional_indicator_counter = 0; + } + state->continue_sequence = false; +} + +int ucg_decode_grapheme_clusters( + gbAllocator allocator, + const uint8_t* str, + int str_len, + + ucg_grapheme** out_graphemes, + i32* out_rune_count, + i32* out_grapheme_count, + i32* out_width +) { + // The following procedure implements text segmentation by breaking on + // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from + // the Unicode® Standard Annex #29, entitled: + // + // UNICODE TEXT SEGMENTATION + // + // Version: Unicode 15.1.0 + // Date: 2023-08-16 + // Revision: 43 + // + // This procedure is conformant[4] to UAX29-C1-1, otherwise known as the + // extended, non-legacy ruleset. + // + // Please see the references for more information. + // + // + // [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + // [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table + // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules + // [4]: https://www.unicode.org/reports/tr29/#Conformance + + // Additionally, this procedure takes into account Standard Annex #11, + // in order to estimate how visually wide the string will appear on a + // monospaced display. This can only ever be a rough guess, as this tends + // to be an implementation detail relating to which fonts are being used, + // how codepoints are interpreted and drawn, if codepoint sequences are + // interpreted correctly, and et cetera. + // + // For example, a program may not properly interpret an emoji modifier + // sequence and print the component glyphs instead of one whole glyph. + // + // See here for more information: https://www.unicode.org/reports/tr11/ + // + // NOTE: There is no explicit mention of what to do with zero-width spaces + // as far as grapheme cluster segmentation goes, therefore this + // implementation may count and return graphemes with a `width` of zero. + // + // Treat them as any other space. + + ucg_decoder_state state = {0}; + +#define UCG_DEFERRED_DECODE_STEP() (_ucg_decode_grapheme_clusters_deferred_step(allocator, &state, byte_index, this_rune)) + + for (i32 byte_index = 0, bytes_advanced = 0; byte_index < str_len; byte_index += bytes_advanced) { + int32_t this_rune = GB_RUNE_INVALID; + bytes_advanced = (i32)(utf8_decode(str+byte_index, str_len-byte_index, &this_rune)); + if (this_rune == GB_RUNE_INVALID || bytes_advanced == 0) { + // There was a Unicode parsing error; bail out. + if (out_graphemes != NULL) { *out_graphemes = state.graphemes; } + if (out_rune_count != NULL) { *out_rune_count = state.rune_count; } + if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; } + if (out_width != NULL) { *out_width = state.width; } + + // Return an error code. + return -1; + } + + // "Do not break between a CR and LF. Otherwise, break before and after controls." + // + // GB3: CR × LF + // GB4: (Control | CR | LF) ÷ + // GB5: ÷ (Control | CR | LF) + if (this_rune == '\n' && state.last_rune == '\r') { + state.last_rune_breaks_forward = false; + state.bypass_next_rune = false; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_control(this_rune)) { + state.grapheme_count += 1; + state.last_rune_breaks_forward = true; + state.bypass_next_rune = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (This check is for rules that work forwards, instead of backwards.) + if (state.bypass_next_rune) { + if (state.last_rune_breaks_forward) { + state.grapheme_count += 1; + state.last_rune_breaks_forward = false; + } + + state.bypass_next_rune = false; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Optimization 1: Prevent low runes from proceeding further.) + // + // * 0xA9 and 0xAE are in the Extended_Pictographic range, + // which is checked later in GB11. + if (this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF) { + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Optimization 2: Check if the rune is in the Hangul space before getting specific.) + if (0x1100 <= this_rune && this_rune <= 0xD7FB) { + // "Do not break Hangul syllable sequences." + // + // GB6: L × (L | V | LV | LVT) + // GB7: (LV | V) × (V | T) + // GB8: (LVT | T) × T + if (ucg_is_hangul_syllable_leading(this_rune) || + ucg_is_hangul_syllable_lv(this_rune) || + ucg_is_hangul_syllable_lvt(this_rune)) + { + if (!ucg_is_hangul_syllable_leading(state.last_rune)) { + state.grapheme_count += 1; + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_hangul_syllable_vowel(this_rune)) { + if (ucg_is_hangul_syllable_leading(state.last_rune) || + ucg_is_hangul_syllable_vowel(state.last_rune) || + ucg_is_hangul_syllable_lv(state.last_rune)) + { + UCG_DEFERRED_DECODE_STEP(); continue; + } + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_hangul_syllable_trailing(this_rune)) { + if (ucg_is_hangul_syllable_trailing(state.last_rune) || + ucg_is_hangul_syllable_lvt(state.last_rune) || + ucg_is_hangul_syllable_lv(state.last_rune) || + ucg_is_hangul_syllable_vowel(state.last_rune)) + { + UCG_DEFERRED_DECODE_STEP(); continue; + } + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); continue; + } + } + + // "Do not break before extending characters or ZWJ." + // + // GB9: × (Extend | ZWJ) + if (this_rune == ZERO_WIDTH_JOINER) { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_gcb_extend_class(this_rune)) { + // (Support for GB9c.) + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_extend(this_rune) && ( + ucg_is_indic_conjunct_break_linker(state.last_rune) || + ucg_is_indic_conjunct_break_consonant(state.last_rune) )) + { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_linker(this_rune) && ( + ucg_is_indic_conjunct_break_linker(state.last_rune) || + ucg_is_indic_conjunct_break_extend(state.last_rune) || + ucg_is_indic_conjunct_break_consonant(state.last_rune) )) + { + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // (Support for GB11.) + if (state.current_sequence == Emoji && ( + ucg_is_gcb_extend_class(state.last_rune) || + ucg_is_emoji_extended_pictographic(state.last_rune) )) + { + state.continue_sequence = true; + } + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // _The GB9a and GB9b rules only apply to extended grapheme clusters:_ + // "Do not break before SpacingMarks, or after Prepend characters." + // + // GB9a: × SpacingMark + // GB9b: Prepend × + if (ucg_is_spacing_mark(this_rune)) { + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_gcb_prepend_class(this_rune)) { + state.grapheme_count += 1; + state.bypass_next_rune = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // _The GB9c rule only applies to extended grapheme clusters:_ + // "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker." + // + // GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant} + if (ucg_is_indic_conjunct_break_consonant(this_rune)) { + if (state.current_sequence == Indic) { + if (state.last_rune == ZERO_WIDTH_JOINER || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } else { + state.grapheme_count += 1; + state.current_sequence = Indic; + state.continue_sequence = true; + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_extend(this_rune)) { + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_consonant(state.last_rune) || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + if (ucg_is_indic_conjunct_break_linker(this_rune)) { + if (state.current_sequence == Indic) { + if (ucg_is_indic_conjunct_break_extend(state.last_rune) || + ucg_is_indic_conjunct_break_linker(state.last_rune)) + { + state.continue_sequence = true; + } else { + state.grapheme_count += 1; + } + } + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // + // (Curiously, there is no GB10.) + // + + // "Do not break within emoji modifier sequences or emoji zwj sequences." + // + // GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} + if (ucg_is_emoji_extended_pictographic(this_rune)) { + if (state.current_sequence != Emoji || state.last_rune != ZERO_WIDTH_JOINER) { + state.grapheme_count += 1; + } + state.current_sequence = Emoji; + state.continue_sequence = true; + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // "Do not break within emoji flag sequences. + // That is, do not break between regional indicator (RI) symbols + // if there is an odd number of RI characters before the break point." + // + // GB12: sot (RI RI)* RI × RI + // GB13: [^RI] (RI RI)* RI × RI + if (ucg_is_regional_indicator(this_rune)) { + if ((state.regional_indicator_counter & 1) == 0) { + state.grapheme_count += 1; + } + + state.current_sequence = Regional; + state.continue_sequence = true; + state.regional_indicator_counter += 1; + + UCG_DEFERRED_DECODE_STEP(); continue; + } + + // "Otherwise, break everywhere." + // + // GB999: Any ÷ Any + state.grapheme_count += 1; + UCG_DEFERRED_DECODE_STEP(); + } + +#undef UCG_DEFERRED_DECODE_STEP + + if (out_graphemes != NULL) { *out_graphemes = state.graphemes; } + if (out_rune_count != NULL) { *out_rune_count = state.rune_count; } + if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; } + if (out_width != NULL) { *out_width = state.width; } + + return 0; +} + +#undef UCG_TABLE_LEN +#undef ZERO_WIDTH_SPACE +#undef ZERO_WIDTH_NON_JOINER +#undef ZERO_WIDTH_JOINER +#undef WORD_JOINER diff --git a/src/ucg/ucg_tables.h b/src/ucg/ucg_tables.h new file mode 100644 index 000000000..a33f9f898 --- /dev/null +++ b/src/ucg/ucg_tables.h @@ -0,0 +1,2629 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Feoramund + * SPDX-License-Identifier: BSD-3-Clause + */ +#ifndef _UCG_TABLES_INCLUDED +#define _UCG_TABLES_INCLUDED + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +// +// The tables below are accurate as of Unicode 15.1.0. +// + +static const int32_t ucg_spacing_mark_ranges[] = { + 0x0903, 0x0903, + 0x093B, 0x093B, + 0x093E, 0x0940, + 0x0949, 0x094C, + 0x094E, 0x094F, + 0x0982, 0x0983, + 0x09BE, 0x09C0, + 0x09C7, 0x09C8, + 0x09CB, 0x09CC, + 0x09D7, 0x09D7, + 0x0A03, 0x0A03, + 0x0A3E, 0x0A40, + 0x0A83, 0x0A83, + 0x0ABE, 0x0AC0, + 0x0AC9, 0x0AC9, + 0x0ACB, 0x0ACC, + 0x0B02, 0x0B03, + 0x0B3E, 0x0B3E, + 0x0B40, 0x0B40, + 0x0B47, 0x0B48, + 0x0B4B, 0x0B4C, + 0x0B57, 0x0B57, + 0x0BBE, 0x0BBF, + 0x0BC1, 0x0BC2, + 0x0BC6, 0x0BC8, + 0x0BCA, 0x0BCC, + 0x0BD7, 0x0BD7, + 0x0C01, 0x0C03, + 0x0C41, 0x0C44, + 0x0C82, 0x0C83, + 0x0CBE, 0x0CBE, + 0x0CC0, 0x0CC4, + 0x0CC7, 0x0CC8, + 0x0CCA, 0x0CCB, + 0x0CD5, 0x0CD6, + 0x0CF3, 0x0CF3, + 0x0D02, 0x0D03, + 0x0D3E, 0x0D40, + 0x0D46, 0x0D48, + 0x0D4A, 0x0D4C, + 0x0D57, 0x0D57, + 0x0D82, 0x0D83, + 0x0DCF, 0x0DD1, + 0x0DD8, 0x0DDF, + 0x0DF2, 0x0DF3, + 0x0F3E, 0x0F3F, + 0x0F7F, 0x0F7F, + 0x102B, 0x102C, + 0x1031, 0x1031, + 0x1038, 0x1038, + 0x103B, 0x103C, + 0x1056, 0x1057, + 0x1062, 0x1064, + 0x1067, 0x106D, + 0x1083, 0x1084, + 0x1087, 0x108C, + 0x108F, 0x108F, + 0x109A, 0x109C, + 0x1715, 0x1715, + 0x1734, 0x1734, + 0x17B6, 0x17B6, + 0x17BE, 0x17C5, + 0x17C7, 0x17C8, + 0x1923, 0x1926, + 0x1929, 0x192B, + 0x1930, 0x1931, + 0x1933, 0x1938, + 0x1A19, 0x1A1A, + 0x1A55, 0x1A55, + 0x1A57, 0x1A57, + 0x1A61, 0x1A61, + 0x1A63, 0x1A64, + 0x1A6D, 0x1A72, + 0x1B04, 0x1B04, + 0x1B35, 0x1B35, + 0x1B3B, 0x1B3B, + 0x1B3D, 0x1B41, + 0x1B43, 0x1B44, + 0x1B82, 0x1B82, + 0x1BA1, 0x1BA1, + 0x1BA6, 0x1BA7, + 0x1BAA, 0x1BAA, + 0x1BE7, 0x1BE7, + 0x1BEA, 0x1BEC, + 0x1BEE, 0x1BEE, + 0x1BF2, 0x1BF3, + 0x1C24, 0x1C2B, + 0x1C34, 0x1C35, + 0x1CE1, 0x1CE1, + 0x1CF7, 0x1CF7, + 0x302E, 0x302F, + 0xA823, 0xA824, + 0xA827, 0xA827, + 0xA880, 0xA881, + 0xA8B4, 0xA8C3, + 0xA952, 0xA953, + 0xA983, 0xA983, + 0xA9B4, 0xA9B5, + 0xA9BA, 0xA9BB, + 0xA9BE, 0xA9C0, + 0xAA2F, 0xAA30, + 0xAA33, 0xAA34, + 0xAA4D, 0xAA4D, + 0xAA7B, 0xAA7B, + 0xAA7D, 0xAA7D, + 0xAAEB, 0xAAEB, + 0xAAEE, 0xAAEF, + 0xAAF5, 0xAAF5, + 0xABE3, 0xABE4, + 0xABE6, 0xABE7, + 0xABE9, 0xABEA, + 0xABEC, 0xABEC, + 0x11000, 0x11000, + 0x11002, 0x11002, + 0x11082, 0x11082, + 0x110B0, 0x110B2, + 0x110B7, 0x110B8, + 0x1112C, 0x1112C, + 0x11145, 0x11146, + 0x11182, 0x11182, + 0x111B3, 0x111B5, + 0x111BF, 0x111C0, + 0x111CE, 0x111CE, + 0x1122C, 0x1122E, + 0x11232, 0x11233, + 0x11235, 0x11235, + 0x112E0, 0x112E2, + 0x11302, 0x11303, + 0x1133E, 0x1133F, + 0x11341, 0x11344, + 0x11347, 0x11348, + 0x1134B, 0x1134D, + 0x11357, 0x11357, + 0x11362, 0x11363, + 0x11435, 0x11437, + 0x11440, 0x11441, + 0x11445, 0x11445, + 0x114B0, 0x114B2, + 0x114B9, 0x114B9, + 0x114BB, 0x114BE, + 0x114C1, 0x114C1, + 0x115AF, 0x115B1, + 0x115B8, 0x115BB, + 0x115BE, 0x115BE, + 0x11630, 0x11632, + 0x1163B, 0x1163C, + 0x1163E, 0x1163E, + 0x116AC, 0x116AC, + 0x116AE, 0x116AF, + 0x116B6, 0x116B6, + 0x11720, 0x11721, + 0x11726, 0x11726, + 0x1182C, 0x1182E, + 0x11838, 0x11838, + 0x11930, 0x11935, + 0x11937, 0x11938, + 0x1193D, 0x1193D, + 0x11940, 0x11940, + 0x11942, 0x11942, + 0x119D1, 0x119D3, + 0x119DC, 0x119DF, + 0x119E4, 0x119E4, + 0x11A39, 0x11A39, + 0x11A57, 0x11A58, + 0x11A97, 0x11A97, + 0x11C2F, 0x11C2F, + 0x11C3E, 0x11C3E, + 0x11CA9, 0x11CA9, + 0x11CB1, 0x11CB1, + 0x11CB4, 0x11CB4, + 0x11D8A, 0x11D8E, + 0x11D93, 0x11D94, + 0x11D96, 0x11D96, + 0x11EF5, 0x11EF6, + 0x11F03, 0x11F03, + 0x11F34, 0x11F35, + 0x11F3E, 0x11F3F, + 0x11F41, 0x11F41, + 0x16F51, 0x16F87, + 0x16FF0, 0x16FF1, + 0x1D165, 0x1D166, + 0x1D16D, 0x1D172, +}; + +static const int32_t ucg_nonspacing_mark_ranges[] = { + 0x0300, 0x036F, + 0x0483, 0x0487, + 0x0591, 0x05BD, + 0x05BF, 0x05BF, + 0x05C1, 0x05C2, + 0x05C4, 0x05C5, + 0x05C7, 0x05C7, + 0x0610, 0x061A, + 0x064B, 0x065F, + 0x0670, 0x0670, + 0x06D6, 0x06DC, + 0x06DF, 0x06E4, + 0x06E7, 0x06E8, + 0x06EA, 0x06ED, + 0x0711, 0x0711, + 0x0730, 0x074A, + 0x07A6, 0x07B0, + 0x07EB, 0x07F3, + 0x07FD, 0x07FD, + 0x0816, 0x0819, + 0x081B, 0x0823, + 0x0825, 0x0827, + 0x0829, 0x082D, + 0x0859, 0x085B, + 0x0898, 0x089F, + 0x08CA, 0x08E1, + 0x08E3, 0x0902, + 0x093A, 0x093A, + 0x093C, 0x093C, + 0x0941, 0x0948, + 0x094D, 0x094D, + 0x0951, 0x0957, + 0x0962, 0x0963, + 0x0981, 0x0981, + 0x09BC, 0x09BC, + 0x09C1, 0x09C4, + 0x09CD, 0x09CD, + 0x09E2, 0x09E3, + 0x09FE, 0x09FE, + 0x0A01, 0x0A02, + 0x0A3C, 0x0A3C, + 0x0A41, 0x0A42, + 0x0A47, 0x0A48, + 0x0A4B, 0x0A4D, + 0x0A51, 0x0A51, + 0x0A70, 0x0A71, + 0x0A75, 0x0A75, + 0x0A81, 0x0A82, + 0x0ABC, 0x0ABC, + 0x0AC1, 0x0AC5, + 0x0AC7, 0x0AC8, + 0x0ACD, 0x0ACD, + 0x0AE2, 0x0AE3, + 0x0AFA, 0x0AFF, + 0x0B01, 0x0B01, + 0x0B3C, 0x0B3C, + 0x0B3F, 0x0B3F, + 0x0B41, 0x0B44, + 0x0B4D, 0x0B4D, + 0x0B55, 0x0B56, + 0x0B62, 0x0B63, + 0x0B82, 0x0B82, + 0x0BC0, 0x0BC0, + 0x0BCD, 0x0BCD, + 0x0C00, 0x0C00, + 0x0C04, 0x0C04, + 0x0C3C, 0x0C3C, + 0x0C3E, 0x0C40, + 0x0C46, 0x0C48, + 0x0C4A, 0x0C4D, + 0x0C55, 0x0C56, + 0x0C62, 0x0C63, + 0x0C81, 0x0C81, + 0x0CBC, 0x0CBC, + 0x0CBF, 0x0CBF, + 0x0CC6, 0x0CC6, + 0x0CCC, 0x0CCD, + 0x0CE2, 0x0CE3, + 0x0D00, 0x0D01, + 0x0D3B, 0x0D3C, + 0x0D41, 0x0D44, + 0x0D4D, 0x0D4D, + 0x0D62, 0x0D63, + 0x0D81, 0x0D81, + 0x0DCA, 0x0DCA, + 0x0DD2, 0x0DD4, + 0x0DD6, 0x0DD6, + 0x0E31, 0x0E31, + 0x0E34, 0x0E3A, + 0x0E47, 0x0E4E, + 0x0EB1, 0x0EB1, + 0x0EB4, 0x0EBC, + 0x0EC8, 0x0ECE, + 0x0F18, 0x0F19, + 0x0F35, 0x0F35, + 0x0F37, 0x0F37, + 0x0F39, 0x0F39, + 0x0F71, 0x0F7E, + 0x0F80, 0x0F84, + 0x0F86, 0x0F87, + 0x0F8D, 0x0F97, + 0x0F99, 0x0FBC, + 0x0FC6, 0x0FC6, + 0x102D, 0x1030, + 0x1032, 0x1037, + 0x1039, 0x103A, + 0x103D, 0x103E, + 0x1058, 0x1059, + 0x105E, 0x1060, + 0x1071, 0x1074, + 0x1082, 0x1082, + 0x1085, 0x1086, + 0x108D, 0x108D, + 0x109D, 0x109D, + 0x135D, 0x135F, + 0x1712, 0x1714, + 0x1732, 0x1733, + 0x1752, 0x1753, + 0x1772, 0x1773, + 0x17B4, 0x17B5, + 0x17B7, 0x17BD, + 0x17C6, 0x17C6, + 0x17C9, 0x17D3, + 0x17DD, 0x17DD, + 0x180B, 0x180D, + 0x180F, 0x180F, + 0x1885, 0x1886, + 0x18A9, 0x18A9, + 0x1920, 0x1922, + 0x1927, 0x1928, + 0x1932, 0x1932, + 0x1939, 0x193B, + 0x1A17, 0x1A18, + 0x1A1B, 0x1A1B, + 0x1A56, 0x1A56, + 0x1A58, 0x1A5E, + 0x1A60, 0x1A60, + 0x1A62, 0x1A62, + 0x1A65, 0x1A6C, + 0x1A73, 0x1A7C, + 0x1A7F, 0x1A7F, + 0x1AB0, 0x1ABD, + 0x1ABF, 0x1ACE, + 0x1B00, 0x1B03, + 0x1B34, 0x1B34, + 0x1B36, 0x1B3A, + 0x1B3C, 0x1B3C, + 0x1B42, 0x1B42, + 0x1B6B, 0x1B73, + 0x1B80, 0x1B81, + 0x1BA2, 0x1BA5, + 0x1BA8, 0x1BA9, + 0x1BAB, 0x1BAD, + 0x1BE6, 0x1BE6, + 0x1BE8, 0x1BE9, + 0x1BED, 0x1BED, + 0x1BEF, 0x1BF1, + 0x1C2C, 0x1C33, + 0x1C36, 0x1C37, + 0x1CD0, 0x1CD2, + 0x1CD4, 0x1CE0, + 0x1CE2, 0x1CE8, + 0x1CED, 0x1CED, + 0x1CF4, 0x1CF4, + 0x1CF8, 0x1CF9, + 0x1DC0, 0x1DFF, + 0x20D0, 0x20DC, + 0x20E1, 0x20E1, + 0x20E5, 0x20F0, + 0x2CEF, 0x2CF1, + 0x2D7F, 0x2D7F, + 0x2DE0, 0x2DFF, + 0x302A, 0x302D, + 0x3099, 0x309A, + 0xA66F, 0xA66F, + 0xA674, 0xA67D, + 0xA69E, 0xA69F, + 0xA6F0, 0xA6F1, + 0xA802, 0xA802, + 0xA806, 0xA806, + 0xA80B, 0xA80B, + 0xA825, 0xA826, + 0xA82C, 0xA82C, + 0xA8C4, 0xA8C5, + 0xA8E0, 0xA8F1, + 0xA8FF, 0xA8FF, + 0xA926, 0xA92D, + 0xA947, 0xA951, + 0xA980, 0xA982, + 0xA9B3, 0xA9B3, + 0xA9B6, 0xA9B9, + 0xA9BC, 0xA9BD, + 0xA9E5, 0xA9E5, + 0xAA29, 0xAA2E, + 0xAA31, 0xAA32, + 0xAA35, 0xAA36, + 0xAA43, 0xAA43, + 0xAA4C, 0xAA4C, + 0xAA7C, 0xAA7C, + 0xAAB0, 0xAAB0, + 0xAAB2, 0xAAB4, + 0xAAB7, 0xAAB8, + 0xAABE, 0xAABF, + 0xAAC1, 0xAAC1, + 0xAAEC, 0xAAED, + 0xAAF6, 0xAAF6, + 0xABE5, 0xABE5, + 0xABE8, 0xABE8, + 0xABED, 0xABED, + 0xFB1E, 0xFB1E, + 0xFE00, 0xFE0F, + 0xFE20, 0xFE2F, + 0x101FD, 0x101FD, + 0x102E0, 0x102E0, + 0x10376, 0x1037A, + 0x10A01, 0x10A03, + 0x10A05, 0x10A06, + 0x10A0C, 0x10A0F, + 0x10A38, 0x10A3A, + 0x10A3F, 0x10A3F, + 0x10AE5, 0x10AE6, + 0x10D24, 0x10D27, + 0x10EAB, 0x10EAC, + 0x10EFD, 0x10EFF, + 0x10F46, 0x10F50, + 0x10F82, 0x10F85, + 0x11001, 0x11001, + 0x11038, 0x11046, + 0x11070, 0x11070, + 0x11073, 0x11074, + 0x1107F, 0x11081, + 0x110B3, 0x110B6, + 0x110B9, 0x110BA, + 0x110C2, 0x110C2, + 0x11100, 0x11102, + 0x11127, 0x1112B, + 0x1112D, 0x11134, + 0x11173, 0x11173, + 0x11180, 0x11181, + 0x111B6, 0x111BE, + 0x111C9, 0x111CC, + 0x111CF, 0x111CF, + 0x1122F, 0x11231, + 0x11234, 0x11234, + 0x11236, 0x11237, + 0x1123E, 0x1123E, + 0x11241, 0x11241, + 0x112DF, 0x112DF, + 0x112E3, 0x112EA, + 0x11300, 0x11301, + 0x1133B, 0x1133C, + 0x11340, 0x11340, + 0x11366, 0x1136C, + 0x11370, 0x11374, + 0x11438, 0x1143F, + 0x11442, 0x11444, + 0x11446, 0x11446, + 0x1145E, 0x1145E, + 0x114B3, 0x114B8, + 0x114BA, 0x114BA, + 0x114BF, 0x114C0, + 0x114C2, 0x114C3, + 0x115B2, 0x115B5, + 0x115BC, 0x115BD, + 0x115BF, 0x115C0, + 0x115DC, 0x115DD, + 0x11633, 0x1163A, + 0x1163D, 0x1163D, + 0x1163F, 0x11640, + 0x116AB, 0x116AB, + 0x116AD, 0x116AD, + 0x116B0, 0x116B5, + 0x116B7, 0x116B7, + 0x1171D, 0x1171F, + 0x11722, 0x11725, + 0x11727, 0x1172B, + 0x1182F, 0x11837, + 0x11839, 0x1183A, + 0x1193B, 0x1193C, + 0x1193E, 0x1193E, + 0x11943, 0x11943, + 0x119D4, 0x119D7, + 0x119DA, 0x119DB, + 0x119E0, 0x119E0, + 0x11A01, 0x11A0A, + 0x11A33, 0x11A38, + 0x11A3B, 0x11A3E, + 0x11A47, 0x11A47, + 0x11A51, 0x11A56, + 0x11A59, 0x11A5B, + 0x11A8A, 0x11A96, + 0x11A98, 0x11A99, + 0x11C30, 0x11C36, + 0x11C38, 0x11C3D, + 0x11C3F, 0x11C3F, + 0x11C92, 0x11CA7, + 0x11CAA, 0x11CB0, + 0x11CB2, 0x11CB3, + 0x11CB5, 0x11CB6, + 0x11D31, 0x11D36, + 0x11D3A, 0x11D3A, + 0x11D3C, 0x11D3D, + 0x11D3F, 0x11D45, + 0x11D47, 0x11D47, + 0x11D90, 0x11D91, + 0x11D95, 0x11D95, + 0x11D97, 0x11D97, + 0x11EF3, 0x11EF4, + 0x11F00, 0x11F01, + 0x11F36, 0x11F3A, + 0x11F40, 0x11F40, + 0x11F42, 0x11F42, + 0x13440, 0x13440, + 0x13447, 0x13455, + 0x16AF0, 0x16AF4, + 0x16B30, 0x16B36, + 0x16F4F, 0x16F4F, + 0x16F8F, 0x16F92, + 0x16FE4, 0x16FE4, + 0x1BC9D, 0x1BC9E, + 0x1CF00, 0x1CF2D, + 0x1CF30, 0x1CF46, + 0x1D167, 0x1D169, + 0x1D17B, 0x1D182, + 0x1D185, 0x1D18B, + 0x1D1AA, 0x1D1AD, + 0x1D242, 0x1D244, + 0x1DA00, 0x1DA36, + 0x1DA3B, 0x1DA6C, + 0x1DA75, 0x1DA75, + 0x1DA84, 0x1DA84, + 0x1DA9B, 0x1DA9F, + 0x1DAA1, 0x1DAAF, + 0x1E000, 0x1E006, + 0x1E008, 0x1E018, + 0x1E01B, 0x1E021, + 0x1E023, 0x1E024, + 0x1E026, 0x1E02A, + 0x1E08F, 0x1E08F, + 0x1E130, 0x1E136, + 0x1E2AE, 0x1E2AE, + 0x1E2EC, 0x1E2EF, + 0x1E4EC, 0x1E4EF, + 0x1E8D0, 0x1E8D6, + 0x1E944, 0x1E94A, + 0xE0100, 0xE01EF, +}; + +static const int32_t ucg_emoji_extended_pictographic_ranges[] = { + 0x00A9, 0x00A9, + 0x00AE, 0x00AE, + 0x203C, 0x203C, + 0x2049, 0x2049, + 0x2122, 0x2122, + 0x2139, 0x2139, + 0x2194, 0x2199, + 0x21A9, 0x21AA, + 0x231A, 0x231B, + 0x2328, 0x2328, + 0x2388, 0x2388, + 0x23CF, 0x23CF, + 0x23E9, 0x23EC, + 0x23ED, 0x23EE, + 0x23EF, 0x23EF, + 0x23F0, 0x23F0, + 0x23F1, 0x23F2, + 0x23F3, 0x23F3, + 0x23F8, 0x23FA, + 0x24C2, 0x24C2, + 0x25AA, 0x25AB, + 0x25B6, 0x25B6, + 0x25C0, 0x25C0, + 0x25FB, 0x25FE, + 0x2600, 0x2601, + 0x2602, 0x2603, + 0x2604, 0x2604, + 0x2605, 0x2605, + 0x2607, 0x260D, + 0x260E, 0x260E, + 0x260F, 0x2610, + 0x2611, 0x2611, + 0x2612, 0x2612, + 0x2614, 0x2615, + 0x2616, 0x2617, + 0x2618, 0x2618, + 0x2619, 0x261C, + 0x261D, 0x261D, + 0x261E, 0x261F, + 0x2620, 0x2620, + 0x2621, 0x2621, + 0x2622, 0x2623, + 0x2624, 0x2625, + 0x2626, 0x2626, + 0x2627, 0x2629, + 0x262A, 0x262A, + 0x262B, 0x262D, + 0x262E, 0x262E, + 0x262F, 0x262F, + 0x2630, 0x2637, + 0x2638, 0x2639, + 0x263A, 0x263A, + 0x263B, 0x263F, + 0x2640, 0x2640, + 0x2641, 0x2641, + 0x2642, 0x2642, + 0x2643, 0x2647, + 0x2648, 0x2653, + 0x2654, 0x265E, + 0x265F, 0x265F, + 0x2660, 0x2660, + 0x2661, 0x2662, + 0x2663, 0x2663, + 0x2664, 0x2664, + 0x2665, 0x2666, + 0x2667, 0x2667, + 0x2668, 0x2668, + 0x2669, 0x267A, + 0x267B, 0x267B, + 0x267C, 0x267D, + 0x267E, 0x267E, + 0x267F, 0x267F, + 0x2680, 0x2685, + 0x2690, 0x2691, + 0x2692, 0x2692, + 0x2693, 0x2693, + 0x2694, 0x2694, + 0x2695, 0x2695, + 0x2696, 0x2697, + 0x2698, 0x2698, + 0x2699, 0x2699, + 0x269A, 0x269A, + 0x269B, 0x269C, + 0x269D, 0x269F, + 0x26A0, 0x26A1, + 0x26A2, 0x26A6, + 0x26A7, 0x26A7, + 0x26A8, 0x26A9, + 0x26AA, 0x26AB, + 0x26AC, 0x26AF, + 0x26B0, 0x26B1, + 0x26B2, 0x26BC, + 0x26BD, 0x26BE, + 0x26BF, 0x26C3, + 0x26C4, 0x26C5, + 0x26C6, 0x26C7, + 0x26C8, 0x26C8, + 0x26C9, 0x26CD, + 0x26CE, 0x26CE, + 0x26CF, 0x26CF, + 0x26D0, 0x26D0, + 0x26D1, 0x26D1, + 0x26D2, 0x26D2, + 0x26D3, 0x26D3, + 0x26D4, 0x26D4, + 0x26D5, 0x26E8, + 0x26E9, 0x26E9, + 0x26EA, 0x26EA, + 0x26EB, 0x26EF, + 0x26F0, 0x26F1, + 0x26F2, 0x26F3, + 0x26F4, 0x26F4, + 0x26F5, 0x26F5, + 0x26F6, 0x26F6, + 0x26F7, 0x26F9, + 0x26FA, 0x26FA, + 0x26FB, 0x26FC, + 0x26FD, 0x26FD, + 0x26FE, 0x2701, + 0x2702, 0x2702, + 0x2703, 0x2704, + 0x2705, 0x2705, + 0x2708, 0x270C, + 0x270D, 0x270D, + 0x270E, 0x270E, + 0x270F, 0x270F, + 0x2710, 0x2711, + 0x2712, 0x2712, + 0x2714, 0x2714, + 0x2716, 0x2716, + 0x271D, 0x271D, + 0x2721, 0x2721, + 0x2728, 0x2728, + 0x2733, 0x2734, + 0x2744, 0x2744, + 0x2747, 0x2747, + 0x274C, 0x274C, + 0x274E, 0x274E, + 0x2753, 0x2755, + 0x2757, 0x2757, + 0x2763, 0x2763, + 0x2764, 0x2764, + 0x2765, 0x2767, + 0x2795, 0x2797, + 0x27A1, 0x27A1, + 0x27B0, 0x27B0, + 0x27BF, 0x27BF, + 0x2934, 0x2935, + 0x2B05, 0x2B07, + 0x2B1B, 0x2B1C, + 0x2B50, 0x2B50, + 0x2B55, 0x2B55, + 0x3030, 0x3030, + 0x303D, 0x303D, + 0x3297, 0x3297, + 0x3299, 0x3299, + 0x1F000, 0x1F003, + 0x1F004, 0x1F004, + 0x1F005, 0x1F0CE, + 0x1F0CF, 0x1F0CF, + 0x1F0D0, 0x1F0FF, + 0x1F10D, 0x1F10F, + 0x1F12F, 0x1F12F, + 0x1F16C, 0x1F16F, + 0x1F170, 0x1F171, + 0x1F17E, 0x1F17F, + 0x1F18E, 0x1F18E, + 0x1F191, 0x1F19A, + 0x1F1AD, 0x1F1E5, + 0x1F201, 0x1F202, + 0x1F203, 0x1F20F, + 0x1F21A, 0x1F21A, + 0x1F22F, 0x1F22F, + 0x1F232, 0x1F23A, + 0x1F23C, 0x1F23F, + 0x1F249, 0x1F24F, + 0x1F250, 0x1F251, + 0x1F252, 0x1F2FF, + 0x1F300, 0x1F30C, + 0x1F30D, 0x1F30E, + 0x1F30F, 0x1F30F, + 0x1F310, 0x1F310, + 0x1F311, 0x1F311, + 0x1F312, 0x1F312, + 0x1F313, 0x1F315, + 0x1F316, 0x1F318, + 0x1F319, 0x1F319, + 0x1F31A, 0x1F31A, + 0x1F31B, 0x1F31B, + 0x1F31C, 0x1F31C, + 0x1F31D, 0x1F31E, + 0x1F31F, 0x1F320, + 0x1F321, 0x1F321, + 0x1F322, 0x1F323, + 0x1F324, 0x1F32C, + 0x1F32D, 0x1F32F, + 0x1F330, 0x1F331, + 0x1F332, 0x1F333, + 0x1F334, 0x1F335, + 0x1F336, 0x1F336, + 0x1F337, 0x1F34A, + 0x1F34B, 0x1F34B, + 0x1F34C, 0x1F34F, + 0x1F350, 0x1F350, + 0x1F351, 0x1F37B, + 0x1F37C, 0x1F37C, + 0x1F37D, 0x1F37D, + 0x1F37E, 0x1F37F, + 0x1F380, 0x1F393, + 0x1F394, 0x1F395, + 0x1F396, 0x1F397, + 0x1F398, 0x1F398, + 0x1F399, 0x1F39B, + 0x1F39C, 0x1F39D, + 0x1F39E, 0x1F39F, + 0x1F3A0, 0x1F3C4, + 0x1F3C5, 0x1F3C5, + 0x1F3C6, 0x1F3C6, + 0x1F3C7, 0x1F3C7, + 0x1F3C8, 0x1F3C8, + 0x1F3C9, 0x1F3C9, + 0x1F3CA, 0x1F3CA, + 0x1F3CB, 0x1F3CE, + 0x1F3CF, 0x1F3D3, + 0x1F3D4, 0x1F3DF, + 0x1F3E0, 0x1F3E3, + 0x1F3E4, 0x1F3E4, + 0x1F3E5, 0x1F3F0, + 0x1F3F1, 0x1F3F2, + 0x1F3F3, 0x1F3F3, + 0x1F3F4, 0x1F3F4, + 0x1F3F5, 0x1F3F5, + 0x1F3F6, 0x1F3F6, + 0x1F3F7, 0x1F3F7, + 0x1F3F8, 0x1F3FA, + 0x1F400, 0x1F407, + 0x1F408, 0x1F408, + 0x1F409, 0x1F40B, + 0x1F40C, 0x1F40E, + 0x1F40F, 0x1F410, + 0x1F411, 0x1F412, + 0x1F413, 0x1F413, + 0x1F414, 0x1F414, + 0x1F415, 0x1F415, + 0x1F416, 0x1F416, + 0x1F417, 0x1F429, + 0x1F42A, 0x1F42A, + 0x1F42B, 0x1F43E, + 0x1F43F, 0x1F43F, + 0x1F440, 0x1F440, + 0x1F441, 0x1F441, + 0x1F442, 0x1F464, + 0x1F465, 0x1F465, + 0x1F466, 0x1F46B, + 0x1F46C, 0x1F46D, + 0x1F46E, 0x1F4AC, + 0x1F4AD, 0x1F4AD, + 0x1F4AE, 0x1F4B5, + 0x1F4B6, 0x1F4B7, + 0x1F4B8, 0x1F4EB, + 0x1F4EC, 0x1F4ED, + 0x1F4EE, 0x1F4EE, + 0x1F4EF, 0x1F4EF, + 0x1F4F0, 0x1F4F4, + 0x1F4F5, 0x1F4F5, + 0x1F4F6, 0x1F4F7, + 0x1F4F8, 0x1F4F8, + 0x1F4F9, 0x1F4FC, + 0x1F4FD, 0x1F4FD, + 0x1F4FE, 0x1F4FE, + 0x1F4FF, 0x1F502, + 0x1F503, 0x1F503, + 0x1F504, 0x1F507, + 0x1F508, 0x1F508, + 0x1F509, 0x1F509, + 0x1F50A, 0x1F514, + 0x1F515, 0x1F515, + 0x1F516, 0x1F52B, + 0x1F52C, 0x1F52D, + 0x1F52E, 0x1F53D, + 0x1F546, 0x1F548, + 0x1F549, 0x1F54A, + 0x1F54B, 0x1F54E, + 0x1F54F, 0x1F54F, + 0x1F550, 0x1F55B, + 0x1F55C, 0x1F567, + 0x1F568, 0x1F56E, + 0x1F56F, 0x1F570, + 0x1F571, 0x1F572, + 0x1F573, 0x1F579, + 0x1F57A, 0x1F57A, + 0x1F57B, 0x1F586, + 0x1F587, 0x1F587, + 0x1F588, 0x1F589, + 0x1F58A, 0x1F58D, + 0x1F58E, 0x1F58F, + 0x1F590, 0x1F590, + 0x1F591, 0x1F594, + 0x1F595, 0x1F596, + 0x1F597, 0x1F5A3, + 0x1F5A4, 0x1F5A4, + 0x1F5A5, 0x1F5A5, + 0x1F5A6, 0x1F5A7, + 0x1F5A8, 0x1F5A8, + 0x1F5A9, 0x1F5B0, + 0x1F5B1, 0x1F5B2, + 0x1F5B3, 0x1F5BB, + 0x1F5BC, 0x1F5BC, + 0x1F5BD, 0x1F5C1, + 0x1F5C2, 0x1F5C4, + 0x1F5C5, 0x1F5D0, + 0x1F5D1, 0x1F5D3, + 0x1F5D4, 0x1F5DB, + 0x1F5DC, 0x1F5DE, + 0x1F5DF, 0x1F5E0, + 0x1F5E1, 0x1F5E1, + 0x1F5E2, 0x1F5E2, + 0x1F5E3, 0x1F5E3, + 0x1F5E4, 0x1F5E7, + 0x1F5E8, 0x1F5E8, + 0x1F5E9, 0x1F5EE, + 0x1F5EF, 0x1F5EF, + 0x1F5F0, 0x1F5F2, + 0x1F5F3, 0x1F5F3, + 0x1F5F4, 0x1F5F9, + 0x1F5FA, 0x1F5FA, + 0x1F5FB, 0x1F5FF, + 0x1F600, 0x1F600, + 0x1F601, 0x1F606, + 0x1F607, 0x1F608, + 0x1F609, 0x1F60D, + 0x1F60E, 0x1F60E, + 0x1F60F, 0x1F60F, + 0x1F610, 0x1F610, + 0x1F611, 0x1F611, + 0x1F612, 0x1F614, + 0x1F615, 0x1F615, + 0x1F616, 0x1F616, + 0x1F617, 0x1F617, + 0x1F618, 0x1F618, + 0x1F619, 0x1F619, + 0x1F61A, 0x1F61A, + 0x1F61B, 0x1F61B, + 0x1F61C, 0x1F61E, + 0x1F61F, 0x1F61F, + 0x1F620, 0x1F625, + 0x1F626, 0x1F627, + 0x1F628, 0x1F62B, + 0x1F62C, 0x1F62C, + 0x1F62D, 0x1F62D, + 0x1F62E, 0x1F62F, + 0x1F630, 0x1F633, + 0x1F634, 0x1F634, + 0x1F635, 0x1F635, + 0x1F636, 0x1F636, + 0x1F637, 0x1F640, + 0x1F641, 0x1F644, + 0x1F645, 0x1F64F, + 0x1F680, 0x1F680, + 0x1F681, 0x1F682, + 0x1F683, 0x1F685, + 0x1F686, 0x1F686, + 0x1F687, 0x1F687, + 0x1F688, 0x1F688, + 0x1F689, 0x1F689, + 0x1F68A, 0x1F68B, + 0x1F68C, 0x1F68C, + 0x1F68D, 0x1F68D, + 0x1F68E, 0x1F68E, + 0x1F68F, 0x1F68F, + 0x1F690, 0x1F690, + 0x1F691, 0x1F693, + 0x1F694, 0x1F694, + 0x1F695, 0x1F695, + 0x1F696, 0x1F696, + 0x1F697, 0x1F697, + 0x1F698, 0x1F698, + 0x1F699, 0x1F69A, + 0x1F69B, 0x1F6A1, + 0x1F6A2, 0x1F6A2, + 0x1F6A3, 0x1F6A3, + 0x1F6A4, 0x1F6A5, + 0x1F6A6, 0x1F6A6, + 0x1F6A7, 0x1F6AD, + 0x1F6AE, 0x1F6B1, + 0x1F6B2, 0x1F6B2, + 0x1F6B3, 0x1F6B5, + 0x1F6B6, 0x1F6B6, + 0x1F6B7, 0x1F6B8, + 0x1F6B9, 0x1F6BE, + 0x1F6BF, 0x1F6BF, + 0x1F6C0, 0x1F6C0, + 0x1F6C1, 0x1F6C5, + 0x1F6C6, 0x1F6CA, + 0x1F6CB, 0x1F6CB, + 0x1F6CC, 0x1F6CC, + 0x1F6CD, 0x1F6CF, + 0x1F6D0, 0x1F6D0, + 0x1F6D1, 0x1F6D2, + 0x1F6D3, 0x1F6D4, + 0x1F6D5, 0x1F6D5, + 0x1F6D6, 0x1F6D7, + 0x1F6D8, 0x1F6DB, + 0x1F6DC, 0x1F6DC, + 0x1F6DD, 0x1F6DF, + 0x1F6E0, 0x1F6E5, + 0x1F6E6, 0x1F6E8, + 0x1F6E9, 0x1F6E9, + 0x1F6EA, 0x1F6EA, + 0x1F6EB, 0x1F6EC, + 0x1F6ED, 0x1F6EF, + 0x1F6F0, 0x1F6F0, + 0x1F6F1, 0x1F6F2, + 0x1F6F3, 0x1F6F3, + 0x1F6F4, 0x1F6F6, + 0x1F6F7, 0x1F6F8, + 0x1F6F9, 0x1F6F9, + 0x1F6FA, 0x1F6FA, + 0x1F6FB, 0x1F6FC, + 0x1F6FD, 0x1F6FF, + 0x1F774, 0x1F77F, + 0x1F7D5, 0x1F7DF, + 0x1F7E0, 0x1F7EB, + 0x1F7EC, 0x1F7EF, + 0x1F7F0, 0x1F7F0, + 0x1F7F1, 0x1F7FF, + 0x1F80C, 0x1F80F, + 0x1F848, 0x1F84F, + 0x1F85A, 0x1F85F, + 0x1F888, 0x1F88F, + 0x1F8AE, 0x1F8FF, + 0x1F90C, 0x1F90C, + 0x1F90D, 0x1F90F, + 0x1F910, 0x1F918, + 0x1F919, 0x1F91E, + 0x1F91F, 0x1F91F, + 0x1F920, 0x1F927, + 0x1F928, 0x1F92F, + 0x1F930, 0x1F930, + 0x1F931, 0x1F932, + 0x1F933, 0x1F93A, + 0x1F93C, 0x1F93E, + 0x1F93F, 0x1F93F, + 0x1F940, 0x1F945, + 0x1F947, 0x1F94B, + 0x1F94C, 0x1F94C, + 0x1F94D, 0x1F94F, + 0x1F950, 0x1F95E, + 0x1F95F, 0x1F96B, + 0x1F96C, 0x1F970, + 0x1F971, 0x1F971, + 0x1F972, 0x1F972, + 0x1F973, 0x1F976, + 0x1F977, 0x1F978, + 0x1F979, 0x1F979, + 0x1F97A, 0x1F97A, + 0x1F97B, 0x1F97B, + 0x1F97C, 0x1F97F, + 0x1F980, 0x1F984, + 0x1F985, 0x1F991, + 0x1F992, 0x1F997, + 0x1F998, 0x1F9A2, + 0x1F9A3, 0x1F9A4, + 0x1F9A5, 0x1F9AA, + 0x1F9AB, 0x1F9AD, + 0x1F9AE, 0x1F9AF, + 0x1F9B0, 0x1F9B9, + 0x1F9BA, 0x1F9BF, + 0x1F9C0, 0x1F9C0, + 0x1F9C1, 0x1F9C2, + 0x1F9C3, 0x1F9CA, + 0x1F9CB, 0x1F9CB, + 0x1F9CC, 0x1F9CC, + 0x1F9CD, 0x1F9CF, + 0x1F9D0, 0x1F9E6, + 0x1F9E7, 0x1F9FF, + 0x1FA00, 0x1FA6F, + 0x1FA70, 0x1FA73, + 0x1FA74, 0x1FA74, + 0x1FA75, 0x1FA77, + 0x1FA78, 0x1FA7A, + 0x1FA7B, 0x1FA7C, + 0x1FA7D, 0x1FA7F, + 0x1FA80, 0x1FA82, + 0x1FA83, 0x1FA86, + 0x1FA87, 0x1FA88, + 0x1FA89, 0x1FA8F, + 0x1FA90, 0x1FA95, + 0x1FA96, 0x1FAA8, + 0x1FAA9, 0x1FAAC, + 0x1FAAD, 0x1FAAF, + 0x1FAB0, 0x1FAB6, + 0x1FAB7, 0x1FABA, + 0x1FABB, 0x1FABD, + 0x1FABE, 0x1FABE, + 0x1FABF, 0x1FABF, + 0x1FAC0, 0x1FAC2, + 0x1FAC3, 0x1FAC5, + 0x1FAC6, 0x1FACD, + 0x1FACE, 0x1FACF, + 0x1FAD0, 0x1FAD6, + 0x1FAD7, 0x1FAD9, + 0x1FADA, 0x1FADB, + 0x1FADC, 0x1FADF, + 0x1FAE0, 0x1FAE7, + 0x1FAE8, 0x1FAE8, + 0x1FAE9, 0x1FAEF, + 0x1FAF0, 0x1FAF6, + 0x1FAF7, 0x1FAF8, + 0x1FAF9, 0x1FAFF, + 0x1FC00, 0x1FFFD, +}; + +static const int32_t ucg_grapheme_extend_ranges[] = { + 0x0300, 0x036F, + 0x0483, 0x0487, + 0x0488, 0x0489, + 0x0591, 0x05BD, + 0x05BF, 0x05BF, + 0x05C1, 0x05C2, + 0x05C4, 0x05C5, + 0x05C7, 0x05C7, + 0x0610, 0x061A, + 0x064B, 0x065F, + 0x0670, 0x0670, + 0x06D6, 0x06DC, + 0x06DF, 0x06E4, + 0x06E7, 0x06E8, + 0x06EA, 0x06ED, + 0x0711, 0x0711, + 0x0730, 0x074A, + 0x07A6, 0x07B0, + 0x07EB, 0x07F3, + 0x07FD, 0x07FD, + 0x0816, 0x0819, + 0x081B, 0x0823, + 0x0825, 0x0827, + 0x0829, 0x082D, + 0x0859, 0x085B, + 0x0898, 0x089F, + 0x08CA, 0x08E1, + 0x08E3, 0x0902, + 0x093A, 0x093A, + 0x093C, 0x093C, + 0x0941, 0x0948, + 0x094D, 0x094D, + 0x0951, 0x0957, + 0x0962, 0x0963, + 0x0981, 0x0981, + 0x09BC, 0x09BC, + 0x09BE, 0x09BE, + 0x09C1, 0x09C4, + 0x09CD, 0x09CD, + 0x09D7, 0x09D7, + 0x09E2, 0x09E3, + 0x09FE, 0x09FE, + 0x0A01, 0x0A02, + 0x0A3C, 0x0A3C, + 0x0A41, 0x0A42, + 0x0A47, 0x0A48, + 0x0A4B, 0x0A4D, + 0x0A51, 0x0A51, + 0x0A70, 0x0A71, + 0x0A75, 0x0A75, + 0x0A81, 0x0A82, + 0x0ABC, 0x0ABC, + 0x0AC1, 0x0AC5, + 0x0AC7, 0x0AC8, + 0x0ACD, 0x0ACD, + 0x0AE2, 0x0AE3, + 0x0AFA, 0x0AFF, + 0x0B01, 0x0B01, + 0x0B3C, 0x0B3C, + 0x0B3E, 0x0B3E, + 0x0B3F, 0x0B3F, + 0x0B41, 0x0B44, + 0x0B4D, 0x0B4D, + 0x0B55, 0x0B56, + 0x0B57, 0x0B57, + 0x0B62, 0x0B63, + 0x0B82, 0x0B82, + 0x0BBE, 0x0BBE, + 0x0BC0, 0x0BC0, + 0x0BCD, 0x0BCD, + 0x0BD7, 0x0BD7, + 0x0C00, 0x0C00, + 0x0C04, 0x0C04, + 0x0C3C, 0x0C3C, + 0x0C3E, 0x0C40, + 0x0C46, 0x0C48, + 0x0C4A, 0x0C4D, + 0x0C55, 0x0C56, + 0x0C62, 0x0C63, + 0x0C81, 0x0C81, + 0x0CBC, 0x0CBC, + 0x0CBF, 0x0CBF, + 0x0CC2, 0x0CC2, + 0x0CC6, 0x0CC6, + 0x0CCC, 0x0CCD, + 0x0CD5, 0x0CD6, + 0x0CE2, 0x0CE3, + 0x0D00, 0x0D01, + 0x0D3B, 0x0D3C, + 0x0D3E, 0x0D3E, + 0x0D41, 0x0D44, + 0x0D4D, 0x0D4D, + 0x0D57, 0x0D57, + 0x0D62, 0x0D63, + 0x0D81, 0x0D81, + 0x0DCA, 0x0DCA, + 0x0DCF, 0x0DCF, + 0x0DD2, 0x0DD4, + 0x0DD6, 0x0DD6, + 0x0DDF, 0x0DDF, + 0x0E31, 0x0E31, + 0x0E34, 0x0E3A, + 0x0E47, 0x0E4E, + 0x0EB1, 0x0EB1, + 0x0EB4, 0x0EBC, + 0x0EC8, 0x0ECE, + 0x0F18, 0x0F19, + 0x0F35, 0x0F35, + 0x0F37, 0x0F37, + 0x0F39, 0x0F39, + 0x0F71, 0x0F7E, + 0x0F80, 0x0F84, + 0x0F86, 0x0F87, + 0x0F8D, 0x0F97, + 0x0F99, 0x0FBC, + 0x0FC6, 0x0FC6, + 0x102D, 0x1030, + 0x1032, 0x1037, + 0x1039, 0x103A, + 0x103D, 0x103E, + 0x1058, 0x1059, + 0x105E, 0x1060, + 0x1071, 0x1074, + 0x1082, 0x1082, + 0x1085, 0x1086, + 0x108D, 0x108D, + 0x109D, 0x109D, + 0x135D, 0x135F, + 0x1712, 0x1714, + 0x1732, 0x1733, + 0x1752, 0x1753, + 0x1772, 0x1773, + 0x17B4, 0x17B5, + 0x17B7, 0x17BD, + 0x17C6, 0x17C6, + 0x17C9, 0x17D3, + 0x17DD, 0x17DD, + 0x180B, 0x180D, + 0x180F, 0x180F, + 0x1885, 0x1886, + 0x18A9, 0x18A9, + 0x1920, 0x1922, + 0x1927, 0x1928, + 0x1932, 0x1932, + 0x1939, 0x193B, + 0x1A17, 0x1A18, + 0x1A1B, 0x1A1B, + 0x1A56, 0x1A56, + 0x1A58, 0x1A5E, + 0x1A60, 0x1A60, + 0x1A62, 0x1A62, + 0x1A65, 0x1A6C, + 0x1A73, 0x1A7C, + 0x1A7F, 0x1A7F, + 0x1AB0, 0x1ABD, + 0x1ABE, 0x1ABE, + 0x1ABF, 0x1ACE, + 0x1B00, 0x1B03, + 0x1B34, 0x1B34, + 0x1B35, 0x1B35, + 0x1B36, 0x1B3A, + 0x1B3C, 0x1B3C, + 0x1B42, 0x1B42, + 0x1B6B, 0x1B73, + 0x1B80, 0x1B81, + 0x1BA2, 0x1BA5, + 0x1BA8, 0x1BA9, + 0x1BAB, 0x1BAD, + 0x1BE6, 0x1BE6, + 0x1BE8, 0x1BE9, + 0x1BED, 0x1BED, + 0x1BEF, 0x1BF1, + 0x1C2C, 0x1C33, + 0x1C36, 0x1C37, + 0x1CD0, 0x1CD2, + 0x1CD4, 0x1CE0, + 0x1CE2, 0x1CE8, + 0x1CED, 0x1CED, + 0x1CF4, 0x1CF4, + 0x1CF8, 0x1CF9, + 0x1DC0, 0x1DFF, + 0x200C, 0x200C, + 0x20D0, 0x20DC, + 0x20DD, 0x20E0, + 0x20E1, 0x20E1, + 0x20E2, 0x20E4, + 0x20E5, 0x20F0, + 0x2CEF, 0x2CF1, + 0x2D7F, 0x2D7F, + 0x2DE0, 0x2DFF, + 0x302A, 0x302D, + 0x302E, 0x302F, + 0x3099, 0x309A, + 0xA66F, 0xA66F, + 0xA670, 0xA672, + 0xA674, 0xA67D, + 0xA69E, 0xA69F, + 0xA6F0, 0xA6F1, + 0xA802, 0xA802, + 0xA806, 0xA806, + 0xA80B, 0xA80B, + 0xA825, 0xA826, + 0xA82C, 0xA82C, + 0xA8C4, 0xA8C5, + 0xA8E0, 0xA8F1, + 0xA8FF, 0xA8FF, + 0xA926, 0xA92D, + 0xA947, 0xA951, + 0xA980, 0xA982, + 0xA9B3, 0xA9B3, + 0xA9B6, 0xA9B9, + 0xA9BC, 0xA9BD, + 0xA9E5, 0xA9E5, + 0xAA29, 0xAA2E, + 0xAA31, 0xAA32, + 0xAA35, 0xAA36, + 0xAA43, 0xAA43, + 0xAA4C, 0xAA4C, + 0xAA7C, 0xAA7C, + 0xAAB0, 0xAAB0, + 0xAAB2, 0xAAB4, + 0xAAB7, 0xAAB8, + 0xAABE, 0xAABF, + 0xAAC1, 0xAAC1, + 0xAAEC, 0xAAED, + 0xAAF6, 0xAAF6, + 0xABE5, 0xABE5, + 0xABE8, 0xABE8, + 0xABED, 0xABED, + 0xFB1E, 0xFB1E, + 0xFE00, 0xFE0F, + 0xFE20, 0xFE2F, + 0xFF9E, 0xFF9F, + 0x101FD, 0x101FD, + 0x102E0, 0x102E0, + 0x10376, 0x1037A, + 0x10A01, 0x10A03, + 0x10A05, 0x10A06, + 0x10A0C, 0x10A0F, + 0x10A38, 0x10A3A, + 0x10A3F, 0x10A3F, + 0x10AE5, 0x10AE6, + 0x10D24, 0x10D27, + 0x10EAB, 0x10EAC, + 0x10EFD, 0x10EFF, + 0x10F46, 0x10F50, + 0x10F82, 0x10F85, + 0x11001, 0x11001, + 0x11038, 0x11046, + 0x11070, 0x11070, + 0x11073, 0x11074, + 0x1107F, 0x11081, + 0x110B3, 0x110B6, + 0x110B9, 0x110BA, + 0x110C2, 0x110C2, + 0x11100, 0x11102, + 0x11127, 0x1112B, + 0x1112D, 0x11134, + 0x11173, 0x11173, + 0x11180, 0x11181, + 0x111B6, 0x111BE, + 0x111C9, 0x111CC, + 0x111CF, 0x111CF, + 0x1122F, 0x11231, + 0x11234, 0x11234, + 0x11236, 0x11237, + 0x1123E, 0x1123E, + 0x11241, 0x11241, + 0x112DF, 0x112DF, + 0x112E3, 0x112EA, + 0x11300, 0x11301, + 0x1133B, 0x1133C, + 0x1133E, 0x1133E, + 0x11340, 0x11340, + 0x11357, 0x11357, + 0x11366, 0x1136C, + 0x11370, 0x11374, + 0x11438, 0x1143F, + 0x11442, 0x11444, + 0x11446, 0x11446, + 0x1145E, 0x1145E, + 0x114B0, 0x114B0, + 0x114B3, 0x114B8, + 0x114BA, 0x114BA, + 0x114BD, 0x114BD, + 0x114BF, 0x114C0, + 0x114C2, 0x114C3, + 0x115AF, 0x115AF, + 0x115B2, 0x115B5, + 0x115BC, 0x115BD, + 0x115BF, 0x115C0, + 0x115DC, 0x115DD, + 0x11633, 0x1163A, + 0x1163D, 0x1163D, + 0x1163F, 0x11640, + 0x116AB, 0x116AB, + 0x116AD, 0x116AD, + 0x116B0, 0x116B5, + 0x116B7, 0x116B7, + 0x1171D, 0x1171F, + 0x11722, 0x11725, + 0x11727, 0x1172B, + 0x1182F, 0x11837, + 0x11839, 0x1183A, + 0x11930, 0x11930, + 0x1193B, 0x1193C, + 0x1193E, 0x1193E, + 0x11943, 0x11943, + 0x119D4, 0x119D7, + 0x119DA, 0x119DB, + 0x119E0, 0x119E0, + 0x11A01, 0x11A0A, + 0x11A33, 0x11A38, + 0x11A3B, 0x11A3E, + 0x11A47, 0x11A47, + 0x11A51, 0x11A56, + 0x11A59, 0x11A5B, + 0x11A8A, 0x11A96, + 0x11A98, 0x11A99, + 0x11C30, 0x11C36, + 0x11C38, 0x11C3D, + 0x11C3F, 0x11C3F, + 0x11C92, 0x11CA7, + 0x11CAA, 0x11CB0, + 0x11CB2, 0x11CB3, + 0x11CB5, 0x11CB6, + 0x11D31, 0x11D36, + 0x11D3A, 0x11D3A, + 0x11D3C, 0x11D3D, + 0x11D3F, 0x11D45, + 0x11D47, 0x11D47, + 0x11D90, 0x11D91, + 0x11D95, 0x11D95, + 0x11D97, 0x11D97, + 0x11EF3, 0x11EF4, + 0x11F00, 0x11F01, + 0x11F36, 0x11F3A, + 0x11F40, 0x11F40, + 0x11F42, 0x11F42, + 0x13440, 0x13440, + 0x13447, 0x13455, + 0x16AF0, 0x16AF4, + 0x16B30, 0x16B36, + 0x16F4F, 0x16F4F, + 0x16F8F, 0x16F92, + 0x16FE4, 0x16FE4, + 0x1BC9D, 0x1BC9E, + 0x1CF00, 0x1CF2D, + 0x1CF30, 0x1CF46, + 0x1D165, 0x1D165, + 0x1D167, 0x1D169, + 0x1D16E, 0x1D172, + 0x1D17B, 0x1D182, + 0x1D185, 0x1D18B, + 0x1D1AA, 0x1D1AD, + 0x1D242, 0x1D244, + 0x1DA00, 0x1DA36, + 0x1DA3B, 0x1DA6C, + 0x1DA75, 0x1DA75, + 0x1DA84, 0x1DA84, + 0x1DA9B, 0x1DA9F, + 0x1DAA1, 0x1DAAF, + 0x1E000, 0x1E006, + 0x1E008, 0x1E018, + 0x1E01B, 0x1E021, + 0x1E023, 0x1E024, + 0x1E026, 0x1E02A, + 0x1E08F, 0x1E08F, + 0x1E130, 0x1E136, + 0x1E2AE, 0x1E2AE, + 0x1E2EC, 0x1E2EF, + 0x1E4EC, 0x1E4EF, + 0x1E8D0, 0x1E8D6, + 0x1E944, 0x1E94A, + 0xE0020, 0xE007F, + 0xE0100, 0xE01EF, +}; + +static const int32_t ucg_hangul_syllable_lv_singlets[] = { + 0xAC00, + 0xAC1C, + 0xAC38, + 0xAC54, + 0xAC70, + 0xAC8C, + 0xACA8, + 0xACC4, + 0xACE0, + 0xACFC, + 0xAD18, + 0xAD34, + 0xAD50, + 0xAD6C, + 0xAD88, + 0xADA4, + 0xADC0, + 0xADDC, + 0xADF8, + 0xAE14, + 0xAE30, + 0xAE4C, + 0xAE68, + 0xAE84, + 0xAEA0, + 0xAEBC, + 0xAED8, + 0xAEF4, + 0xAF10, + 0xAF2C, + 0xAF48, + 0xAF64, + 0xAF80, + 0xAF9C, + 0xAFB8, + 0xAFD4, + 0xAFF0, + 0xB00C, + 0xB028, + 0xB044, + 0xB060, + 0xB07C, + 0xB098, + 0xB0B4, + 0xB0D0, + 0xB0EC, + 0xB108, + 0xB124, + 0xB140, + 0xB15C, + 0xB178, + 0xB194, + 0xB1B0, + 0xB1CC, + 0xB1E8, + 0xB204, + 0xB220, + 0xB23C, + 0xB258, + 0xB274, + 0xB290, + 0xB2AC, + 0xB2C8, + 0xB2E4, + 0xB300, + 0xB31C, + 0xB338, + 0xB354, + 0xB370, + 0xB38C, + 0xB3A8, + 0xB3C4, + 0xB3E0, + 0xB3FC, + 0xB418, + 0xB434, + 0xB450, + 0xB46C, + 0xB488, + 0xB4A4, + 0xB4C0, + 0xB4DC, + 0xB4F8, + 0xB514, + 0xB530, + 0xB54C, + 0xB568, + 0xB584, + 0xB5A0, + 0xB5BC, + 0xB5D8, + 0xB5F4, + 0xB610, + 0xB62C, + 0xB648, + 0xB664, + 0xB680, + 0xB69C, + 0xB6B8, + 0xB6D4, + 0xB6F0, + 0xB70C, + 0xB728, + 0xB744, + 0xB760, + 0xB77C, + 0xB798, + 0xB7B4, + 0xB7D0, + 0xB7EC, + 0xB808, + 0xB824, + 0xB840, + 0xB85C, + 0xB878, + 0xB894, + 0xB8B0, + 0xB8CC, + 0xB8E8, + 0xB904, + 0xB920, + 0xB93C, + 0xB958, + 0xB974, + 0xB990, + 0xB9AC, + 0xB9C8, + 0xB9E4, + 0xBA00, + 0xBA1C, + 0xBA38, + 0xBA54, + 0xBA70, + 0xBA8C, + 0xBAA8, + 0xBAC4, + 0xBAE0, + 0xBAFC, + 0xBB18, + 0xBB34, + 0xBB50, + 0xBB6C, + 0xBB88, + 0xBBA4, + 0xBBC0, + 0xBBDC, + 0xBBF8, + 0xBC14, + 0xBC30, + 0xBC4C, + 0xBC68, + 0xBC84, + 0xBCA0, + 0xBCBC, + 0xBCD8, + 0xBCF4, + 0xBD10, + 0xBD2C, + 0xBD48, + 0xBD64, + 0xBD80, + 0xBD9C, + 0xBDB8, + 0xBDD4, + 0xBDF0, + 0xBE0C, + 0xBE28, + 0xBE44, + 0xBE60, + 0xBE7C, + 0xBE98, + 0xBEB4, + 0xBED0, + 0xBEEC, + 0xBF08, + 0xBF24, + 0xBF40, + 0xBF5C, + 0xBF78, + 0xBF94, + 0xBFB0, + 0xBFCC, + 0xBFE8, + 0xC004, + 0xC020, + 0xC03C, + 0xC058, + 0xC074, + 0xC090, + 0xC0AC, + 0xC0C8, + 0xC0E4, + 0xC100, + 0xC11C, + 0xC138, + 0xC154, + 0xC170, + 0xC18C, + 0xC1A8, + 0xC1C4, + 0xC1E0, + 0xC1FC, + 0xC218, + 0xC234, + 0xC250, + 0xC26C, + 0xC288, + 0xC2A4, + 0xC2C0, + 0xC2DC, + 0xC2F8, + 0xC314, + 0xC330, + 0xC34C, + 0xC368, + 0xC384, + 0xC3A0, + 0xC3BC, + 0xC3D8, + 0xC3F4, + 0xC410, + 0xC42C, + 0xC448, + 0xC464, + 0xC480, + 0xC49C, + 0xC4B8, + 0xC4D4, + 0xC4F0, + 0xC50C, + 0xC528, + 0xC544, + 0xC560, + 0xC57C, + 0xC598, + 0xC5B4, + 0xC5D0, + 0xC5EC, + 0xC608, + 0xC624, + 0xC640, + 0xC65C, + 0xC678, + 0xC694, + 0xC6B0, + 0xC6CC, + 0xC6E8, + 0xC704, + 0xC720, + 0xC73C, + 0xC758, + 0xC774, + 0xC790, + 0xC7AC, + 0xC7C8, + 0xC7E4, + 0xC800, + 0xC81C, + 0xC838, + 0xC854, + 0xC870, + 0xC88C, + 0xC8A8, + 0xC8C4, + 0xC8E0, + 0xC8FC, + 0xC918, + 0xC934, + 0xC950, + 0xC96C, + 0xC988, + 0xC9A4, + 0xC9C0, + 0xC9DC, + 0xC9F8, + 0xCA14, + 0xCA30, + 0xCA4C, + 0xCA68, + 0xCA84, + 0xCAA0, + 0xCABC, + 0xCAD8, + 0xCAF4, + 0xCB10, + 0xCB2C, + 0xCB48, + 0xCB64, + 0xCB80, + 0xCB9C, + 0xCBB8, + 0xCBD4, + 0xCBF0, + 0xCC0C, + 0xCC28, + 0xCC44, + 0xCC60, + 0xCC7C, + 0xCC98, + 0xCCB4, + 0xCCD0, + 0xCCEC, + 0xCD08, + 0xCD24, + 0xCD40, + 0xCD5C, + 0xCD78, + 0xCD94, + 0xCDB0, + 0xCDCC, + 0xCDE8, + 0xCE04, + 0xCE20, + 0xCE3C, + 0xCE58, + 0xCE74, + 0xCE90, + 0xCEAC, + 0xCEC8, + 0xCEE4, + 0xCF00, + 0xCF1C, + 0xCF38, + 0xCF54, + 0xCF70, + 0xCF8C, + 0xCFA8, + 0xCFC4, + 0xCFE0, + 0xCFFC, + 0xD018, + 0xD034, + 0xD050, + 0xD06C, + 0xD088, + 0xD0A4, + 0xD0C0, + 0xD0DC, + 0xD0F8, + 0xD114, + 0xD130, + 0xD14C, + 0xD168, + 0xD184, + 0xD1A0, + 0xD1BC, + 0xD1D8, + 0xD1F4, + 0xD210, + 0xD22C, + 0xD248, + 0xD264, + 0xD280, + 0xD29C, + 0xD2B8, + 0xD2D4, + 0xD2F0, + 0xD30C, + 0xD328, + 0xD344, + 0xD360, + 0xD37C, + 0xD398, + 0xD3B4, + 0xD3D0, + 0xD3EC, + 0xD408, + 0xD424, + 0xD440, + 0xD45C, + 0xD478, + 0xD494, + 0xD4B0, + 0xD4CC, + 0xD4E8, + 0xD504, + 0xD520, + 0xD53C, + 0xD558, + 0xD574, + 0xD590, + 0xD5AC, + 0xD5C8, + 0xD5E4, + 0xD600, + 0xD61C, + 0xD638, + 0xD654, + 0xD670, + 0xD68C, + 0xD6A8, + 0xD6C4, + 0xD6E0, + 0xD6FC, + 0xD718, + 0xD734, + 0xD750, + 0xD76C, + 0xD788, +}; + +static const int32_t ucg_hangul_syllable_lvt_ranges[] = { + 0xAC01, 0xAC1B, + 0xAC1D, 0xAC37, + 0xAC39, 0xAC53, + 0xAC55, 0xAC6F, + 0xAC71, 0xAC8B, + 0xAC8D, 0xACA7, + 0xACA9, 0xACC3, + 0xACC5, 0xACDF, + 0xACE1, 0xACFB, + 0xACFD, 0xAD17, + 0xAD19, 0xAD33, + 0xAD35, 0xAD4F, + 0xAD51, 0xAD6B, + 0xAD6D, 0xAD87, + 0xAD89, 0xADA3, + 0xADA5, 0xADBF, + 0xADC1, 0xADDB, + 0xADDD, 0xADF7, + 0xADF9, 0xAE13, + 0xAE15, 0xAE2F, + 0xAE31, 0xAE4B, + 0xAE4D, 0xAE67, + 0xAE69, 0xAE83, + 0xAE85, 0xAE9F, + 0xAEA1, 0xAEBB, + 0xAEBD, 0xAED7, + 0xAED9, 0xAEF3, + 0xAEF5, 0xAF0F, + 0xAF11, 0xAF2B, + 0xAF2D, 0xAF47, + 0xAF49, 0xAF63, + 0xAF65, 0xAF7F, + 0xAF81, 0xAF9B, + 0xAF9D, 0xAFB7, + 0xAFB9, 0xAFD3, + 0xAFD5, 0xAFEF, + 0xAFF1, 0xB00B, + 0xB00D, 0xB027, + 0xB029, 0xB043, + 0xB045, 0xB05F, + 0xB061, 0xB07B, + 0xB07D, 0xB097, + 0xB099, 0xB0B3, + 0xB0B5, 0xB0CF, + 0xB0D1, 0xB0EB, + 0xB0ED, 0xB107, + 0xB109, 0xB123, + 0xB125, 0xB13F, + 0xB141, 0xB15B, + 0xB15D, 0xB177, + 0xB179, 0xB193, + 0xB195, 0xB1AF, + 0xB1B1, 0xB1CB, + 0xB1CD, 0xB1E7, + 0xB1E9, 0xB203, + 0xB205, 0xB21F, + 0xB221, 0xB23B, + 0xB23D, 0xB257, + 0xB259, 0xB273, + 0xB275, 0xB28F, + 0xB291, 0xB2AB, + 0xB2AD, 0xB2C7, + 0xB2C9, 0xB2E3, + 0xB2E5, 0xB2FF, + 0xB301, 0xB31B, + 0xB31D, 0xB337, + 0xB339, 0xB353, + 0xB355, 0xB36F, + 0xB371, 0xB38B, + 0xB38D, 0xB3A7, + 0xB3A9, 0xB3C3, + 0xB3C5, 0xB3DF, + 0xB3E1, 0xB3FB, + 0xB3FD, 0xB417, + 0xB419, 0xB433, + 0xB435, 0xB44F, + 0xB451, 0xB46B, + 0xB46D, 0xB487, + 0xB489, 0xB4A3, + 0xB4A5, 0xB4BF, + 0xB4C1, 0xB4DB, + 0xB4DD, 0xB4F7, + 0xB4F9, 0xB513, + 0xB515, 0xB52F, + 0xB531, 0xB54B, + 0xB54D, 0xB567, + 0xB569, 0xB583, + 0xB585, 0xB59F, + 0xB5A1, 0xB5BB, + 0xB5BD, 0xB5D7, + 0xB5D9, 0xB5F3, + 0xB5F5, 0xB60F, + 0xB611, 0xB62B, + 0xB62D, 0xB647, + 0xB649, 0xB663, + 0xB665, 0xB67F, + 0xB681, 0xB69B, + 0xB69D, 0xB6B7, + 0xB6B9, 0xB6D3, + 0xB6D5, 0xB6EF, + 0xB6F1, 0xB70B, + 0xB70D, 0xB727, + 0xB729, 0xB743, + 0xB745, 0xB75F, + 0xB761, 0xB77B, + 0xB77D, 0xB797, + 0xB799, 0xB7B3, + 0xB7B5, 0xB7CF, + 0xB7D1, 0xB7EB, + 0xB7ED, 0xB807, + 0xB809, 0xB823, + 0xB825, 0xB83F, + 0xB841, 0xB85B, + 0xB85D, 0xB877, + 0xB879, 0xB893, + 0xB895, 0xB8AF, + 0xB8B1, 0xB8CB, + 0xB8CD, 0xB8E7, + 0xB8E9, 0xB903, + 0xB905, 0xB91F, + 0xB921, 0xB93B, + 0xB93D, 0xB957, + 0xB959, 0xB973, + 0xB975, 0xB98F, + 0xB991, 0xB9AB, + 0xB9AD, 0xB9C7, + 0xB9C9, 0xB9E3, + 0xB9E5, 0xB9FF, + 0xBA01, 0xBA1B, + 0xBA1D, 0xBA37, + 0xBA39, 0xBA53, + 0xBA55, 0xBA6F, + 0xBA71, 0xBA8B, + 0xBA8D, 0xBAA7, + 0xBAA9, 0xBAC3, + 0xBAC5, 0xBADF, + 0xBAE1, 0xBAFB, + 0xBAFD, 0xBB17, + 0xBB19, 0xBB33, + 0xBB35, 0xBB4F, + 0xBB51, 0xBB6B, + 0xBB6D, 0xBB87, + 0xBB89, 0xBBA3, + 0xBBA5, 0xBBBF, + 0xBBC1, 0xBBDB, + 0xBBDD, 0xBBF7, + 0xBBF9, 0xBC13, + 0xBC15, 0xBC2F, + 0xBC31, 0xBC4B, + 0xBC4D, 0xBC67, + 0xBC69, 0xBC83, + 0xBC85, 0xBC9F, + 0xBCA1, 0xBCBB, + 0xBCBD, 0xBCD7, + 0xBCD9, 0xBCF3, + 0xBCF5, 0xBD0F, + 0xBD11, 0xBD2B, + 0xBD2D, 0xBD47, + 0xBD49, 0xBD63, + 0xBD65, 0xBD7F, + 0xBD81, 0xBD9B, + 0xBD9D, 0xBDB7, + 0xBDB9, 0xBDD3, + 0xBDD5, 0xBDEF, + 0xBDF1, 0xBE0B, + 0xBE0D, 0xBE27, + 0xBE29, 0xBE43, + 0xBE45, 0xBE5F, + 0xBE61, 0xBE7B, + 0xBE7D, 0xBE97, + 0xBE99, 0xBEB3, + 0xBEB5, 0xBECF, + 0xBED1, 0xBEEB, + 0xBEED, 0xBF07, + 0xBF09, 0xBF23, + 0xBF25, 0xBF3F, + 0xBF41, 0xBF5B, + 0xBF5D, 0xBF77, + 0xBF79, 0xBF93, + 0xBF95, 0xBFAF, + 0xBFB1, 0xBFCB, + 0xBFCD, 0xBFE7, + 0xBFE9, 0xC003, + 0xC005, 0xC01F, + 0xC021, 0xC03B, + 0xC03D, 0xC057, + 0xC059, 0xC073, + 0xC075, 0xC08F, + 0xC091, 0xC0AB, + 0xC0AD, 0xC0C7, + 0xC0C9, 0xC0E3, + 0xC0E5, 0xC0FF, + 0xC101, 0xC11B, + 0xC11D, 0xC137, + 0xC139, 0xC153, + 0xC155, 0xC16F, + 0xC171, 0xC18B, + 0xC18D, 0xC1A7, + 0xC1A9, 0xC1C3, + 0xC1C5, 0xC1DF, + 0xC1E1, 0xC1FB, + 0xC1FD, 0xC217, + 0xC219, 0xC233, + 0xC235, 0xC24F, + 0xC251, 0xC26B, + 0xC26D, 0xC287, + 0xC289, 0xC2A3, + 0xC2A5, 0xC2BF, + 0xC2C1, 0xC2DB, + 0xC2DD, 0xC2F7, + 0xC2F9, 0xC313, + 0xC315, 0xC32F, + 0xC331, 0xC34B, + 0xC34D, 0xC367, + 0xC369, 0xC383, + 0xC385, 0xC39F, + 0xC3A1, 0xC3BB, + 0xC3BD, 0xC3D7, + 0xC3D9, 0xC3F3, + 0xC3F5, 0xC40F, + 0xC411, 0xC42B, + 0xC42D, 0xC447, + 0xC449, 0xC463, + 0xC465, 0xC47F, + 0xC481, 0xC49B, + 0xC49D, 0xC4B7, + 0xC4B9, 0xC4D3, + 0xC4D5, 0xC4EF, + 0xC4F1, 0xC50B, + 0xC50D, 0xC527, + 0xC529, 0xC543, + 0xC545, 0xC55F, + 0xC561, 0xC57B, + 0xC57D, 0xC597, + 0xC599, 0xC5B3, + 0xC5B5, 0xC5CF, + 0xC5D1, 0xC5EB, + 0xC5ED, 0xC607, + 0xC609, 0xC623, + 0xC625, 0xC63F, + 0xC641, 0xC65B, + 0xC65D, 0xC677, + 0xC679, 0xC693, + 0xC695, 0xC6AF, + 0xC6B1, 0xC6CB, + 0xC6CD, 0xC6E7, + 0xC6E9, 0xC703, + 0xC705, 0xC71F, + 0xC721, 0xC73B, + 0xC73D, 0xC757, + 0xC759, 0xC773, + 0xC775, 0xC78F, + 0xC791, 0xC7AB, + 0xC7AD, 0xC7C7, + 0xC7C9, 0xC7E3, + 0xC7E5, 0xC7FF, + 0xC801, 0xC81B, + 0xC81D, 0xC837, + 0xC839, 0xC853, + 0xC855, 0xC86F, + 0xC871, 0xC88B, + 0xC88D, 0xC8A7, + 0xC8A9, 0xC8C3, + 0xC8C5, 0xC8DF, + 0xC8E1, 0xC8FB, + 0xC8FD, 0xC917, + 0xC919, 0xC933, + 0xC935, 0xC94F, + 0xC951, 0xC96B, + 0xC96D, 0xC987, + 0xC989, 0xC9A3, + 0xC9A5, 0xC9BF, + 0xC9C1, 0xC9DB, + 0xC9DD, 0xC9F7, + 0xC9F9, 0xCA13, + 0xCA15, 0xCA2F, + 0xCA31, 0xCA4B, + 0xCA4D, 0xCA67, + 0xCA69, 0xCA83, + 0xCA85, 0xCA9F, + 0xCAA1, 0xCABB, + 0xCABD, 0xCAD7, + 0xCAD9, 0xCAF3, + 0xCAF5, 0xCB0F, + 0xCB11, 0xCB2B, + 0xCB2D, 0xCB47, + 0xCB49, 0xCB63, + 0xCB65, 0xCB7F, + 0xCB81, 0xCB9B, + 0xCB9D, 0xCBB7, + 0xCBB9, 0xCBD3, + 0xCBD5, 0xCBEF, + 0xCBF1, 0xCC0B, + 0xCC0D, 0xCC27, + 0xCC29, 0xCC43, + 0xCC45, 0xCC5F, + 0xCC61, 0xCC7B, + 0xCC7D, 0xCC97, + 0xCC99, 0xCCB3, + 0xCCB5, 0xCCCF, + 0xCCD1, 0xCCEB, + 0xCCED, 0xCD07, + 0xCD09, 0xCD23, + 0xCD25, 0xCD3F, + 0xCD41, 0xCD5B, + 0xCD5D, 0xCD77, + 0xCD79, 0xCD93, + 0xCD95, 0xCDAF, + 0xCDB1, 0xCDCB, + 0xCDCD, 0xCDE7, + 0xCDE9, 0xCE03, + 0xCE05, 0xCE1F, + 0xCE21, 0xCE3B, + 0xCE3D, 0xCE57, + 0xCE59, 0xCE73, + 0xCE75, 0xCE8F, + 0xCE91, 0xCEAB, + 0xCEAD, 0xCEC7, + 0xCEC9, 0xCEE3, + 0xCEE5, 0xCEFF, + 0xCF01, 0xCF1B, + 0xCF1D, 0xCF37, + 0xCF39, 0xCF53, + 0xCF55, 0xCF6F, + 0xCF71, 0xCF8B, + 0xCF8D, 0xCFA7, + 0xCFA9, 0xCFC3, + 0xCFC5, 0xCFDF, + 0xCFE1, 0xCFFB, + 0xCFFD, 0xD017, + 0xD019, 0xD033, + 0xD035, 0xD04F, + 0xD051, 0xD06B, + 0xD06D, 0xD087, + 0xD089, 0xD0A3, + 0xD0A5, 0xD0BF, + 0xD0C1, 0xD0DB, + 0xD0DD, 0xD0F7, + 0xD0F9, 0xD113, + 0xD115, 0xD12F, + 0xD131, 0xD14B, + 0xD14D, 0xD167, + 0xD169, 0xD183, + 0xD185, 0xD19F, + 0xD1A1, 0xD1BB, + 0xD1BD, 0xD1D7, + 0xD1D9, 0xD1F3, + 0xD1F5, 0xD20F, + 0xD211, 0xD22B, + 0xD22D, 0xD247, + 0xD249, 0xD263, + 0xD265, 0xD27F, + 0xD281, 0xD29B, + 0xD29D, 0xD2B7, + 0xD2B9, 0xD2D3, + 0xD2D5, 0xD2EF, + 0xD2F1, 0xD30B, + 0xD30D, 0xD327, + 0xD329, 0xD343, + 0xD345, 0xD35F, + 0xD361, 0xD37B, + 0xD37D, 0xD397, + 0xD399, 0xD3B3, + 0xD3B5, 0xD3CF, + 0xD3D1, 0xD3EB, + 0xD3ED, 0xD407, + 0xD409, 0xD423, + 0xD425, 0xD43F, + 0xD441, 0xD45B, + 0xD45D, 0xD477, + 0xD479, 0xD493, + 0xD495, 0xD4AF, + 0xD4B1, 0xD4CB, + 0xD4CD, 0xD4E7, + 0xD4E9, 0xD503, + 0xD505, 0xD51F, + 0xD521, 0xD53B, + 0xD53D, 0xD557, + 0xD559, 0xD573, + 0xD575, 0xD58F, + 0xD591, 0xD5AB, + 0xD5AD, 0xD5C7, + 0xD5C9, 0xD5E3, + 0xD5E5, 0xD5FF, + 0xD601, 0xD61B, + 0xD61D, 0xD637, + 0xD639, 0xD653, + 0xD655, 0xD66F, + 0xD671, 0xD68B, + 0xD68D, 0xD6A7, + 0xD6A9, 0xD6C3, + 0xD6C5, 0xD6DF, + 0xD6E1, 0xD6FB, + 0xD6FD, 0xD717, + 0xD719, 0xD733, + 0xD735, 0xD74F, + 0xD751, 0xD76B, + 0xD76D, 0xD787, + 0xD789, 0xD7A3, +}; + +static const int32_t ucg_indic_conjunct_break_consonant_ranges[] = { + 0x0915, 0x0939, + 0x0958, 0x095F, + 0x0978, 0x097F, + 0x0995, 0x09A8, + 0x09AA, 0x09B0, + 0x09B2, 0x09B2, + 0x09B6, 0x09B9, + 0x09DC, 0x09DD, + 0x09DF, 0x09DF, + 0x09F0, 0x09F1, + 0x0A95, 0x0AA8, + 0x0AAA, 0x0AB0, + 0x0AB2, 0x0AB3, + 0x0AB5, 0x0AB9, + 0x0AF9, 0x0AF9, + 0x0B15, 0x0B28, + 0x0B2A, 0x0B30, + 0x0B32, 0x0B33, + 0x0B35, 0x0B39, + 0x0B5C, 0x0B5D, + 0x0B5F, 0x0B5F, + 0x0B71, 0x0B71, + 0x0C15, 0x0C28, + 0x0C2A, 0x0C39, + 0x0C58, 0x0C5A, + 0x0D15, 0x0D3A, +}; + +static const int32_t ucg_indic_conjunct_break_extend_ranges[] = { + 0x0300, 0x034E, + 0x0350, 0x036F, + 0x0483, 0x0487, + 0x0591, 0x05BD, + 0x05BF, 0x05BF, + 0x05C1, 0x05C2, + 0x05C4, 0x05C5, + 0x05C7, 0x05C7, + 0x0610, 0x061A, + 0x064B, 0x065F, + 0x0670, 0x0670, + 0x06D6, 0x06DC, + 0x06DF, 0x06E4, + 0x06E7, 0x06E8, + 0x06EA, 0x06ED, + 0x0711, 0x0711, + 0x0730, 0x074A, + 0x07EB, 0x07F3, + 0x07FD, 0x07FD, + 0x0816, 0x0819, + 0x081B, 0x0823, + 0x0825, 0x0827, + 0x0829, 0x082D, + 0x0859, 0x085B, + 0x0898, 0x089F, + 0x08CA, 0x08E1, + 0x08E3, 0x08FF, + 0x093C, 0x093C, + 0x0951, 0x0954, + 0x09BC, 0x09BC, + 0x09FE, 0x09FE, + 0x0A3C, 0x0A3C, + 0x0ABC, 0x0ABC, + 0x0B3C, 0x0B3C, + 0x0C3C, 0x0C3C, + 0x0C55, 0x0C56, + 0x0CBC, 0x0CBC, + 0x0D3B, 0x0D3C, + 0x0E38, 0x0E3A, + 0x0E48, 0x0E4B, + 0x0EB8, 0x0EBA, + 0x0EC8, 0x0ECB, + 0x0F18, 0x0F19, + 0x0F35, 0x0F35, + 0x0F37, 0x0F37, + 0x0F39, 0x0F39, + 0x0F71, 0x0F72, + 0x0F74, 0x0F74, + 0x0F7A, 0x0F7D, + 0x0F80, 0x0F80, + 0x0F82, 0x0F84, + 0x0F86, 0x0F87, + 0x0FC6, 0x0FC6, + 0x1037, 0x1037, + 0x1039, 0x103A, + 0x108D, 0x108D, + 0x135D, 0x135F, + 0x1714, 0x1714, + 0x17D2, 0x17D2, + 0x17DD, 0x17DD, + 0x18A9, 0x18A9, + 0x1939, 0x193B, + 0x1A17, 0x1A18, + 0x1A60, 0x1A60, + 0x1A75, 0x1A7C, + 0x1A7F, 0x1A7F, + 0x1AB0, 0x1ABD, + 0x1ABF, 0x1ACE, + 0x1B34, 0x1B34, + 0x1B6B, 0x1B73, + 0x1BAB, 0x1BAB, + 0x1BE6, 0x1BE6, + 0x1C37, 0x1C37, + 0x1CD0, 0x1CD2, + 0x1CD4, 0x1CE0, + 0x1CE2, 0x1CE8, + 0x1CED, 0x1CED, + 0x1CF4, 0x1CF4, + 0x1CF8, 0x1CF9, + 0x1DC0, 0x1DFF, + 0x200D, 0x200D, + 0x20D0, 0x20DC, + 0x20E1, 0x20E1, + 0x20E5, 0x20F0, + 0x2CEF, 0x2CF1, + 0x2D7F, 0x2D7F, + 0x2DE0, 0x2DFF, + 0x302A, 0x302D, + 0x302E, 0x302F, + 0x3099, 0x309A, + 0xA66F, 0xA66F, + 0xA674, 0xA67D, + 0xA69E, 0xA69F, + 0xA6F0, 0xA6F1, + 0xA82C, 0xA82C, + 0xA8E0, 0xA8F1, + 0xA92B, 0xA92D, + 0xA9B3, 0xA9B3, + 0xAAB0, 0xAAB0, + 0xAAB2, 0xAAB4, + 0xAAB7, 0xAAB8, + 0xAABE, 0xAABF, + 0xAAC1, 0xAAC1, + 0xAAF6, 0xAAF6, + 0xABED, 0xABED, + 0xFB1E, 0xFB1E, + 0xFE20, 0xFE2F, + 0x101FD, 0x101FD, + 0x102E0, 0x102E0, + 0x10376, 0x1037A, + 0x10A0D, 0x10A0D, + 0x10A0F, 0x10A0F, + 0x10A38, 0x10A3A, + 0x10A3F, 0x10A3F, + 0x10AE5, 0x10AE6, + 0x10D24, 0x10D27, + 0x10EAB, 0x10EAC, + 0x10EFD, 0x10EFF, + 0x10F46, 0x10F50, + 0x10F82, 0x10F85, + 0x11070, 0x11070, + 0x1107F, 0x1107F, + 0x110BA, 0x110BA, + 0x11100, 0x11102, + 0x11133, 0x11134, + 0x11173, 0x11173, + 0x111CA, 0x111CA, + 0x11236, 0x11236, + 0x112E9, 0x112EA, + 0x1133B, 0x1133C, + 0x11366, 0x1136C, + 0x11370, 0x11374, + 0x11446, 0x11446, + 0x1145E, 0x1145E, + 0x114C3, 0x114C3, + 0x115C0, 0x115C0, + 0x116B7, 0x116B7, + 0x1172B, 0x1172B, + 0x1183A, 0x1183A, + 0x1193E, 0x1193E, + 0x11943, 0x11943, + 0x11A34, 0x11A34, + 0x11A47, 0x11A47, + 0x11A99, 0x11A99, + 0x11D42, 0x11D42, + 0x11D44, 0x11D45, + 0x11D97, 0x11D97, + 0x11F42, 0x11F42, + 0x16AF0, 0x16AF4, + 0x16B30, 0x16B36, + 0x1BC9E, 0x1BC9E, + 0x1D165, 0x1D165, + 0x1D167, 0x1D169, + 0x1D16E, 0x1D172, + 0x1D17B, 0x1D182, + 0x1D185, 0x1D18B, + 0x1D1AA, 0x1D1AD, + 0x1D242, 0x1D244, + 0x1E000, 0x1E006, + 0x1E008, 0x1E018, + 0x1E01B, 0x1E021, + 0x1E023, 0x1E024, + 0x1E026, 0x1E02A, + 0x1E08F, 0x1E08F, + 0x1E130, 0x1E136, + 0x1E2AE, 0x1E2AE, + 0x1E2EC, 0x1E2EF, + 0x1E4EC, 0x1E4EF, + 0x1E8D0, 0x1E8D6, + 0x1E944, 0x1E94A, +}; + +// Fullwidth (F) and Wide (W) are counted as 2. +// Everything else is 1. +// +// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt +static const int32_t ucg_normalized_east_asian_width_ranges[] = { + 0x0000, 0x10FF, 1, + 0x1100, 0x115F, 2, + 0x1160, 0x2319, 1, + 0x231A, 0x231B, 2, + 0x231C, 0x2328, 1, + 0x2329, 0x232A, 2, + 0x232B, 0x23E8, 1, + 0x23E9, 0x23EC, 2, + 0x23ED, 0x23EF, 1, + 0x23F0, 0x23F0, 2, + 0x23F1, 0x23F2, 1, + 0x23F3, 0x23F3, 2, + 0x23F4, 0x25FC, 1, + 0x25FD, 0x25FE, 2, + 0x25FF, 0x2613, 1, + 0x2614, 0x2615, 2, + 0x2616, 0x2647, 1, + 0x2648, 0x2653, 2, + 0x2654, 0x267E, 1, + 0x267F, 0x267F, 2, + 0x2680, 0x2692, 1, + 0x2693, 0x2693, 2, + 0x2694, 0x26A0, 1, + 0x26A1, 0x26A1, 2, + 0x26A2, 0x26A9, 1, + 0x26AA, 0x26AB, 2, + 0x26AC, 0x26BC, 1, + 0x26BD, 0x26BE, 2, + 0x26BF, 0x26C3, 1, + 0x26C4, 0x26C5, 2, + 0x26C6, 0x26CD, 1, + 0x26CE, 0x26CE, 2, + 0x26CF, 0x26D3, 1, + 0x26D4, 0x26D4, 2, + 0x26D5, 0x26E9, 1, + 0x26EA, 0x26EA, 2, + 0x26EB, 0x26F1, 1, + 0x26F2, 0x26F3, 2, + 0x26F4, 0x26F4, 1, + 0x26F5, 0x26F5, 2, + 0x26F6, 0x26F9, 1, + 0x26FA, 0x26FA, 2, + 0x26FB, 0x26FC, 1, + 0x26FD, 0x26FD, 2, + 0x26FE, 0x2704, 1, + 0x2705, 0x2705, 2, + 0x2706, 0x2709, 1, + 0x270A, 0x270B, 2, + 0x270C, 0x2727, 1, + 0x2728, 0x2728, 2, + 0x2729, 0x274B, 1, + 0x274C, 0x274C, 2, + 0x274D, 0x274D, 1, + 0x274E, 0x274E, 2, + 0x274F, 0x2752, 1, + 0x2753, 0x2755, 2, + 0x2756, 0x2756, 1, + 0x2757, 0x2757, 2, + 0x2758, 0x2794, 1, + 0x2795, 0x2797, 2, + 0x2798, 0x27AF, 1, + 0x27B0, 0x27B0, 2, + 0x27B1, 0x27BE, 1, + 0x27BF, 0x27BF, 2, + 0x27C0, 0x2B1A, 1, + 0x2B1B, 0x2B1C, 2, + 0x2B1D, 0x2B4F, 1, + 0x2B50, 0x2B50, 2, + 0x2B51, 0x2B54, 1, + 0x2B55, 0x2B55, 2, + 0x2B56, 0x2E5D, 1, + 0x2E80, 0x303E, 2, + 0x303F, 0x303F, 1, + 0x3041, 0x3247, 2, + 0x3248, 0x324F, 1, + 0x3250, 0x4DBF, 2, + 0x4DC0, 0x4DFF, 1, + 0x4E00, 0xA4C6, 2, + 0xA4D0, 0xA95F, 1, + 0xA960, 0xA97C, 2, + 0xA980, 0xABF9, 1, + 0xAC00, 0xD7A3, 2, + 0xD7B0, 0xF8FF, 1, + 0xF900, 0xFAFF, 2, + 0xFB00, 0xFE0F, 1, + 0xFE10, 0xFE19, 2, + 0xFE20, 0xFE2F, 1, + 0xFE30, 0xFE6B, 2, + 0xFE70, 0xFEFF, 1, + 0xFF01, 0xFF60, 2, + 0xFF61, 0xFFDC, 1, + 0xFFE0, 0xFFE6, 2, + 0xFFE8, 0x16F9F, 1, + 0x16FE0, 0x1B2FB, 2, + 0x1BC00, 0x1F003, 1, + 0x1F004, 0x1F004, 2, + 0x1F005, 0x1F0CE, 1, + 0x1F0CF, 0x1F0CF, 2, + 0x1F0D1, 0x1F18D, 1, + 0x1F18E, 0x1F18E, 2, + 0x1F18F, 0x1F190, 1, + 0x1F191, 0x1F19A, 2, + 0x1F19B, 0x1F1FF, 1, + 0x1F200, 0x1F320, 2, + 0x1F321, 0x1F32C, 1, + 0x1F32D, 0x1F335, 2, + 0x1F336, 0x1F336, 1, + 0x1F337, 0x1F37C, 2, + 0x1F37D, 0x1F37D, 1, + 0x1F37E, 0x1F393, 2, + 0x1F394, 0x1F39F, 1, + 0x1F3A0, 0x1F3CA, 2, + 0x1F3CB, 0x1F3CE, 1, + 0x1F3CF, 0x1F3D3, 2, + 0x1F3D4, 0x1F3DF, 1, + 0x1F3E0, 0x1F3F0, 2, + 0x1F3F1, 0x1F3F3, 1, + 0x1F3F4, 0x1F3F4, 2, + 0x1F3F5, 0x1F3F7, 1, + 0x1F3F8, 0x1F43E, 2, + 0x1F43F, 0x1F43F, 1, + 0x1F440, 0x1F440, 2, + 0x1F441, 0x1F441, 1, + 0x1F442, 0x1F4FC, 2, + 0x1F4FD, 0x1F4FE, 1, + 0x1F4FF, 0x1F53D, 2, + 0x1F53E, 0x1F54A, 1, + 0x1F54B, 0x1F54E, 2, + 0x1F54F, 0x1F54F, 1, + 0x1F550, 0x1F567, 2, + 0x1F568, 0x1F579, 1, + 0x1F57A, 0x1F57A, 2, + 0x1F57B, 0x1F594, 1, + 0x1F595, 0x1F596, 2, + 0x1F597, 0x1F5A3, 1, + 0x1F5A4, 0x1F5A4, 2, + 0x1F5A5, 0x1F5FA, 1, + 0x1F5FB, 0x1F64F, 2, + 0x1F650, 0x1F67F, 1, + 0x1F680, 0x1F6C5, 2, + 0x1F6C6, 0x1F6CB, 1, + 0x1F6CC, 0x1F6CC, 2, + 0x1F6CD, 0x1F6CF, 1, + 0x1F6D0, 0x1F6D2, 2, + 0x1F6D3, 0x1F6D4, 1, + 0x1F6D5, 0x1F6DF, 2, + 0x1F6E0, 0x1F6EA, 1, + 0x1F6EB, 0x1F6EC, 2, + 0x1F6F0, 0x1F6F3, 1, + 0x1F6F4, 0x1F6FC, 2, + 0x1F700, 0x1F7D9, 1, + 0x1F7E0, 0x1F7F0, 2, + 0x1F800, 0x1F90B, 1, + 0x1F90C, 0x1F93A, 2, + 0x1F93B, 0x1F93B, 1, + 0x1F93C, 0x1F945, 2, + 0x1F946, 0x1F946, 1, + 0x1F947, 0x1F9FF, 2, + 0x1FA00, 0x1FA6D, 1, + 0x1FA70, 0x1FAF8, 2, + 0x1FB00, 0x1FBF9, 1, + 0x20000, 0x3FFFD, 2, + 0xE0001, 0x10FFFD, 1, +}; + +// +// End of Unicode 15.1.0 block. +// + +#ifdef __cplusplus +} +#endif + +#endif /* _UCG_TABLES_INCLUDED */ diff --git a/src/unicode.cpp b/src/unicode.cpp index c244a323c..665d5b182 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -162,3 +162,8 @@ end: if (codepoint_out) *codepoint_out = codepoint; return width; } + +// NOTE(Feoramund): It's down here because I made UCG use the utf8_decode above to avoid duplicating code. +extern "C" { +#include "ucg/ucg.c" +} From 8ed5cb283b5039693e96d6f47eea6213194d6cbe Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sat, 29 Jun 2024 18:17:44 -0400 Subject: [PATCH 6/6] Re-implement the error squiggles with visual width --- src/check_stmt.cpp | 18 ++- src/error.cpp | 291 ++++++++++++++++++++------------------------- 2 files changed, 144 insertions(+), 165 deletions(-) diff --git a/src/check_stmt.cpp b/src/check_stmt.cpp index c369ba098..f2e3b0242 100644 --- a/src/check_stmt.cpp +++ b/src/check_stmt.cpp @@ -582,20 +582,28 @@ gb_internal Type *check_assignment_variable(CheckerContext *ctx, Operand *lhs, O isize offset = show_error_on_line(e->token.pos, token_pos_end(e->token)); if (offset < 0) { if (is_type_map(e->type)) { - error_line("\t\tSuggestion: Did you mean? 'for key, &%.*s in ...'\n", LIT(e->token.string)); + error_line("\tSuggestion: Did you mean? 'for key, &%.*s in ...'\n", LIT(e->token.string)); } else { - error_line("\t\tSuggestion: Did you mean? 'for &%.*s in ...'\n", LIT(e->token.string)); + error_line("\tSuggestion: Did you mean? 'for &%.*s in ...'\n", LIT(e->token.string)); } } else { - error_line("\t\t'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); + error_line("\t"); + for (isize i = 0; i < offset-1; i++) { + error_line(" "); + } + error_line("'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); } } else if (e && e->flags & EntityFlag_SwitchValue) { isize offset = show_error_on_line(e->token.pos, token_pos_end(e->token)); if (offset < 0) { - error_line("\t\tSuggestion: Did you mean? 'switch &%.*s in ...'\n", LIT(e->token.string)); + error_line("\tSuggestion: Did you mean? 'switch &%.*s in ...'\n", LIT(e->token.string)); } else { - error_line("\t\t'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); + error_line("\t"); + for (isize i = 0; i < offset-1; i++) { + error_line(" "); + } + error_line("'%.*s' is immutable, declare it as '&%.*s' to make it mutable\n", LIT(e->token.string), LIT(e->token.string)); } } } diff --git a/src/error.cpp b/src/error.cpp index 26b4106a0..f95123f15 100644 --- a/src/error.cpp +++ b/src/error.cpp @@ -283,9 +283,6 @@ gb_internal isize show_error_on_line(TokenPos const &pos, TokenPos end) { error_out("\t( empty line )\n"); terminal_reset_colours(); - // Preserve the old return behaviour. Even if we can't guarantee the - // exact visual space offset, there are two places that check this to - // change what sort of suggestion they offer. if (the_line == nullptr) { return -1; } else { @@ -293,244 +290,218 @@ gb_internal isize show_error_on_line(TokenPos const &pos, TokenPos end) { } } - // Specfically use basic ASCII arrows here, in case the terminal - // doesn't support anything fancy. This is meant to be a good fallback. - char const *mark_error_sign = "><"; - char const *open_error_sign = ">>"; - char const *close_error_sign = "<<"; - const TerminalColour marker_colour = TerminalColour_Yellow; - - // ANSI SGR: - // 0 = Reset. - // 58:5:2 = Underline colour, 8-bit, green. (non-standard) - // 4:3 = Wiggly underline. (non-standard) - char const *wiggly_underline_sgr = ""; - char const *disable_underline_sgr = ""; - if (has_ansi_terminal_colours()) { - wiggly_underline_sgr = "\x1b[0;58:5:2;4:3m"; - disable_underline_sgr = "\x1b[24m"; - } - // These two will be used like an Odin slice later. char const *line_text = the_line; i32 line_length_bytes = cast(i32)gb_string_length(the_line); - // NOTE(Feoramund): The numbers below are in Unicode codepoints - // (or runes), not visual glyph width. Calculating the visual width of - // a cluster of Unicode codepoints is vexing, and `utf8proc_charwidth` - // is inadequate. - // - // We're counting codepoints here so we don't slice one down the - // middle during truncation. It will still look strange if we slice - // a cluster down the middle. (i.e. a letter and a combining diacritic) - // - // Luckily, if our assumption about 1 codepoint == 1 glyph is wrong, - // we only suffer a shorter or longer line displayed in total, but all - // of our highlighting and marking will be precise. - // (Unless there's an invalid Unicode codepoint, in which case, no guarantees.) - // - // The line will be longer if a codepoint occupies more than one space - // (CJK in most cases) and shorter if a codepoint is invisible or is - // a type of joiner or combining codepoint. - // - // If we get a complete Unicode glyph counter, it would be as simple as - // replacing `utf8_decode` below to make all of this work perfectly. + ucg_grapheme* graphemes; + i32 line_length_runes = 0; + i32 line_length_graphemes = 0; + i32 line_width = 0; + int ucg_result = ucg_decode_grapheme_clusters( + permanent_allocator(), (const uint8_t*)line_text, line_length_bytes, + &graphemes, &line_length_runes, &line_length_graphemes, &line_width); + + if (ucg_result < 0) { + // There was a UTF-8 parsing error. + // Insert a dummy grapheme so the start of the invalid rune can be pointed at. + graphemes = (ucg_grapheme*)gb_resize(permanent_allocator(), + graphemes, + sizeof(ucg_grapheme) * (line_length_graphemes), + sizeof(ucg_grapheme) * (1 + line_length_graphemes)); + + ucg_grapheme append = { + error_start_index_bytes, + line_length_runes, + 1, + }; + + graphemes[line_length_graphemes] = append; + } + + // The units below are counted in visual, monospace cells. enum { MAX_LINE_LENGTH = 80, MAX_TAB_WIDTH = 8, ELLIPSIS_PADDING = 8, // `... ...` - MAX_MARK_WIDTH = 4, // `><` or `>>` and `<<` MIN_LEFT_VIEW = 8, // A rough estimate of how many characters we'll insert, at most: - MAX_INSERTED_WIDTH = MAX_TAB_WIDTH + ELLIPSIS_PADDING + MAX_MARK_WIDTH, + MAX_INSERTED_WIDTH = MAX_TAB_WIDTH + ELLIPSIS_PADDING, MAX_LINE_LENGTH_PADDED = MAX_LINE_LENGTH - MAX_INSERTED_WIDTH, }; - // For the purposes of truncating long lines, we calculate how many - // runes the line is composed of, first. We'll take note of at which - // rune index the error starts, too. - i32 error_start_index_runes = 0; - - i32 line_length_runes = 0; - for (i32 i = 0; i < line_length_bytes; /**/) { - Rune rune; - - if (i == error_start_index_bytes) { - error_start_index_runes = line_length_runes; - } - - i32 bytes_read = cast(i32)utf8_decode(cast(const u8 *)line_text + i, line_length_bytes - i, &rune); - if (rune == GB_RUNE_INVALID || bytes_read <= 0) { - // Bail out; we won't even try to truncate the line later. - line_length_runes = 0; + i32 error_start_index_graphemes = 0; + for (i32 i = 0; i < line_length_graphemes; i += 1) { + if (graphemes[i].byte_index == error_start_index_bytes) { + error_start_index_graphemes = i; break; } - - line_length_runes += 1; - i += bytes_read; } - if (error_start_index_runes == 0 && error_start_index_bytes != 0 && line_length_runes != 0) { - // The error index in runes was not found, but we did find a valid Unicode string. + if (error_start_index_graphemes == 0 && error_start_index_bytes != 0 && line_length_graphemes != 0) { + // The error index in graphemes was not found, but we did find a valid Unicode string. // // This is an edge case where the error is sitting on a newline or the // end of the line, as that is the only location we could not have checked. - error_start_index_runes = line_length_runes; + error_start_index_graphemes = line_length_graphemes; } error_out("\t"); bool show_right_ellipsis = false; - if (line_length_runes > MAX_LINE_LENGTH_PADDED) { + i32 squiggle_padding = 0; + i32 window_open_bytes = 0; + i32 window_close_bytes = 0; + if (line_width > MAX_LINE_LENGTH_PADDED) { // Now that we know the line is over the length limit, we have to - // compose a runic window in which to display the error. - i32 window_width = MAX_LINE_LENGTH_PADDED; + // compose a visual window in which to display the error. + i32 window_size_left = 0; + i32 window_size_right = 0; + i32 window_open_graphemes = 0; - i32 extend_right = 0; - i32 extend_left = 0; - if (error_start_index_runes + window_width > line_length_runes - 1) { - // Trade space from the right to the left. - extend_right = line_length_runes - error_start_index_runes; - extend_left = window_width - extend_right; - } else if (MIN_LEFT_VIEW - error_start_index_runes > 0) { - // Trade space from the left to the right. - extend_left = error_start_index_runes; - extend_right = window_width - extend_left; - } else { - // Square in the middle somewhere. - extend_left = MIN_LEFT_VIEW; - extend_right = window_width - extend_left; + for (i32 i = error_start_index_graphemes - 1; i > 0; i -= 1) { + window_size_left += graphemes[i].width; + if (window_size_left >= MIN_LEFT_VIEW) { + window_open_graphemes = i; + window_open_bytes = graphemes[i].byte_index; + break; + } } - i32 window_right_runes = gb_min(error_start_index_runes + extend_right, line_length_runes); - i32 window_left_runes = gb_max(0, error_start_index_runes - extend_left); - - i32 window_right_bytes = 0; - i32 window_left_bytes = 0; - - i32 i_runes = 0; - for (i32 i = 0; i < line_length_bytes; /**/) { - if (i_runes == window_left_runes ) { window_left_bytes = i; } - if (i_runes == window_right_runes) { window_right_bytes = i; } - - // No need for error-checking. - // - // We've already validated the string at this point, otherwise - // `line_length_runes` would be 0, and we would not have - // entered this block. - i32 bytes_read = cast(i32)utf8_decode(cast(const u8 *)line_text + i, line_length_bytes - i, nullptr); - - i_runes += 1; - i += bytes_read; + for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) { + window_size_right += graphemes[i].width; + if (window_size_right >= MAX_LINE_LENGTH_PADDED - MIN_LEFT_VIEW) { + window_close_bytes = graphemes[i].byte_index; + break; + } + } + if (window_close_bytes == 0) { + // The window ends at the end of the line. + window_close_bytes = line_length_bytes; } - if (window_right_bytes == 0) { - // The end of the window is the end of the line. - window_right_bytes = line_length_bytes; + if (window_size_right < MAX_LINE_LENGTH_PADDED - MIN_LEFT_VIEW) { + // Hit the end of the string early on the right side; expand backwards. + for (i32 i = window_open_graphemes - 1; i > 0; i -= 1) { + window_size_left += graphemes[i].width; + if (window_size_left + window_size_right >= MAX_LINE_LENGTH_PADDED) { + window_open_graphemes = i; + window_open_bytes = graphemes[i].byte_index; + break; + } + } } - GB_ASSERT_MSG(window_right_runes >= window_left_runes, "Error line truncation window has wrong rune indices. (left, right: %i, %i)", window_left_runes, window_right_runes); - GB_ASSERT_MSG(window_right_bytes >= window_left_bytes, "Error line truncation window has wrong byte indices. (left, right: %i, %i)", window_left_bytes, window_right_bytes); + GB_ASSERT_MSG(window_close_bytes >= window_open_bytes, "Error line truncation window has wrong byte indices. (open, close: %i, %i)", window_open_bytes, window_close_bytes); - if (window_right_bytes != line_length_bytes) { + if (window_close_bytes != line_length_bytes) { show_right_ellipsis = true; } - // The text will advance; all indices and lengths will become relative. - // We must keep our other iterators in sync. - // NOTE: Uncomment the rune versions if they ever get used beyond this point. - // Close the window, going left. - line_length_bytes = window_right_bytes; + line_length_bytes = window_close_bytes; // Adjust the slice of text. In Odin, this would be: // `line_text = line_text[window_left_bytes:]` - line_text += window_left_bytes; - line_length_bytes -= window_left_bytes; - // line_length_runes -= window_left_runes; + line_text += window_open_bytes; + line_length_bytes -= window_open_bytes; GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); - // Part of advancing `line_text`: - error_start_index_bytes -= window_left_bytes; - // error_start_index_runes -= window_left_runes; - GB_ASSERT_MSG(error_start_index_bytes >= 0, "Bounds-checking error: error_start_index_bytes"); - - if (window_left_bytes > 0) { + if (window_open_bytes > 0) { error_out("... "); + squiggle_padding += 4; } + } else { + // No truncation needed. + window_open_bytes = 0; + window_close_bytes = line_length_bytes; + } + + for (i32 i = error_start_index_graphemes; i > 0; i -= 1) { + if (graphemes[i].byte_index == window_open_bytes) { + break; + } + squiggle_padding += graphemes[i].width; } // Start printing code. terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); - error_out("%.*s", error_start_index_bytes, line_text); + error_out("%.*s", line_length_bytes, line_text); - // Odin-like: `line_text = line_text[error_start_index_bytes:]` - line_text += error_start_index_bytes; - line_length_bytes -= error_start_index_bytes; - GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); + i32 squiggle_length = 0; + bool trailing_squiggle = false; if (end.file_id == pos.file_id) { // The error has an endpoint. - terminal_set_colours(TerminalStyle_Bold, marker_colour); - error_out(open_error_sign); if (end.line > pos.line) { // Error goes to next line. - error_out(wiggly_underline_sgr); - error_out("%.*s", line_length_bytes, line_text); - - error_out(disable_underline_sgr); - // Always show the ellipsis in this case show_right_ellipsis = true; - } else if (end.line == pos.line && end.column > pos.column) { - // Error terminates before line end. - i32 error_length_bytes = gb_min(end.column - pos.column, line_length_bytes); - - error_out(wiggly_underline_sgr); - error_out("%.*s", error_length_bytes, line_text); - line_text += error_length_bytes; - line_length_bytes -= error_length_bytes; - GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes"); - - error_out(disable_underline_sgr); - - if (!show_right_ellipsis) { - // The line hasn't been truncated; show the end marker. - terminal_set_colours(TerminalStyle_Bold, marker_colour); - error_out(close_error_sign); + for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) { + squiggle_length += graphemes[i].width; + trailing_squiggle = true; } - terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); - error_out("%.*s", line_length_bytes, line_text); - } + } else if (end.line == pos.line && end.column > pos.column) { + // Error terminates before line end. + i32 adjusted_end_index = graphemes[error_start_index_graphemes].byte_index + end.column - pos.column; + for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) { + if (graphemes[i].byte_index >= adjusted_end_index) { + break; + } else if (graphemes[i].byte_index >= window_close_bytes) { + trailing_squiggle = true; + break; + } + squiggle_length += graphemes[i].width; + } + } } else { // The error is at one spot; no range known. - terminal_set_colours(TerminalStyle_Bold, marker_colour); - error_out(mark_error_sign); - - terminal_set_colours(TerminalStyle_Normal, TerminalColour_White); - error_out("%.*s", line_length_bytes, line_text); + squiggle_length = 1; } if (show_right_ellipsis) { error_out(" ..."); } + error_out("\n\t"); + + for (i32 i = squiggle_padding; i > 0; i -= 1) { + error_out(" "); + } + + terminal_set_colours(TerminalStyle_Bold, TerminalColour_Green); + + if (squiggle_length > 0) { + error_out("^"); + squiggle_length -= 1; + } + for (/**/; squiggle_length > 1; squiggle_length -= 1) { + error_out("~"); + } + if (squiggle_length > 0) { + if (trailing_squiggle) { + error_out("~ ..."); + } else { + error_out("^"); + } + } + // NOTE(Feoramund): Specifically print a newline, then reset colours, // instead of the other way around. Otherwise the printing mechanism // will collapse the newline for reasons currently beyond my ken. error_out("\n"); terminal_reset_colours(); - return error_start_index_bytes; + return squiggle_padding; } gb_internal void error_out_empty(void) {