Merge pull request #3760 from Feoramund/refactor-show-error-on-line

Refactor `show_error_on_line`
This commit is contained in:
gingerBill
2024-07-01 11:03:29 +01:00
committed by GitHub
6 changed files with 3556 additions and 78 deletions

View File

@@ -237,6 +237,7 @@ enum TerminalColour {
TerminalColour_Blue,
TerminalColour_Purple,
TerminalColour_Black,
TerminalColour_Grey,
};
gb_internal void terminal_set_colours(TerminalStyle style, TerminalColour foreground) {
@@ -256,6 +257,7 @@ gb_internal void terminal_set_colours(TerminalStyle style, TerminalColour foregr
case TerminalColour_Blue: error_out("\x1b[%s;34m", ss); break;
case TerminalColour_Purple: error_out("\x1b[%s;35m", ss); break;
case TerminalColour_Black: error_out("\x1b[%s;30m", ss); break;
case TerminalColour_Grey: error_out("\x1b[%s;90m", ss); break;
}
}
}
@@ -272,85 +274,234 @@ gb_internal isize show_error_on_line(TokenPos const &pos, TokenPos end) {
return -1;
}
i32 offset = 0;
gbString the_line = get_file_line_as_string(pos, &offset);
i32 error_start_index_bytes = 0;
gbString the_line = get_file_line_as_string(pos, &error_start_index_bytes);
defer (gb_string_free(the_line));
if (the_line != nullptr) {
char const *line_text = the_line;
isize line_len = gb_string_length(the_line);
// TODO(bill): This assumes ASCII
enum {
MAX_LINE_LENGTH = 80,
MAX_TAB_WIDTH = 8,
ELLIPSIS_PADDING = 8, // `... ...`
MAX_LINE_LENGTH_PADDED = MAX_LINE_LENGTH-MAX_TAB_WIDTH-ELLIPSIS_PADDING,
};
i32 error_length = gb_max(end.offset - pos.offset, 1);
error_out("\t");
terminal_set_colours(TerminalStyle_Bold, TerminalColour_White);
isize squiggle_extra = 0;
if (line_len > MAX_LINE_LENGTH_PADDED) {
i32 left = MAX_TAB_WIDTH;
i32 diff = gb_max(offset-left, 0);
if (diff > 0) {
line_text += diff;
line_len -= diff;
offset = left + ELLIPSIS_PADDING/2;
}
if (line_len > MAX_LINE_LENGTH_PADDED) {
line_len = MAX_LINE_LENGTH_PADDED;
if (error_length > line_len-left) {
error_length = cast(i32)line_len - left;
squiggle_extra = 1;
}
}
if (diff > 0) {
error_out("... %.*s ...", cast(i32)line_len, line_text);
} else {
error_out("%.*s ...", cast(i32)line_len, line_text);
}
} else {
error_out("%.*s", cast(i32)line_len, line_text);
}
error_out("\n\t");
for (i32 i = 0; i < offset; i++) {
error_out(" ");
}
terminal_set_colours(TerminalStyle_Bold, TerminalColour_Green);
error_out("^");
if (end.file_id == pos.file_id) {
if (end.line > pos.line) {
for (i32 i = offset; i < line_len; i++) {
error_out("~");
}
} else if (end.line == pos.line && end.column > pos.column) {
for (i32 i = 1; i < error_length-1+squiggle_extra; i++) {
error_out("~");
}
if (error_length > 1 && squiggle_extra == 0) {
error_out("^");
}
}
}
if (the_line == nullptr || gb_string_length(the_line) == 0) {
terminal_set_colours(TerminalStyle_Normal, TerminalColour_Grey);
error_out("\t( empty line )\n");
terminal_reset_colours();
error_out("\n");
return offset;
if (the_line == nullptr) {
return -1;
} else {
return cast(isize)error_start_index_bytes;
}
}
return -1;
// These two will be used like an Odin slice later.
char const *line_text = the_line;
i32 line_length_bytes = cast(i32)gb_string_length(the_line);
ucg_grapheme* graphemes;
i32 line_length_runes = 0;
i32 line_length_graphemes = 0;
i32 line_width = 0;
int ucg_result = ucg_decode_grapheme_clusters(
permanent_allocator(), (const uint8_t*)line_text, line_length_bytes,
&graphemes, &line_length_runes, &line_length_graphemes, &line_width);
if (ucg_result < 0) {
// There was a UTF-8 parsing error.
// Insert a dummy grapheme so the start of the invalid rune can be pointed at.
graphemes = (ucg_grapheme*)gb_resize(permanent_allocator(),
graphemes,
sizeof(ucg_grapheme) * (line_length_graphemes),
sizeof(ucg_grapheme) * (1 + line_length_graphemes));
ucg_grapheme append = {
error_start_index_bytes,
line_length_runes,
1,
};
graphemes[line_length_graphemes] = append;
}
// The units below are counted in visual, monospace cells.
enum {
MAX_LINE_LENGTH = 80,
MAX_TAB_WIDTH = 8,
ELLIPSIS_PADDING = 8, // `... ...`
MIN_LEFT_VIEW = 8,
// A rough estimate of how many characters we'll insert, at most:
MAX_INSERTED_WIDTH = MAX_TAB_WIDTH + ELLIPSIS_PADDING,
MAX_LINE_LENGTH_PADDED = MAX_LINE_LENGTH - MAX_INSERTED_WIDTH,
};
i32 error_start_index_graphemes = 0;
for (i32 i = 0; i < line_length_graphemes; i += 1) {
if (graphemes[i].byte_index == error_start_index_bytes) {
error_start_index_graphemes = i;
break;
}
}
if (error_start_index_graphemes == 0 && error_start_index_bytes != 0 && line_length_graphemes != 0) {
// The error index in graphemes was not found, but we did find a valid Unicode string.
//
// This is an edge case where the error is sitting on a newline or the
// end of the line, as that is the only location we could not have checked.
error_start_index_graphemes = line_length_graphemes;
}
error_out("\t");
bool show_right_ellipsis = false;
i32 squiggle_padding = 0;
i32 window_open_bytes = 0;
i32 window_close_bytes = 0;
if (line_width > MAX_LINE_LENGTH_PADDED) {
// Now that we know the line is over the length limit, we have to
// compose a visual window in which to display the error.
i32 window_size_left = 0;
i32 window_size_right = 0;
i32 window_open_graphemes = 0;
for (i32 i = error_start_index_graphemes - 1; i > 0; i -= 1) {
window_size_left += graphemes[i].width;
if (window_size_left >= MIN_LEFT_VIEW) {
window_open_graphemes = i;
window_open_bytes = graphemes[i].byte_index;
break;
}
}
for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) {
window_size_right += graphemes[i].width;
if (window_size_right >= MAX_LINE_LENGTH_PADDED - MIN_LEFT_VIEW) {
window_close_bytes = graphemes[i].byte_index;
break;
}
}
if (window_close_bytes == 0) {
// The window ends at the end of the line.
window_close_bytes = line_length_bytes;
}
if (window_size_right < MAX_LINE_LENGTH_PADDED - MIN_LEFT_VIEW) {
// Hit the end of the string early on the right side; expand backwards.
for (i32 i = window_open_graphemes - 1; i > 0; i -= 1) {
window_size_left += graphemes[i].width;
if (window_size_left + window_size_right >= MAX_LINE_LENGTH_PADDED) {
window_open_graphemes = i;
window_open_bytes = graphemes[i].byte_index;
break;
}
}
}
GB_ASSERT_MSG(window_close_bytes >= window_open_bytes, "Error line truncation window has wrong byte indices. (open, close: %i, %i)", window_open_bytes, window_close_bytes);
if (window_close_bytes != line_length_bytes) {
show_right_ellipsis = true;
}
// Close the window, going left.
line_length_bytes = window_close_bytes;
// Adjust the slice of text. In Odin, this would be:
// `line_text = line_text[window_left_bytes:]`
line_text += window_open_bytes;
line_length_bytes -= window_open_bytes;
GB_ASSERT_MSG(line_length_bytes >= 0, "Bounds-checking error: line_length_bytes");
if (window_open_bytes > 0) {
error_out("... ");
squiggle_padding += 4;
}
} else {
// No truncation needed.
window_open_bytes = 0;
window_close_bytes = line_length_bytes;
}
for (i32 i = error_start_index_graphemes; i > 0; i -= 1) {
if (graphemes[i].byte_index == window_open_bytes) {
break;
}
squiggle_padding += graphemes[i].width;
}
// Start printing code.
terminal_set_colours(TerminalStyle_Normal, TerminalColour_White);
error_out("%.*s", line_length_bytes, line_text);
i32 squiggle_length = 0;
bool trailing_squiggle = false;
if (end.file_id == pos.file_id) {
// The error has an endpoint.
if (end.line > pos.line) {
// Error goes to next line.
// Always show the ellipsis in this case
show_right_ellipsis = true;
for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) {
squiggle_length += graphemes[i].width;
trailing_squiggle = true;
}
} else if (end.line == pos.line && end.column > pos.column) {
// Error terminates before line end.
i32 adjusted_end_index = graphemes[error_start_index_graphemes].byte_index + end.column - pos.column;
for (i32 i = error_start_index_graphemes; i < line_length_graphemes; i += 1) {
if (graphemes[i].byte_index >= adjusted_end_index) {
break;
} else if (graphemes[i].byte_index >= window_close_bytes) {
trailing_squiggle = true;
break;
}
squiggle_length += graphemes[i].width;
}
}
} else {
// The error is at one spot; no range known.
squiggle_length = 1;
}
if (show_right_ellipsis) {
error_out(" ...");
}
error_out("\n\t");
for (i32 i = squiggle_padding; i > 0; i -= 1) {
error_out(" ");
}
terminal_set_colours(TerminalStyle_Bold, TerminalColour_Green);
if (squiggle_length > 0) {
error_out("^");
squiggle_length -= 1;
}
for (/**/; squiggle_length > 1; squiggle_length -= 1) {
error_out("~");
}
if (squiggle_length > 0) {
if (trailing_squiggle) {
error_out("~ ...");
} else {
error_out("^");
}
}
// NOTE(Feoramund): Specifically print a newline, then reset colours,
// instead of the other way around. Otherwise the printing mechanism
// will collapse the newline for reasons currently beyond my ken.
error_out("\n");
terminal_reset_colours();
return squiggle_padding;
}
gb_internal void error_out_empty(void) {

View File

@@ -71,6 +71,12 @@ gb_internal gbString get_file_line_as_string(TokenPos const &pos, i32 *offset_)
u8 *line_start = pos_offset;
u8 *line_end = pos_offset;
if (offset > 0 && *line_start == '\n') {
// Prevent an error token that starts at the boundary of a line that
// leads to an empty line from advancing off its line.
line_start -= 1;
}
while (line_start >= start) {
if (*line_start == '\n') {
line_start += 1;
@@ -78,6 +84,11 @@ gb_internal gbString get_file_line_as_string(TokenPos const &pos, i32 *offset_)
}
line_start -= 1;
}
if (line_start == start - 1) {
// Prevent an error on the first line from stepping behind the boundary
// of the text.
line_start += 1;
}
while (line_end < end) {
if (*line_end == '\n') {

View File

@@ -786,7 +786,6 @@ gb_internal void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
case '`': // Raw String Literal
case '"': // String Literal
{
bool has_carriage_return = false;
i32 success;
Rune quote = curr_rune;
token->kind = Token_String;
@@ -816,9 +815,6 @@ gb_internal void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
if (r == quote) {
break;
}
if (r == '\r') {
has_carriage_return = true;
}
}
}
token->string.len = t->curr - token->string.text;

686
src/ucg/ucg.c Normal file
View File

@@ -0,0 +1,686 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Feoramund
* SPDX-License-Identifier: BSD-3-Clause
*/
//
// NOTE(Feoramund): This is my UCG library, adapted for use within the Odin compiler.
// Most of the comments have been let alone and may not strictly apply anymore.
//
// 1. The UCG allocator interface was replaced by gbAllocator.
// 2. The UCG UTF-8 decoder was replaced with the one already in the compiler.
// 3. Non-essential code was stripped.
// 4. Some types were changed for compatibility.
//
/* This is the data that is allocated when an allocator is passed to
* ucg_decode_grapheme_clusters. */
typedef struct {
i32 byte_index;
i32 rune_index;
i32 width;
} ucg_grapheme;
/* #include "ucg.h" */
#include "ucg_tables.h"
#define UCG_TABLE_LEN(t) (sizeof(t) / sizeof(int32_t))
#define ZERO_WIDTH_SPACE 0x200B
#define ZERO_WIDTH_NON_JOINER 0x200C
#define ZERO_WIDTH_JOINER 0x200D
#define WORD_JOINER 0x2060
int ucg_binary_search(int32_t value, const int32_t* table, int length, int stride) {
GB_ASSERT(table != NULL);
GB_ASSERT(length > 0);
GB_ASSERT(stride > 0);
int n = length;
int t = 0;
for (/**/; n > 1; /**/) {
int m = n / 2;
int p = t + m * stride;
if (value >= table[p]) {
t = p;
n = n - m;
} else {
n = m;
}
}
if (n != 0 && value >= table[t]) {
return t;
}
return -1;
}
//
// The procedures below are accurate as of Unicode 15.1.0.
//
bool ucg_is_control(int32_t r) {
if (r <= 0x1F || (0x7F <= r && r <= 0x9F)) {
return true;
}
return false;
}
// Emoji_Modifier
bool ucg_is_emoji_modifier(int32_t r) {
return 0x1F3FB <= r && r <= 0x1F3FF;
}
// Regional_Indicator
bool ucg_is_regional_indicator(int32_t r) {
return 0x1F1E6 <= r && r <= 0x1F1FF;
}
// General_Category=Enclosing_Mark
bool ucg_is_enclosing_mark(int32_t r) {
switch (r) {
case 0x0488:
case 0x0489:
case 0x1ABE:
return true;
}
if (0x20DD <= r && r <= 0x20E0) { return true; }
if (0x20E2 <= r && r <= 0x20E4) { return true; }
if (0xA670 <= r && r <= 0xA672) { return true; }
return false;
}
// Prepended_Concatenation_Mark
bool ucg_is_prepended_concatenation_mark(int32_t r) {
switch (r) {
case 0x006DD:
case 0x0070F:
case 0x008E2:
case 0x110BD:
case 0x110CD:
return true;
}
if (0x00600 <= r && r <= 0x00605) { return true; }
if (0x00890 <= r && r <= 0x00891) { return true; }
return false;
}
// General_Category=Spacing_Mark
bool ucg_is_spacing_mark(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_spacing_mark_ranges, UCG_TABLE_LEN(ucg_spacing_mark_ranges)/2, 2);
if (p >= 0 && ucg_spacing_mark_ranges[p] <= r && r <= ucg_spacing_mark_ranges[p+1]) {
return true;
}
return false;
}
// General_Category=Nonspacing_Mark
bool ucg_is_nonspacing_mark(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_nonspacing_mark_ranges, UCG_TABLE_LEN(ucg_nonspacing_mark_ranges)/2, 2);
if (p >= 0 && ucg_nonspacing_mark_ranges[p] <= r && r <= ucg_nonspacing_mark_ranges[p+1]) {
return true;
}
return false;
}
// Extended_Pictographic
bool ucg_is_emoji_extended_pictographic(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_emoji_extended_pictographic_ranges, UCG_TABLE_LEN(ucg_emoji_extended_pictographic_ranges)/2, 2);
if (p >= 0 && ucg_emoji_extended_pictographic_ranges[p] <= r && r <= ucg_emoji_extended_pictographic_ranges[p+1]) {
return true;
}
return false;
}
// Grapheme_Extend
bool ucg_is_grapheme_extend(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_grapheme_extend_ranges, UCG_TABLE_LEN(ucg_grapheme_extend_ranges)/2, 2);
if (p >= 0 && ucg_grapheme_extend_ranges[p] <= r && r <= ucg_grapheme_extend_ranges[p+1]) {
return true;
}
return false;
}
// Hangul_Syllable_Type=Leading_Jamo
bool ucg_is_hangul_syllable_leading(int32_t r) {
return (0x1100 <= r && r <= 0x115F) || (0xA960 <= r && r <= 0xA97C);
}
// Hangul_Syllable_Type=Vowel_Jamo
bool ucg_is_hangul_syllable_vowel(int32_t r) {
return (0x1160 <= r && r <= 0x11A7) || (0xD7B0 <= r && r <= 0xD7C6);
}
// Hangul_Syllable_Type=Trailing_Jamo
bool ucg_is_hangul_syllable_trailing(int32_t r) {
return (0x11A8 <= r && r <= 0x11FF) || (0xD7CB <= r && r <= 0xD7FB);
}
// Hangul_Syllable_Type=LV_Syllable
bool ucg_is_hangul_syllable_lv(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lv_singlets, UCG_TABLE_LEN(ucg_hangul_syllable_lv_singlets), 1);
if (p >= 0 && r == ucg_hangul_syllable_lv_singlets[p]) {
return true;
}
return false;
}
// Hangul_Syllable_Type=LVT_Syllable
bool ucg_is_hangul_syllable_lvt(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_hangul_syllable_lvt_ranges, UCG_TABLE_LEN(ucg_hangul_syllable_lvt_ranges)/2, 2);
if (p >= 0 && ucg_hangul_syllable_lvt_ranges[p] <= r && r <= ucg_hangul_syllable_lvt_ranges[p+1]) {
return true;
}
return false;
}
// Indic_Syllabic_Category=Consonant_Preceding_Repha
bool ucg_is_indic_consonant_preceding_repha(int32_t r) {
switch (r) {
case 0x00D4E:
case 0x11941:
case 0x11D46:
case 0x11F02:
return true;
}
return false;
}
// Indic_Syllabic_Category=Consonant_Prefixed
bool ucg_is_indic_consonant_prefixed(int32_t r) {
switch (r) {
case 0x1193F:
case 0x11A3A:
return true;
}
if (0x111C2 <= r && r <= 0x111C3) { return true; }
if (0x11A84 <= r && r <= 0x11A89) { return true; }
return false;
}
// Indic_Conjunct_Break=Linker
bool ucg_is_indic_conjunct_break_linker(int32_t r) {
switch (r) {
case 0x094D:
case 0x09CD:
case 0x0ACD:
case 0x0B4D:
case 0x0C4D:
case 0x0D4D:
return true;
}
return false;
}
// Indic_Conjunct_Break=Consonant
bool ucg_is_indic_conjunct_break_consonant(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_consonant_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_consonant_ranges)/2, 2);
if (p >= 0 && ucg_indic_conjunct_break_consonant_ranges[p] <= r && r <= ucg_indic_conjunct_break_consonant_ranges[p+1]) {
return true;
}
return false;
}
// Indic_Conjunct_Break=Extend
bool ucg_is_indic_conjunct_break_extend(int32_t r) {
intptr_t p = ucg_binary_search(r, ucg_indic_conjunct_break_extend_ranges, UCG_TABLE_LEN(ucg_indic_conjunct_break_extend_ranges)/2, 2);
if (p >= 0 && ucg_indic_conjunct_break_extend_ranges[p] <= r && r <= ucg_indic_conjunct_break_extend_ranges[p+1]) {
return true;
}
return false;
}
/*
```
Indic_Syllabic_Category = Consonant_Preceding_Repha, or
Indic_Syllabic_Category = Consonant_Prefixed, or
Prepended_Concatenation_Mark = Yes
```
*/
bool ucg_is_gcb_prepend_class(int32_t r) {
return ucg_is_indic_consonant_preceding_repha(r) || ucg_is_indic_consonant_prefixed(r) || ucg_is_prepended_concatenation_mark(r);
}
/*
```
Grapheme_Extend = Yes, or
Emoji_Modifier = Yes
This includes:
General_Category = Nonspacing_Mark
General_Category = Enclosing_Mark
U+200C ZERO WIDTH NON-JOINER
plus a few General_Category = Spacing_Mark needed for canonical equivalence.
```
*/
bool ucg_is_gcb_extend_class(int32_t r) {
return ucg_is_grapheme_extend(r) || ucg_is_emoji_modifier(r);
}
// Return values:
//
// - 2 if East_Asian_Width=F or W, or
// - 0 if non-printable / zero-width, or
// - 1 in all other cases.
//
int ucg_normalized_east_asian_width(int32_t r) {
if (ucg_is_control(r)) {
return 0;
} else if (r <= 0x10FF) {
// Easy early out for low runes.
return 1;
}
switch (r) {
// This is a different interpretation of the BOM which occurs in the middle of text.
case 0xFEFF: /* ZERO_WIDTH_NO_BREAK_SPACE */
case ZERO_WIDTH_SPACE:
case ZERO_WIDTH_NON_JOINER:
case ZERO_WIDTH_JOINER:
case WORD_JOINER:
return 0;
}
intptr_t p = ucg_binary_search(r, ucg_normalized_east_asian_width_ranges, UCG_TABLE_LEN(ucg_normalized_east_asian_width_ranges)/3, 3);
if (p >= 0 && ucg_normalized_east_asian_width_ranges[p] <= r && r <= ucg_normalized_east_asian_width_ranges[p+1]) {
return (int)ucg_normalized_east_asian_width_ranges[p+2];
}
return 1;
}
//
// End of Unicode 15.1.0 block.
//
enum grapheme_cluster_sequence {
None,
Indic,
Emoji,
Regional,
};
typedef struct {
ucg_grapheme* graphemes;
i32 rune_count;
i32 grapheme_count;
i32 width;
int32_t last_rune;
bool last_rune_breaks_forward;
i32 last_width;
i32 last_grapheme_count;
bool bypass_next_rune;
int regional_indicator_counter;
enum grapheme_cluster_sequence current_sequence;
bool continue_sequence;
} ucg_decoder_state;
void _ucg_decode_grapheme_clusters_deferred_step(
gbAllocator allocator,
ucg_decoder_state* state,
i32 byte_index,
int32_t this_rune
) {
// "Break at the start and end of text, unless the text is empty."
//
// GB1: sot ÷ Any
// GB2: Any ÷ eot
if (state->rune_count == 0 && state->grapheme_count == 0) {
state->grapheme_count += 1;
}
if (state->grapheme_count > state->last_grapheme_count) {
state->width += ucg_normalized_east_asian_width(this_rune);
/* if (allocator != NULL) { */
state->graphemes = (ucg_grapheme*)gb_resize(allocator,
state->graphemes,
sizeof(ucg_grapheme) * (state->grapheme_count),
sizeof(ucg_grapheme) * (1 + state->grapheme_count));
ucg_grapheme append = {
byte_index,
state->rune_count,
state->width - state->last_width,
};
state->graphemes[state->grapheme_count - 1] = append;
/* } */
state->last_grapheme_count = state->grapheme_count;
state->last_width = state->width;
}
state->last_rune = this_rune;
state->rune_count += 1;
if (!state->continue_sequence) {
state->current_sequence = None;
state->regional_indicator_counter = 0;
}
state->continue_sequence = false;
}
int ucg_decode_grapheme_clusters(
gbAllocator allocator,
const uint8_t* str,
int str_len,
ucg_grapheme** out_graphemes,
i32* out_rune_count,
i32* out_grapheme_count,
i32* out_width
) {
// The following procedure implements text segmentation by breaking on
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
// the Unicode® Standard Annex #29, entitled:
//
// UNICODE TEXT SEGMENTATION
//
// Version: Unicode 15.1.0
// Date: 2023-08-16
// Revision: 43
//
// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
// extended, non-legacy ruleset.
//
// Please see the references for more information.
//
//
// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// [4]: https://www.unicode.org/reports/tr29/#Conformance
// Additionally, this procedure takes into account Standard Annex #11,
// in order to estimate how visually wide the string will appear on a
// monospaced display. This can only ever be a rough guess, as this tends
// to be an implementation detail relating to which fonts are being used,
// how codepoints are interpreted and drawn, if codepoint sequences are
// interpreted correctly, and et cetera.
//
// For example, a program may not properly interpret an emoji modifier
// sequence and print the component glyphs instead of one whole glyph.
//
// See here for more information: https://www.unicode.org/reports/tr11/
//
// NOTE: There is no explicit mention of what to do with zero-width spaces
// as far as grapheme cluster segmentation goes, therefore this
// implementation may count and return graphemes with a `width` of zero.
//
// Treat them as any other space.
ucg_decoder_state state = {0};
#define UCG_DEFERRED_DECODE_STEP() (_ucg_decode_grapheme_clusters_deferred_step(allocator, &state, byte_index, this_rune))
for (i32 byte_index = 0, bytes_advanced = 0; byte_index < str_len; byte_index += bytes_advanced) {
int32_t this_rune = GB_RUNE_INVALID;
bytes_advanced = (i32)(utf8_decode(str+byte_index, str_len-byte_index, &this_rune));
if (this_rune == GB_RUNE_INVALID || bytes_advanced == 0) {
// There was a Unicode parsing error; bail out.
if (out_graphemes != NULL) { *out_graphemes = state.graphemes; }
if (out_rune_count != NULL) { *out_rune_count = state.rune_count; }
if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; }
if (out_width != NULL) { *out_width = state.width; }
// Return an error code.
return -1;
}
// "Do not break between a CR and LF. Otherwise, break before and after controls."
//
// GB3: CR × LF
// GB4: (Control | CR | LF) ÷
// GB5: ÷ (Control | CR | LF)
if (this_rune == '\n' && state.last_rune == '\r') {
state.last_rune_breaks_forward = false;
state.bypass_next_rune = false;
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_control(this_rune)) {
state.grapheme_count += 1;
state.last_rune_breaks_forward = true;
state.bypass_next_rune = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// (This check is for rules that work forwards, instead of backwards.)
if (state.bypass_next_rune) {
if (state.last_rune_breaks_forward) {
state.grapheme_count += 1;
state.last_rune_breaks_forward = false;
}
state.bypass_next_rune = false;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// (Optimization 1: Prevent low runes from proceeding further.)
//
// * 0xA9 and 0xAE are in the Extended_Pictographic range,
// which is checked later in GB11.
if (this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF) {
state.grapheme_count += 1;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
if (0x1100 <= this_rune && this_rune <= 0xD7FB) {
// "Do not break Hangul syllable sequences."
//
// GB6: L × (L | V | LV | LVT)
// GB7: (LV | V) × (V | T)
// GB8: (LVT | T) × T
if (ucg_is_hangul_syllable_leading(this_rune) ||
ucg_is_hangul_syllable_lv(this_rune) ||
ucg_is_hangul_syllable_lvt(this_rune))
{
if (!ucg_is_hangul_syllable_leading(state.last_rune)) {
state.grapheme_count += 1;
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_hangul_syllable_vowel(this_rune)) {
if (ucg_is_hangul_syllable_leading(state.last_rune) ||
ucg_is_hangul_syllable_vowel(state.last_rune) ||
ucg_is_hangul_syllable_lv(state.last_rune))
{
UCG_DEFERRED_DECODE_STEP(); continue;
}
state.grapheme_count += 1;
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_hangul_syllable_trailing(this_rune)) {
if (ucg_is_hangul_syllable_trailing(state.last_rune) ||
ucg_is_hangul_syllable_lvt(state.last_rune) ||
ucg_is_hangul_syllable_lv(state.last_rune) ||
ucg_is_hangul_syllable_vowel(state.last_rune))
{
UCG_DEFERRED_DECODE_STEP(); continue;
}
state.grapheme_count += 1;
UCG_DEFERRED_DECODE_STEP(); continue;
}
}
// "Do not break before extending characters or ZWJ."
//
// GB9: × (Extend | ZWJ)
if (this_rune == ZERO_WIDTH_JOINER) {
state.continue_sequence = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_gcb_extend_class(this_rune)) {
// (Support for GB9c.)
if (state.current_sequence == Indic) {
if (ucg_is_indic_conjunct_break_extend(this_rune) && (
ucg_is_indic_conjunct_break_linker(state.last_rune) ||
ucg_is_indic_conjunct_break_consonant(state.last_rune) ))
{
state.continue_sequence = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_indic_conjunct_break_linker(this_rune) && (
ucg_is_indic_conjunct_break_linker(state.last_rune) ||
ucg_is_indic_conjunct_break_extend(state.last_rune) ||
ucg_is_indic_conjunct_break_consonant(state.last_rune) ))
{
state.continue_sequence = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
// (Support for GB11.)
if (state.current_sequence == Emoji && (
ucg_is_gcb_extend_class(state.last_rune) ||
ucg_is_emoji_extended_pictographic(state.last_rune) ))
{
state.continue_sequence = true;
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
// _The GB9a and GB9b rules only apply to extended grapheme clusters:_
// "Do not break before SpacingMarks, or after Prepend characters."
//
// GB9a: × SpacingMark
// GB9b: Prepend ×
if (ucg_is_spacing_mark(this_rune)) {
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_gcb_prepend_class(this_rune)) {
state.grapheme_count += 1;
state.bypass_next_rune = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// _The GB9c rule only applies to extended grapheme clusters:_
// "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
//
// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
if (ucg_is_indic_conjunct_break_consonant(this_rune)) {
if (state.current_sequence == Indic) {
if (state.last_rune == ZERO_WIDTH_JOINER ||
ucg_is_indic_conjunct_break_linker(state.last_rune))
{
state.continue_sequence = true;
} else {
state.grapheme_count += 1;
}
} else {
state.grapheme_count += 1;
state.current_sequence = Indic;
state.continue_sequence = true;
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_indic_conjunct_break_extend(this_rune)) {
if (state.current_sequence == Indic) {
if (ucg_is_indic_conjunct_break_consonant(state.last_rune) ||
ucg_is_indic_conjunct_break_linker(state.last_rune))
{
state.continue_sequence = true;
} else {
state.grapheme_count += 1;
}
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
if (ucg_is_indic_conjunct_break_linker(this_rune)) {
if (state.current_sequence == Indic) {
if (ucg_is_indic_conjunct_break_extend(state.last_rune) ||
ucg_is_indic_conjunct_break_linker(state.last_rune))
{
state.continue_sequence = true;
} else {
state.grapheme_count += 1;
}
}
UCG_DEFERRED_DECODE_STEP(); continue;
}
//
// (Curiously, there is no GB10.)
//
// "Do not break within emoji modifier sequences or emoji zwj sequences."
//
// GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
if (ucg_is_emoji_extended_pictographic(this_rune)) {
if (state.current_sequence != Emoji || state.last_rune != ZERO_WIDTH_JOINER) {
state.grapheme_count += 1;
}
state.current_sequence = Emoji;
state.continue_sequence = true;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// "Do not break within emoji flag sequences.
// That is, do not break between regional indicator (RI) symbols
// if there is an odd number of RI characters before the break point."
//
// GB12: sot (RI RI)* RI × RI
// GB13: [^RI] (RI RI)* RI × RI
if (ucg_is_regional_indicator(this_rune)) {
if ((state.regional_indicator_counter & 1) == 0) {
state.grapheme_count += 1;
}
state.current_sequence = Regional;
state.continue_sequence = true;
state.regional_indicator_counter += 1;
UCG_DEFERRED_DECODE_STEP(); continue;
}
// "Otherwise, break everywhere."
//
// GB999: Any ÷ Any
state.grapheme_count += 1;
UCG_DEFERRED_DECODE_STEP();
}
#undef UCG_DEFERRED_DECODE_STEP
if (out_graphemes != NULL) { *out_graphemes = state.graphemes; }
if (out_rune_count != NULL) { *out_rune_count = state.rune_count; }
if (out_grapheme_count != NULL) { *out_grapheme_count = state.grapheme_count; }
if (out_width != NULL) { *out_width = state.width; }
return 0;
}
#undef UCG_TABLE_LEN
#undef ZERO_WIDTH_SPACE
#undef ZERO_WIDTH_NON_JOINER
#undef ZERO_WIDTH_JOINER
#undef WORD_JOINER

2629
src/ucg/ucg_tables.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -162,3 +162,8 @@ end:
if (codepoint_out) *codepoint_out = codepoint;
return width;
}
// NOTE(Feoramund): It's down here because I made UCG use the utf8_decode above to avoid duplicating code.
extern "C" {
#include "ucg/ucg.c"
}