diff --git a/src/checker.cpp b/src/checker.cpp index de69bd854..f81017c38 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -3512,7 +3512,7 @@ bool is_string_an_identifier(String s) { while (offset < s.len) { bool ok = false; Rune r = -1; - isize size = gb_utf8_decode(s.text+offset, s.len-offset, &r); + isize size = utf8_decode(s.text+offset, s.len-offset, &r); if (offset == 0) { ok = rune_is_letter(r); } else { diff --git a/src/exact_value.cpp b/src/exact_value.cpp index cd5ba6579..c8e1ae99f 100644 --- a/src/exact_value.cpp +++ b/src/exact_value.cpp @@ -317,7 +317,7 @@ ExactValue exact_value_from_basic_literal(Token token) { } case Token_Rune: { Rune r = GB_RUNE_INVALID; - gb_utf8_decode(token.string.text, token.string.len, &r); + utf8_decode(token.string.text, token.string.len, &r); return exact_value_i64(r); } default: diff --git a/src/gb/gb.h b/src/gb/gb.h index 187b6628b..848ad8afc 100644 --- a/src/gb/gb.h +++ b/src/gb/gb.h @@ -8232,7 +8232,9 @@ gbFileContents gb_file_read_contents(gbAllocator a, b32 zero_terminate, char con if (gb_file_open(&file, filepath) == gbFileError_None) { isize file_size = cast(isize)gb_file_size(&file); if (file_size > 0) { - result.data = gb_alloc(a, zero_terminate ? file_size+1 : file_size); + isize total_size = file_size + !!zero_terminate; + total_size = (total_size+7)&~7; + result.data = gb_alloc(a, total_size); result.size = file_size; gb_file_read_at(&file, result.data, result.size, 0); if (zero_terminate) { diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index a59e1baa9..743d465e8 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -4712,7 +4712,7 @@ void lb_build_unroll_range_stmt(lbProcedure *p, AstUnrollRangeStmt *rs, Scope *s Rune codepoint = 0; isize offset = 0; do { - isize width = gb_utf8_decode(str.text+offset, str.len-offset, &codepoint); + isize width = utf8_decode(str.text+offset, str.len-offset, &codepoint); if (val0_type) lb_addr_store(p, val0_addr, lb_const_value(m, val0_type, exact_value_i64(codepoint))); if (val1_type) lb_addr_store(p, val1_addr, lb_const_value(m, val1_type, exact_value_i64(offset))); lb_build_stmt(p, rs->body); @@ -6563,7 +6563,7 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, cast(isize)count); for (i64 i = 0; i < count && offset < s.len; i++) { - width = gb_utf8_decode(s.text+offset, s.len-offset, &rune); + width = utf8_decode(s.text+offset, s.len-offset, &rune); offset += width; elems[i] = LLVMConstInt(et, rune, true); diff --git a/src/main.cpp b/src/main.cpp index 27d69d428..3b3cb49d3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -561,7 +561,7 @@ bool string_is_valid_identifier(String str) { isize offset = 0; while (offset < str.len) { Rune r = 0; - w = gb_utf8_decode(str.text, str.len, &r); + w = utf8_decode(str.text, str.len, &r); if (r == GB_RUNE_INVALID) { return false; } diff --git a/src/parser.cpp b/src/parser.cpp index f20c29558..911082b01 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -1705,7 +1705,7 @@ bool is_foreign_name_valid(String name) { while (offset < name.len) { Rune rune; isize remaining = name.len - offset; - isize width = gb_utf8_decode(name.text+offset, remaining, &rune); + isize width = utf8_decode(name.text+offset, remaining, &rune); if (rune == GB_RUNE_INVALID && width == 1) { return false; } else if (rune == GB_RUNE_BOM && remaining > 0) { @@ -4612,7 +4612,7 @@ ParseFileError init_ast_file(AstFile *f, String fullpath, TokenPos *err_pos) { u64 start = time_stamp_time_now(); - while (f->curr_token.kind != Token_EOF) { + for (;;) { Token *token = array_add_and_get(&f->tokens); tokenizer_get_token(&f->tokenizer, token); if (token->kind == Token_Invalid) { @@ -4887,7 +4887,7 @@ bool is_import_path_valid(String path) { isize width = 1; Rune r = *curr; if (r >= 0x80) { - width = gb_utf8_decode(curr, end-curr, &r); + width = utf8_decode(curr, end-curr, &r); if (r == GB_RUNE_INVALID && width == 1) { return false; } @@ -4920,7 +4920,7 @@ bool is_build_flag_path_valid(String path) { isize width = 1; Rune r = *curr; if (r >= 0x80) { - width = gb_utf8_decode(curr, end-curr, &r); + width = utf8_decode(curr, end-curr, &r); if (r == GB_RUNE_INVALID && width == 1) { return false; } @@ -5170,7 +5170,7 @@ String build_tag_get_token(String s, String *out) { isize n = 0; while (n < s.len) { Rune rune = 0; - isize width = gb_utf8_decode(&s[n], s.len-n, &rune); + isize width = utf8_decode(&s[n], s.len-n, &rune); if (n == 0 && rune == '!') { } else if (!rune_is_letter(rune) && !rune_is_digit(rune)) { diff --git a/src/string.cpp b/src/string.cpp index 4e8273f60..0764009bc 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -500,7 +500,7 @@ String quote_to_ascii(gbAllocator a, String str, u8 quote='"') { Rune r = cast(Rune)s[0]; width = 1; if (r >= 0x80) { - width = gb_utf8_decode(s, n, &r); + width = utf8_decode(s, n, &r); } if (width == 1 && r == GB_RUNE_INVALID) { array_add(&buf, cast(u8)'\\'); @@ -576,7 +576,7 @@ bool unquote_char(String s, u8 quote, Rune *rune, bool *multiple_bytes, String * return false; } else if (s[0] >= 0x80) { Rune r = -1; - isize size = gb_utf8_decode(s.text, s.len, &r); + isize size = utf8_decode(s.text, s.len, &r); *rune = r; *multiple_bytes = true; *tail_string = make_string(s.text+size, s.len-size); @@ -736,7 +736,7 @@ i32 unquote_string(gbAllocator a, String *s_, u8 quote=0, bool has_carriage_retu return 1; } else if (quote == '\'') { Rune r = GB_RUNE_INVALID; - isize size = gb_utf8_decode(s.text, s.len, &r); + isize size = utf8_decode(s.text, s.len, &r); if ((size == s.len) && (r != -1 || size != 1)) { *s_ = s; return 1; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index a073abc37..d375ca05d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -789,26 +789,27 @@ void tokenizer_err(Tokenizer *t, TokenPos const &pos, char const *msg, ...) { void advance_to_next_rune(Tokenizer *t) { if (t->read_curr < t->end) { - Rune rune; - isize width = 1; - t->curr = t->read_curr; if (t->curr_rune == '\n') { t->line = t->curr; t->line_count++; } - rune = *t->read_curr; + + Rune rune = *t->read_curr; if (rune == 0) { tokenizer_err(t, "Illegal character NUL"); - } else if (rune >= 0x80) { // not ASCII - width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune); + t->read_curr++; + } else if (rune & 0x80) { // not ASCII + isize width = utf8_decode(t->read_curr, t->end-t->read_curr, &rune); + t->read_curr += width; if (rune == GB_RUNE_INVALID && width == 1) { tokenizer_err(t, "Illegal UTF-8 encoding"); } else if (rune == GB_RUNE_BOM && t->curr-t->start > 0){ tokenizer_err(t, "Illegal byte order mark"); } + } else { + t->read_curr++; } - t->read_curr += width; t->curr_rune = rune; } else { t->curr = t->end; @@ -820,7 +821,28 @@ void advance_to_next_rune(Tokenizer *t) { } } -TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags flags = TokenizerFlag_None) { +void init_tokenizer_with_file_contents(Tokenizer *t, String const &fullpath, gbFileContents *fc, TokenizerFlags flags) { + t->flags = flags; + t->fullpath = fullpath; + t->line_count = 1; + + t->start = cast(u8 *)fc->data; + t->line = t->read_curr = t->curr = t->start; + t->end = t->start + fc->size; + + advance_to_next_rune(t); + if (t->curr_rune == GB_RUNE_BOM) { + advance_to_next_rune(t); // Ignore BOM at file beginning + } + + if (t->allocated_strings.count != 0) { + array_clear(&t->allocated_strings); + } else { + array_init(&t->allocated_strings, heap_allocator()); + } +} + +TokenizerInitError init_tokenizer(Tokenizer *t, String const &fullpath, TokenizerFlags flags = TokenizerFlag_None) { TokenizerInitError err = TokenizerInit_None; char *c_str = alloc_cstring(heap_allocator(), fullpath); @@ -829,25 +851,18 @@ TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags // TODO(bill): Memory map rather than copy contents gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str); - t->flags = flags; - t->fullpath = fullpath; - t->line_count = 1; - if (fc.size > I32_MAX) { + t->flags = flags; + t->fullpath = fullpath; + t->line_count = 1; err = TokenizerInit_FileTooLarge; gb_file_free_contents(&fc); } else if (fc.data != nullptr) { - t->start = cast(u8 *)fc.data; - t->line = t->read_curr = t->curr = t->start; - t->end = t->start + fc.size; - - advance_to_next_rune(t); - if (t->curr_rune == GB_RUNE_BOM) { - advance_to_next_rune(t); // Ignore BOM at file beginning - } - - array_init(&t->allocated_strings, heap_allocator()); + init_tokenizer_with_file_contents(t, fullpath, &fc, flags); } else { + t->flags = flags; + t->fullpath = fullpath; + t->line_count = 1; gbFile f = {}; gbFileError file_err = gb_file_open(&f, c_str); defer (gb_file_close(&f)); @@ -1093,8 +1108,24 @@ bool scan_escape(Tokenizer *t) { } -void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { +gb_inline void tokenizer_skip_line(Tokenizer *t) { +#if 0 + while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { + advance_to_next_rune(t); + } +#else + while (t->read_curr != t->end && t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { + t->curr = t->read_curr; + t->curr_rune = *t->read_curr; + if (t->curr_rune == 0) { + tokenizer_err(t, "Illegal character NUL"); + } + t->read_curr++; + } +#endif +} +void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { // Skip whitespace if (t->flags & TokenizerFlag_InsertSemicolon && t->insert_semicolon) { for (;;) { @@ -1405,10 +1436,7 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { token->kind = Token_Hash; if (t->curr_rune == '!') { token->kind = Token_Comment; - - while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { - advance_to_next_rune(t); - } + tokenizer_skip_line(t); } break; case '/': @@ -1416,9 +1444,7 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { switch (t->curr_rune) { case '/': token->kind = Token_Comment; - while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { - advance_to_next_rune(t); - } + tokenizer_skip_line(t); break; case '*': token->kind = Token_Comment; diff --git a/src/unicode.cpp b/src/unicode.cpp index 83aa8deef..e33fb793b 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -65,3 +65,100 @@ bool rune_is_whitespace(Rune r) { } return false; } + + +gb_global u8 const global__utf8_first[256] = { + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6F + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7F + 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8F + 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9F + 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xA0-0xAF + 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xB0-0xBF + 0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xC0-0xCF + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xD0-0xDF + 0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xE0-0xEF + 0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xF0-0xFF +}; + +typedef struct Utf8AcceptRange { + u8 lo, hi; +} Utf8AcceptRange; + +gb_global Utf8AcceptRange const global__utf8_accept_ranges[] = { + {0x80, 0xbf}, + {0xa0, 0xbf}, + {0x80, 0x9f}, + {0x90, 0xbf}, + {0x80, 0x8f}, +}; + + +isize utf8_decode(u8 const *str, isize str_len, Rune *codepoint_out) { + isize width = 0; + Rune codepoint = GB_RUNE_INVALID; + + if (str_len > 0) { + u8 s0 = str[0]; + u8 x = global__utf8_first[s0], sz; + u8 b1, b2, b3; + Utf8AcceptRange accept; + if (x >= 0xf0) { + Rune mask = (cast(Rune)x << 31) >> 31; + codepoint = (cast(Rune)s0 & (~mask)) | (GB_RUNE_INVALID & mask); + width = 1; + goto end; + } + if (s0 < 0x80) { + codepoint = s0; + width = 1; + goto end; + } + + sz = x&7; + accept = global__utf8_accept_ranges[x>>4]; + if (str_len < gb_size_of(sz)) + goto invalid_codepoint; + + b1 = str[1]; + if (b1 < accept.lo || accept.hi < b1) + goto invalid_codepoint; + + if (sz == 2) { + codepoint = (cast(Rune)s0&0x1f)<<6 | (cast(Rune)b1&0x3f); + width = 2; + goto end; + } + + b2 = str[2]; + if (!gb_is_between(b2, 0x80, 0xbf)) + goto invalid_codepoint; + + if (sz == 3) { + codepoint = (cast(Rune)s0&0x1f)<<12 | (cast(Rune)b1&0x3f)<<6 | (cast(Rune)b2&0x3f); + width = 3; + goto end; + } + + b3 = str[3]; + if (!gb_is_between(b3, 0x80, 0xbf)) + goto invalid_codepoint; + + codepoint = (cast(Rune)s0&0x07)<<18 | (cast(Rune)b1&0x3f)<<12 | (cast(Rune)b2&0x3f)<<6 | (cast(Rune)b3&0x3f); + width = 4; + goto end; + + invalid_codepoint: + codepoint = GB_RUNE_INVALID; + width = 1; + } + +end: + if (codepoint_out) *codepoint_out = codepoint; + return width; +}