From 876820789e9dedaa6198c4cd145702485e3bd21c Mon Sep 17 00:00:00 2001 From: gingerBill Date: Wed, 27 May 2020 12:54:11 +0100 Subject: [PATCH] Add `rune_is_letter_or_digit` for tokenizer --- src/tokenizer.cpp | 2 +- src/unicode.cpp | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 8361e62a7..a7205f664 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -948,7 +948,7 @@ Token tokenizer_get_token(Tokenizer *t) { Rune curr_rune = t->curr_rune; if (rune_is_letter(curr_rune)) { token.kind = Token_Ident; - while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune)) { + while (rune_is_letter_or_digit(t->curr_rune)) { advance_to_next_rune(t); } diff --git a/src/unicode.cpp b/src/unicode.cpp index b988155f7..83aa8deef 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -32,6 +32,29 @@ bool rune_is_digit(Rune r) { return utf8proc_category(r) == UTF8PROC_CATEGORY_ND; } +bool rune_is_letter_or_digit(Rune r) { + if (r < 0x80) { + if (r == '_') { + return true; + } + if (((cast(u32)r | 0x20) - 0x61) < 26) { + return true; + } + return (cast(u32)r - '0') < 10; + } + switch (utf8proc_category(r)) { + case UTF8PROC_CATEGORY_LU: + case UTF8PROC_CATEGORY_LL: + case UTF8PROC_CATEGORY_LT: + case UTF8PROC_CATEGORY_LM: + case UTF8PROC_CATEGORY_LO: + return true; + case UTF8PROC_CATEGORY_ND: + return true; + } + return false; +} + bool rune_is_whitespace(Rune r) { switch (r) { case ' ':