From 49e140f4db1f9fffa541c4d58efa91b7128c4ff4 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Tue, 10 Nov 2020 16:47:56 +0000 Subject: [PATCH] Add utf8.full_rune --- core/unicode/utf8/utf8.odin | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/core/unicode/utf8/utf8.odin b/core/unicode/utf8/utf8.odin index f008c3881..50d24d562 100644 --- a/core/unicode/utf8/utf8.odin +++ b/core/unicode/utf8/utf8.odin @@ -350,3 +350,44 @@ rune_size :: proc(r: rune) -> int { } return -1; } + +// full_rune reports if the bytes in b begin with a full utf-8 encoding of a rune or not +// An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) +full_rune :: proc(b: []byte) -> bool { + n := len(b); + if n == 0 { + return false; + } + x := _first[b[0]]; + if n >= int(x & 7) { + return true; + } + accept := accept_ranges[x>>4]; + if n > 1 && (b[1] < accept.lo || accept.hi < b[1]) { + return true; + } else if n > 2 && (b[2] < LOCB || HICB < b[2]) { + return true; + } + return false; +} + +// full_rune_in_string reports if the bytes in s begin with a full utf-8 encoding of a rune or not +// An invalid encoding is considered a full rune since it will convert as an error rune of width 1 (RUNE_ERROR) +full_rune_in_string :: proc(s: string) -> bool { + return full_rune(transmute([]byte)s); +} + + +_first := [256]u8{ + 0x00..0x7f = 0xf0, // ascii, size 1 + 0x80..0xc1 = 0xf1, // invalid, size 1 + 0xc2..0xdf = 0x02, // accept 1, size 2 + 0xe0 = 0x13, // accept 1, size 3 + 0xe1..0xec = 0x03, // accept 0, size 3 + 0xed = 0x23, // accept 2, size 3 + 0xee..0xef = 0x03, // accept 0, size 3 + 0xf0 = 0x34, // accept 3, size 4 + 0xf1..0xf3 = 0x04, // accept 0, size 4 + 0xf4 = 0x44, // accept 4, size 4 + 0xf5..0xff = 0xf1, // ascii, size 1 +};