mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-04 12:07:45 +00:00
269 lines
6.4 KiB
Odin
269 lines
6.4 KiB
Odin
package utf8
|
|
|
|
RUNE_ERROR :: '\ufffd';
|
|
RUNE_SELF :: 0x80;
|
|
RUNE_BOM :: 0xfeff;
|
|
RUNE_EOF :: ~rune(0);
|
|
MAX_RUNE :: '\U0010ffff';
|
|
UTF_MAX :: 4;
|
|
|
|
SURROGATE_MIN :: 0xd800;
|
|
SURROGATE_MAX :: 0xdfff;
|
|
|
|
T1 :: 0b0000_0000;
|
|
TX :: 0b1000_0000;
|
|
T2 :: 0b1100_0000;
|
|
T3 :: 0b1110_0000;
|
|
T4 :: 0b1111_0000;
|
|
T5 :: 0b1111_1000;
|
|
|
|
MASKX :: 0b0011_1111;
|
|
MASK2 :: 0b0001_1111;
|
|
MASK3 :: 0b0000_1111;
|
|
MASK4 :: 0b0000_0111;
|
|
|
|
RUNE1_MAX :: 1<<7 - 1;
|
|
RUNE2_MAX :: 1<<11 - 1;
|
|
RUNE3_MAX :: 1<<16 - 1;
|
|
|
|
// The default lowest and highest continuation byte.
|
|
LOCB :: 0b1000_0000;
|
|
HICB :: 0b1011_1111;
|
|
|
|
Accept_Range :: struct {lo, hi: u8};
|
|
|
|
accept_ranges := [5]Accept_Range{
|
|
{0x80, 0xbf},
|
|
{0xa0, 0xbf},
|
|
{0x80, 0x9f},
|
|
{0x90, 0xbf},
|
|
{0x80, 0x8f},
|
|
};
|
|
|
|
accept_sizes := [256]u8{
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6f
|
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7f
|
|
|
|
0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8f
|
|
0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9f
|
|
0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xa0-0xaf
|
|
0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xb0-0xbf
|
|
0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xc0-0xcf
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xd0-0xdf
|
|
0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xe0-0xef
|
|
0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xf0-0xff
|
|
};
|
|
|
|
encode_rune :: proc(r: rune) -> ([4]u8, int) {
|
|
buf: [4]u8;
|
|
i := u32(r);
|
|
mask :: u8(0x3f);
|
|
if i <= 1<<7-1 {
|
|
buf[0] = u8(r);
|
|
return buf, 1;
|
|
}
|
|
if i <= 1<<11-1 {
|
|
buf[0] = 0xc0 | u8(r>>6);
|
|
buf[1] = 0x80 | u8(r) & mask;
|
|
return buf, 2;
|
|
}
|
|
|
|
// Invalid or Surrogate range
|
|
if i > 0x0010ffff ||
|
|
(0xd800 <= i && i <= 0xdfff) {
|
|
r = 0xfffd;
|
|
}
|
|
|
|
if i <= 1<<16-1 {
|
|
buf[0] = 0xe0 | u8(r>>12);
|
|
buf[1] = 0x80 | u8(r>>6) & mask;
|
|
buf[2] = 0x80 | u8(r) & mask;
|
|
return buf, 3;
|
|
}
|
|
|
|
buf[0] = 0xf0 | u8(r>>18);
|
|
buf[1] = 0x80 | u8(r>>12) & mask;
|
|
buf[2] = 0x80 | u8(r>>6) & mask;
|
|
buf[3] = 0x80 | u8(r) & mask;
|
|
return buf, 4;
|
|
}
|
|
|
|
decode_rune_from_string :: inline proc(s: string) -> (rune, int) do return decode_rune(cast([]u8)s);
|
|
decode_rune :: proc(s: []u8) -> (rune, int) {
|
|
n := len(s);
|
|
if n < 1 {
|
|
return RUNE_ERROR, 0;
|
|
}
|
|
s0 := s[0];
|
|
x := accept_sizes[s0];
|
|
if x >= 0xF0 {
|
|
mask := rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
|
|
return rune(s[0])&~mask | RUNE_ERROR&mask, 1;
|
|
}
|
|
sz := x & 7;
|
|
accept := accept_ranges[x>>4];
|
|
if n < int(sz) {
|
|
return RUNE_ERROR, 1;
|
|
}
|
|
b1 := s[1];
|
|
if b1 < accept.lo || accept.hi < b1 {
|
|
return RUNE_ERROR, 1;
|
|
}
|
|
if sz == 2 {
|
|
return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2;
|
|
}
|
|
b2 := s[2];
|
|
if b2 < LOCB || HICB < b2 {
|
|
return RUNE_ERROR, 1;
|
|
}
|
|
if sz == 3 {
|
|
return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3;
|
|
}
|
|
b3 := s[3];
|
|
if b3 < LOCB || HICB < b3 {
|
|
return RUNE_ERROR, 1;
|
|
}
|
|
return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4;
|
|
}
|
|
|
|
|
|
|
|
decode_last_rune_from_string :: inline proc(s: string) -> (rune, int) do return decode_last_rune(cast([]u8)s);
|
|
decode_last_rune :: proc(s: []u8) -> (rune, int) {
|
|
r: rune;
|
|
size: int;
|
|
start, end, limit: int;
|
|
|
|
end = len(s);
|
|
if end == 0 {
|
|
return RUNE_ERROR, 0;
|
|
}
|
|
start = end-1;
|
|
r = rune(s[start]);
|
|
if r < RUNE_SELF {
|
|
return r, 1;
|
|
}
|
|
|
|
|
|
limit = max(end - UTF_MAX, 0);
|
|
|
|
for start-=1; start >= limit; start-=1 {
|
|
if rune_start(s[start]) do break;
|
|
}
|
|
|
|
start = max(start, 0);
|
|
r, size = decode_rune(s[start..end]);
|
|
if start+size != end {
|
|
return RUNE_ERROR, 1;
|
|
}
|
|
return r, size;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
valid_rune :: proc(r: rune) -> bool {
|
|
if r < 0 {
|
|
return false;
|
|
} else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
|
|
return false;
|
|
} else if r > MAX_RUNE {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
valid_string :: proc(s: string) -> bool {
|
|
n := len(s);
|
|
for i := 0; i < n; {
|
|
si := s[i];
|
|
if si < RUNE_SELF { // ascii
|
|
i += 1;
|
|
continue;
|
|
}
|
|
x := accept_sizes[si];
|
|
if x == 0xf1 {
|
|
return false;
|
|
}
|
|
size := int(x & 7);
|
|
if i+size > n {
|
|
return false;
|
|
}
|
|
ar := accept_ranges[x>>4];
|
|
if b := s[i+1]; b < ar.lo || ar.hi < b {
|
|
return false;
|
|
} else if size == 2 {
|
|
// Okay
|
|
} else if b := s[i+2]; b < 0x80 || 0xbf < b {
|
|
return false;
|
|
} else if size == 3 {
|
|
// Okay
|
|
} else if b := s[i+3]; b < 0x80 || 0xbf < b {
|
|
return false;
|
|
}
|
|
i += size;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
rune_start :: inline proc(b: u8) -> bool do return b&0xc0 != 0x80;
|
|
|
|
rune_count_from_string :: inline proc(s: string) -> int do return rune_count(cast([]u8)s);
|
|
rune_count :: proc(s: []u8) -> int {
|
|
count := 0;
|
|
n := len(s);
|
|
|
|
for i := 0; i < n; {
|
|
defer count += 1;
|
|
si := s[i];
|
|
if si < RUNE_SELF { // ascii
|
|
i += 1;
|
|
continue;
|
|
}
|
|
x := accept_sizes[si];
|
|
if x == 0xf1 {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
size := int(x & 7);
|
|
if i+size > n {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
ar := accept_ranges[x>>4];
|
|
if b := s[i+1]; b < ar.lo || ar.hi < b {
|
|
size = 1;
|
|
} else if size == 2 {
|
|
// Okay
|
|
} else if b := s[i+2]; b < 0x80 || 0xbf < b {
|
|
size = 1;
|
|
} else if size == 3 {
|
|
// Okay
|
|
} else if b := s[i+3]; b < 0x80 || 0xbf < b {
|
|
size = 1;
|
|
}
|
|
i += size;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
|
|
rune_size :: proc(r: rune) -> int {
|
|
switch {
|
|
case r < 0: return -1;
|
|
case r <= 1<<7 - 1: return 1;
|
|
case r <= 1<<11 - 1: return 2;
|
|
case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
|
|
case r <= 1<<16 - 1: return 3;
|
|
case r <= MAX_RUNE: return 4;
|
|
}
|
|
return -1;
|
|
}
|