Files
Odin/core/unicode/utf8/utf8.odin
2020-01-26 21:10:56 +00:00

343 lines
6.3 KiB
Odin

package utf8
RUNE_ERROR :: '\ufffd';
RUNE_SELF :: 0x80;
RUNE_BOM :: 0xfeff;
RUNE_EOF :: ~rune(0);
MAX_RUNE :: '\U0010ffff';
UTF_MAX :: 4;
SURROGATE_MIN :: 0xd800;
SURROGATE_MAX :: 0xdfff;
T1 :: 0b0000_0000;
TX :: 0b1000_0000;
T2 :: 0b1100_0000;
T3 :: 0b1110_0000;
T4 :: 0b1111_0000;
T5 :: 0b1111_1000;
MASKX :: 0b0011_1111;
MASK2 :: 0b0001_1111;
MASK3 :: 0b0000_1111;
MASK4 :: 0b0000_0111;
RUNE1_MAX :: 1<<7 - 1;
RUNE2_MAX :: 1<<11 - 1;
RUNE3_MAX :: 1<<16 - 1;
// The default lowest and highest continuation byte.
LOCB :: 0b1000_0000;
HICB :: 0b1011_1111;
Accept_Range :: struct {lo, hi: u8};
accept_ranges := [5]Accept_Range{
{0x80, 0xbf},
{0xa0, 0xbf},
{0x80, 0x9f},
{0x90, 0xbf},
{0x80, 0x8f},
};
accept_sizes := [256]u8{
0x00..0x7f = 0xf0,
0x80..0xc1 = 0xf1,
0xc2..0xdf = 0x02,
0xe0 = 0x13,
0xe1..0xec = 0x03,
0xed = 0x23,
0xee..0xef = 0x03,
0xf0 = 0x34,
0xf1..0xf3 = 0x04,
0xf4 = 0x44,
0xf5..0xff = 0xf1,
};
encode_rune :: proc(c: rune) -> ([4]u8, int) {
r := c;
buf: [4]u8;
i := u32(r);
mask :: u8(0x3f);
if i <= 1<<7-1 {
buf[0] = u8(r);
return buf, 1;
}
if i <= 1<<11-1 {
buf[0] = 0xc0 | u8(r>>6);
buf[1] = 0x80 | u8(r) & mask;
return buf, 2;
}
// Invalid or Surrogate range
if i > 0x0010ffff ||
(0xd800 <= i && i <= 0xdfff) {
r = 0xfffd;
}
if i <= 1<<16-1 {
buf[0] = 0xe0 | u8(r>>12);
buf[1] = 0x80 | u8(r>>6) & mask;
buf[2] = 0x80 | u8(r) & mask;
return buf, 3;
}
buf[0] = 0xf0 | u8(r>>18);
buf[1] = 0x80 | u8(r>>12) & mask;
buf[2] = 0x80 | u8(r>>6) & mask;
buf[3] = 0x80 | u8(r) & mask;
return buf, 4;
}
decode_rune_in_string :: inline proc(s: string) -> (rune, int) do return decode_rune(transmute([]u8)s);
decode_rune :: proc(s: []u8) -> (rune, int) {
n := len(s);
if n < 1 {
return RUNE_ERROR, 0;
}
s0 := s[0];
x := accept_sizes[s0];
if x >= 0xF0 {
mask := rune(x) << 31 >> 31; // NOTE(bill): Create 0x0000 or 0xffff.
return rune(s[0])&~mask | RUNE_ERROR&mask, 1;
}
sz := x & 7;
accept := accept_ranges[x>>4];
if n < int(sz) {
return RUNE_ERROR, 1;
}
b1 := s[1];
if b1 < accept.lo || accept.hi < b1 {
return RUNE_ERROR, 1;
}
if sz == 2 {
return rune(s0&MASK2)<<6 | rune(b1&MASKX), 2;
}
b2 := s[2];
if b2 < LOCB || HICB < b2 {
return RUNE_ERROR, 1;
}
if sz == 3 {
return rune(s0&MASK3)<<12 | rune(b1&MASKX)<<6 | rune(b2&MASKX), 3;
}
b3 := s[3];
if b3 < LOCB || HICB < b3 {
return RUNE_ERROR, 1;
}
return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4;
}
string_to_runes :: proc(s: string, allocator := context.allocator) -> (runes: []rune) {
n := rune_count_in_string(s);
runes = make([]rune, n, allocator);
i := 0;
for r in s {
runes[i] = r;
i += 1;
}
return;
}
runes_to_string :: proc(runes: []rune, allocator := context.allocator) -> string {
byte_count := 0;
for r in runes {
_, w := encode_rune(r);
byte_count += w;
}
bytes := make([]byte, byte_count, allocator);
offset := 0;
for r in runes {
b, w := encode_rune(r);
copy(bytes[offset:], b[:w]);
offset += w;
}
return string(bytes);
}
decode_last_rune_in_string :: inline proc(s: string) -> (rune, int) do return decode_last_rune(transmute([]u8)s);
decode_last_rune :: proc(s: []u8) -> (rune, int) {
r: rune;
size: int;
start, end, limit: int;
end = len(s);
if end == 0 {
return RUNE_ERROR, 0;
}
start = end-1;
r = rune(s[start]);
if r < RUNE_SELF {
return r, 1;
}
limit = max(end - UTF_MAX, 0);
for start-=1; start >= limit; start-=1 {
if rune_start(s[start]) do break;
}
start = max(start, 0);
r, size = decode_rune(s[start:end]);
if start+size != end {
return RUNE_ERROR, 1;
}
return r, size;
}
rune_at_pos :: proc(s: string, pos: int) -> rune {
if pos < 0 {
return RUNE_ERROR;
}
i := 0;
for r in s {
if i == pos {
return r;
}
i += 1;
}
return RUNE_ERROR;
}
rune_string_at_pos :: proc(s: string, pos: int) -> string {
if pos < 0 {
return "";
}
i := 0;
for c, offset in s {
if i == pos {
w := rune_size(c);
return s[offset:][:w];
}
i += 1;
}
return "";
}
rune_at :: proc(s: string, byte_index: int) -> rune {
r, _ := decode_rune_in_string(s[byte_index:]);
return r;
}
// Returns the byte position of rune at position pos in s with an optional start byte position.
// Returns -1 if it runs out of the string.
rune_offset :: proc(s: string, pos: int, start: int = 0) -> int {
if pos < 0 {
return -1;
}
i := 0;
for _, offset in s[start:] {
if i == pos {
return offset+start;
}
i += 1;
}
return -1;
}
valid_rune :: proc(r: rune) -> bool {
if r < 0 {
return false;
} else if SURROGATE_MIN <= r && r <= SURROGATE_MAX {
return false;
} else if r > MAX_RUNE {
return false;
}
return true;
}
valid_string :: proc(s: string) -> bool {
n := len(s);
for i := 0; i < n; {
si := s[i];
if si < RUNE_SELF { // ascii
i += 1;
continue;
}
x := accept_sizes[si];
if x == 0xf1 {
return false;
}
size := int(x & 7);
if i+size > n {
return false;
}
ar := accept_ranges[x>>4];
if b := s[i+1]; b < ar.lo || ar.hi < b {
return false;
} else if size == 2 {
// Okay
} else if c := s[i+2]; c < 0x80 || 0xbf < c {
return false;
} else if size == 3 {
// Okay
} else if d := s[i+3]; b < 0x80 || 0xbf < d {
return false;
}
i += size;
}
return true;
}
rune_start :: inline proc(b: u8) -> bool do return b&0xc0 != 0x80;
rune_count_in_string :: inline proc(s: string) -> int do return rune_count(transmute([]u8)s);
rune_count :: proc(s: []u8) -> int {
count := 0;
n := len(s);
for i := 0; i < n; {
defer count += 1;
si := s[i];
if si < RUNE_SELF { // ascii
i += 1;
continue;
}
x := accept_sizes[si];
if x == 0xf1 {
i += 1;
continue;
}
size := int(x & 7);
if i+size > n {
i += 1;
continue;
}
ar := accept_ranges[x>>4];
if b := s[i+1]; b < ar.lo || ar.hi < b {
size = 1;
} else if size == 2 {
// Okay
} else if c := s[i+2]; c < 0x80 || 0xbf < c {
size = 1;
} else if size == 3 {
// Okay
} else if d := s[i+3]; d < 0x80 || 0xbf < d {
size = 1;
}
i += size;
}
return count;
}
rune_size :: proc(r: rune) -> int {
switch {
case r < 0: return -1;
case r <= 1<<7 - 1: return 1;
case r <= 1<<11 - 1: return 2;
case SURROGATE_MIN <= r && r <= SURROGATE_MAX: return -1;
case r <= 1<<16 - 1: return 3;
case r <= MAX_RUNE: return 4;
}
return -1;
}