mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-01 02:42:09 +00:00
358 lines
6.2 KiB
Odin
358 lines
6.2 KiB
Odin
package regex_tokenizer
|
|
|
|
/*
|
|
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
|
Made available under Odin's BSD-3 license.
|
|
|
|
List of contributors:
|
|
Feoramund: Initial implementation.
|
|
*/
|
|
|
|
import "core:text/regex/common"
|
|
import "core:unicode/utf8"
|
|
|
|
Token_Kind :: enum {
|
|
Invalid,
|
|
EOF,
|
|
|
|
Rune,
|
|
Wildcard,
|
|
|
|
Alternate,
|
|
|
|
Concatenate,
|
|
|
|
Repeat_Zero,
|
|
Repeat_Zero_Non_Greedy,
|
|
Repeat_One,
|
|
Repeat_One_Non_Greedy,
|
|
|
|
Repeat_N,
|
|
|
|
Optional,
|
|
Optional_Non_Greedy,
|
|
|
|
Rune_Class,
|
|
|
|
Open_Paren,
|
|
Open_Paren_Non_Capture,
|
|
Close_Paren,
|
|
|
|
Anchor_Start,
|
|
Anchor_End,
|
|
|
|
Word_Boundary,
|
|
Non_Word_Boundary,
|
|
}
|
|
|
|
Token :: struct {
|
|
kind: Token_Kind,
|
|
text: string,
|
|
pos: int,
|
|
}
|
|
|
|
Tokenizer :: struct {
|
|
flags: common.Flags,
|
|
src: string,
|
|
|
|
ch: rune,
|
|
offset: int,
|
|
read_offset: int,
|
|
|
|
last_token_kind: Token_Kind,
|
|
held_token: Token,
|
|
error_state: Error,
|
|
paren_depth: int,
|
|
}
|
|
|
|
Error :: enum {
|
|
None,
|
|
Illegal_Null_Character,
|
|
Illegal_Codepoint,
|
|
Illegal_Byte_Order_Mark,
|
|
}
|
|
|
|
init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
|
|
t.src = str
|
|
t.flags = flags
|
|
t.error_state = advance_rune(t)
|
|
}
|
|
|
|
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
|
|
if t.read_offset+offset < len(t.src) {
|
|
return t.src[t.read_offset+offset]
|
|
}
|
|
return 0
|
|
}
|
|
|
|
advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
|
|
if t.error_state != nil {
|
|
return t.error_state
|
|
}
|
|
|
|
if t.read_offset < len(t.src) {
|
|
t.offset = t.read_offset
|
|
r, w := rune(t.src[t.read_offset]), 1
|
|
switch {
|
|
case r == 0:
|
|
err = .Illegal_Null_Character
|
|
case r >= utf8.RUNE_SELF:
|
|
r, w = utf8.decode_rune(t.src[t.read_offset:])
|
|
if r == utf8.RUNE_ERROR && w == 1 {
|
|
err = .Illegal_Codepoint
|
|
} else if r == utf8.RUNE_BOM && t.offset > 0 {
|
|
err = .Illegal_Byte_Order_Mark
|
|
}
|
|
}
|
|
t.read_offset += w
|
|
t.ch = r
|
|
} else {
|
|
t.offset = len(t.src)
|
|
t.ch = -1
|
|
}
|
|
|
|
t.error_state = err
|
|
|
|
return
|
|
}
|
|
|
|
@require_results
|
|
scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
|
start := t.read_offset
|
|
|
|
for {
|
|
advance_rune(t)
|
|
if t.ch == -1 || t.error_state != nil {
|
|
return "", false
|
|
}
|
|
|
|
if t.ch == '\\' {
|
|
advance_rune(t)
|
|
continue
|
|
}
|
|
|
|
if t.ch == ']' {
|
|
return t.src[start:t.offset], true
|
|
}
|
|
}
|
|
|
|
unreachable()
|
|
}
|
|
|
|
@require_results
|
|
scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
|
start := t.read_offset
|
|
|
|
for {
|
|
advance_rune(t)
|
|
if t.ch == -1 {
|
|
return "", false
|
|
}
|
|
if t.ch == '}' {
|
|
return t.src[start:t.offset], true
|
|
}
|
|
}
|
|
|
|
unreachable()
|
|
}
|
|
|
|
@require_results
|
|
scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
|
|
if peek_byte(t) == '?' {
|
|
advance_rune(t)
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
scan_comment :: proc(t: ^Tokenizer) {
|
|
for {
|
|
advance_rune(t)
|
|
switch t.ch {
|
|
case -1:
|
|
return
|
|
case '\n':
|
|
// UNIX newline.
|
|
advance_rune(t)
|
|
return
|
|
case '\r':
|
|
// Mac newline.
|
|
advance_rune(t)
|
|
if t.ch == '\n' {
|
|
// Windows newline.
|
|
advance_rune(t)
|
|
}
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
@require_results
|
|
scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
|
|
if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
|
|
advance_rune(t)
|
|
advance_rune(t)
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
@require_results
|
|
scan :: proc(t: ^Tokenizer) -> (token: Token) {
|
|
kind: Token_Kind
|
|
lit: string
|
|
pos := t.offset
|
|
|
|
defer {
|
|
t.last_token_kind = token.kind
|
|
}
|
|
|
|
if t.error_state != nil {
|
|
t.error_state = nil
|
|
return { .Invalid, "", pos }
|
|
}
|
|
|
|
if t.held_token != {} {
|
|
popped := t.held_token
|
|
t.held_token = {}
|
|
|
|
return popped
|
|
}
|
|
|
|
ch_loop: for {
|
|
switch t.ch {
|
|
case -1:
|
|
return { .EOF, "", pos }
|
|
|
|
case '\\':
|
|
advance_rune(t)
|
|
|
|
if t.ch == -1 {
|
|
return { .EOF, "", pos }
|
|
}
|
|
|
|
pos = t.offset
|
|
|
|
// @MetaCharacter
|
|
// NOTE: These must be kept in sync with the compiler.
|
|
DIGIT_CLASS :: "0-9"
|
|
SPACE_CLASS :: "\t\n\f\r "
|
|
WORD_CLASS :: "0-9A-Z_a-z"
|
|
|
|
switch t.ch {
|
|
case 'b': kind = .Word_Boundary
|
|
case 'B': kind = .Non_Word_Boundary
|
|
|
|
case 'f': kind = .Rune; lit = "\f"
|
|
case 'n': kind = .Rune; lit = "\n"
|
|
case 'r': kind = .Rune; lit = "\r"
|
|
case 't': kind = .Rune; lit = "\t"
|
|
|
|
case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
|
|
case 's': kind = .Rune_Class; lit = SPACE_CLASS
|
|
case 'w': kind = .Rune_Class; lit = WORD_CLASS
|
|
case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
|
|
case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
|
|
case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
|
|
case:
|
|
kind = .Rune
|
|
lit = t.src[t.offset:t.read_offset]
|
|
}
|
|
|
|
case '.':
|
|
kind = .Wildcard
|
|
|
|
case '|': kind = .Alternate
|
|
|
|
case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
|
|
case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
|
|
case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
|
|
|
|
case '[':
|
|
if text, ok := scan_class(t); ok {
|
|
kind = .Rune_Class
|
|
lit = text
|
|
} else {
|
|
kind = .EOF
|
|
}
|
|
|
|
case '{':
|
|
if text, ok := scan_repeat(t); ok {
|
|
kind = .Repeat_N
|
|
lit = text
|
|
} else {
|
|
kind = .EOF
|
|
}
|
|
|
|
case '(':
|
|
kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
|
|
t.paren_depth += 1
|
|
case ')':
|
|
kind = .Close_Paren
|
|
t.paren_depth -= 1
|
|
|
|
case '^': kind = .Anchor_Start
|
|
case '$':
|
|
kind = .Anchor_End
|
|
|
|
case:
|
|
if .Ignore_Whitespace in t.flags {
|
|
switch t.ch {
|
|
case ' ', '\r', '\n', '\t', '\f':
|
|
advance_rune(t)
|
|
continue ch_loop
|
|
case:
|
|
break
|
|
}
|
|
}
|
|
if t.ch == '#' && t.paren_depth == 0 {
|
|
scan_comment(t)
|
|
continue ch_loop
|
|
}
|
|
|
|
kind = .Rune
|
|
lit = t.src[t.offset:t.read_offset]
|
|
}
|
|
|
|
break ch_loop
|
|
}
|
|
|
|
if t.error_state != nil {
|
|
t.error_state = nil
|
|
return { .Invalid, "", pos }
|
|
}
|
|
|
|
advance_rune(t)
|
|
|
|
// The following set of rules dictate where Concatenate tokens are
|
|
// automatically inserted.
|
|
#partial switch kind {
|
|
case
|
|
.Close_Paren,
|
|
.Alternate,
|
|
.Optional, .Optional_Non_Greedy,
|
|
.Repeat_Zero, .Repeat_Zero_Non_Greedy,
|
|
.Repeat_One, .Repeat_One_Non_Greedy,
|
|
.Repeat_N:
|
|
// Never prepend a Concatenate before these tokens.
|
|
break
|
|
case:
|
|
#partial switch t.last_token_kind {
|
|
case
|
|
.Invalid,
|
|
.Open_Paren, .Open_Paren_Non_Capture,
|
|
.Alternate:
|
|
// Never prepend a Concatenate token when the _last token_ was one
|
|
// of these.
|
|
break
|
|
case:
|
|
t.held_token = { kind, lit, pos }
|
|
return { .Concatenate, "", pos }
|
|
}
|
|
}
|
|
|
|
return { kind, lit, pos }
|
|
}
|