mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-07 21:43:15 +00:00
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
359 lines
6.2 KiB
Odin
359 lines
6.2 KiB
Odin
// Tokenizes regular expressions.
|
|
package regex_tokenizer
|
|
|
|
/*
|
|
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
|
Made available under Odin's license.
|
|
|
|
List of contributors:
|
|
Feoramund: Initial implementation.
|
|
*/
|
|
|
|
import "core:text/regex/common"
|
|
import "core:unicode/utf8"
|
|
|
|
Token_Kind :: enum {
|
|
Invalid,
|
|
EOF,
|
|
|
|
Rune,
|
|
Wildcard,
|
|
|
|
Alternate,
|
|
|
|
Concatenate,
|
|
|
|
Repeat_Zero,
|
|
Repeat_Zero_Non_Greedy,
|
|
Repeat_One,
|
|
Repeat_One_Non_Greedy,
|
|
|
|
Repeat_N,
|
|
|
|
Optional,
|
|
Optional_Non_Greedy,
|
|
|
|
Rune_Class,
|
|
|
|
Open_Paren,
|
|
Open_Paren_Non_Capture,
|
|
Close_Paren,
|
|
|
|
Anchor_Start,
|
|
Anchor_End,
|
|
|
|
Word_Boundary,
|
|
Non_Word_Boundary,
|
|
}
|
|
|
|
Token :: struct {
|
|
kind: Token_Kind,
|
|
text: string,
|
|
pos: int,
|
|
}
|
|
|
|
Tokenizer :: struct {
|
|
flags: common.Flags,
|
|
src: string,
|
|
|
|
ch: rune,
|
|
offset: int,
|
|
read_offset: int,
|
|
|
|
last_token_kind: Token_Kind,
|
|
held_token: Token,
|
|
error_state: Error,
|
|
paren_depth: int,
|
|
}
|
|
|
|
Error :: enum {
|
|
None,
|
|
Illegal_Null_Character,
|
|
Illegal_Codepoint,
|
|
Illegal_Byte_Order_Mark,
|
|
}
|
|
|
|
init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
|
|
t.src = str
|
|
t.flags = flags
|
|
t.error_state = advance_rune(t)
|
|
}
|
|
|
|
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
|
|
if t.read_offset+offset < len(t.src) {
|
|
return t.src[t.read_offset+offset]
|
|
}
|
|
return 0
|
|
}
|
|
|
|
advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
|
|
if t.error_state != nil {
|
|
return t.error_state
|
|
}
|
|
|
|
if t.read_offset < len(t.src) {
|
|
t.offset = t.read_offset
|
|
r, w := rune(t.src[t.read_offset]), 1
|
|
switch {
|
|
case r == 0:
|
|
err = .Illegal_Null_Character
|
|
case r >= utf8.RUNE_SELF:
|
|
r, w = utf8.decode_rune(t.src[t.read_offset:])
|
|
if r == utf8.RUNE_ERROR && w == 1 {
|
|
err = .Illegal_Codepoint
|
|
} else if r == utf8.RUNE_BOM && t.offset > 0 {
|
|
err = .Illegal_Byte_Order_Mark
|
|
}
|
|
}
|
|
t.read_offset += w
|
|
t.ch = r
|
|
} else {
|
|
t.offset = len(t.src)
|
|
t.ch = -1
|
|
}
|
|
|
|
t.error_state = err
|
|
|
|
return
|
|
}
|
|
|
|
@require_results
|
|
scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
|
start := t.read_offset
|
|
|
|
for {
|
|
advance_rune(t)
|
|
if t.ch == -1 || t.error_state != nil {
|
|
return "", false
|
|
}
|
|
|
|
if t.ch == '\\' {
|
|
advance_rune(t)
|
|
continue
|
|
}
|
|
|
|
if t.ch == ']' {
|
|
return t.src[start:t.offset], true
|
|
}
|
|
}
|
|
|
|
unreachable()
|
|
}
|
|
|
|
@require_results
|
|
scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
|
start := t.read_offset
|
|
|
|
for {
|
|
advance_rune(t)
|
|
if t.ch == -1 {
|
|
return "", false
|
|
}
|
|
if t.ch == '}' {
|
|
return t.src[start:t.offset], true
|
|
}
|
|
}
|
|
|
|
unreachable()
|
|
}
|
|
|
|
@require_results
|
|
scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
|
|
if peek_byte(t) == '?' {
|
|
advance_rune(t)
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
scan_comment :: proc(t: ^Tokenizer) {
|
|
for {
|
|
advance_rune(t)
|
|
switch t.ch {
|
|
case -1:
|
|
return
|
|
case '\n':
|
|
// UNIX newline.
|
|
advance_rune(t)
|
|
return
|
|
case '\r':
|
|
// Mac newline.
|
|
advance_rune(t)
|
|
if t.ch == '\n' {
|
|
// Windows newline.
|
|
advance_rune(t)
|
|
}
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
@require_results
|
|
scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
|
|
if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
|
|
advance_rune(t)
|
|
advance_rune(t)
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
@require_results
|
|
scan :: proc(t: ^Tokenizer) -> (token: Token) {
|
|
kind: Token_Kind
|
|
lit: string
|
|
pos := t.offset
|
|
|
|
defer {
|
|
t.last_token_kind = token.kind
|
|
}
|
|
|
|
if t.error_state != nil {
|
|
t.error_state = nil
|
|
return { .Invalid, "", pos }
|
|
}
|
|
|
|
if t.held_token != {} {
|
|
popped := t.held_token
|
|
t.held_token = {}
|
|
|
|
return popped
|
|
}
|
|
|
|
ch_loop: for {
|
|
switch t.ch {
|
|
case -1:
|
|
return { .EOF, "", pos }
|
|
|
|
case '\\':
|
|
advance_rune(t)
|
|
|
|
if t.ch == -1 {
|
|
return { .EOF, "", pos }
|
|
}
|
|
|
|
pos = t.offset
|
|
|
|
// @MetaCharacter
|
|
// NOTE: These must be kept in sync with the compiler.
|
|
DIGIT_CLASS :: "0-9"
|
|
SPACE_CLASS :: "\t\n\f\r "
|
|
WORD_CLASS :: "0-9A-Z_a-z"
|
|
|
|
switch t.ch {
|
|
case 'b': kind = .Word_Boundary
|
|
case 'B': kind = .Non_Word_Boundary
|
|
|
|
case 'f': kind = .Rune; lit = "\f"
|
|
case 'n': kind = .Rune; lit = "\n"
|
|
case 'r': kind = .Rune; lit = "\r"
|
|
case 't': kind = .Rune; lit = "\t"
|
|
|
|
case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
|
|
case 's': kind = .Rune_Class; lit = SPACE_CLASS
|
|
case 'w': kind = .Rune_Class; lit = WORD_CLASS
|
|
case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
|
|
case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
|
|
case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
|
|
case:
|
|
kind = .Rune
|
|
lit = t.src[t.offset:t.read_offset]
|
|
}
|
|
|
|
case '.':
|
|
kind = .Wildcard
|
|
|
|
case '|': kind = .Alternate
|
|
|
|
case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
|
|
case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
|
|
case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
|
|
|
|
case '[':
|
|
if text, ok := scan_class(t); ok {
|
|
kind = .Rune_Class
|
|
lit = text
|
|
} else {
|
|
kind = .EOF
|
|
}
|
|
|
|
case '{':
|
|
if text, ok := scan_repeat(t); ok {
|
|
kind = .Repeat_N
|
|
lit = text
|
|
} else {
|
|
kind = .EOF
|
|
}
|
|
|
|
case '(':
|
|
kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
|
|
t.paren_depth += 1
|
|
case ')':
|
|
kind = .Close_Paren
|
|
t.paren_depth -= 1
|
|
|
|
case '^': kind = .Anchor_Start
|
|
case '$':
|
|
kind = .Anchor_End
|
|
|
|
case:
|
|
if .Ignore_Whitespace in t.flags {
|
|
switch t.ch {
|
|
case ' ', '\r', '\n', '\t', '\f':
|
|
advance_rune(t)
|
|
continue ch_loop
|
|
case:
|
|
break
|
|
}
|
|
}
|
|
if t.ch == '#' && t.paren_depth == 0 {
|
|
scan_comment(t)
|
|
continue ch_loop
|
|
}
|
|
|
|
kind = .Rune
|
|
lit = t.src[t.offset:t.read_offset]
|
|
}
|
|
|
|
break ch_loop
|
|
}
|
|
|
|
if t.error_state != nil {
|
|
t.error_state = nil
|
|
return { .Invalid, "", pos }
|
|
}
|
|
|
|
advance_rune(t)
|
|
|
|
// The following set of rules dictate where Concatenate tokens are
|
|
// automatically inserted.
|
|
#partial switch kind {
|
|
case
|
|
.Close_Paren,
|
|
.Alternate,
|
|
.Optional, .Optional_Non_Greedy,
|
|
.Repeat_Zero, .Repeat_Zero_Non_Greedy,
|
|
.Repeat_One, .Repeat_One_Non_Greedy,
|
|
.Repeat_N:
|
|
// Never prepend a Concatenate before these tokens.
|
|
break
|
|
case:
|
|
#partial switch t.last_token_kind {
|
|
case
|
|
.Invalid,
|
|
.Open_Paren, .Open_Paren_Non_Capture,
|
|
.Alternate:
|
|
// Never prepend a Concatenate token when the _last token_ was one
|
|
// of these.
|
|
break
|
|
case:
|
|
t.held_token = { kind, lit, pos }
|
|
return { .Concatenate, "", pos }
|
|
}
|
|
}
|
|
|
|
return { kind, lit, pos }
|
|
}
|