mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-29 17:34:34 +00:00
591 lines
13 KiB
Odin
591 lines
13 KiB
Odin
package regex_parser
|
|
|
|
/*
|
|
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
|
Made available under Odin's BSD-3 license.
|
|
|
|
List of contributors:
|
|
Feoramund: Initial implementation.
|
|
*/
|
|
|
|
import "base:intrinsics"
|
|
import "core:strconv"
|
|
import "core:strings"
|
|
import "core:text/regex/common"
|
|
import "core:text/regex/tokenizer"
|
|
import "core:unicode"
|
|
import "core:unicode/utf8"
|
|
|
|
Token :: tokenizer.Token
|
|
Token_Kind :: tokenizer.Token_Kind
|
|
Tokenizer :: tokenizer.Tokenizer
|
|
|
|
Rune_Class_Range :: struct {
|
|
lower, upper: rune,
|
|
}
|
|
Rune_Class_Data :: struct {
|
|
runes: [dynamic]rune,
|
|
ranges: [dynamic]Rune_Class_Range,
|
|
}
|
|
|
|
|
|
Node_Rune :: struct {
|
|
data: rune,
|
|
}
|
|
|
|
Node_Rune_Class :: struct {
|
|
negating: bool,
|
|
using data: Rune_Class_Data,
|
|
}
|
|
|
|
Node_Wildcard :: struct {}
|
|
|
|
Node_Alternation :: struct {
|
|
left, right: Node,
|
|
}
|
|
|
|
Node_Concatenation :: struct {
|
|
nodes: [dynamic]Node,
|
|
}
|
|
|
|
Node_Repeat_Zero :: struct {
|
|
inner: Node,
|
|
}
|
|
Node_Repeat_Zero_Non_Greedy :: struct {
|
|
inner: Node,
|
|
}
|
|
Node_Repeat_One :: struct {
|
|
inner: Node,
|
|
}
|
|
Node_Repeat_One_Non_Greedy :: struct {
|
|
inner: Node,
|
|
}
|
|
|
|
Node_Repeat_N :: struct {
|
|
inner: Node,
|
|
lower, upper: int,
|
|
}
|
|
|
|
Node_Optional :: struct {
|
|
inner: Node,
|
|
}
|
|
Node_Optional_Non_Greedy :: struct {
|
|
inner: Node,
|
|
}
|
|
|
|
Node_Group :: struct {
|
|
inner: Node,
|
|
capture_id: int,
|
|
capture: bool,
|
|
}
|
|
|
|
Node_Anchor :: struct {
|
|
start: bool,
|
|
}
|
|
Node_Word_Boundary :: struct {
|
|
non_word: bool,
|
|
}
|
|
|
|
Node_Match_All_And_Escape :: struct {}
|
|
|
|
Node :: union {
|
|
^Node_Rune,
|
|
^Node_Rune_Class,
|
|
^Node_Wildcard,
|
|
^Node_Concatenation,
|
|
^Node_Alternation,
|
|
^Node_Repeat_Zero,
|
|
^Node_Repeat_Zero_Non_Greedy,
|
|
^Node_Repeat_One,
|
|
^Node_Repeat_One_Non_Greedy,
|
|
^Node_Repeat_N,
|
|
^Node_Optional,
|
|
^Node_Optional_Non_Greedy,
|
|
^Node_Group,
|
|
^Node_Anchor,
|
|
^Node_Word_Boundary,
|
|
|
|
// Optimized nodes (not created by the Parser):
|
|
^Node_Match_All_And_Escape,
|
|
}
|
|
|
|
|
|
left_binding_power :: proc(kind: Token_Kind) -> int {
|
|
#partial switch kind {
|
|
case .Alternate: return 1
|
|
case .Concatenate: return 2
|
|
case .Repeat_Zero, .Repeat_One,
|
|
.Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
|
|
.Repeat_N: return 3
|
|
case .Optional,
|
|
.Optional_Non_Greedy: return 4
|
|
case .Open_Paren,
|
|
.Open_Paren_Non_Capture: return 9
|
|
}
|
|
return 0
|
|
}
|
|
|
|
|
|
Expected_Token :: struct {
|
|
pos: int,
|
|
kind: Token_Kind,
|
|
}
|
|
|
|
Invalid_Repetition :: struct {
|
|
pos: int,
|
|
}
|
|
|
|
Invalid_Token :: struct {
|
|
pos: int,
|
|
kind: Token_Kind,
|
|
}
|
|
|
|
Invalid_Unicode :: struct {
|
|
pos: int,
|
|
}
|
|
|
|
Too_Many_Capture_Groups :: struct {
|
|
pos: int,
|
|
}
|
|
|
|
Unexpected_EOF :: struct {
|
|
pos: int,
|
|
}
|
|
|
|
Error :: union {
|
|
Expected_Token,
|
|
Invalid_Repetition,
|
|
Invalid_Token,
|
|
Invalid_Unicode,
|
|
Too_Many_Capture_Groups,
|
|
Unexpected_EOF,
|
|
}
|
|
|
|
|
|
Parser :: struct {
|
|
flags: common.Flags,
|
|
t: Tokenizer,
|
|
|
|
cur_token: Token,
|
|
|
|
groups: int,
|
|
}
|
|
|
|
|
|
@require_results
|
|
advance :: proc(p: ^Parser) -> Error {
|
|
p.cur_token = tokenizer.scan(&p.t)
|
|
if p.cur_token.kind == .Invalid {
|
|
return Invalid_Unicode { pos = 0 }
|
|
}
|
|
return nil
|
|
}
|
|
|
|
expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
|
|
if p.cur_token.kind == kind {
|
|
advance(p) or_return
|
|
return
|
|
}
|
|
|
|
return Expected_Token{
|
|
pos = p.t.offset,
|
|
kind = kind,
|
|
}
|
|
}
|
|
|
|
null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
|
|
#partial switch token.kind {
|
|
case .Rune:
|
|
r: rune
|
|
for ru in token.text {
|
|
r = ru
|
|
break
|
|
}
|
|
assert(r != 0, "Parsed an empty Rune token.")
|
|
|
|
if .Case_Insensitive in p.flags {
|
|
lower := unicode.to_lower(r)
|
|
upper := unicode.to_upper(r)
|
|
if lower != upper {
|
|
node := new(Node_Rune_Class)
|
|
append(&node.runes, lower)
|
|
append(&node.runes, upper)
|
|
return node, nil
|
|
}
|
|
}
|
|
|
|
node := new(Node_Rune)
|
|
node ^= { r }
|
|
return node, nil
|
|
|
|
case .Rune_Class:
|
|
if len(token.text) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
node := new(Node_Rune_Class)
|
|
|
|
#no_bounds_check for i := 0; i < len(token.text); /**/ {
|
|
r, size := utf8.decode_rune(token.text[i:])
|
|
if i == 0 && r == '^' {
|
|
node.negating = true
|
|
i += size
|
|
continue
|
|
}
|
|
i += size
|
|
|
|
assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
|
|
|
|
if r == '\\' {
|
|
next_r, next_size := utf8.decode_rune(token.text[i:])
|
|
i += next_size
|
|
assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
|
|
|
|
// @MetaCharacter
|
|
// NOTE: These must be kept in sync with the tokenizer.
|
|
switch next_r {
|
|
case 'f': append(&node.runes, '\f')
|
|
case 'n': append(&node.runes, '\n')
|
|
case 'r': append(&node.runes, '\r')
|
|
case 't': append(&node.runes, '\t')
|
|
|
|
case 'd':
|
|
append(&node.ranges, Rune_Class_Range{ '0', '9' })
|
|
case 's':
|
|
append(&node.runes, '\t')
|
|
append(&node.runes, '\n')
|
|
append(&node.runes, '\f')
|
|
append(&node.runes, '\r')
|
|
append(&node.runes, ' ')
|
|
case 'w':
|
|
append(&node.ranges, Rune_Class_Range{ '0', '9' })
|
|
append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
|
|
append(&node.runes, '_')
|
|
append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
|
|
case 'D':
|
|
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
|
|
case 'S':
|
|
append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
|
|
// \t and \n are adjacent.
|
|
append(&node.runes, '\x0b') // Vertical Tab
|
|
append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
|
|
case 'W':
|
|
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
|
|
append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
|
|
case:
|
|
append(&node.runes, next_r)
|
|
}
|
|
continue
|
|
}
|
|
|
|
if r == '-' && len(node.runes) > 0 {
|
|
next_r, next_size := utf8.decode_rune(token.text[i:])
|
|
if next_size > 0 {
|
|
last := pop(&node.runes)
|
|
i += next_size
|
|
|
|
append(&node.ranges, Rune_Class_Range{ last, next_r })
|
|
continue
|
|
}
|
|
}
|
|
|
|
append(&node.runes, r)
|
|
}
|
|
|
|
if .Case_Insensitive in p.flags {
|
|
// These two loops cannot be in the form of `for x in y` because
|
|
// they append to the data that they iterate over.
|
|
length := len(node.runes)
|
|
#no_bounds_check for i := 0; i < length; i += 1 {
|
|
r := node.runes[i]
|
|
lower := unicode.to_lower(r)
|
|
upper := unicode.to_upper(r)
|
|
|
|
if lower != upper {
|
|
if lower != r {
|
|
append(&node.runes, lower)
|
|
} else {
|
|
append(&node.runes, upper)
|
|
}
|
|
}
|
|
}
|
|
|
|
length = len(node.ranges)
|
|
#no_bounds_check for i := 0; i < length; i += 1 {
|
|
range := &node.ranges[i]
|
|
|
|
min_lower := unicode.to_lower(range.lower)
|
|
max_lower := unicode.to_lower(range.upper)
|
|
|
|
min_upper := unicode.to_upper(range.lower)
|
|
max_upper := unicode.to_upper(range.upper)
|
|
|
|
if min_lower != min_upper && max_lower != max_upper {
|
|
range.lower = min_lower
|
|
range.upper = max_lower
|
|
append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
|
|
}
|
|
}
|
|
}
|
|
|
|
result = node
|
|
|
|
case .Wildcard:
|
|
node := new(Node_Wildcard)
|
|
result = node
|
|
|
|
case .Open_Paren:
|
|
// Because of the recursive nature of the token parser, we take the
|
|
// group number first instead of afterwards, in order to construct
|
|
// group matches from the outside in.
|
|
p.groups += 1
|
|
if p.groups == common.MAX_CAPTURE_GROUPS {
|
|
return nil, Too_Many_Capture_Groups{ pos = token.pos }
|
|
}
|
|
this_group := p.groups
|
|
|
|
node := new(Node_Group)
|
|
node.capture = true
|
|
node.capture_id = this_group
|
|
|
|
node.inner = parse_expression(p, 0) or_return
|
|
expect(p, .Close_Paren) or_return
|
|
result = node
|
|
case .Open_Paren_Non_Capture:
|
|
node := new(Node_Group)
|
|
node.inner = parse_expression(p, 0) or_return
|
|
expect(p, .Close_Paren) or_return
|
|
result = node
|
|
case .Close_Paren:
|
|
node := new(Node_Rune)
|
|
node ^= { ')' }
|
|
return node, nil
|
|
|
|
case .Anchor_Start:
|
|
node := new(Node_Anchor)
|
|
node.start = true
|
|
result = node
|
|
case .Anchor_End:
|
|
node := new(Node_Anchor)
|
|
result = node
|
|
case .Word_Boundary:
|
|
node := new(Node_Word_Boundary)
|
|
result = node
|
|
case .Non_Word_Boundary:
|
|
node := new(Node_Word_Boundary)
|
|
node.non_word = true
|
|
result = node
|
|
|
|
case .Alternate:
|
|
// A unary alternation with a left-side empty path, i.e. `|a`.
|
|
right, right_err := parse_expression(p, left_binding_power(.Alternate))
|
|
#partial switch specific in right_err {
|
|
case Unexpected_EOF:
|
|
// This token is a NOP, i.e. `|`.
|
|
break
|
|
case nil:
|
|
break
|
|
case:
|
|
return nil, right_err
|
|
}
|
|
|
|
node := new(Node_Alternation)
|
|
node.right = right
|
|
result = node
|
|
|
|
case .EOF:
|
|
return nil, Unexpected_EOF{ pos = token.pos }
|
|
|
|
case:
|
|
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
|
|
#partial switch token.kind {
|
|
case .Alternate:
|
|
if p.cur_token.kind == .Close_Paren {
|
|
// `(a|)`
|
|
// parse_expression will fail, so intervene here.
|
|
node := new(Node_Alternation)
|
|
node.left = left
|
|
return node, nil
|
|
}
|
|
|
|
right, right_err := parse_expression(p, left_binding_power(.Alternate))
|
|
|
|
#partial switch specific in right_err {
|
|
case nil:
|
|
break
|
|
case Unexpected_EOF:
|
|
// EOF is okay in an alternation; it's an edge case in the way of
|
|
// expressing an optional such as `a|`.
|
|
break
|
|
case:
|
|
return nil, right_err
|
|
}
|
|
|
|
node := new(Node_Alternation)
|
|
node.left = left
|
|
node.right = right
|
|
result = node
|
|
|
|
case .Concatenate:
|
|
right := parse_expression(p, left_binding_power(.Concatenate)) or_return
|
|
|
|
// There should be no need to check if right is Node_Concatenation, due
|
|
// to how the parsing direction works.
|
|
#partial switch specific in left {
|
|
case ^Node_Concatenation:
|
|
append(&specific.nodes, right)
|
|
result = specific
|
|
case:
|
|
node := new(Node_Concatenation)
|
|
append(&node.nodes, left)
|
|
append(&node.nodes, right)
|
|
result = node
|
|
}
|
|
|
|
case .Repeat_Zero:
|
|
node := new(Node_Repeat_Zero)
|
|
node.inner = left
|
|
result = node
|
|
case .Repeat_Zero_Non_Greedy:
|
|
node := new(Node_Repeat_Zero_Non_Greedy)
|
|
node.inner = left
|
|
result = node
|
|
case .Repeat_One:
|
|
node := new(Node_Repeat_One)
|
|
node.inner = left
|
|
result = node
|
|
case .Repeat_One_Non_Greedy:
|
|
node := new(Node_Repeat_One_Non_Greedy)
|
|
node.inner = left
|
|
result = node
|
|
|
|
case .Repeat_N:
|
|
node := new(Node_Repeat_N)
|
|
node.inner = left
|
|
|
|
comma := strings.index_byte(token.text, ',')
|
|
|
|
switch comma {
|
|
case -1: // {N}
|
|
exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
|
|
if !ok {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
if exact == 0 {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
|
|
node.lower = cast(int)exact
|
|
node.upper = cast(int)exact
|
|
|
|
case 0: // {,M}
|
|
upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
|
|
if !ok {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
if upper == 0 {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
|
|
node.lower = -1
|
|
node.upper = cast(int)upper
|
|
|
|
case len(token.text) - 1: // {N,}
|
|
lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
|
|
if !ok {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
|
|
node.lower = cast(int)lower
|
|
node.upper = -1
|
|
|
|
case: // {N,M}
|
|
lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
|
|
if !lower_ok {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
|
|
if !upper_ok {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
if lower > upper {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
if upper == 0 {
|
|
return nil, Invalid_Repetition{ pos = token.pos }
|
|
}
|
|
|
|
node.lower = cast(int)lower
|
|
node.upper = cast(int)upper
|
|
}
|
|
|
|
result = node
|
|
|
|
case .Optional:
|
|
node := new(Node_Optional)
|
|
node.inner = left
|
|
result = node
|
|
case .Optional_Non_Greedy:
|
|
node := new(Node_Optional_Non_Greedy)
|
|
node.inner = left
|
|
result = node
|
|
|
|
case .EOF:
|
|
return nil, Unexpected_EOF{ pos = token.pos }
|
|
|
|
case:
|
|
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
|
|
token := p.cur_token
|
|
|
|
advance(p) or_return
|
|
left := null_denotation(p, token) or_return
|
|
|
|
token = p.cur_token
|
|
for rbp < left_binding_power(token.kind) {
|
|
advance(p) or_return
|
|
left = left_denotation(p, token, left) or_return
|
|
token = p.cur_token
|
|
}
|
|
|
|
return left, nil
|
|
}
|
|
|
|
parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
|
|
if len(str) == 0 {
|
|
node := new(Node_Group)
|
|
return node, nil
|
|
}
|
|
|
|
p: Parser
|
|
p.flags = flags
|
|
|
|
tokenizer.init(&p.t, str, flags)
|
|
|
|
p.cur_token = tokenizer.scan(&p.t)
|
|
if p.cur_token.kind == .Invalid {
|
|
return nil, Invalid_Unicode { pos = 0 }
|
|
}
|
|
|
|
node := parse_expression(&p, 0) or_return
|
|
result = node
|
|
|
|
return
|
|
}
|