Files
Odin/core/text/regex/parser/parser.odin
gingerBill 842cfee0f3 Change Odin's LICENSE to zlib from BSD 3-clause
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
2025-10-28 14:38:25 +00:00

591 lines
13 KiB
Odin

package regex_parser
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:strconv"
import "core:strings"
import "core:text/regex/common"
import "core:text/regex/tokenizer"
import "core:unicode"
import "core:unicode/utf8"
Token :: tokenizer.Token
Token_Kind :: tokenizer.Token_Kind
Tokenizer :: tokenizer.Tokenizer
Rune_Class_Range :: struct {
lower, upper: rune,
}
Rune_Class_Data :: struct {
runes: [dynamic]rune,
ranges: [dynamic]Rune_Class_Range,
}
Node_Rune :: struct {
data: rune,
}
Node_Rune_Class :: struct {
negating: bool,
using data: Rune_Class_Data,
}
Node_Wildcard :: struct {}
Node_Alternation :: struct {
left, right: Node,
}
Node_Concatenation :: struct {
nodes: [dynamic]Node,
}
Node_Repeat_Zero :: struct {
inner: Node,
}
Node_Repeat_Zero_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_One :: struct {
inner: Node,
}
Node_Repeat_One_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_N :: struct {
inner: Node,
lower, upper: int,
}
Node_Optional :: struct {
inner: Node,
}
Node_Optional_Non_Greedy :: struct {
inner: Node,
}
Node_Group :: struct {
inner: Node,
capture_id: int,
capture: bool,
}
Node_Anchor :: struct {
start: bool,
}
Node_Word_Boundary :: struct {
non_word: bool,
}
Node_Match_All_And_Escape :: struct {}
Node :: union {
^Node_Rune,
^Node_Rune_Class,
^Node_Wildcard,
^Node_Concatenation,
^Node_Alternation,
^Node_Repeat_Zero,
^Node_Repeat_Zero_Non_Greedy,
^Node_Repeat_One,
^Node_Repeat_One_Non_Greedy,
^Node_Repeat_N,
^Node_Optional,
^Node_Optional_Non_Greedy,
^Node_Group,
^Node_Anchor,
^Node_Word_Boundary,
// Optimized nodes (not created by the Parser):
^Node_Match_All_And_Escape,
}
left_binding_power :: proc(kind: Token_Kind) -> int {
#partial switch kind {
case .Alternate: return 1
case .Concatenate: return 2
case .Repeat_Zero, .Repeat_One,
.Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
.Repeat_N: return 3
case .Optional,
.Optional_Non_Greedy: return 4
case .Open_Paren,
.Open_Paren_Non_Capture: return 9
}
return 0
}
Expected_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Repetition :: struct {
pos: int,
}
Invalid_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Unicode :: struct {
pos: int,
}
Too_Many_Capture_Groups :: struct {
pos: int,
}
Unexpected_EOF :: struct {
pos: int,
}
Error :: union {
Expected_Token,
Invalid_Repetition,
Invalid_Token,
Invalid_Unicode,
Too_Many_Capture_Groups,
Unexpected_EOF,
}
Parser :: struct {
flags: common.Flags,
t: Tokenizer,
cur_token: Token,
groups: int,
}
@require_results
advance :: proc(p: ^Parser) -> Error {
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return Invalid_Unicode { pos = 0 }
}
return nil
}
expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
if p.cur_token.kind == kind {
advance(p) or_return
return
}
return Expected_Token{
pos = p.t.offset,
kind = kind,
}
}
null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Rune:
r: rune
for ru in token.text {
r = ru
break
}
assert(r != 0, "Parsed an empty Rune token.")
if .Case_Insensitive in p.flags {
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
node := new(Node_Rune_Class)
append(&node.runes, lower)
append(&node.runes, upper)
return node, nil
}
}
node := new(Node_Rune)
node ^= { r }
return node, nil
case .Rune_Class:
if len(token.text) == 0 {
return nil, nil
}
node := new(Node_Rune_Class)
#no_bounds_check for i := 0; i < len(token.text); /**/ {
r, size := utf8.decode_rune(token.text[i:])
if i == 0 && r == '^' {
node.negating = true
i += size
continue
}
i += size
assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
if r == '\\' {
next_r, next_size := utf8.decode_rune(token.text[i:])
i += next_size
assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
// @MetaCharacter
// NOTE: These must be kept in sync with the tokenizer.
switch next_r {
case 'f': append(&node.runes, '\f')
case 'n': append(&node.runes, '\n')
case 'r': append(&node.runes, '\r')
case 't': append(&node.runes, '\t')
case 'd':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
case 's':
append(&node.runes, '\t')
append(&node.runes, '\n')
append(&node.runes, '\f')
append(&node.runes, '\r')
append(&node.runes, ' ')
case 'w':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
append(&node.runes, '_')
append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
case 'D':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
case 'S':
append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
// \t and \n are adjacent.
append(&node.runes, '\x0b') // Vertical Tab
append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
case 'W':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
case:
append(&node.runes, next_r)
}
continue
}
if r == '-' && len(node.runes) > 0 {
next_r, next_size := utf8.decode_rune(token.text[i:])
if next_size > 0 {
last := pop(&node.runes)
i += next_size
append(&node.ranges, Rune_Class_Range{ last, next_r })
continue
}
}
append(&node.runes, r)
}
if .Case_Insensitive in p.flags {
// These two loops cannot be in the form of `for x in y` because
// they append to the data that they iterate over.
length := len(node.runes)
#no_bounds_check for i := 0; i < length; i += 1 {
r := node.runes[i]
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
if lower != r {
append(&node.runes, lower)
} else {
append(&node.runes, upper)
}
}
}
length = len(node.ranges)
#no_bounds_check for i := 0; i < length; i += 1 {
range := &node.ranges[i]
min_lower := unicode.to_lower(range.lower)
max_lower := unicode.to_lower(range.upper)
min_upper := unicode.to_upper(range.lower)
max_upper := unicode.to_upper(range.upper)
if min_lower != min_upper && max_lower != max_upper {
range.lower = min_lower
range.upper = max_lower
append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
}
}
}
result = node
case .Wildcard:
node := new(Node_Wildcard)
result = node
case .Open_Paren:
// Because of the recursive nature of the token parser, we take the
// group number first instead of afterwards, in order to construct
// group matches from the outside in.
p.groups += 1
if p.groups == common.MAX_CAPTURE_GROUPS {
return nil, Too_Many_Capture_Groups{ pos = token.pos }
}
this_group := p.groups
node := new(Node_Group)
node.capture = true
node.capture_id = this_group
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Open_Paren_Non_Capture:
node := new(Node_Group)
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Close_Paren:
node := new(Node_Rune)
node ^= { ')' }
return node, nil
case .Anchor_Start:
node := new(Node_Anchor)
node.start = true
result = node
case .Anchor_End:
node := new(Node_Anchor)
result = node
case .Word_Boundary:
node := new(Node_Word_Boundary)
result = node
case .Non_Word_Boundary:
node := new(Node_Word_Boundary)
node.non_word = true
result = node
case .Alternate:
// A unary alternation with a left-side empty path, i.e. `|a`.
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case Unexpected_EOF:
// This token is a NOP, i.e. `|`.
break
case nil:
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.right = right
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Alternate:
if p.cur_token.kind == .Close_Paren {
// `(a|)`
// parse_expression will fail, so intervene here.
node := new(Node_Alternation)
node.left = left
return node, nil
}
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case nil:
break
case Unexpected_EOF:
// EOF is okay in an alternation; it's an edge case in the way of
// expressing an optional such as `a|`.
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.left = left
node.right = right
result = node
case .Concatenate:
right := parse_expression(p, left_binding_power(.Concatenate)) or_return
// There should be no need to check if right is Node_Concatenation, due
// to how the parsing direction works.
#partial switch specific in left {
case ^Node_Concatenation:
append(&specific.nodes, right)
result = specific
case:
node := new(Node_Concatenation)
append(&node.nodes, left)
append(&node.nodes, right)
result = node
}
case .Repeat_Zero:
node := new(Node_Repeat_Zero)
node.inner = left
result = node
case .Repeat_Zero_Non_Greedy:
node := new(Node_Repeat_Zero_Non_Greedy)
node.inner = left
result = node
case .Repeat_One:
node := new(Node_Repeat_One)
node.inner = left
result = node
case .Repeat_One_Non_Greedy:
node := new(Node_Repeat_One_Non_Greedy)
node.inner = left
result = node
case .Repeat_N:
node := new(Node_Repeat_N)
node.inner = left
comma := strings.index_byte(token.text, ',')
switch comma {
case -1: // {N}
exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if exact == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)exact
node.upper = cast(int)exact
case 0: // {,M}
upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = -1
node.upper = cast(int)upper
case len(token.text) - 1: // {N,}
lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = -1
case: // {N,M}
lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !lower_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
if !upper_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if lower > upper {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = cast(int)upper
}
result = node
case .Optional:
node := new(Node_Optional)
node.inner = left
result = node
case .Optional_Non_Greedy:
node := new(Node_Optional_Non_Greedy)
node.inner = left
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
token := p.cur_token
advance(p) or_return
left := null_denotation(p, token) or_return
token = p.cur_token
for rbp < left_binding_power(token.kind) {
advance(p) or_return
left = left_denotation(p, token, left) or_return
token = p.cur_token
}
return left, nil
}
parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
if len(str) == 0 {
node := new(Node_Group)
return node, nil
}
p: Parser
p.flags = flags
tokenizer.init(&p.t, str, flags)
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return nil, Invalid_Unicode { pos = 0 }
}
node := parse_expression(&p, 0) or_return
result = node
return
}