Files
Odin/core/text/regex/parser/parser.odin
2024-08-05 03:49:29 -04:00

591 lines
13 KiB
Odin

package regex_parser
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:strconv"
import "core:strings"
import "core:text/regex/common"
import "core:text/regex/tokenizer"
import "core:unicode"
import "core:unicode/utf8"
Token :: tokenizer.Token
Token_Kind :: tokenizer.Token_Kind
Tokenizer :: tokenizer.Tokenizer
Rune_Class_Range :: struct {
lower, upper: rune,
}
Rune_Class_Data :: struct {
runes: [dynamic]rune,
ranges: [dynamic]Rune_Class_Range,
}
Node_Rune :: struct {
data: rune,
}
Node_Rune_Class :: struct {
negating: bool,
using data: Rune_Class_Data,
}
Node_Wildcard :: struct {}
Node_Alternation :: struct {
left, right: Node,
}
Node_Concatenation :: struct {
nodes: [dynamic]Node,
}
Node_Repeat_Zero :: struct {
inner: Node,
}
Node_Repeat_Zero_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_One :: struct {
inner: Node,
}
Node_Repeat_One_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_N :: struct {
inner: Node,
lower, upper: int,
}
Node_Optional :: struct {
inner: Node,
}
Node_Optional_Non_Greedy :: struct {
inner: Node,
}
Node_Group :: struct {
inner: Node,
capture_id: int,
capture: bool,
}
Node_Anchor :: struct {
start: bool,
}
Node_Word_Boundary :: struct {
non_word: bool,
}
Node_Match_All_And_Escape :: struct {}
Node :: union {
^Node_Rune,
^Node_Rune_Class,
^Node_Wildcard,
^Node_Concatenation,
^Node_Alternation,
^Node_Repeat_Zero,
^Node_Repeat_Zero_Non_Greedy,
^Node_Repeat_One,
^Node_Repeat_One_Non_Greedy,
^Node_Repeat_N,
^Node_Optional,
^Node_Optional_Non_Greedy,
^Node_Group,
^Node_Anchor,
^Node_Word_Boundary,
// Optimized nodes (not created by the Parser):
^Node_Match_All_And_Escape,
}
left_binding_power :: proc(kind: Token_Kind) -> int {
#partial switch kind {
case .Alternate: return 1
case .Concatenate: return 2
case .Repeat_Zero, .Repeat_One,
.Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
.Repeat_N: return 3
case .Optional,
.Optional_Non_Greedy: return 4
case .Open_Paren,
.Open_Paren_Non_Capture: return 9
}
return 0
}
Expected_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Repetition :: struct {
pos: int,
}
Invalid_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Unicode :: struct {
pos: int,
}
Too_Many_Capture_Groups :: struct {
pos: int,
}
Unexpected_EOF :: struct {
pos: int,
}
Error :: union {
Expected_Token,
Invalid_Repetition,
Invalid_Token,
Invalid_Unicode,
Too_Many_Capture_Groups,
Unexpected_EOF,
}
Parser :: struct {
flags: common.Flags,
t: Tokenizer,
cur_token: Token,
groups: int,
}
@require_results
advance :: proc(p: ^Parser) -> Error {
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return Invalid_Unicode { pos = 0 }
}
return nil
}
expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
if p.cur_token.kind == kind {
advance(p) or_return
return
}
return Expected_Token{
pos = p.t.offset,
kind = kind,
}
}
null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Rune:
r: rune
for ru in token.text {
r = ru
break
}
assert(r != 0, "Parsed an empty Rune token.")
if .Case_Insensitive in p.flags {
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
node := new(Node_Rune_Class)
append(&node.runes, lower)
append(&node.runes, upper)
return node, nil
}
}
node := new(Node_Rune)
node ^= { r }
return node, nil
case .Rune_Class:
if len(token.text) == 0 {
return nil, nil
}
node := new(Node_Rune_Class)
#no_bounds_check for i := 0; i < len(token.text); /**/ {
r, size := utf8.decode_rune(token.text[i:])
if i == 0 && r == '^' {
node.negating = true
i += size
continue
}
i += size
assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
if r == '\\' {
next_r, next_size := utf8.decode_rune(token.text[i:])
i += next_size
assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
// @MetaCharacter
// NOTE: These must be kept in sync with the tokenizer.
switch next_r {
case 'f': append(&node.runes, '\f')
case 'n': append(&node.runes, '\n')
case 'r': append(&node.runes, '\r')
case 't': append(&node.runes, '\t')
case 'd':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
case 's':
append(&node.runes, '\t')
append(&node.runes, '\n')
append(&node.runes, '\f')
append(&node.runes, '\r')
append(&node.runes, ' ')
case 'w':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
append(&node.runes, '_')
append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
case 'D':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
case 'S':
append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
// \t and \n are adjacent.
append(&node.runes, '\x0b') // Vertical Tab
append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
case 'W':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
case:
append(&node.runes, next_r)
}
continue
}
if r == '-' && len(node.runes) > 0 {
next_r, next_size := utf8.decode_rune(token.text[i:])
if next_size > 0 {
last := pop(&node.runes)
i += next_size
append(&node.ranges, Rune_Class_Range{ last, next_r })
continue
}
}
append(&node.runes, r)
}
if .Case_Insensitive in p.flags {
// These two loops cannot be in the form of `for x in y` because
// they append to the data that they iterate over.
length := len(node.runes)
#no_bounds_check for i := 0; i < length; i += 1 {
r := node.runes[i]
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
if lower != r {
append(&node.runes, lower)
} else {
append(&node.runes, upper)
}
}
}
length = len(node.ranges)
#no_bounds_check for i := 0; i < length; i += 1 {
range := &node.ranges[i]
min_lower := unicode.to_lower(range.lower)
max_lower := unicode.to_lower(range.upper)
min_upper := unicode.to_upper(range.lower)
max_upper := unicode.to_upper(range.upper)
if min_lower != min_upper && max_lower != max_upper {
range.lower = min_lower
range.upper = max_lower
append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
}
}
}
result = node
case .Wildcard:
node := new(Node_Wildcard)
result = node
case .Open_Paren:
// Because of the recursive nature of the token parser, we take the
// group number first instead of afterwards, in order to construct
// group matches from the outside in.
p.groups += 1
if p.groups == common.MAX_CAPTURE_GROUPS {
return nil, Too_Many_Capture_Groups{ pos = token.pos }
}
this_group := p.groups
node := new(Node_Group)
node.capture = true
node.capture_id = this_group
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Open_Paren_Non_Capture:
node := new(Node_Group)
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Close_Paren:
node := new(Node_Rune)
node ^= { ')' }
return node, nil
case .Anchor_Start:
node := new(Node_Anchor)
node.start = true
result = node
case .Anchor_End:
node := new(Node_Anchor)
result = node
case .Word_Boundary:
node := new(Node_Word_Boundary)
result = node
case .Non_Word_Boundary:
node := new(Node_Word_Boundary)
node.non_word = true
result = node
case .Alternate:
// A unary alternation with a left-side empty path, i.e. `|a`.
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case Unexpected_EOF:
// This token is a NOP, i.e. `|`.
break
case nil:
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.right = right
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Alternate:
if p.cur_token.kind == .Close_Paren {
// `(a|)`
// parse_expression will fail, so intervene here.
node := new(Node_Alternation)
node.left = left
return node, nil
}
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case nil:
break
case Unexpected_EOF:
// EOF is okay in an alternation; it's an edge case in the way of
// expressing an optional such as `a|`.
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.left = left
node.right = right
result = node
case .Concatenate:
right := parse_expression(p, left_binding_power(.Concatenate)) or_return
// There should be no need to check if right is Node_Concatenation, due
// to how the parsing direction works.
#partial switch specific in left {
case ^Node_Concatenation:
append(&specific.nodes, right)
result = specific
case:
node := new(Node_Concatenation)
append(&node.nodes, left)
append(&node.nodes, right)
result = node
}
case .Repeat_Zero:
node := new(Node_Repeat_Zero)
node.inner = left
result = node
case .Repeat_Zero_Non_Greedy:
node := new(Node_Repeat_Zero_Non_Greedy)
node.inner = left
result = node
case .Repeat_One:
node := new(Node_Repeat_One)
node.inner = left
result = node
case .Repeat_One_Non_Greedy:
node := new(Node_Repeat_One_Non_Greedy)
node.inner = left
result = node
case .Repeat_N:
node := new(Node_Repeat_N)
node.inner = left
comma := strings.index_byte(token.text, ',')
switch comma {
case -1: // {N}
exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if exact == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)exact
node.upper = cast(int)exact
case 0: // {,M}
upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = -1
node.upper = cast(int)upper
case len(token.text) - 1: // {N,}
lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = -1
case: // {N,M}
lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !lower_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
if !upper_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if lower > upper {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = cast(int)upper
}
result = node
case .Optional:
node := new(Node_Optional)
node.inner = left
result = node
case .Optional_Non_Greedy:
node := new(Node_Optional_Non_Greedy)
node.inner = left
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
token := p.cur_token
advance(p) or_return
left := null_denotation(p, token) or_return
token = p.cur_token
for rbp < left_binding_power(token.kind) {
advance(p) or_return
left = left_denotation(p, token, left) or_return
token = p.cur_token
}
return left, nil
}
parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
if len(str) == 0 {
node := new(Node_Group)
return node, nil
}
p: Parser
p.flags = flags
tokenizer.init(&p.t, str, flags)
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return nil, Invalid_Unicode { pos = 0 }
}
node := parse_expression(&p, 0) or_return
result = node
return
}