mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
531 lines
14 KiB
Odin
531 lines
14 KiB
Odin
package regex_optimizer
|
|
|
|
/*
|
|
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
|
Made available under Odin's license.
|
|
|
|
List of contributors:
|
|
Feoramund: Initial implementation.
|
|
*/
|
|
|
|
import "base:intrinsics"
|
|
@require import "core:io"
|
|
import "core:slice"
|
|
import "core:text/regex/common"
|
|
import "core:text/regex/parser"
|
|
|
|
Rune_Class_Range :: parser.Rune_Class_Range
|
|
|
|
Node :: parser.Node
|
|
Node_Rune :: parser.Node_Rune
|
|
Node_Rune_Class :: parser.Node_Rune_Class
|
|
Node_Wildcard :: parser.Node_Wildcard
|
|
Node_Concatenation :: parser.Node_Concatenation
|
|
Node_Alternation :: parser.Node_Alternation
|
|
Node_Repeat_Zero :: parser.Node_Repeat_Zero
|
|
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
|
|
Node_Repeat_One :: parser.Node_Repeat_One
|
|
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
|
|
Node_Repeat_N :: parser.Node_Repeat_N
|
|
Node_Optional :: parser.Node_Optional
|
|
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
|
|
Node_Group :: parser.Node_Group
|
|
Node_Anchor :: parser.Node_Anchor
|
|
Node_Word_Boundary :: parser.Node_Word_Boundary
|
|
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
|
|
|
|
|
|
class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
|
|
return i.lower < j.lower
|
|
}
|
|
|
|
optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
|
|
if tree == nil {
|
|
return nil, 0
|
|
}
|
|
|
|
result = tree
|
|
|
|
switch specific in tree {
|
|
// No direct optimization possible on these nodes:
|
|
case ^Node_Rune: break
|
|
case ^Node_Wildcard: break
|
|
case ^Node_Anchor: break
|
|
case ^Node_Word_Boundary: break
|
|
case ^Node_Match_All_And_Escape: break
|
|
|
|
case ^Node_Concatenation:
|
|
// * Composition: Consume All to Anchored End
|
|
//
|
|
// DO: `.*$` => <special opcode>
|
|
// DO: `.+$` => `.` <special opcode>
|
|
if .Multiline not_in flags && len(specific.nodes) >= 2 {
|
|
i := len(specific.nodes) - 2
|
|
wrza: {
|
|
subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
|
|
_ = subnode.inner.(^Node_Wildcard) or_break wrza
|
|
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
|
|
if next_node.start == false {
|
|
specific.nodes[i] = new(Node_Match_All_And_Escape)
|
|
ordered_remove(&specific.nodes, i + 1)
|
|
changes += 1
|
|
break
|
|
}
|
|
}
|
|
wroa: {
|
|
subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
|
|
subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
|
|
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
|
|
if next_node.start == false {
|
|
specific.nodes[i] = subsubnode
|
|
specific.nodes[i+1] = new(Node_Match_All_And_Escape)
|
|
changes += 1
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Only recursive optimizations:
|
|
#no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
|
|
subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
|
|
changes += subnode_changes
|
|
if subnode == nil {
|
|
ordered_remove(&specific.nodes, i)
|
|
i -= 1
|
|
changes += 1
|
|
} else {
|
|
specific.nodes[i] = subnode
|
|
}
|
|
}
|
|
|
|
if len(specific.nodes) == 1 {
|
|
result = specific.nodes[0]
|
|
changes += 1
|
|
} else if len(specific.nodes) == 0 {
|
|
return nil, changes + 1
|
|
}
|
|
|
|
case ^Node_Repeat_Zero:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Repeat_Zero_Non_Greedy:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Repeat_One:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Repeat_One_Non_Greedy:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Repeat_N:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Optional:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
case ^Node_Optional_Non_Greedy:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
|
|
case ^Node_Group:
|
|
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
|
|
|
if specific.inner == nil {
|
|
return nil, changes + 1
|
|
}
|
|
|
|
if !specific.capture {
|
|
result = specific.inner
|
|
changes += 1
|
|
}
|
|
|
|
// Full optimization:
|
|
case ^Node_Rune_Class:
|
|
// * Class Simplification
|
|
//
|
|
// DO: `[aab]` => `[ab]`
|
|
// DO: `[aa]` => `[a]`
|
|
runes_seen: map[rune]bool
|
|
|
|
for r in specific.runes {
|
|
runes_seen[r] = true
|
|
}
|
|
|
|
if len(runes_seen) != len(specific.runes) {
|
|
clear(&specific.runes)
|
|
for key in runes_seen {
|
|
append(&specific.runes, key)
|
|
}
|
|
changes += 1
|
|
}
|
|
|
|
// * Class Reduction
|
|
//
|
|
// DO: `[a]` => `a`
|
|
if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
|
|
only_rune := specific.runes[0]
|
|
|
|
node := new(Node_Rune)
|
|
node.data = only_rune
|
|
|
|
return node, changes + 1
|
|
}
|
|
|
|
// * Range Construction
|
|
//
|
|
// DO: `[abc]` => `[a-c]`
|
|
slice.sort(specific.runes[:])
|
|
if len(specific.runes) > 1 {
|
|
new_range: Rune_Class_Range
|
|
new_range.lower = specific.runes[0]
|
|
new_range.upper = specific.runes[0]
|
|
|
|
#no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
|
|
r := specific.runes[i]
|
|
if new_range.lower == -1 {
|
|
new_range = { r, r }
|
|
continue
|
|
}
|
|
|
|
if r == new_range.lower - 1 {
|
|
new_range.lower -= 1
|
|
ordered_remove(&specific.runes, i)
|
|
i -= 1
|
|
changes += 1
|
|
} else if r == new_range.upper + 1 {
|
|
new_range.upper += 1
|
|
ordered_remove(&specific.runes, i)
|
|
i -= 1
|
|
changes += 1
|
|
} else if new_range.lower != new_range.upper {
|
|
append(&specific.ranges, new_range)
|
|
new_range = { -1, -1 }
|
|
changes += 1
|
|
}
|
|
}
|
|
|
|
if new_range.lower != new_range.upper {
|
|
append(&specific.ranges, new_range)
|
|
changes += 1
|
|
}
|
|
}
|
|
|
|
// * Rune Merging into Range
|
|
//
|
|
// DO: `[aa-c]` => `[a-c]`
|
|
for range in specific.ranges {
|
|
#no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
|
|
r := specific.runes[i]
|
|
if range.lower <= r && r <= range.upper {
|
|
ordered_remove(&specific.runes, i)
|
|
i -= 1
|
|
changes += 1
|
|
}
|
|
}
|
|
}
|
|
|
|
// * Range Merging
|
|
//
|
|
// DO: `[a-cc-e]` => `[a-e]`
|
|
// DO: `[a-cd-e]` => `[a-e]`
|
|
// DO: `[a-cb-e]` => `[a-e]`
|
|
slice.sort_by(specific.ranges[:], class_range_sorter)
|
|
#no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
|
|
for j := i + 1; j < len(specific.ranges); j += 1 {
|
|
left_range := &specific.ranges[i]
|
|
right_range := specific.ranges[j]
|
|
|
|
if left_range.upper == right_range.lower ||
|
|
left_range.upper == right_range.lower - 1 ||
|
|
left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
|
|
left_range.upper = max(left_range.upper, right_range.upper)
|
|
ordered_remove(&specific.ranges, j)
|
|
j -= 1
|
|
changes += 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(specific.ranges) == 0 {
|
|
specific.ranges = {}
|
|
}
|
|
if len(specific.runes) == 0 {
|
|
specific.runes = {}
|
|
}
|
|
|
|
// * NOP
|
|
//
|
|
// DO: `[]` => <nil>
|
|
if len(specific.ranges) + len(specific.runes) == 0 {
|
|
return nil, 1
|
|
}
|
|
|
|
slice.sort(specific.runes[:])
|
|
slice.sort_by(specific.ranges[:], class_range_sorter)
|
|
|
|
case ^Node_Alternation:
|
|
// Perform recursive optimization first.
|
|
left_changes, right_changes: int
|
|
specific.left, left_changes = optimize_subtree(specific.left, flags)
|
|
specific.right, right_changes = optimize_subtree(specific.right, flags)
|
|
changes += left_changes + right_changes
|
|
|
|
// * Alternation to Optional
|
|
//
|
|
// DO: `a|` => `a?`
|
|
if specific.left != nil && specific.right == nil {
|
|
node := new(Node_Optional)
|
|
node.inner = specific.left
|
|
return node, 1
|
|
}
|
|
|
|
// * Alternation to Optional Non-Greedy
|
|
//
|
|
// DO: `|a` => `a??`
|
|
if specific.right != nil && specific.left == nil {
|
|
node := new(Node_Optional_Non_Greedy)
|
|
node.inner = specific.right
|
|
return node, 1
|
|
}
|
|
|
|
// * NOP
|
|
//
|
|
// DO: `|` => <nil>
|
|
if specific.left == nil && specific.right == nil {
|
|
return nil, 1
|
|
}
|
|
|
|
left_rune, left_is_rune := specific.left.(^Node_Rune)
|
|
right_rune, right_is_rune := specific.right.(^Node_Rune)
|
|
|
|
if left_is_rune && right_is_rune {
|
|
if left_rune.data == right_rune.data {
|
|
// * Alternation Reduction
|
|
//
|
|
// DO: `a|a` => `a`
|
|
return left_rune, 1
|
|
} else {
|
|
// * Alternation to Class
|
|
//
|
|
// DO: `a|b` => `[ab]`
|
|
node := new(Node_Rune_Class)
|
|
append(&node.runes, left_rune.data)
|
|
append(&node.runes, right_rune.data)
|
|
return node, 1
|
|
}
|
|
}
|
|
|
|
left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
|
|
right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
|
|
|
|
// * Class Union
|
|
//
|
|
// DO: `[a0]|[b1]` => `[a0b1]`
|
|
left_class, left_is_class := specific.left.(^Node_Rune_Class)
|
|
right_class, right_is_class := specific.right.(^Node_Rune_Class)
|
|
if left_is_class && right_is_class {
|
|
for r in right_class.runes {
|
|
append(&left_class.runes, r)
|
|
}
|
|
for range in right_class.ranges {
|
|
append(&left_class.ranges, range)
|
|
}
|
|
return left_class, 1
|
|
}
|
|
|
|
// * Class Union
|
|
//
|
|
// DO: `[a-b]|c` => `[a-bc]`
|
|
if left_is_class && right_is_rune {
|
|
append(&left_class.runes, right_rune.data)
|
|
return left_class, 1
|
|
}
|
|
|
|
// * Class Union
|
|
//
|
|
// DO: `a|[b-c]` => `[b-ca]`
|
|
if left_is_rune && right_is_class {
|
|
append(&right_class.runes, left_rune.data)
|
|
return right_class, 1
|
|
}
|
|
|
|
// * Wildcard Reduction
|
|
//
|
|
// DO: `a|.` => `.`
|
|
if left_is_rune && right_is_wildcard {
|
|
return right_wildcard, 1
|
|
}
|
|
|
|
// * Wildcard Reduction
|
|
//
|
|
// DO: `.|a` => `.`
|
|
if left_is_wildcard && right_is_rune {
|
|
return left_wildcard, 1
|
|
}
|
|
|
|
// * Wildcard Reduction
|
|
//
|
|
// DO: `[ab]|.` => `.`
|
|
if left_is_class && right_is_wildcard {
|
|
return right_wildcard, 1
|
|
}
|
|
|
|
// * Wildcard Reduction
|
|
//
|
|
// DO: `.|[ab]` => `.`
|
|
if left_is_wildcard && right_is_class {
|
|
return left_wildcard, 1
|
|
}
|
|
|
|
left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
|
|
right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
|
|
|
|
// * Common Suffix Elimination
|
|
//
|
|
// DO: `blueberry|strawberry` => `(?:blue|straw)berry`
|
|
if left_is_concatenation && right_is_concatenation {
|
|
// Remember that a concatenation could contain any node, not just runes.
|
|
left_len := len(left_concatenation.nodes)
|
|
right_len := len(right_concatenation.nodes)
|
|
least_len := min(left_len, right_len)
|
|
same_len := 0
|
|
for i := 1; i <= least_len; i += 1 {
|
|
left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
|
|
right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
|
|
|
|
if !left_is_subrune || !right_is_subrune {
|
|
// One of the nodes isn't a rune; there's nothing more we can do.
|
|
break
|
|
}
|
|
|
|
if left_subrune.data == right_subrune.data {
|
|
same_len += 1
|
|
} else {
|
|
// No more similarities.
|
|
break
|
|
}
|
|
}
|
|
|
|
if same_len > 0 {
|
|
// Dissolve this alternation into a concatenation.
|
|
cat_node := new(Node_Concatenation)
|
|
group_node := new(Node_Group)
|
|
append(&cat_node.nodes, group_node)
|
|
|
|
// Turn the concatenation into the common suffix.
|
|
for i := left_len - same_len; i < left_len; i += 1 {
|
|
append(&cat_node.nodes, left_concatenation.nodes[i])
|
|
}
|
|
|
|
// Construct the group of alternating prefixes.
|
|
for i := same_len; i > 0; i -= 1 {
|
|
pop(&left_concatenation.nodes)
|
|
pop(&right_concatenation.nodes)
|
|
}
|
|
|
|
// (Re-using this alternation node.)
|
|
alter_node := specific
|
|
alter_node.left = left_concatenation
|
|
alter_node.right = right_concatenation
|
|
group_node.inner = alter_node
|
|
|
|
return cat_node, 1
|
|
}
|
|
}
|
|
|
|
// * Common Prefix Elimination
|
|
//
|
|
// DO: `abi|abe` => `ab(?:i|e)`
|
|
if left_is_concatenation && right_is_concatenation {
|
|
// Try to identify a common prefix.
|
|
// Remember that a concatenation could contain any node, not just runes.
|
|
least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
|
|
same_len := 0
|
|
for i := 0; i < least_len; i += 1 {
|
|
left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
|
|
right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
|
|
|
|
if !left_is_subrune || !right_is_subrune {
|
|
// One of the nodes isn't a rune; there's nothing more we can do.
|
|
break
|
|
}
|
|
|
|
if left_subrune.data == right_subrune.data {
|
|
same_len = i + 1
|
|
} else {
|
|
// No more similarities.
|
|
break
|
|
}
|
|
}
|
|
|
|
if same_len > 0 {
|
|
cat_node := new(Node_Concatenation)
|
|
for i := 0; i < same_len; i += 1 {
|
|
append(&cat_node.nodes, left_concatenation.nodes[i])
|
|
}
|
|
for i := same_len; i > 0; i -= 1 {
|
|
ordered_remove(&left_concatenation.nodes, 0)
|
|
ordered_remove(&right_concatenation.nodes, 0)
|
|
}
|
|
|
|
group_node := new(Node_Group)
|
|
// (Re-using this alternation node.)
|
|
alter_node := specific
|
|
alter_node.left = left_concatenation
|
|
alter_node.right = right_concatenation
|
|
group_node.inner = alter_node
|
|
|
|
append(&cat_node.nodes, group_node)
|
|
return cat_node, 1
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
|
|
result = tree
|
|
new_changes := 0
|
|
|
|
when common.ODIN_DEBUG_REGEX {
|
|
io.write_string(common.debug_stream, "AST before Optimizer: ")
|
|
parser.write_node(common.debug_stream, tree)
|
|
io.write_byte(common.debug_stream, '\n')
|
|
}
|
|
|
|
// Keep optimizing until no more changes are seen.
|
|
for {
|
|
result, new_changes = optimize_subtree(result, flags)
|
|
changes += new_changes
|
|
if new_changes == 0 {
|
|
break
|
|
}
|
|
}
|
|
|
|
when common.ODIN_DEBUG_REGEX {
|
|
io.write_string(common.debug_stream, "AST after Optimizer: ")
|
|
parser.write_node(common.debug_stream, result)
|
|
io.write_byte(common.debug_stream, '\n')
|
|
}
|
|
|
|
|
|
return
|
|
}
|