Files
Odin/core/text/regex/optimizer/optimizer.odin
gingerBill 842cfee0f3 Change Odin's LICENSE to zlib from BSD 3-clause
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
2025-10-28 14:38:25 +00:00

531 lines
14 KiB
Odin

package regex_optimizer
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
@require import "core:io"
import "core:slice"
import "core:text/regex/common"
import "core:text/regex/parser"
Rune_Class_Range :: parser.Rune_Class_Range
Node :: parser.Node
Node_Rune :: parser.Node_Rune
Node_Rune_Class :: parser.Node_Rune_Class
Node_Wildcard :: parser.Node_Wildcard
Node_Concatenation :: parser.Node_Concatenation
Node_Alternation :: parser.Node_Alternation
Node_Repeat_Zero :: parser.Node_Repeat_Zero
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
Node_Repeat_One :: parser.Node_Repeat_One
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
Node_Repeat_N :: parser.Node_Repeat_N
Node_Optional :: parser.Node_Optional
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
Node_Group :: parser.Node_Group
Node_Anchor :: parser.Node_Anchor
Node_Word_Boundary :: parser.Node_Word_Boundary
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
return i.lower < j.lower
}
optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
if tree == nil {
return nil, 0
}
result = tree
switch specific in tree {
// No direct optimization possible on these nodes:
case ^Node_Rune: break
case ^Node_Wildcard: break
case ^Node_Anchor: break
case ^Node_Word_Boundary: break
case ^Node_Match_All_And_Escape: break
case ^Node_Concatenation:
// * Composition: Consume All to Anchored End
//
// DO: `.*$` => <special opcode>
// DO: `.+$` => `.` <special opcode>
if .Multiline not_in flags && len(specific.nodes) >= 2 {
i := len(specific.nodes) - 2
wrza: {
subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
_ = subnode.inner.(^Node_Wildcard) or_break wrza
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
if next_node.start == false {
specific.nodes[i] = new(Node_Match_All_And_Escape)
ordered_remove(&specific.nodes, i + 1)
changes += 1
break
}
}
wroa: {
subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
if next_node.start == false {
specific.nodes[i] = subsubnode
specific.nodes[i+1] = new(Node_Match_All_And_Escape)
changes += 1
break
}
}
}
// Only recursive optimizations:
#no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
changes += subnode_changes
if subnode == nil {
ordered_remove(&specific.nodes, i)
i -= 1
changes += 1
} else {
specific.nodes[i] = subnode
}
}
if len(specific.nodes) == 1 {
result = specific.nodes[0]
changes += 1
} else if len(specific.nodes) == 0 {
return nil, changes + 1
}
case ^Node_Repeat_Zero:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_Zero_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_N:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Group:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
if !specific.capture {
result = specific.inner
changes += 1
}
// Full optimization:
case ^Node_Rune_Class:
// * Class Simplification
//
// DO: `[aab]` => `[ab]`
// DO: `[aa]` => `[a]`
runes_seen: map[rune]bool
for r in specific.runes {
runes_seen[r] = true
}
if len(runes_seen) != len(specific.runes) {
clear(&specific.runes)
for key in runes_seen {
append(&specific.runes, key)
}
changes += 1
}
// * Class Reduction
//
// DO: `[a]` => `a`
if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
only_rune := specific.runes[0]
node := new(Node_Rune)
node.data = only_rune
return node, changes + 1
}
// * Range Construction
//
// DO: `[abc]` => `[a-c]`
slice.sort(specific.runes[:])
if len(specific.runes) > 1 {
new_range: Rune_Class_Range
new_range.lower = specific.runes[0]
new_range.upper = specific.runes[0]
#no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if new_range.lower == -1 {
new_range = { r, r }
continue
}
if r == new_range.lower - 1 {
new_range.lower -= 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if r == new_range.upper + 1 {
new_range.upper += 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
new_range = { -1, -1 }
changes += 1
}
}
if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
changes += 1
}
}
// * Rune Merging into Range
//
// DO: `[aa-c]` => `[a-c]`
for range in specific.ranges {
#no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if range.lower <= r && r <= range.upper {
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
}
}
}
// * Range Merging
//
// DO: `[a-cc-e]` => `[a-e]`
// DO: `[a-cd-e]` => `[a-e]`
// DO: `[a-cb-e]` => `[a-e]`
slice.sort_by(specific.ranges[:], class_range_sorter)
#no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
for j := i + 1; j < len(specific.ranges); j += 1 {
left_range := &specific.ranges[i]
right_range := specific.ranges[j]
if left_range.upper == right_range.lower ||
left_range.upper == right_range.lower - 1 ||
left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
left_range.upper = max(left_range.upper, right_range.upper)
ordered_remove(&specific.ranges, j)
j -= 1
changes += 1
} else {
break
}
}
}
if len(specific.ranges) == 0 {
specific.ranges = {}
}
if len(specific.runes) == 0 {
specific.runes = {}
}
// * NOP
//
// DO: `[]` => <nil>
if len(specific.ranges) + len(specific.runes) == 0 {
return nil, 1
}
slice.sort(specific.runes[:])
slice.sort_by(specific.ranges[:], class_range_sorter)
case ^Node_Alternation:
// Perform recursive optimization first.
left_changes, right_changes: int
specific.left, left_changes = optimize_subtree(specific.left, flags)
specific.right, right_changes = optimize_subtree(specific.right, flags)
changes += left_changes + right_changes
// * Alternation to Optional
//
// DO: `a|` => `a?`
if specific.left != nil && specific.right == nil {
node := new(Node_Optional)
node.inner = specific.left
return node, 1
}
// * Alternation to Optional Non-Greedy
//
// DO: `|a` => `a??`
if specific.right != nil && specific.left == nil {
node := new(Node_Optional_Non_Greedy)
node.inner = specific.right
return node, 1
}
// * NOP
//
// DO: `|` => <nil>
if specific.left == nil && specific.right == nil {
return nil, 1
}
left_rune, left_is_rune := specific.left.(^Node_Rune)
right_rune, right_is_rune := specific.right.(^Node_Rune)
if left_is_rune && right_is_rune {
if left_rune.data == right_rune.data {
// * Alternation Reduction
//
// DO: `a|a` => `a`
return left_rune, 1
} else {
// * Alternation to Class
//
// DO: `a|b` => `[ab]`
node := new(Node_Rune_Class)
append(&node.runes, left_rune.data)
append(&node.runes, right_rune.data)
return node, 1
}
}
left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
// * Class Union
//
// DO: `[a0]|[b1]` => `[a0b1]`
left_class, left_is_class := specific.left.(^Node_Rune_Class)
right_class, right_is_class := specific.right.(^Node_Rune_Class)
if left_is_class && right_is_class {
for r in right_class.runes {
append(&left_class.runes, r)
}
for range in right_class.ranges {
append(&left_class.ranges, range)
}
return left_class, 1
}
// * Class Union
//
// DO: `[a-b]|c` => `[a-bc]`
if left_is_class && right_is_rune {
append(&left_class.runes, right_rune.data)
return left_class, 1
}
// * Class Union
//
// DO: `a|[b-c]` => `[b-ca]`
if left_is_rune && right_is_class {
append(&right_class.runes, left_rune.data)
return right_class, 1
}
// * Wildcard Reduction
//
// DO: `a|.` => `.`
if left_is_rune && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|a` => `.`
if left_is_wildcard && right_is_rune {
return left_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `[ab]|.` => `.`
if left_is_class && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|[ab]` => `.`
if left_is_wildcard && right_is_class {
return left_wildcard, 1
}
left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
// * Common Suffix Elimination
//
// DO: `blueberry|strawberry` => `(?:blue|straw)berry`
if left_is_concatenation && right_is_concatenation {
// Remember that a concatenation could contain any node, not just runes.
left_len := len(left_concatenation.nodes)
right_len := len(right_concatenation.nodes)
least_len := min(left_len, right_len)
same_len := 0
for i := 1; i <= least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len += 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
// Dissolve this alternation into a concatenation.
cat_node := new(Node_Concatenation)
group_node := new(Node_Group)
append(&cat_node.nodes, group_node)
// Turn the concatenation into the common suffix.
for i := left_len - same_len; i < left_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
// Construct the group of alternating prefixes.
for i := same_len; i > 0; i -= 1 {
pop(&left_concatenation.nodes)
pop(&right_concatenation.nodes)
}
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
return cat_node, 1
}
}
// * Common Prefix Elimination
//
// DO: `abi|abe` => `ab(?:i|e)`
if left_is_concatenation && right_is_concatenation {
// Try to identify a common prefix.
// Remember that a concatenation could contain any node, not just runes.
least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
same_len := 0
for i := 0; i < least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len = i + 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
cat_node := new(Node_Concatenation)
for i := 0; i < same_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
for i := same_len; i > 0; i -= 1 {
ordered_remove(&left_concatenation.nodes, 0)
ordered_remove(&right_concatenation.nodes, 0)
}
group_node := new(Node_Group)
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
append(&cat_node.nodes, group_node)
return cat_node, 1
}
}
}
return
}
optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
result = tree
new_changes := 0
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST before Optimizer: ")
parser.write_node(common.debug_stream, tree)
io.write_byte(common.debug_stream, '\n')
}
// Keep optimizing until no more changes are seen.
for {
result, new_changes = optimize_subtree(result, flags)
changes += new_changes
if new_changes == 0 {
break
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST after Optimizer: ")
parser.write_node(common.debug_stream, result)
io.write_byte(common.debug_stream, '\n')
}
return
}