Files
Odin/core/text/regex/optimizer/optimizer.odin
2024-08-05 03:49:29 -04:00

531 lines
14 KiB
Odin

package regex_optimizer
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
@require import "core:io"
import "core:slice"
import "core:text/regex/common"
import "core:text/regex/parser"
Rune_Class_Range :: parser.Rune_Class_Range
Node :: parser.Node
Node_Rune :: parser.Node_Rune
Node_Rune_Class :: parser.Node_Rune_Class
Node_Wildcard :: parser.Node_Wildcard
Node_Concatenation :: parser.Node_Concatenation
Node_Alternation :: parser.Node_Alternation
Node_Repeat_Zero :: parser.Node_Repeat_Zero
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
Node_Repeat_One :: parser.Node_Repeat_One
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
Node_Repeat_N :: parser.Node_Repeat_N
Node_Optional :: parser.Node_Optional
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
Node_Group :: parser.Node_Group
Node_Anchor :: parser.Node_Anchor
Node_Word_Boundary :: parser.Node_Word_Boundary
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
return i.lower < j.lower
}
optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
if tree == nil {
return nil, 0
}
result = tree
switch specific in tree {
// No direct optimization possible on these nodes:
case ^Node_Rune: break
case ^Node_Wildcard: break
case ^Node_Anchor: break
case ^Node_Word_Boundary: break
case ^Node_Match_All_And_Escape: break
case ^Node_Concatenation:
// * Composition: Consume All to Anchored End
//
// DO: `.*$` => <special opcode>
// DO: `.+$` => `.` <special opcode>
if .Multiline not_in flags && len(specific.nodes) >= 2 {
i := len(specific.nodes) - 2
wrza: {
subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
_ = subnode.inner.(^Node_Wildcard) or_break wrza
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
if next_node.start == false {
specific.nodes[i] = new(Node_Match_All_And_Escape)
ordered_remove(&specific.nodes, i + 1)
changes += 1
break
}
}
wroa: {
subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
if next_node.start == false {
specific.nodes[i] = subsubnode
specific.nodes[i+1] = new(Node_Match_All_And_Escape)
changes += 1
break
}
}
}
// Only recursive optimizations:
#no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
changes += subnode_changes
if subnode == nil {
ordered_remove(&specific.nodes, i)
i -= 1
changes += 1
} else {
specific.nodes[i] = subnode
}
}
if len(specific.nodes) == 1 {
result = specific.nodes[0]
changes += 1
} else if len(specific.nodes) == 0 {
return nil, changes + 1
}
case ^Node_Repeat_Zero:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_Zero_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_N:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Group:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
if !specific.capture {
result = specific.inner
changes += 1
}
// Full optimization:
case ^Node_Rune_Class:
// * Class Simplification
//
// DO: `[aab]` => `[ab]`
// DO: `[aa]` => `[a]`
runes_seen: map[rune]bool
for r in specific.runes {
runes_seen[r] = true
}
if len(runes_seen) != len(specific.runes) {
clear(&specific.runes)
for key in runes_seen {
append(&specific.runes, key)
}
changes += 1
}
// * Class Reduction
//
// DO: `[a]` => `a`
if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
only_rune := specific.runes[0]
node := new(Node_Rune)
node.data = only_rune
return node, changes + 1
}
// * Range Construction
//
// DO: `[abc]` => `[a-c]`
slice.sort(specific.runes[:])
if len(specific.runes) > 1 {
new_range: Rune_Class_Range
new_range.lower = specific.runes[0]
new_range.upper = specific.runes[0]
#no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if new_range.lower == -1 {
new_range = { r, r }
continue
}
if r == new_range.lower - 1 {
new_range.lower -= 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if r == new_range.upper + 1 {
new_range.upper += 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
new_range = { -1, -1 }
changes += 1
}
}
if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
changes += 1
}
}
// * Rune Merging into Range
//
// DO: `[aa-c]` => `[a-c]`
for range in specific.ranges {
#no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if range.lower <= r && r <= range.upper {
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
}
}
}
// * Range Merging
//
// DO: `[a-cc-e]` => `[a-e]`
// DO: `[a-cd-e]` => `[a-e]`
// DO: `[a-cb-e]` => `[a-e]`
slice.sort_by(specific.ranges[:], class_range_sorter)
#no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
for j := i + 1; j < len(specific.ranges); j += 1 {
left_range := &specific.ranges[i]
right_range := specific.ranges[j]
if left_range.upper == right_range.lower ||
left_range.upper == right_range.lower - 1 ||
left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
left_range.upper = max(left_range.upper, right_range.upper)
ordered_remove(&specific.ranges, j)
j -= 1
changes += 1
} else {
break
}
}
}
if len(specific.ranges) == 0 {
specific.ranges = {}
}
if len(specific.runes) == 0 {
specific.runes = {}
}
// * NOP
//
// DO: `[]` => <nil>
if len(specific.ranges) + len(specific.runes) == 0 {
return nil, 1
}
slice.sort(specific.runes[:])
slice.sort_by(specific.ranges[:], class_range_sorter)
case ^Node_Alternation:
// Perform recursive optimization first.
left_changes, right_changes: int
specific.left, left_changes = optimize_subtree(specific.left, flags)
specific.right, right_changes = optimize_subtree(specific.right, flags)
changes += left_changes + right_changes
// * Alternation to Optional
//
// DO: `a|` => `a?`
if specific.left != nil && specific.right == nil {
node := new(Node_Optional)
node.inner = specific.left
return node, 1
}
// * Alternation to Optional Non-Greedy
//
// DO: `|a` => `a??`
if specific.right != nil && specific.left == nil {
node := new(Node_Optional_Non_Greedy)
node.inner = specific.right
return node, 1
}
// * NOP
//
// DO: `|` => <nil>
if specific.left == nil && specific.right == nil {
return nil, 1
}
left_rune, left_is_rune := specific.left.(^Node_Rune)
right_rune, right_is_rune := specific.right.(^Node_Rune)
if left_is_rune && right_is_rune {
if left_rune.data == right_rune.data {
// * Alternation Reduction
//
// DO: `a|a` => `a`
return left_rune, 1
} else {
// * Alternation to Class
//
// DO: `a|b` => `[ab]`
node := new(Node_Rune_Class)
append(&node.runes, left_rune.data)
append(&node.runes, right_rune.data)
return node, 1
}
}
left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
// * Class Union
//
// DO: `[a0]|[b1]` => `[a0b1]`
left_class, left_is_class := specific.left.(^Node_Rune_Class)
right_class, right_is_class := specific.right.(^Node_Rune_Class)
if left_is_class && right_is_class {
for r in right_class.runes {
append(&left_class.runes, r)
}
for range in right_class.ranges {
append(&left_class.ranges, range)
}
return left_class, 1
}
// * Class Union
//
// DO: `[a-b]|c` => `[a-bc]`
if left_is_class && right_is_rune {
append(&left_class.runes, right_rune.data)
return left_class, 1
}
// * Class Union
//
// DO: `a|[b-c]` => `[b-ca]`
if left_is_rune && right_is_class {
append(&right_class.runes, left_rune.data)
return right_class, 1
}
// * Wildcard Reduction
//
// DO: `a|.` => `.`
if left_is_rune && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|a` => `.`
if left_is_wildcard && right_is_rune {
return left_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `[ab]|.` => `.`
if left_is_class && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|[ab]` => `.`
if left_is_wildcard && right_is_class {
return left_wildcard, 1
}
left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
// * Common Suffix Elimination
//
// DO: `blueberry|strawberry` => `(?:blue|straw)berry`
if left_is_concatenation && right_is_concatenation {
// Remember that a concatenation could contain any node, not just runes.
left_len := len(left_concatenation.nodes)
right_len := len(right_concatenation.nodes)
least_len := min(left_len, right_len)
same_len := 0
for i := 1; i <= least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len += 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
// Dissolve this alternation into a concatenation.
cat_node := new(Node_Concatenation)
group_node := new(Node_Group)
append(&cat_node.nodes, group_node)
// Turn the concatenation into the common suffix.
for i := left_len - same_len; i < left_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
// Construct the group of alternating prefixes.
for i := same_len; i > 0; i -= 1 {
pop(&left_concatenation.nodes)
pop(&right_concatenation.nodes)
}
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
return cat_node, 1
}
}
// * Common Prefix Elimination
//
// DO: `abi|abe` => `ab(?:i|e)`
if left_is_concatenation && right_is_concatenation {
// Try to identify a common prefix.
// Remember that a concatenation could contain any node, not just runes.
least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
same_len := 0
for i := 0; i < least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len = i + 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
cat_node := new(Node_Concatenation)
for i := 0; i < same_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
for i := same_len; i > 0; i -= 1 {
ordered_remove(&left_concatenation.nodes, 0)
ordered_remove(&right_concatenation.nodes, 0)
}
group_node := new(Node_Group)
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
append(&cat_node.nodes, group_node)
return cat_node, 1
}
}
}
return
}
optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
result = tree
new_changes := 0
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST before Optimizer: ")
parser.write_node(common.debug_stream, tree)
io.write_byte(common.debug_stream, '\n')
}
// Keep optimizing until no more changes are seen.
for {
result, new_changes = optimize_subtree(result, flags)
changes += new_changes
if new_changes == 0 {
break
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST after Optimizer: ")
parser.write_node(common.debug_stream, result)
io.write_byte(common.debug_stream, '\n')
}
return
}