Make RegEx VM restartable and fix iterator infinite loop

This commit is contained in:
Feoramund
2025-05-23 20:20:59 -04:00
parent a2c0720fb0
commit fedb9efb41
3 changed files with 68 additions and 10 deletions

View File

@@ -77,6 +77,8 @@ Match_Iterator :: struct {
vm: virtual_machine.Machine,
idx: int,
temp: runtime.Allocator,
threads: int,
done: bool,
}
/*
@@ -101,7 +103,6 @@ create :: proc(
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Regular_Expression, err: Error) {
// For the sake of speed and simplicity, we first run all the intermediate
// processes such as parsing and compilation through the temporary
// allocator.
@@ -294,6 +295,7 @@ create_iterator :: proc(
result.temp = temporary_allocator
result.vm = virtual_machine.create(result.regex.program, str)
result.vm.class_data = result.regex.class_data
result.threads = max(1, virtual_machine.opcode_count(result.vm.code) - 1)
return
}
@@ -457,8 +459,27 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
assert(len(it.capture.pos) >= common.MAX_CAPTURE_GROUPS,
"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
// Guard against situations in which the iterator should finish.
if it.done {
return
}
runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
if it.idx > 0 {
// Reset the state needed to `virtual_machine.run` again.
it.vm.top_thread = 0
it.vm.current_rune = rune(0)
it.vm.current_rune_size = 0
for i in 0..<it.threads {
it.vm.threads[i] = {}
it.vm.next_threads[i] = {}
}
}
// Take note of where the string pointer is before we start.
sp_before := it.vm.string_pointer
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
{
context.allocator = it.temp
@@ -469,6 +490,28 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
}
}
if !ok {
// Match failed, bail out.
return
}
if it.vm.string_pointer == sp_before {
// The string pointer did not move, but there was a match.
//
// At this point, the pattern supplied to the iterator will infinitely
// loop if we do not intervene.
it.done = true
}
if it.vm.string_pointer == len(it.vm.memory) {
// The VM hit the end of the string.
//
// We do not check at the start, because a match of pattern `$`
// against string "" is valid and must return a match.
//
// This check prevents a double-match of `$` against a non-empty string.
it.done = true
}
str := string(it.vm.memory)
num_groups: int
@@ -488,9 +531,7 @@ match_iterator :: proc(it: ^Match_Iterator) -> (result: Capture, index: int, ok:
num_groups = n
}
defer if ok {
it.idx += 1
}
defer it.idx += 1
if num_groups > 0 {
result = {it.capture.pos[:num_groups], it.capture.groups[:num_groups]}
@@ -504,8 +545,24 @@ match :: proc {
match_iterator,
}
/*
Reset an iterator, allowing it to be run again as if new.
Inputs:
- it: The iterator to reset.
*/
reset :: proc(it: ^Match_Iterator) {
it.idx = 0
it.done = false
it.idx = 0
it.vm.string_pointer = 0
it.vm.top_thread = 0
it.vm.current_rune = rune(0)
it.vm.current_rune_size = 0
for i in 0..<it.threads {
it.vm.threads[i] = {}
it.vm.next_threads[i] = {}
}
}
/*

View File

@@ -329,10 +329,10 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc:
run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
when UNICODE_MODE {
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer:])
} else {
if len(vm.memory) > 0 {
vm.next_rune = cast(rune)vm.memory[0]
vm.next_rune = cast(rune)vm.memory[vm.string_pointer]
vm.next_rune_size = 1
}
}
@@ -652,4 +652,4 @@ destroy :: proc(vm: Machine, allocator := context.allocator) {
delete(vm.busy_map)
free(vm.threads)
free(vm.next_threads)
}
}

View File

@@ -1119,7 +1119,7 @@ iterator_vectors := []Iterator_Test{
@test
test_match_iterator :: proc(t: ^testing.T) {
for test in iterator_vectors {
vector: for test in iterator_vectors {
it, err := regex.create_iterator(test.haystack, test.pattern, test.flags)
defer regex.destroy(it)
@@ -1128,7 +1128,8 @@ test_match_iterator :: proc(t: ^testing.T) {
for capture, idx in regex.match(&it) {
if idx >= len(test.expected) {
break
log.errorf("got more than expected number of captures for matching string %q against pattern %q\n\tidx %i = %v", test.haystack, test.pattern, idx, capture)
continue vector
}
check_capture(t, capture, test.expected[idx])
}