diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index 07ace7b5d..2f0f183e9 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -195,8 +195,12 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) { case ^Node_Anchor: if .Multiline in c.flags { - append(&code, Opcode.Multiline_Open) - append(&code, Opcode.Multiline_Close) + if specific.start { + append(&code, Opcode.Assert_Start_Multiline) + } else { + append(&code, Opcode.Multiline_Open) + append(&code, Opcode.Multiline_Close) + } } else { if specific.start { append(&code, Opcode.Assert_Start) @@ -439,7 +443,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: case .Save: continue - case .Assert_Start: + case .Assert_Start, .Assert_Start_Multiline: break optimize_opening case: diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 94a4b163a..7456634ac 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -282,10 +282,6 @@ create_iterator :: proc( temporary_allocator := context.temp_allocator, ) -> (result: Match_Iterator, err: Error) { - if .Multiline in flags { - return {}, .Unsupported_Flag - } - result.regex = create(pattern, flags, permanent_allocator, temporary_allocator) or_return result.capture = preallocate_capture() result.temp = temporary_allocator @@ -555,6 +551,7 @@ reset :: proc(it: ^Match_Iterator) { it.vm.top_thread = 0 it.vm.current_rune = rune(0) it.vm.current_rune_size = 0 + it.vm.last_rune = rune(0) for i in 0.. (opcode: Opcode, pc: int, ok: case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16) case .Save: iter.pc += size_of(Opcode) + size_of(u8) case .Assert_Start: iter.pc += size_of(Opcode) + case .Assert_Start_Multiline: iter.pc += size_of(Opcode) case .Assert_End: iter.pc += size_of(Opcode) case .Assert_Word_Boundary: iter.pc += size_of(Opcode) case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode) @@ -64,6 +65,7 @@ opcode_to_name :: proc(opcode: Opcode) -> (str: string) { case .Split: str = "Split" case .Save: str = "Save" case .Assert_Start: str = "Assert_Start" + case .Assert_Start_Multiline: str = "Assert_Start_Multiline" case .Assert_End: str = "Assert_End" case .Assert_Word_Boundary: str = "Assert_Word_Boundary" case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary" diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin index 32b772802..c292b0e99 100644 --- a/core/text/regex/virtual_machine/virtual_machine.odin +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -37,16 +37,17 @@ Opcode :: enum u8 { Split = 0x08, // | u16, u16 Save = 0x09, // | u8 Assert_Start = 0x0A, // | - Assert_End = 0x0B, // | - Assert_Word_Boundary = 0x0C, // | - Assert_Non_Word_Boundary = 0x0D, // | - Multiline_Open = 0x0E, // | - Multiline_Close = 0x0F, // | - Wait_For_Byte = 0x10, // | u8 - Wait_For_Rune = 0x11, // | i32 - Wait_For_Rune_Class = 0x12, // | u8 - Wait_For_Rune_Class_Negated = 0x13, // | u8 - Match_All_And_Escape = 0x14, // | + Assert_Start_Multiline = 0x0B, // | + Assert_End = 0x0C, // | + Assert_Word_Boundary = 0x0D, // | + Assert_Non_Word_Boundary = 0x0E, // | + Multiline_Open = 0x0F, // | + Multiline_Close = 0x10, // | + Wait_For_Byte = 0x11, // | u8 + Wait_For_Rune = 0x12, // | i32 + Wait_For_Rune_Class = 0x13, // | u8 + Wait_For_Rune_Class_Negated = 0x14, // | u8 + Match_All_And_Escape = 0x15, // | } Thread :: struct { @@ -77,6 +78,8 @@ Machine :: struct { current_rune_size: int, next_rune: rune, next_rune_size: int, + + last_rune: rune, } @@ -169,6 +172,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: pc += size_of(Opcode) continue } + case .Assert_Start_Multiline: + sp := vm.string_pointer+vm.current_rune_size + if sp == 0 || vm.last_rune == '\n' || vm.last_rune == '\r' { + pc += size_of(Opcode) + continue + } case .Assert_End: sp := vm.string_pointer+vm.current_rune_size if sp == len(vm.memory) { @@ -177,24 +186,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: } case .Multiline_Open: sp := vm.string_pointer+vm.current_rune_size - if sp == 0 || sp == len(vm.memory) { - if vm.next_rune == '\r' || vm.next_rune == '\n' { - // The VM is currently on a newline at the string boundary, - // so consume the newline next frame. - when common.ODIN_DEBUG_REGEX { - io.write_string(common.debug_stream, "*** New thread added [PC:") - common.write_padded_hex(common.debug_stream, pc, 4) - io.write_string(common.debug_stream, "]\n") - } - vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } - vm.top_thread += 1 - } else { - // Skip the `Multiline_Close` opcode. - pc += 2 * size_of(Opcode) - continue - } + if sp == len(vm.memory) { + // Skip the `Multiline_Close` opcode. + pc += 2 * size_of(Opcode) + continue } else { - // Not on a string boundary. + // Not at the end of the string. // Try to consume a newline next frame in the other opcode loop. when common.ODIN_DEBUG_REGEX { io.write_string(common.debug_stream, "*** New thread added [PC:") @@ -613,6 +610,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU break } + vm.last_rune = vm.current_rune vm.string_pointer += vm.current_rune_size } diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index aed3091e1..8369444b9 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -699,15 +699,15 @@ test_case_insensitive :: proc(t: ^testing.T) { test_multiline :: proc(t: ^testing.T) { { EXPR :: `^hellope$world$` - check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\nworld\n", "hellope\nworld\n", extra_flags = { .Multiline }) check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline }) check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline }) check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline }) } { - EXPR :: `^?.$` - check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline }) + EXPR :: `^.$` check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline }) + check_expression(t, EXPR, "h\n", "h\n", extra_flags = { .Multiline }) } { EXPR :: `^$` @@ -1219,6 +1219,57 @@ iterator_vectors := []Iterator_Test{ {pos = {{3, 3}}, groups = {""}}, }, }, + // Multiline iteration is supported, but it must follow the `^...$` scheme. + // + // Any usage outside of this strict syntax will produce predictable but + // unusual outputs, as `^` is defined to assert the start of a string or + // that a newline sequence was previously consumed, and `$` consumes a + // newline sequence or asserts the end of the string. + { + "foo1\nfoo2\r\nfoo3\rfoo4", `^foo.$`, {.Multiline}, + { + {pos = {{0, 5}}, groups = {"foo1\n"}}, + {pos = {{5, 11}}, groups = {"foo2\r\n"}}, + {pos = {{11, 16}}, groups = {"foo3\r"}}, + {pos = {{16, 20}}, groups = {"foo4"}}, + }, + }, + { + "a\nb\n\r", `^$`, {.Multiline}, + {}, + }, + { + "a\nb\n", `^$`, {.Multiline}, + {}, + }, + { + "a\nb", `^$`, {.Multiline}, + {}, + }, + // Multiline anchors must work within groups, as people are going to end up + // using them in there and we do not forbid it. + { + "a\nb\na\nb", `(?:^a$|^b$)`, {.Multiline}, + { + {pos = {{0, 2}}, groups = {"a\n"}}, + {pos = {{2, 4}}, groups = {"b\n"}}, + {pos = {{4, 6}}, groups = {"a\n"}}, + {pos = {{6, 7}}, groups = {"b"}}, + }, + }, + // The following patterns are valid uses of optional anchors and must match. + { + "a\nb\na\nb", `^a(?:b|$)`, {.Multiline}, + { + {pos = {{0, 2}}, groups = {"a\n"}}, + }, + }, + { + "a\nb\na\nb", `^ab?$?`, {.Multiline}, + { + {pos = {{0, 2}}, groups = {"a\n"}}, + }, + }, } @test