Merge pull request #3962 from Feoramund/regex

Add `core:text/regex`
This commit is contained in:
gingerBill
2024-08-21 13:55:11 +01:00
committed by GitHub
21 changed files with 5141 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
// This package helps break dependency cycles.
package regex_common
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
// VM limitations
MAX_CAPTURE_GROUPS :: max(#config(ODIN_REGEX_MAX_CAPTURE_GROUPS, 10), 10)
MAX_PROGRAM_SIZE :: int(max(i16))
MAX_CLASSES :: int(max(u8))
Flag :: enum u8 {
// Global: try to match the pattern anywhere in the string.
Global,
// Multiline: treat `^` and `$` as if they also match newlines.
Multiline,
// Case Insensitive: treat `a-z` as if it was also `A-Z`.
Case_Insensitive,
// Ignore Whitespace: bypass unescaped whitespace outside of classes.
Ignore_Whitespace,
// Unicode: let the compiler and virtual machine know to expect Unicode strings.
Unicode,
// No Capture: avoid saving capture group data entirely.
No_Capture,
// No Optimization: do not pass the pattern through the optimizer; for debugging.
No_Optimization,
}
Flags :: bit_set[Flag; u8]
@(rodata)
Flag_To_Letter := #sparse[Flag]u8 {
.Global = 'g',
.Multiline = 'm',
.Case_Insensitive = 'i',
.Ignore_Whitespace = 'x',
.Unicode = 'u',
.No_Capture = 'n',
.No_Optimization = '-',
}

View File

@@ -0,0 +1,33 @@
package regex_common
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
@require import "core:os"
import "core:io"
import "core:strings"
ODIN_DEBUG_REGEX :: #config(ODIN_DEBUG_REGEX, false)
when ODIN_DEBUG_REGEX {
debug_stream := os.stream_from_handle(os.stderr)
}
write_padded_hex :: proc(w: io.Writer, #any_int n, zeroes: int) {
sb := strings.builder_make()
defer strings.builder_destroy(&sb)
sbw := strings.to_writer(&sb)
io.write_int(sbw, n, 0x10)
io.write_string(w, "0x")
for _ in 0..<max(0, zeroes - strings.builder_len(sb)) {
io.write_byte(w, '0')
}
io.write_int(w, n, 0x10)
}

View File

@@ -0,0 +1,548 @@
package regex_compiler
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:text/regex/common"
import "core:text/regex/parser"
import "core:text/regex/tokenizer"
import "core:text/regex/virtual_machine"
import "core:unicode"
Token :: tokenizer.Token
Token_Kind :: tokenizer.Token_Kind
Tokenizer :: tokenizer.Tokenizer
Rune_Class_Range :: parser.Rune_Class_Range
Rune_Class_Data :: parser.Rune_Class_Data
Node :: parser.Node
Node_Rune :: parser.Node_Rune
Node_Rune_Class :: parser.Node_Rune_Class
Node_Wildcard :: parser.Node_Wildcard
Node_Concatenation :: parser.Node_Concatenation
Node_Alternation :: parser.Node_Alternation
Node_Repeat_Zero :: parser.Node_Repeat_Zero
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
Node_Repeat_One :: parser.Node_Repeat_One
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
Node_Repeat_N :: parser.Node_Repeat_N
Node_Optional :: parser.Node_Optional
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
Node_Group :: parser.Node_Group
Node_Anchor :: parser.Node_Anchor
Node_Word_Boundary :: parser.Node_Word_Boundary
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
Opcode :: virtual_machine.Opcode
Program :: [dynamic]Opcode
JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
Compiler :: struct {
flags: common.Flags,
class_data: [dynamic]Rune_Class_Data,
}
Error :: enum {
None,
Program_Too_Big,
Too_Many_Classes,
}
classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
assert(q != nil)
assert(w != nil)
if q == w {
return true
}
if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
return false
}
for r, i in q.runes {
if r != w.runes[i] {
return false
}
}
for r, i in q.ranges {
if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
return false
}
}
return true
}
map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
if tree == nil {
return
}
switch specific in tree {
case ^Node_Rune: break
case ^Node_Wildcard: break
case ^Node_Anchor: break
case ^Node_Word_Boundary: break
case ^Node_Match_All_And_Escape: break
case ^Node_Concatenation:
for subnode in specific.nodes {
map_all_classes(subnode, collection)
}
case ^Node_Repeat_Zero:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_Zero_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_One:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_One_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Repeat_N:
map_all_classes(specific.inner, collection)
case ^Node_Optional:
map_all_classes(specific.inner, collection)
case ^Node_Optional_Non_Greedy:
map_all_classes(specific.inner, collection)
case ^Node_Group:
map_all_classes(specific.inner, collection)
case ^Node_Alternation:
map_all_classes(specific.left, collection)
map_all_classes(specific.right, collection)
case ^Node_Rune_Class:
unseen := true
for &value in collection {
if classes_are_exact(&specific.data, &value) {
unseen = false
break
}
}
if unseen {
append(collection, specific.data)
}
}
}
append_raw :: #force_inline proc(code: ^Program, data: $T) {
// NOTE: This is system-dependent endian.
for b in transmute([size_of(T)]byte)data {
append(code, cast(Opcode)b)
}
}
inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
// NOTE: This is system-dependent endian.
for b, i in transmute([size_of(T)]byte)data {
inject_at(code, start + i, cast(Opcode)b)
}
}
@require_results
generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
if node == nil {
return
}
// NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
// this later when relative jumps are turned into absolute jumps.
switch specific in node {
// Atomic Nodes:
case ^Node_Rune:
if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
append(&code, Opcode.Byte)
append(&code, cast(Opcode)specific.data)
} else {
append(&code, Opcode.Rune)
append_raw(&code, specific.data)
}
case ^Node_Rune_Class:
if specific.negating {
append(&code, Opcode.Rune_Class_Negated)
} else {
append(&code, Opcode.Rune_Class)
}
index := -1
for &data, i in c.class_data {
if classes_are_exact(&data, &specific.data) {
index = i
break
}
}
assert(index != -1, "Unable to find collected Rune_Class_Data index.")
append(&code, Opcode(index))
case ^Node_Wildcard:
append(&code, Opcode.Wildcard)
case ^Node_Anchor:
if .Multiline in c.flags {
append(&code, Opcode.Multiline_Open)
append(&code, Opcode.Multiline_Close)
} else {
if specific.start {
append(&code, Opcode.Assert_Start)
} else {
append(&code, Opcode.Assert_End)
}
}
case ^Node_Word_Boundary:
if specific.non_word {
append(&code, Opcode.Assert_Non_Word_Boundary)
} else {
append(&code, Opcode.Assert_Word_Boundary)
}
// Compound Nodes:
case ^Node_Group:
code = generate_code(c, specific.inner)
if specific.capture && .No_Capture not_in c.flags {
inject_at(&code, 0, Opcode.Save)
inject_at(&code, 1, Opcode(2 * specific.capture_id))
append(&code, Opcode.Save)
append(&code, Opcode(2 * specific.capture_id + 1))
}
case ^Node_Alternation:
left := generate_code(c, specific.left)
right := generate_code(c, specific.right)
left_len := len(left)
// Avoiding duplicate allocation by reusing `left`.
code = left
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(len(right) + JUMP_SIZE))
for opcode in right {
append(&code, opcode)
}
case ^Node_Concatenation:
for subnode in specific.nodes {
subnode_code := generate_code(c, subnode)
for opcode in subnode_code {
append(&code, opcode)
}
}
case ^Node_Repeat_Zero:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
case ^Node_Repeat_Zero_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
case ^Node_Repeat_One:
code = generate_code(c, specific.inner)
original_len := len(code)
append(&code, Opcode.Split)
append_raw(&code, i16(-original_len))
append_raw(&code, i16(SPLIT_SIZE))
case ^Node_Repeat_One_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(-original_len))
case ^Node_Repeat_N:
inside := generate_code(c, specific.inner)
original_len := len(inside)
if specific.lower == specific.upper { // {N}
// e{N} ... evaluates to ... e^N
for i := 0; i < specific.upper; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
} else if specific.lower == -1 && specific.upper > 0 { // {,M}
// e{,M} ... evaluates to ... e?^M
for i := 0; i < specific.upper; i += 1 {
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(SPLIT_SIZE + original_len))
for opcode in inside {
append(&code, opcode)
}
}
} else if specific.lower >= 0 && specific.upper == -1 { // {N,}
// e{N,} ... evaluates to ... e^N e*
for i := 0; i < specific.lower; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE))
append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
for opcode in inside {
append(&code, opcode)
}
append(&code, Opcode.Jump)
append_raw(&code, i16(-original_len - SPLIT_SIZE))
} else if specific.lower >= 0 && specific.upper > 0 {
// e{N,M} evaluates to ... e^N e?^(M-N)
for i := 0; i < specific.lower; i += 1 {
for opcode in inside {
append(&code, opcode)
}
}
for i := 0; i < specific.upper - specific.lower; i += 1 {
append(&code, Opcode.Split)
append_raw(&code, i16(SPLIT_SIZE + original_len))
append_raw(&code, i16(SPLIT_SIZE))
for opcode in inside {
append(&code, opcode)
}
}
} else {
panic("RegEx compiler received invalid repetition group.")
}
case ^Node_Optional:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
case ^Node_Optional_Non_Greedy:
code = generate_code(c, specific.inner)
original_len := len(code)
inject_at(&code, 0, Opcode.Split)
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
case ^Node_Match_All_And_Escape:
append(&code, Opcode.Match_All_And_Escape)
}
return
}
@require_results
compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
if tree == nil {
if .No_Capture not_in flags {
append(&code, Opcode.Save); append(&code, Opcode(0x00))
append(&code, Opcode.Save); append(&code, Opcode(0x01))
append(&code, Opcode.Match)
} else {
append(&code, Opcode.Match_And_Exit)
}
return
}
c: Compiler
c.flags = flags
map_all_classes(tree, &class_data)
if len(class_data) >= common.MAX_CLASSES {
err = .Too_Many_Classes
return
}
c.class_data = class_data
code = generate_code(&c, tree)
pc_open := 0
add_global: if .Global in flags {
// Check if the opening to the pattern is predictable.
// If so, use one of the optimized Wait opcodes.
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
#partial switch opcode {
case .Byte:
inject_at(&code, pc_open, Opcode.Wait_For_Byte)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
inject_at(&code, pc_open, Opcode.Wait_For_Rune)
pc_open += size_of(Opcode)
inject_raw(&code, pc_open, operand)
pc_open += size_of(rune)
break add_global
case .Rune_Class:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
case .Rune_Class_Negated:
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
pc_open += size_of(Opcode)
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
pc_open += size_of(u8)
break add_global
case .Save:
continue
case:
break seek_loop
}
}
// `.*?`
inject_at(&code, pc_open, Opcode.Split)
pc_open += size_of(byte)
inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
pc_open += size_of(i16)
inject_raw(&code, pc_open, i16(SPLIT_SIZE))
pc_open += size_of(i16)
inject_at(&code, pc_open, Opcode.Wildcard)
pc_open += size_of(byte)
inject_at(&code, pc_open, Opcode.Jump)
pc_open += size_of(byte)
inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
pc_open += size_of(i16)
}
if .No_Capture not_in flags {
// `(` <generated code>
inject_at(&code, pc_open, Opcode.Save)
inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
// `)`
append(&code, Opcode.Save); append(&code, Opcode(0x01))
append(&code, Opcode.Match)
} else {
append(&code, Opcode.Match_And_Exit)
}
if len(code) >= common.MAX_PROGRAM_SIZE {
err = .Program_Too_Big
return
}
// NOTE: No further opcode addition beyond this point, as we've already
// checked the program size. Removal or transformation is fine.
// Post-Compile Optimizations:
// * Jump Extension
//
// A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
if .No_Optimization not_in flags {
for passes_left := 1; passes_left > 0; passes_left -= 1 {
do_another_pass := false
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
#partial switch opcode {
case .Jump:
jmp := cast(^i16)&code[pc+size_of(Opcode)]
jmp_value := intrinsics.unaligned_load(jmp)
if code[cast(i16)pc+jmp_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp, jmp_value + next_jmp)
do_another_pass = true
}
case .Split:
jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
jmp_x_value := intrinsics.unaligned_load(jmp_x)
if code[cast(i16)pc+jmp_x_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp)
do_another_pass = true
}
jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
jmp_y_value := intrinsics.unaligned_load(jmp_y)
if code[cast(i16)pc+jmp_y_value] == .Jump {
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)])
intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp)
do_another_pass = true
}
}
}
if do_another_pass {
passes_left += 1
}
}
}
// * Relative Jump to Absolute Jump
//
// RelJmp{PC +/- N} => AbsJmp{M}
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
// NOTE: The virtual machine implementation depends on this.
#partial switch opcode {
case .Jump:
jmp := cast(^u16)&code[pc+size_of(Opcode)]
intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc)
case .Split:
jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc)
jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc)
}
}
return
}

View File

@@ -0,0 +1,93 @@
package regex_compiler
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:io"
import "core:text/regex/common"
import "core:text/regex/virtual_machine"
get_jump_targets :: proc(code: []Opcode) -> (jump_targets: map[int]int) {
iter := virtual_machine.Opcode_Iterator{ code, 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
#partial switch opcode {
case .Jump:
jmp := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1])
jump_targets[jmp] = pc
case .Split:
jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1])
jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+3])
jump_targets[jmp_x] = pc
jump_targets[jmp_y] = pc
}
}
return
}
trace :: proc(w: io.Writer, code: []Opcode) {
jump_targets := get_jump_targets(code)
defer delete(jump_targets)
iter := virtual_machine.Opcode_Iterator{ code, 0 }
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
if src, ok := jump_targets[pc]; ok {
io.write_string(w, "--")
common.write_padded_hex(w, src, 4)
io.write_string(w, "--> ")
} else {
io.write_string(w, " ")
}
io.write_string(w, "[PC: ")
common.write_padded_hex(w, pc, 4)
io.write_string(w, "] ")
io.write_string(w, virtual_machine.opcode_to_name(opcode))
io.write_byte(w, ' ')
#partial switch opcode {
case .Byte:
operand := cast(rune)code[pc+1]
io.write_encoded_rune(w, operand)
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
io.write_encoded_rune(w, operand)
case .Rune_Class, .Rune_Class_Negated:
operand := cast(u8)code[pc+1]
common.write_padded_hex(w, operand, 2)
case .Jump:
jmp := intrinsics.unaligned_load(cast(^u16)&code[pc+1])
io.write_string(w, "-> $")
common.write_padded_hex(w, jmp, 4)
case .Split:
jmp_x := intrinsics.unaligned_load(cast(^u16)&code[pc+1])
jmp_y := intrinsics.unaligned_load(cast(^u16)&code[pc+3])
io.write_string(w, "=> $")
common.write_padded_hex(w, jmp_x, 4)
io.write_string(w, ", $")
common.write_padded_hex(w, jmp_y, 4)
case .Save:
operand := cast(u8)code[pc+1]
common.write_padded_hex(w, operand, 2)
case .Wait_For_Byte:
operand := cast(rune)code[pc+1]
io.write_encoded_rune(w, operand)
case .Wait_For_Rune:
operand := (cast(^rune)&code[pc+1])^
io.write_encoded_rune(w, operand)
case .Wait_For_Rune_Class:
operand := cast(u8)code[pc+1]
common.write_padded_hex(w, operand, 2)
case .Wait_For_Rune_Class_Negated:
operand := cast(u8)code[pc+1]
common.write_padded_hex(w, operand, 2)
}
io.write_byte(w, '\n')
}
}

View File

@@ -0,0 +1,9 @@
/*
package regex_compiler implements a bytecode compiler for the virtual machine
included alongside it.
Operands larger than u8 are written in system endian order.
More details can be found in the documentation for the virtual machine.
*/
package regex_compiler

97
core/text/regex/doc.odin Normal file
View File

@@ -0,0 +1,97 @@
/*
package regex implements a complete suite for using Regular Expressions to
match and capture text.
Regular expressions are used to describe how a piece of text can match to
another, using a pattern language.
Odin's regex library implements the following features:
Alternation: `apple|cherry`
Classes: `[0-9_]`
Classes, negated: `[^0-9_]`
Shorthands: `\d\s\w`
Shorthands, negated: `\D\S\W`
Wildcards: `.`
Repeat, optional: `a*`
Repeat, at least once: `a+`
Repetition: `a{1,2}`
Optional: `a?`
Group, capture: `([0-9])`
Group, non-capture: `(?:[0-9])`
Start & End Anchors: `^hello$`
Word Boundaries: `\bhello\b`
Non-Word Boundaries: `hello\B`
These specifiers can be composed together, such as an optional group:
`(?:hello)?`
This package also supports the non-greedy variants of the repeating and
optional specifiers by appending a `?` to them.
Of the shorthand classes that are supported, they are all ASCII-based, even
when compiling in Unicode mode. This is for the sake of general performance and
simplicity, as there are thousands of Unicode codepoints which would qualify as
either a digit, space, or word character which could be irrelevant depending on
what is being matched.
Here are the shorthand class equivalencies:
\d: [0-9]
\s: [\t\n\f\r ]
\w: [0-9A-Z_a-z]
If you need your own shorthands, you can compose strings together like so:
MY_HEX :: "[0-9A-Fa-f]"
PATTERN :: MY_HEX + "-" + MY_HEX
The compiler will handle turning multiple identical classes into references to
the same set of matching runes, so there's no penalty for doing it like this.
``Some people, when confronted with a problem, think
"I know, I'll use regular expressions." Now they have two problems.''
- Jamie Zawinski
Regular expressions have gathered a reputation over the decades for often being
chosen as the wrong tool for the job. Here, we will clarify a few cases in
which RegEx might be good or bad.
**When is it a good time to use RegEx?**
- You don't know at compile-time what patterns of text the program will need to
match when it's running.
- As an example, you are making a client which can be configured by the user to
trigger on certain text patterns received from a server.
- For another example, you need a way for users of a text editor to compose
matching strings that are more intricate than a simple substring lookup.
- The text you're matching against is small (< 64 KiB) and your patterns aren't
overly complicated with branches (alternations, repeats, and optionals).
- If none of the above general impressions apply but your project doesn't
warrant long-term maintenance.
**When is it a bad time to use RegEx?**
- You know at compile-time the grammar you're parsing; a hand-made parser has
the potential to be more maintainable and readable.
- The grammar you're parsing has certain validation steps that lend itself to
forming complicated expressions, such as e-mail addresses, URIs, dates,
postal codes, credit cards, et cetera. Using RegEx to validate these
structures is almost always a bad sign.
- The text you're matching against is big (> 1 MiB); you would be better served
by first dividing the text into manageable chunks and using some heuristic to
locate the most likely location of a match before applying RegEx against it.
- You value high performance and low memory usage; RegEx will always have a
certain overhead which increases with the complexity of the pattern.
The implementation of this package has been optimized, but it will never be as
thoroughly performant as a hand-made parser. In comparison, there are just too
many intermediate steps, assumptions, and generalizations in what it takes to
handle a regular expression.
*/
package regex

View File

@@ -0,0 +1,58 @@
/*
package regex_optimizer implements an optimizer which acts upon the AST of a
parsed regular expression pattern, transforming it in-place without moving to a
compilation step.
Where possible, it aims to reduce branching as much as possible in the
expression by reducing usage of `|`.
Here is a summary of the optimizations that it will do:
* Class Simplification : `[aab]` => `[ab]`
`[aa]` => `[a]`
* Class Reduction : `[a]` => `a`
* Range Construction : `[abc]` => `[a-c]`
* Rune Merging into Range : `[aa-c]` => `[a-c]`
* Range Merging : `[a-cc-e]` => `[a-e]`
`[a-cd-e]` => `[a-e]`
`[a-cb-e]` => `[a-e]`
* Alternation to Optional : `a|` => `a?`
* Alternation to Optional Non-Greedy : `|a` => `a??`
* Alternation Reduction : `a|a` => `a`
* Alternation to Class : `a|b` => `[ab]`
* Class Union : `[a0]|[b1]` => `[a0b1]`
`[a-b]|c` => `[a-bc]`
`a|[b-c]` => `[b-ca]`
* Wildcard Reduction : `a|.` => `.`
`.|a` => `.`
`[ab]|.` => `.`
`.|[ab]` => `.`
* Common Suffix Elimination : `blueberry|strawberry` => `(?:blue|straw)berry`
* Common Prefix Elimination : `abi|abe` => `ab(?:i|e)`
* Composition: Consume All to Anchored End
`.*$` => <special opcode>
`.+$` => `.` <special opcode>
Possible future improvements:
- Change the AST of alternations to be a list instead of a tree, so that
constructions such as `(ab|bb|cb)` can be considered in whole by the affix
elimination optimizations.
- Introduce specialized opcodes for certain classes of repetition.
- Add Common Infix Elimination.
- Measure the precise finite minimum and maximum of a pattern, if available,
and check against that on any strings before running the virtual machine.
*/
package regex_optimizer

View File

@@ -0,0 +1,530 @@
package regex_optimizer
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
@require import "core:io"
import "core:slice"
import "core:text/regex/common"
import "core:text/regex/parser"
Rune_Class_Range :: parser.Rune_Class_Range
Node :: parser.Node
Node_Rune :: parser.Node_Rune
Node_Rune_Class :: parser.Node_Rune_Class
Node_Wildcard :: parser.Node_Wildcard
Node_Concatenation :: parser.Node_Concatenation
Node_Alternation :: parser.Node_Alternation
Node_Repeat_Zero :: parser.Node_Repeat_Zero
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
Node_Repeat_One :: parser.Node_Repeat_One
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
Node_Repeat_N :: parser.Node_Repeat_N
Node_Optional :: parser.Node_Optional
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
Node_Group :: parser.Node_Group
Node_Anchor :: parser.Node_Anchor
Node_Word_Boundary :: parser.Node_Word_Boundary
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
return i.lower < j.lower
}
optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
if tree == nil {
return nil, 0
}
result = tree
switch specific in tree {
// No direct optimization possible on these nodes:
case ^Node_Rune: break
case ^Node_Wildcard: break
case ^Node_Anchor: break
case ^Node_Word_Boundary: break
case ^Node_Match_All_And_Escape: break
case ^Node_Concatenation:
// * Composition: Consume All to Anchored End
//
// DO: `.*$` => <special opcode>
// DO: `.+$` => `.` <special opcode>
if .Multiline not_in flags && len(specific.nodes) >= 2 {
i := len(specific.nodes) - 2
wrza: {
subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
_ = subnode.inner.(^Node_Wildcard) or_break wrza
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
if next_node.start == false {
specific.nodes[i] = new(Node_Match_All_And_Escape)
ordered_remove(&specific.nodes, i + 1)
changes += 1
break
}
}
wroa: {
subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
if next_node.start == false {
specific.nodes[i] = subsubnode
specific.nodes[i+1] = new(Node_Match_All_And_Escape)
changes += 1
break
}
}
}
// Only recursive optimizations:
#no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
changes += subnode_changes
if subnode == nil {
ordered_remove(&specific.nodes, i)
i -= 1
changes += 1
} else {
specific.nodes[i] = subnode
}
}
if len(specific.nodes) == 1 {
result = specific.nodes[0]
changes += 1
} else if len(specific.nodes) == 0 {
return nil, changes + 1
}
case ^Node_Repeat_Zero:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_Zero_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_One_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Repeat_N:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Optional_Non_Greedy:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
case ^Node_Group:
specific.inner, changes = optimize_subtree(specific.inner, flags)
if specific.inner == nil {
return nil, changes + 1
}
if !specific.capture {
result = specific.inner
changes += 1
}
// Full optimization:
case ^Node_Rune_Class:
// * Class Simplification
//
// DO: `[aab]` => `[ab]`
// DO: `[aa]` => `[a]`
runes_seen: map[rune]bool
for r in specific.runes {
runes_seen[r] = true
}
if len(runes_seen) != len(specific.runes) {
clear(&specific.runes)
for key in runes_seen {
append(&specific.runes, key)
}
changes += 1
}
// * Class Reduction
//
// DO: `[a]` => `a`
if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
only_rune := specific.runes[0]
node := new(Node_Rune)
node.data = only_rune
return node, changes + 1
}
// * Range Construction
//
// DO: `[abc]` => `[a-c]`
slice.sort(specific.runes[:])
if len(specific.runes) > 1 {
new_range: Rune_Class_Range
new_range.lower = specific.runes[0]
new_range.upper = specific.runes[0]
#no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if new_range.lower == -1 {
new_range = { r, r }
continue
}
if r == new_range.lower - 1 {
new_range.lower -= 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if r == new_range.upper + 1 {
new_range.upper += 1
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
} else if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
new_range = { -1, -1 }
changes += 1
}
}
if new_range.lower != new_range.upper {
append(&specific.ranges, new_range)
changes += 1
}
}
// * Rune Merging into Range
//
// DO: `[aa-c]` => `[a-c]`
for range in specific.ranges {
#no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
r := specific.runes[i]
if range.lower <= r && r <= range.upper {
ordered_remove(&specific.runes, i)
i -= 1
changes += 1
}
}
}
// * Range Merging
//
// DO: `[a-cc-e]` => `[a-e]`
// DO: `[a-cd-e]` => `[a-e]`
// DO: `[a-cb-e]` => `[a-e]`
slice.sort_by(specific.ranges[:], class_range_sorter)
#no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
for j := i + 1; j < len(specific.ranges); j += 1 {
left_range := &specific.ranges[i]
right_range := specific.ranges[j]
if left_range.upper == right_range.lower ||
left_range.upper == right_range.lower - 1 ||
left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
left_range.upper = max(left_range.upper, right_range.upper)
ordered_remove(&specific.ranges, j)
j -= 1
changes += 1
} else {
break
}
}
}
if len(specific.ranges) == 0 {
specific.ranges = {}
}
if len(specific.runes) == 0 {
specific.runes = {}
}
// * NOP
//
// DO: `[]` => <nil>
if len(specific.ranges) + len(specific.runes) == 0 {
return nil, 1
}
slice.sort(specific.runes[:])
slice.sort_by(specific.ranges[:], class_range_sorter)
case ^Node_Alternation:
// Perform recursive optimization first.
left_changes, right_changes: int
specific.left, left_changes = optimize_subtree(specific.left, flags)
specific.right, right_changes = optimize_subtree(specific.right, flags)
changes += left_changes + right_changes
// * Alternation to Optional
//
// DO: `a|` => `a?`
if specific.left != nil && specific.right == nil {
node := new(Node_Optional)
node.inner = specific.left
return node, 1
}
// * Alternation to Optional Non-Greedy
//
// DO: `|a` => `a??`
if specific.right != nil && specific.left == nil {
node := new(Node_Optional_Non_Greedy)
node.inner = specific.right
return node, 1
}
// * NOP
//
// DO: `|` => <nil>
if specific.left == nil && specific.right == nil {
return nil, 1
}
left_rune, left_is_rune := specific.left.(^Node_Rune)
right_rune, right_is_rune := specific.right.(^Node_Rune)
if left_is_rune && right_is_rune {
if left_rune.data == right_rune.data {
// * Alternation Reduction
//
// DO: `a|a` => `a`
return left_rune, 1
} else {
// * Alternation to Class
//
// DO: `a|b` => `[ab]`
node := new(Node_Rune_Class)
append(&node.runes, left_rune.data)
append(&node.runes, right_rune.data)
return node, 1
}
}
left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
// * Class Union
//
// DO: `[a0]|[b1]` => `[a0b1]`
left_class, left_is_class := specific.left.(^Node_Rune_Class)
right_class, right_is_class := specific.right.(^Node_Rune_Class)
if left_is_class && right_is_class {
for r in right_class.runes {
append(&left_class.runes, r)
}
for range in right_class.ranges {
append(&left_class.ranges, range)
}
return left_class, 1
}
// * Class Union
//
// DO: `[a-b]|c` => `[a-bc]`
if left_is_class && right_is_rune {
append(&left_class.runes, right_rune.data)
return left_class, 1
}
// * Class Union
//
// DO: `a|[b-c]` => `[b-ca]`
if left_is_rune && right_is_class {
append(&right_class.runes, left_rune.data)
return right_class, 1
}
// * Wildcard Reduction
//
// DO: `a|.` => `.`
if left_is_rune && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|a` => `.`
if left_is_wildcard && right_is_rune {
return left_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `[ab]|.` => `.`
if left_is_class && right_is_wildcard {
return right_wildcard, 1
}
// * Wildcard Reduction
//
// DO: `.|[ab]` => `.`
if left_is_wildcard && right_is_class {
return left_wildcard, 1
}
left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
// * Common Suffix Elimination
//
// DO: `blueberry|strawberry` => `(?:blue|straw)berry`
if left_is_concatenation && right_is_concatenation {
// Remember that a concatenation could contain any node, not just runes.
left_len := len(left_concatenation.nodes)
right_len := len(right_concatenation.nodes)
least_len := min(left_len, right_len)
same_len := 0
for i := 1; i <= least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len += 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
// Dissolve this alternation into a concatenation.
cat_node := new(Node_Concatenation)
group_node := new(Node_Group)
append(&cat_node.nodes, group_node)
// Turn the concatenation into the common suffix.
for i := left_len - same_len; i < left_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
// Construct the group of alternating prefixes.
for i := same_len; i > 0; i -= 1 {
pop(&left_concatenation.nodes)
pop(&right_concatenation.nodes)
}
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
return cat_node, 1
}
}
// * Common Prefix Elimination
//
// DO: `abi|abe` => `ab(?:i|e)`
if left_is_concatenation && right_is_concatenation {
// Try to identify a common prefix.
// Remember that a concatenation could contain any node, not just runes.
least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
same_len := 0
for i := 0; i < least_len; i += 1 {
left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
if !left_is_subrune || !right_is_subrune {
// One of the nodes isn't a rune; there's nothing more we can do.
break
}
if left_subrune.data == right_subrune.data {
same_len = i + 1
} else {
// No more similarities.
break
}
}
if same_len > 0 {
cat_node := new(Node_Concatenation)
for i := 0; i < same_len; i += 1 {
append(&cat_node.nodes, left_concatenation.nodes[i])
}
for i := same_len; i > 0; i -= 1 {
ordered_remove(&left_concatenation.nodes, 0)
ordered_remove(&right_concatenation.nodes, 0)
}
group_node := new(Node_Group)
// (Re-using this alternation node.)
alter_node := specific
alter_node.left = left_concatenation
alter_node.right = right_concatenation
group_node.inner = alter_node
append(&cat_node.nodes, group_node)
return cat_node, 1
}
}
}
return
}
optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
result = tree
new_changes := 0
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST before Optimizer: ")
parser.write_node(common.debug_stream, tree)
io.write_byte(common.debug_stream, '\n')
}
// Keep optimizing until no more changes are seen.
for {
result, new_changes = optimize_subtree(result, flags)
changes += new_changes
if new_changes == 0 {
break
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "AST after Optimizer: ")
parser.write_node(common.debug_stream, result)
io.write_byte(common.debug_stream, '\n')
}
return
}

View File

@@ -0,0 +1,111 @@
package regex_parser
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "core:io"
write_node :: proc(w: io.Writer, node: Node) {
switch specific in node {
case ^Node_Rune:
io.write_rune(w, specific.data)
case ^Node_Rune_Class:
io.write_byte(w, '[')
if specific.negating {
io.write_byte(w, '^')
}
for r in specific.data.runes {
io.write_rune(w, r)
}
for range in specific.data.ranges {
io.write_rune(w, range.lower)
io.write_byte(w, '-')
io.write_rune(w, range.upper)
}
io.write_byte(w, ']')
case ^Node_Wildcard:
io.write_byte(w, '.')
case ^Node_Concatenation:
io.write_rune(w, '「')
for subnode, i in specific.nodes {
if i != 0 {
io.write_rune(w, '⋅')
}
write_node(w, subnode)
}
io.write_rune(w, '」')
case ^Node_Repeat_Zero:
write_node(w, specific.inner)
io.write_byte(w, '*')
case ^Node_Repeat_Zero_Non_Greedy:
write_node(w, specific.inner)
io.write_string(w, "*?")
case ^Node_Repeat_One:
write_node(w, specific.inner)
io.write_byte(w, '+')
case ^Node_Repeat_One_Non_Greedy:
write_node(w, specific.inner)
io.write_string(w, "+?")
case ^Node_Repeat_N:
write_node(w, specific.inner)
if specific.lower == 0 && specific.upper == -1 {
io.write_byte(w, '*')
} else if specific.lower == 1 && specific.upper == -1 {
io.write_byte(w, '+')
} else {
io.write_byte(w, '{')
io.write_int(w, specific.lower)
io.write_byte(w, ',')
io.write_int(w, specific.upper)
io.write_byte(w, '}')
}
case ^Node_Alternation:
io.write_rune(w, '《')
write_node(w, specific.left)
io.write_byte(w, '|')
write_node(w, specific.right)
io.write_rune(w, '》')
case ^Node_Optional:
io.write_rune(w, '〈')
write_node(w, specific.inner)
io.write_byte(w, '?')
io.write_rune(w, '〉')
case ^Node_Optional_Non_Greedy:
io.write_rune(w, '〈')
write_node(w, specific.inner)
io.write_string(w, "??")
io.write_rune(w, '〉')
case ^Node_Group:
io.write_byte(w, '(')
if !specific.capture {
io.write_string(w, "?:")
}
write_node(w, specific.inner)
io.write_byte(w, ')')
case ^Node_Anchor:
io.write_byte(w, '^' if specific.start else '$')
case ^Node_Word_Boundary:
io.write_string(w, `\B` if specific.non_word else `\b`)
case ^Node_Match_All_And_Escape:
io.write_string(w, "《.*$》")
case nil:
io.write_string(w, "<nil>")
}
}

View File

@@ -0,0 +1,10 @@
/*
package regex_parser implements a Pratt parser, also known as a Top-Down
Operator Precedence parser, for parsing tokenized regular expression patterns.
References:
- https://dl.acm.org/doi/10.1145/512927.512931
- https://tdop.github.io/
- http://crockford.com/javascript/tdop/tdop.html
*/
package regex_parser

View File

@@ -0,0 +1,590 @@
package regex_parser
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
import "core:strconv"
import "core:strings"
import "core:text/regex/common"
import "core:text/regex/tokenizer"
import "core:unicode"
import "core:unicode/utf8"
Token :: tokenizer.Token
Token_Kind :: tokenizer.Token_Kind
Tokenizer :: tokenizer.Tokenizer
Rune_Class_Range :: struct {
lower, upper: rune,
}
Rune_Class_Data :: struct {
runes: [dynamic]rune,
ranges: [dynamic]Rune_Class_Range,
}
Node_Rune :: struct {
data: rune,
}
Node_Rune_Class :: struct {
negating: bool,
using data: Rune_Class_Data,
}
Node_Wildcard :: struct {}
Node_Alternation :: struct {
left, right: Node,
}
Node_Concatenation :: struct {
nodes: [dynamic]Node,
}
Node_Repeat_Zero :: struct {
inner: Node,
}
Node_Repeat_Zero_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_One :: struct {
inner: Node,
}
Node_Repeat_One_Non_Greedy :: struct {
inner: Node,
}
Node_Repeat_N :: struct {
inner: Node,
lower, upper: int,
}
Node_Optional :: struct {
inner: Node,
}
Node_Optional_Non_Greedy :: struct {
inner: Node,
}
Node_Group :: struct {
inner: Node,
capture_id: int,
capture: bool,
}
Node_Anchor :: struct {
start: bool,
}
Node_Word_Boundary :: struct {
non_word: bool,
}
Node_Match_All_And_Escape :: struct {}
Node :: union {
^Node_Rune,
^Node_Rune_Class,
^Node_Wildcard,
^Node_Concatenation,
^Node_Alternation,
^Node_Repeat_Zero,
^Node_Repeat_Zero_Non_Greedy,
^Node_Repeat_One,
^Node_Repeat_One_Non_Greedy,
^Node_Repeat_N,
^Node_Optional,
^Node_Optional_Non_Greedy,
^Node_Group,
^Node_Anchor,
^Node_Word_Boundary,
// Optimized nodes (not created by the Parser):
^Node_Match_All_And_Escape,
}
left_binding_power :: proc(kind: Token_Kind) -> int {
#partial switch kind {
case .Alternate: return 1
case .Concatenate: return 2
case .Repeat_Zero, .Repeat_One,
.Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
.Repeat_N: return 3
case .Optional,
.Optional_Non_Greedy: return 4
case .Open_Paren,
.Open_Paren_Non_Capture: return 9
}
return 0
}
Expected_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Repetition :: struct {
pos: int,
}
Invalid_Token :: struct {
pos: int,
kind: Token_Kind,
}
Invalid_Unicode :: struct {
pos: int,
}
Too_Many_Capture_Groups :: struct {
pos: int,
}
Unexpected_EOF :: struct {
pos: int,
}
Error :: union {
Expected_Token,
Invalid_Repetition,
Invalid_Token,
Invalid_Unicode,
Too_Many_Capture_Groups,
Unexpected_EOF,
}
Parser :: struct {
flags: common.Flags,
t: Tokenizer,
cur_token: Token,
groups: int,
}
@require_results
advance :: proc(p: ^Parser) -> Error {
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return Invalid_Unicode { pos = 0 }
}
return nil
}
expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
if p.cur_token.kind == kind {
advance(p) or_return
return
}
return Expected_Token{
pos = p.t.offset,
kind = kind,
}
}
null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Rune:
r: rune
for ru in token.text {
r = ru
break
}
assert(r != 0, "Parsed an empty Rune token.")
if .Case_Insensitive in p.flags {
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
node := new(Node_Rune_Class)
append(&node.runes, lower)
append(&node.runes, upper)
return node, nil
}
}
node := new(Node_Rune)
node ^= { r }
return node, nil
case .Rune_Class:
if len(token.text) == 0 {
return nil, nil
}
node := new(Node_Rune_Class)
#no_bounds_check for i := 0; i < len(token.text); /**/ {
r, size := utf8.decode_rune(token.text[i:])
if i == 0 && r == '^' {
node.negating = true
i += size
continue
}
i += size
assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
if r == '\\' {
next_r, next_size := utf8.decode_rune(token.text[i:])
i += next_size
assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
// @MetaCharacter
// NOTE: These must be kept in sync with the tokenizer.
switch next_r {
case 'f': append(&node.runes, '\f')
case 'n': append(&node.runes, '\n')
case 'r': append(&node.runes, '\r')
case 't': append(&node.runes, '\t')
case 'd':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
case 's':
append(&node.runes, '\t')
append(&node.runes, '\n')
append(&node.runes, '\f')
append(&node.runes, '\r')
append(&node.runes, ' ')
case 'w':
append(&node.ranges, Rune_Class_Range{ '0', '9' })
append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
append(&node.runes, '_')
append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
case 'D':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
case 'S':
append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
// \t and \n are adjacent.
append(&node.runes, '\x0b') // Vertical Tab
append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
case 'W':
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
case:
append(&node.runes, next_r)
}
continue
}
if r == '-' && len(node.runes) > 0 {
next_r, next_size := utf8.decode_rune(token.text[i:])
if next_size > 0 {
last := pop(&node.runes)
i += next_size
append(&node.ranges, Rune_Class_Range{ last, next_r })
continue
}
}
append(&node.runes, r)
}
if .Case_Insensitive in p.flags {
// These two loops cannot be in the form of `for x in y` because
// they append to the data that they iterate over.
length := len(node.runes)
#no_bounds_check for i := 0; i < length; i += 1 {
r := node.runes[i]
lower := unicode.to_lower(r)
upper := unicode.to_upper(r)
if lower != upper {
if lower != r {
append(&node.runes, lower)
} else {
append(&node.runes, upper)
}
}
}
length = len(node.ranges)
#no_bounds_check for i := 0; i < length; i += 1 {
range := &node.ranges[i]
min_lower := unicode.to_lower(range.lower)
max_lower := unicode.to_lower(range.upper)
min_upper := unicode.to_upper(range.lower)
max_upper := unicode.to_upper(range.upper)
if min_lower != min_upper && max_lower != max_upper {
range.lower = min_lower
range.upper = max_lower
append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
}
}
}
result = node
case .Wildcard:
node := new(Node_Wildcard)
result = node
case .Open_Paren:
// Because of the recursive nature of the token parser, we take the
// group number first instead of afterwards, in order to construct
// group matches from the outside in.
p.groups += 1
if p.groups == common.MAX_CAPTURE_GROUPS {
return nil, Too_Many_Capture_Groups{ pos = token.pos }
}
this_group := p.groups
node := new(Node_Group)
node.capture = true
node.capture_id = this_group
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Open_Paren_Non_Capture:
node := new(Node_Group)
node.inner = parse_expression(p, 0) or_return
expect(p, .Close_Paren) or_return
result = node
case .Close_Paren:
node := new(Node_Rune)
node ^= { ')' }
return node, nil
case .Anchor_Start:
node := new(Node_Anchor)
node.start = true
result = node
case .Anchor_End:
node := new(Node_Anchor)
result = node
case .Word_Boundary:
node := new(Node_Word_Boundary)
result = node
case .Non_Word_Boundary:
node := new(Node_Word_Boundary)
node.non_word = true
result = node
case .Alternate:
// A unary alternation with a left-side empty path, i.e. `|a`.
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case Unexpected_EOF:
// This token is a NOP, i.e. `|`.
break
case nil:
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.right = right
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
#partial switch token.kind {
case .Alternate:
if p.cur_token.kind == .Close_Paren {
// `(a|)`
// parse_expression will fail, so intervene here.
node := new(Node_Alternation)
node.left = left
return node, nil
}
right, right_err := parse_expression(p, left_binding_power(.Alternate))
#partial switch specific in right_err {
case nil:
break
case Unexpected_EOF:
// EOF is okay in an alternation; it's an edge case in the way of
// expressing an optional such as `a|`.
break
case:
return nil, right_err
}
node := new(Node_Alternation)
node.left = left
node.right = right
result = node
case .Concatenate:
right := parse_expression(p, left_binding_power(.Concatenate)) or_return
// There should be no need to check if right is Node_Concatenation, due
// to how the parsing direction works.
#partial switch specific in left {
case ^Node_Concatenation:
append(&specific.nodes, right)
result = specific
case:
node := new(Node_Concatenation)
append(&node.nodes, left)
append(&node.nodes, right)
result = node
}
case .Repeat_Zero:
node := new(Node_Repeat_Zero)
node.inner = left
result = node
case .Repeat_Zero_Non_Greedy:
node := new(Node_Repeat_Zero_Non_Greedy)
node.inner = left
result = node
case .Repeat_One:
node := new(Node_Repeat_One)
node.inner = left
result = node
case .Repeat_One_Non_Greedy:
node := new(Node_Repeat_One_Non_Greedy)
node.inner = left
result = node
case .Repeat_N:
node := new(Node_Repeat_N)
node.inner = left
comma := strings.index_byte(token.text, ',')
switch comma {
case -1: // {N}
exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if exact == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)exact
node.upper = cast(int)exact
case 0: // {,M}
upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = -1
node.upper = cast(int)upper
case len(token.text) - 1: // {N,}
lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = -1
case: // {N,M}
lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
if !lower_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
if !upper_ok {
return nil, Invalid_Repetition{ pos = token.pos }
}
if lower > upper {
return nil, Invalid_Repetition{ pos = token.pos }
}
if upper == 0 {
return nil, Invalid_Repetition{ pos = token.pos }
}
node.lower = cast(int)lower
node.upper = cast(int)upper
}
result = node
case .Optional:
node := new(Node_Optional)
node.inner = left
result = node
case .Optional_Non_Greedy:
node := new(Node_Optional_Non_Greedy)
node.inner = left
result = node
case .EOF:
return nil, Unexpected_EOF{ pos = token.pos }
case:
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
}
return
}
parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
token := p.cur_token
advance(p) or_return
left := null_denotation(p, token) or_return
token = p.cur_token
for rbp < left_binding_power(token.kind) {
advance(p) or_return
left = left_denotation(p, token, left) or_return
token = p.cur_token
}
return left, nil
}
parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
if len(str) == 0 {
node := new(Node_Group)
return node, nil
}
p: Parser
p.flags = flags
tokenizer.init(&p.t, str, flags)
p.cur_token = tokenizer.scan(&p.t)
if p.cur_token.kind == .Invalid {
return nil, Invalid_Unicode { pos = 0 }
}
node := parse_expression(&p, 0) or_return
result = node
return
}

450
core/text/regex/regex.odin Normal file
View File

@@ -0,0 +1,450 @@
package regex
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "core:text/regex/common"
import "core:text/regex/compiler"
import "core:text/regex/optimizer"
import "core:text/regex/parser"
import "core:text/regex/virtual_machine"
Flag :: common.Flag
Flags :: common.Flags
Parser_Error :: parser.Error
Compiler_Error :: compiler.Error
Creation_Error :: enum {
None,
// A `\` was supplied as the delimiter to `create_by_user`.
Bad_Delimiter,
// A pair of delimiters for `create_by_user` was not found.
Expected_Delimiter,
// An unknown letter was supplied to `create_by_user` after the last delimiter.
Unknown_Flag,
}
Error :: union #shared_nil {
// An error that can occur in the pattern parsing phase.
//
// Most of these are regular expression syntax errors and are either
// context-dependent as to what they mean or have self-explanatory names.
Parser_Error,
// An error that can occur in the pattern compiling phase.
//
// Of the two that can be returned, they have to do with exceeding the
// limitations of the Virtual Machine.
Compiler_Error,
// An error that occurs only for `create_by_user`.
Creation_Error,
}
/*
This struct corresponds to a set of string captures from a RegEx match.
`pos` will contain the start and end positions for each string in `groups`,
such that `str[pos[0][0]:pos[0][1]] == groups[0]`.
*/
Capture :: struct {
pos: [][2]int,
groups: []string,
}
/*
A compiled Regular Expression value, to be used with the `match_*` procedures.
*/
Regular_Expression :: struct {
flags: Flags `fmt:"-"`,
class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`,
program: []virtual_machine.Opcode `fmt:"-"`,
}
/*
Create a regular expression from a string pattern and a set of flags.
*Allocates Using Provided Allocators*
Inputs:
- pattern: The pattern to compile.
- flags: A `bit_set` of RegEx flags.
- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
Returns:
- result: The regular expression.
- err: An error, if one occurred.
*/
@require_results
create :: proc(
pattern: string,
flags: Flags = {},
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Regular_Expression, err: Error) {
// For the sake of speed and simplicity, we first run all the intermediate
// processes such as parsing and compilation through the temporary
// allocator.
program: [dynamic]virtual_machine.Opcode = ---
class_data: [dynamic]parser.Rune_Class_Data = ---
{
context.allocator = temporary_allocator
ast := parser.parse(pattern, flags) or_return
if .No_Optimization not_in flags {
ast, _ = optimizer.optimize(ast, flags)
}
program, class_data = compiler.compile(ast, flags) or_return
}
// When that's successful, re-allocate all at once with the permanent
// allocator so everything can be tightly packed.
context.allocator = permanent_allocator
result.flags = flags
if len(class_data) > 0 {
result.class_data = make([]virtual_machine.Rune_Class_Data, len(class_data))
}
for data, i in class_data {
if len(data.runes) > 0 {
result.class_data[i].runes = make([]rune, len(data.runes))
copy(result.class_data[i].runes, data.runes[:])
}
if len(data.ranges) > 0 {
result.class_data[i].ranges = make([]virtual_machine.Rune_Class_Range, len(data.ranges))
copy(result.class_data[i].ranges, data.ranges[:])
}
}
result.program = make([]virtual_machine.Opcode, len(program))
copy(result.program, program[:])
return
}
/*
Create a regular expression from a delimited string pattern, such as one
provided by users of a program or those found in a configuration file.
They are in the form of:
[DELIMITER] [regular expression] [DELIMITER] [flags]
For example, the following strings are valid:
/hellope/i
#hellope#i
hellopei
hellopei
The delimiter is determined by the very first rune in the string.
The only restriction is that the delimiter cannot be `\`, as that rune is used
to escape the delimiter if found in the middle of the string.
All runes after the closing delimiter will be parsed as flags:
- 'g': Global
- 'm': Multiline
- 'i': Case_Insensitive
- 'x': Ignore_Whitespace
- 'u': Unicode
- 'n': No_Capture
- '-': No_Optimization
*Allocates Using Provided Allocators*
Inputs:
- pattern: The delimited pattern with optional flags to compile.
- str: The string to match against.
- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
Returns:
- result: The regular expression.
- err: An error, if one occurred.
*/
@require_results
create_by_user :: proc(
pattern: string,
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (result: Regular_Expression, err: Error) {
if len(pattern) == 0 {
err = .Expected_Delimiter
return
}
delimiter: rune
start := -1
end := -1
flags: Flags
escaping: bool
parse_loop: for r, i in pattern {
if delimiter == 0 {
if r == '\\' {
err = .Bad_Delimiter
return
}
delimiter = r
continue parse_loop
}
if start == -1 {
start = i
}
if escaping {
escaping = false
continue parse_loop
}
switch r {
case '\\':
escaping = true
case delimiter:
end = i
break parse_loop
}
}
if end == -1 {
err = .Expected_Delimiter
return
}
// `start` is also the size of the delimiter, which is why it's being added
// to `end` here.
for r in pattern[start + end:] {
switch r {
case 'g': flags += { .Global }
case 'm': flags += { .Multiline }
case 'i': flags += { .Case_Insensitive }
case 'x': flags += { .Ignore_Whitespace }
case 'u': flags += { .Unicode }
case 'n': flags += { .No_Capture }
case '-': flags += { .No_Optimization }
case:
err = .Unknown_Flag
return
}
}
return create(pattern[start:end], flags, permanent_allocator, temporary_allocator)
}
/*
Match a regular expression against a string and allocate the results into the
returned `capture` structure.
The resulting capture strings will be slices to the string `str`, not wholly
copied strings, so they won't need to be individually deleted.
*Allocates Using Provided Allocators*
Inputs:
- regex: The regular expression.
- str: The string to match against.
- permanent_allocator: The allocator to use for the capture results. (default: context.allocator)
- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
Returns:
- capture: The capture groups found in the string.
- success: True if the regex matched the string.
*/
@require_results
match_and_allocate_capture :: proc(
regex: Regular_Expression,
str: string,
permanent_allocator := context.allocator,
temporary_allocator := context.temp_allocator,
) -> (capture: Capture, success: bool) {
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
{
context.allocator = temporary_allocator
vm := virtual_machine.create(regex.program, str)
vm.class_data = regex.class_data
if .Unicode in regex.flags {
saved, success = virtual_machine.run(&vm, true)
} else {
saved, success = virtual_machine.run(&vm, false)
}
}
if saved != nil {
context.allocator = permanent_allocator
num_groups := 0
#no_bounds_check for i := 0; i < len(saved); i += 2 {
a, b := saved[i], saved[i + 1]
if a == -1 || b == -1 {
continue
}
num_groups += 1
}
if num_groups > 0 {
capture.groups = make([]string, num_groups)
capture.pos = make([][2]int, num_groups)
n := 0
#no_bounds_check for i := 0; i < len(saved); i += 2 {
a, b := saved[i], saved[i + 1]
if a == -1 || b == -1 {
continue
}
capture.groups[n] = str[a:b]
capture.pos[n] = {a, b}
n += 1
}
}
}
return
}
/*
Match a regular expression against a string and save the capture results into
the provided `capture` structure.
The resulting capture strings will be slices to the string `str`, not wholly
copied strings, so they won't need to be individually deleted.
*Allocates Using Provided Allocator*
Inputs:
- regex: The regular expression.
- str: The string to match against.
- capture: A pointer to a Capture structure with `groups` and `pos` already allocated.
- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
Returns:
- num_groups: The number of capture groups set into `capture`.
- success: True if the regex matched the string.
*/
@require_results
match_with_preallocated_capture :: proc(
regex: Regular_Expression,
str: string,
capture: ^Capture,
temporary_allocator := context.temp_allocator,
) -> (num_groups: int, success: bool) {
assert(capture != nil, "Pre-allocated RegEx capture must not be nil.")
assert(len(capture.groups) >= common.MAX_CAPTURE_GROUPS,
"Pre-allocated RegEx capture `groups` must be at least 10 elements long.")
assert(len(capture.pos) >= common.MAX_CAPTURE_GROUPS,
"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
{
context.allocator = temporary_allocator
vm := virtual_machine.create(regex.program, str)
vm.class_data = regex.class_data
if .Unicode in regex.flags {
saved, success = virtual_machine.run(&vm, true)
} else {
saved, success = virtual_machine.run(&vm, false)
}
}
if saved != nil {
n := 0
#no_bounds_check for i := 0; i < len(saved); i += 2 {
a, b := saved[i], saved[i + 1]
if a == -1 || b == -1 {
continue
}
capture.groups[n] = str[a:b]
capture.pos[n] = {a, b}
n += 1
}
}
return
}
match :: proc {
match_and_allocate_capture,
match_with_preallocated_capture,
}
/*
Allocate a `Capture` in advance for use with `match`. This can save some time
if you plan on performing several matches at once and only need the results
between matches.
Inputs:
- allocator: (default: context.allocator)
Returns:
- result: The `Capture` with the maximum number of groups allocated.
*/
@require_results
preallocate_capture :: proc(allocator := context.allocator) -> (result: Capture) {
context.allocator = allocator
result.pos = make([][2]int, common.MAX_CAPTURE_GROUPS)
result.groups = make([]string, common.MAX_CAPTURE_GROUPS)
return
}
/*
Free all data allocated by the `create*` procedures.
*Frees Using Provided Allocator*
Inputs:
- regex: A regular expression.
- allocator: (default: context.allocator)
*/
destroy_regex :: proc(regex: Regular_Expression, allocator := context.allocator) {
context.allocator = allocator
delete(regex.program)
for data in regex.class_data {
delete(data.runes)
delete(data.ranges)
}
delete(regex.class_data)
}
/*
Free all data allocated by the `match_and_allocate_capture` procedure.
*Frees Using Provided Allocator*
Inputs:
- capture: A Capture.
- allocator: (default: context.allocator)
*/
destroy_capture :: proc(capture: Capture, allocator := context.allocator) {
context.allocator = allocator
delete(capture.groups)
delete(capture.pos)
}
destroy :: proc {
destroy_regex,
destroy_capture,
}

View File

@@ -0,0 +1,357 @@
package regex_tokenizer
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "core:text/regex/common"
import "core:unicode/utf8"
Token_Kind :: enum {
Invalid,
EOF,
Rune,
Wildcard,
Alternate,
Concatenate,
Repeat_Zero,
Repeat_Zero_Non_Greedy,
Repeat_One,
Repeat_One_Non_Greedy,
Repeat_N,
Optional,
Optional_Non_Greedy,
Rune_Class,
Open_Paren,
Open_Paren_Non_Capture,
Close_Paren,
Anchor_Start,
Anchor_End,
Word_Boundary,
Non_Word_Boundary,
}
Token :: struct {
kind: Token_Kind,
text: string,
pos: int,
}
Tokenizer :: struct {
flags: common.Flags,
src: string,
ch: rune,
offset: int,
read_offset: int,
last_token_kind: Token_Kind,
held_token: Token,
error_state: Error,
paren_depth: int,
}
Error :: enum {
None,
Illegal_Null_Character,
Illegal_Codepoint,
Illegal_Byte_Order_Mark,
}
init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
t.src = str
t.flags = flags
t.error_state = advance_rune(t)
}
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
if t.read_offset+offset < len(t.src) {
return t.src[t.read_offset+offset]
}
return 0
}
advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
if t.error_state != nil {
return t.error_state
}
if t.read_offset < len(t.src) {
t.offset = t.read_offset
r, w := rune(t.src[t.read_offset]), 1
switch {
case r == 0:
err = .Illegal_Null_Character
case r >= utf8.RUNE_SELF:
r, w = utf8.decode_rune(t.src[t.read_offset:])
if r == utf8.RUNE_ERROR && w == 1 {
err = .Illegal_Codepoint
} else if r == utf8.RUNE_BOM && t.offset > 0 {
err = .Illegal_Byte_Order_Mark
}
}
t.read_offset += w
t.ch = r
} else {
t.offset = len(t.src)
t.ch = -1
}
t.error_state = err
return
}
@require_results
scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
start := t.read_offset
for {
advance_rune(t)
if t.ch == -1 || t.error_state != nil {
return "", false
}
if t.ch == '\\' {
advance_rune(t)
continue
}
if t.ch == ']' {
return t.src[start:t.offset], true
}
}
unreachable()
}
@require_results
scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
start := t.read_offset
for {
advance_rune(t)
if t.ch == -1 {
return "", false
}
if t.ch == '}' {
return t.src[start:t.offset], true
}
}
unreachable()
}
@require_results
scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
if peek_byte(t) == '?' {
advance_rune(t)
return true
}
return false
}
scan_comment :: proc(t: ^Tokenizer) {
for {
advance_rune(t)
switch t.ch {
case -1:
return
case '\n':
// UNIX newline.
advance_rune(t)
return
case '\r':
// Mac newline.
advance_rune(t)
if t.ch == '\n' {
// Windows newline.
advance_rune(t)
}
return
}
}
}
@require_results
scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
advance_rune(t)
advance_rune(t)
return true
}
return false
}
@require_results
scan :: proc(t: ^Tokenizer) -> (token: Token) {
kind: Token_Kind
lit: string
pos := t.offset
defer {
t.last_token_kind = token.kind
}
if t.error_state != nil {
t.error_state = nil
return { .Invalid, "", pos }
}
if t.held_token != {} {
popped := t.held_token
t.held_token = {}
return popped
}
ch_loop: for {
switch t.ch {
case -1:
return { .EOF, "", pos }
case '\\':
advance_rune(t)
if t.ch == -1 {
return { .EOF, "", pos }
}
pos = t.offset
// @MetaCharacter
// NOTE: These must be kept in sync with the compiler.
DIGIT_CLASS :: "0-9"
SPACE_CLASS :: "\t\n\f\r "
WORD_CLASS :: "0-9A-Z_a-z"
switch t.ch {
case 'b': kind = .Word_Boundary
case 'B': kind = .Non_Word_Boundary
case 'f': kind = .Rune; lit = "\f"
case 'n': kind = .Rune; lit = "\n"
case 'r': kind = .Rune; lit = "\r"
case 't': kind = .Rune; lit = "\t"
case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
case 's': kind = .Rune_Class; lit = SPACE_CLASS
case 'w': kind = .Rune_Class; lit = WORD_CLASS
case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
case:
kind = .Rune
lit = t.src[t.offset:t.read_offset]
}
case '.':
kind = .Wildcard
case '|': kind = .Alternate
case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
case '[':
if text, ok := scan_class(t); ok {
kind = .Rune_Class
lit = text
} else {
kind = .EOF
}
case '{':
if text, ok := scan_repeat(t); ok {
kind = .Repeat_N
lit = text
} else {
kind = .EOF
}
case '(':
kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
t.paren_depth += 1
case ')':
kind = .Close_Paren
t.paren_depth -= 1
case '^': kind = .Anchor_Start
case '$':
kind = .Anchor_End
case:
if .Ignore_Whitespace in t.flags {
switch t.ch {
case ' ', '\r', '\n', '\t', '\f':
advance_rune(t)
continue ch_loop
case:
break
}
}
if t.ch == '#' && t.paren_depth == 0 {
scan_comment(t)
continue ch_loop
}
kind = .Rune
lit = t.src[t.offset:t.read_offset]
}
break ch_loop
}
if t.error_state != nil {
t.error_state = nil
return { .Invalid, "", pos }
}
advance_rune(t)
// The following set of rules dictate where Concatenate tokens are
// automatically inserted.
#partial switch kind {
case
.Close_Paren,
.Alternate,
.Optional, .Optional_Non_Greedy,
.Repeat_Zero, .Repeat_Zero_Non_Greedy,
.Repeat_One, .Repeat_One_Non_Greedy,
.Repeat_N:
// Never prepend a Concatenate before these tokens.
break
case:
#partial switch t.last_token_kind {
case
.Invalid,
.Open_Paren, .Open_Paren_Non_Capture,
.Alternate:
// Never prepend a Concatenate token when the _last token_ was one
// of these.
break
case:
t.held_token = { kind, lit, pos }
return { .Concatenate, "", pos }
}
}
return { kind, lit, pos }
}

View File

@@ -0,0 +1,175 @@
/*
package regex_vm implements a threaded virtual machine for interpreting
regular expressions, based on the designs described by Russ Cox and attributed
to both Ken Thompson and Rob Pike.
The virtual machine executes all threads in lock step, i.e. the string pointer
does not advance until all threads have finished processing the current rune.
The algorithm does not look backwards.
Threads merge when splitting or jumping to positions already visited by another
thread, based on the observation that each thread having visited one PC
(Program Counter) state will execute identically to the previous thread.
Each thread keeps a save state of its capture groups, and thread priority is
used to allow higher precedence operations to complete first with correct save
states, such as greedy versus non-greedy repetition.
For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
**Implementation Details:**
- Each opcode is 8 bits in size, and most instructions have no operands.
- All operands larger than `u8` are read in system endian order.
- Jump and Split instructions operate on absolute positions in `u16` operands.
- Classes such as `[0-9]` are stored in a RegEx-specific slice of structs which
are then dereferenced by a `u8` index from the `Rune_Class` instructions.
- Each Byte and Rune opcode have their operands stored inline after the opcode,
sized `u8` and `i32` respectively.
- A bitmap is used to determine which PC positions are occupied by a thread to
perform merging. The bitmap is cleared with every new frame.
- The VM supports two modes: ASCII and Unicode, decided by a compile-time
boolean constant argument provided to `run`. The procedure differs only in
string decoding. This was done for the sake of performance.
- No allocations are ever freed; the VM expects an arena or temporary allocator
to be used in the context preceding it.
**Opcode Reference:**
(0x00) Match
The terminal opcode which ends a thread. This always comes at the end of
the program.
(0x01) Match_And_Exit
A modified version of Match which stops the virtual machine entirely. It is
only compiled for `No_Capture` expressions, as those expressions do not
need to determine which thread may have saved the most appropriate capture
groups.
(0x02) Byte
Consumes one byte from the text using its operand, which is also a byte.
(0x03) Rune
Consumes one Unicode codepoint from the text using its operand, which is
four bytes long in a system-dependent endian order.
(0x04) Rune_Class
Consumes one character (which may be an ASCII byte or Unicode codepoint,
wholly dependent on which mode the virtual machine is running in) from the
text.
The actual data storing what runes and ranges of runes apply to the class
are stored alongside the program in the Regular_Expression structure and
the operand for this opcode is a single byte which indexes into a
collection of these data structures.
(0x05) Rune_Class_Negated
A modified version of Rune_Class that functions the same, save for how it
returns the opposite of what Rune_Class matches.
(0x06) Wildcard
Consumes one byte or one Unicode codepoint, depending on the VM mode.
(0x07) Jump
Sets the Program Counter of a VM thread to the operand, which is a u16.
This opcode is used to implement Alternation (coming at the end of the left
choice) and Repeat_Zero (to cause the thread to loop backwards).
(0x08) Split
Spawns a new thread for the X operand and causes the current thread to jump
to the Y operand. This opcode is used to implement Alternation, all the
Repeat variations, and the Optional nodes.
Splitting threads is how the virtual machine is able to execute optional
control flow paths, letting it evaluate different possible ways to match
text.
(0x09) Save
Saves the current string index to a slot on the thread dictated by the
operand. These values will be used later to reconstruct capture groups.
(0x0A) Assert_Start
Asserts that the thread is at the beginning of a string.
(0x0B) Assert_End
Asserts that the thread is at the end of a string.
(0x0C) Assert_Word_Boundary
Asserts that the thread is on a word boundary, which can be the start or
end of the text. This examines both the current rune and the next rune.
(0x0D) Assert_Non_Word_Boundary
A modified version of Assert_Word_Boundary that returns the opposite value.
(0x0E) Multiline_Open
This opcode is compiled in only when the `Multiline` flag is present, and
it replaces both `^` and `$` text anchors.
It asserts that either the current thread is on one of the string
boundaries, or it consumes a `\n` or `\r` character.
If a `\r` character is consumed, the PC will be advanced to the sibling
`Multiline_Close` opcode to optionally consume a `\n` character on the next
frame.
(0x0F) Multiline_Close
This opcode is always present after `Multiline_Open`.
It handles consuming the second half of a complete newline, if necessary.
For example, Windows newlines are represented by the characters `\r\n`,
whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
(0x10) Wait_For_Byte
(0x11) Wait_For_Rune
(0x12) Wait_For_Rune_Class
(0x13) Wait_For_Rune_Class_Negated
These opcodes are an optimization around restarting threads on failed
matches when the beginning to a pattern is predictable and the Global flag
is set.
They will cause the VM to wait for the next rune to match before splitting,
as would happen in the un-optimized version.
(0x14) Match_All_And_Escape
This opcode is an optimized version of `.*$` or `.+$` that causes the
active thread to immediately work on escaping the program by following all
Jumps out to the end.
While running through the rest of the program, the thread will trigger on
every Save instruction it passes to store the length of the string.
This way, any time a program hits one of these `.*$` constructs, the
virtual machine can exit early, vastly improving processing times.
Be aware, this opcode is not compiled in if the `Multiline` flag is on, as
the meaning of `$` changes with that flag.
*/
package regex_vm

View File

@@ -0,0 +1,81 @@
package regex_vm
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
Opcode_Iterator :: struct {
code: Program,
pc: int,
}
iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok: bool) {
if iter.pc >= len(iter.code) {
return
}
opcode = iter.code[iter.pc]
pc = iter.pc
ok = true
switch opcode {
case .Match: iter.pc += size_of(Opcode)
case .Match_And_Exit: iter.pc += size_of(Opcode)
case .Byte: iter.pc += size_of(Opcode) + size_of(u8)
case .Rune: iter.pc += size_of(Opcode) + size_of(rune)
case .Rune_Class: iter.pc += size_of(Opcode) + size_of(u8)
case .Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8)
case .Wildcard: iter.pc += size_of(Opcode)
case .Jump: iter.pc += size_of(Opcode) + size_of(u16)
case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16)
case .Save: iter.pc += size_of(Opcode) + size_of(u8)
case .Assert_Start: iter.pc += size_of(Opcode)
case .Assert_End: iter.pc += size_of(Opcode)
case .Assert_Word_Boundary: iter.pc += size_of(Opcode)
case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode)
case .Multiline_Open: iter.pc += size_of(Opcode)
case .Multiline_Close: iter.pc += size_of(Opcode)
case .Wait_For_Byte: iter.pc += size_of(Opcode) + size_of(u8)
case .Wait_For_Rune: iter.pc += size_of(Opcode) + size_of(rune)
case .Wait_For_Rune_Class: iter.pc += size_of(Opcode) + size_of(u8)
case .Wait_For_Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8)
case .Match_All_And_Escape: iter.pc += size_of(Opcode)
case:
panic("Invalid opcode found in RegEx program.")
}
return
}
opcode_to_name :: proc(opcode: Opcode) -> (str: string) {
switch opcode {
case .Match: str = "Match"
case .Match_And_Exit: str = "Match_And_Exit"
case .Byte: str = "Byte"
case .Rune: str = "Rune"
case .Rune_Class: str = "Rune_Class"
case .Rune_Class_Negated: str = "Rune_Class_Negated"
case .Wildcard: str = "Wildcard"
case .Jump: str = "Jump"
case .Split: str = "Split"
case .Save: str = "Save"
case .Assert_Start: str = "Assert_Start"
case .Assert_End: str = "Assert_End"
case .Assert_Word_Boundary: str = "Assert_Word_Boundary"
case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary"
case .Multiline_Open: str = "Multiline_Open"
case .Multiline_Close: str = "Multiline_Close"
case .Wait_For_Byte: str = "Wait_For_Byte"
case .Wait_For_Rune: str = "Wait_For_Rune"
case .Wait_For_Rune_Class: str = "Wait_For_Rune_Class"
case .Wait_For_Rune_Class_Negated: str = "Wait_For_Rune_Class_Negated"
case .Match_All_And_Escape: str = "Match_All_And_Escape"
case: str = "<UNKNOWN>"
}
return
}

View File

@@ -0,0 +1,646 @@
package regex_vm
/*
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
Made available under Odin's BSD-3 license.
List of contributors:
Feoramund: Initial implementation.
*/
import "base:intrinsics"
@require import "core:io"
import "core:slice"
import "core:text/regex/common"
import "core:text/regex/parser"
import "core:unicode/utf8"
Rune_Class_Range :: parser.Rune_Class_Range
// NOTE: This structure differs intentionally from the one in `regex/parser`,
// as this data doesn't need to be a dynamic array once it hits the VM.
Rune_Class_Data :: struct {
runes: []rune,
ranges: []Rune_Class_Range,
}
Opcode :: enum u8 {
// | [ operands ]
Match = 0x00, // |
Match_And_Exit = 0x01, // |
Byte = 0x02, // | u8
Rune = 0x03, // | i32
Rune_Class = 0x04, // | u8
Rune_Class_Negated = 0x05, // | u8
Wildcard = 0x06, // |
Jump = 0x07, // | u16
Split = 0x08, // | u16, u16
Save = 0x09, // | u8
Assert_Start = 0x0A, // |
Assert_End = 0x0B, // |
Assert_Word_Boundary = 0x0C, // |
Assert_Non_Word_Boundary = 0x0D, // |
Multiline_Open = 0x0E, // |
Multiline_Close = 0x0F, // |
Wait_For_Byte = 0x10, // | u8
Wait_For_Rune = 0x11, // | i32
Wait_For_Rune_Class = 0x12, // | u8
Wait_For_Rune_Class_Negated = 0x13, // | u8
Match_All_And_Escape = 0x14, // |
}
Thread :: struct {
pc: int,
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int,
}
Program :: []Opcode
Machine :: struct {
// Program state
memory: string,
class_data: []Rune_Class_Data,
code: Program,
// Thread state
top_thread: int,
threads: [^]Thread,
next_threads: [^]Thread,
// The busy map is used to merge threads based on their program counters.
busy_map: []u64,
// Global state
string_pointer: int,
current_rune: rune,
current_rune_size: int,
next_rune: rune,
next_rune_size: int,
}
// @MetaCharacter
// NOTE: This must be kept in sync with the compiler & tokenizer.
is_word_class :: #force_inline proc "contextless" (r: rune) -> bool {
switch r {
case '0'..='9', 'A'..='Z', '_', 'a'..='z':
return true
case:
return false
}
}
set_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check {
slot := cast(u64)pc >> 6
bit: u64 = 1 << (cast(u64)pc & 0x3F)
if vm.busy_map[slot] & bit > 0 {
return false
}
vm.busy_map[slot] |= bit
return true
}
check_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check {
slot := cast(u64)pc >> 6
bit: u64 = 1 << (cast(u64)pc & 0x3F)
return vm.busy_map[slot] & bit > 0
}
add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: int) #no_bounds_check {
if check_busy_map(vm, pc) {
return
}
saved := saved
pc := pc
resolution_loop: for {
if !set_busy_map(vm, pc) {
return
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Thread [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "] thinking about ")
io.write_string(common.debug_stream, opcode_to_name(vm.code[pc]))
io.write_rune(common.debug_stream, '\n')
}
#partial switch vm.code[pc] {
case .Jump:
pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)])
continue
case .Split:
jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)])
jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode) + size_of(u16)])
add_thread(vm, saved, jmp_x)
pc = jmp_y
continue
case .Save:
new_saved := new([2 * common.MAX_CAPTURE_GROUPS]int)
new_saved ^= saved^
saved = new_saved
index := vm.code[pc + size_of(Opcode)]
sp := vm.string_pointer+vm.current_rune_size
saved[index] = sp
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Thread [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "] saving state: (slot ")
io.write_int(common.debug_stream, cast(int)index)
io.write_string(common.debug_stream, " = ")
io.write_int(common.debug_stream, sp)
io.write_string(common.debug_stream, ")\n")
}
pc += size_of(Opcode) + size_of(u8)
continue
case .Assert_Start:
sp := vm.string_pointer+vm.current_rune_size
if sp == 0 {
pc += size_of(Opcode)
continue
}
case .Assert_End:
sp := vm.string_pointer+vm.current_rune_size
if sp == len(vm.memory) {
pc += size_of(Opcode)
continue
}
case .Multiline_Open:
sp := vm.string_pointer+vm.current_rune_size
if sp == 0 || sp == len(vm.memory) {
if vm.next_rune == '\r' || vm.next_rune == '\n' {
// The VM is currently on a newline at the string boundary,
// so consume the newline next frame.
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
} else {
// Skip the `Multiline_Close` opcode.
pc += 2 * size_of(Opcode)
continue
}
} else {
// Not on a string boundary.
// Try to consume a newline next frame in the other opcode loop.
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
}
case .Assert_Word_Boundary:
sp := vm.string_pointer+vm.current_rune_size
if sp == 0 || sp == len(vm.memory) {
pc += size_of(Opcode)
continue
} else {
last_rune_is_wc := is_word_class(vm.current_rune)
this_rune_is_wc := is_word_class(vm.next_rune)
if last_rune_is_wc && !this_rune_is_wc || !last_rune_is_wc && this_rune_is_wc {
pc += size_of(Opcode)
continue
}
}
case .Assert_Non_Word_Boundary:
sp := vm.string_pointer+vm.current_rune_size
if sp != 0 && sp != len(vm.memory) {
last_rune_is_wc := is_word_class(vm.current_rune)
this_rune_is_wc := is_word_class(vm.next_rune)
if last_rune_is_wc && this_rune_is_wc || !last_rune_is_wc && !this_rune_is_wc {
pc += size_of(Opcode)
continue
}
}
case .Wait_For_Byte:
operand := cast(rune)vm.code[pc + size_of(Opcode)]
if vm.next_rune == operand {
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
case .Wait_For_Rune:
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[pc + size_of(Opcode)])
if vm.next_rune == operand {
add_thread(vm, saved, pc + size_of(Opcode) + size_of(rune))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
case .Wait_For_Rune_Class:
operand := cast(u8)vm.code[pc + size_of(Opcode)]
class_data := vm.class_data[operand]
next_rune := vm.next_rune
check: {
for r in class_data.runes {
if next_rune == r {
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
break check
}
}
for range in class_data.ranges {
if range.lower <= next_rune && next_rune <= range.upper {
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
break check
}
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
case .Wait_For_Rune_Class_Negated:
operand := cast(u8)vm.code[pc + size_of(Opcode)]
class_data := vm.class_data[operand]
next_rune := vm.next_rune
check_negated: {
for r in class_data.runes {
if next_rune == r {
break check_negated
}
}
for range in class_data.ranges {
if range.lower <= next_rune && next_rune <= range.upper {
break check_negated
}
}
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
case:
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
vm.top_thread += 1
}
break resolution_loop
}
return
}
run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
when UNICODE_MODE {
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
} else {
if len(vm.memory) > 0 {
vm.next_rune = cast(rune)vm.memory[0]
vm.next_rune_size = 1
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "### Adding initial thread.\n")
}
{
starter_saved := new([2 * common.MAX_CAPTURE_GROUPS]int)
starter_saved ^= -1
add_thread(vm, starter_saved, 0)
}
// `add_thread` adds to `next_threads` by default, but we need to put this
// thread in the current thread buffer.
vm.threads, vm.next_threads = vm.next_threads, vm.threads
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "### VM starting.\n")
defer io.write_string(common.debug_stream, "### VM finished.\n")
}
for {
slice.zero(vm.busy_map[:])
assert(vm.string_pointer <= len(vm.memory), "VM string pointer went out of bounds.")
current_rune := vm.next_rune
vm.current_rune = current_rune
vm.current_rune_size = vm.next_rune_size
when UNICODE_MODE {
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer+vm.current_rune_size:])
} else {
if vm.string_pointer+size_of(u8) < len(vm.memory) {
vm.next_rune = cast(rune)vm.memory[vm.string_pointer+size_of(u8)]
vm.next_rune_size = size_of(u8)
} else {
vm.next_rune = 0
vm.next_rune_size = 0
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, ">>> Dispatching rune: ")
io.write_encoded_rune(common.debug_stream, current_rune)
io.write_byte(common.debug_stream, '\n')
}
thread_count := vm.top_thread
vm.top_thread = 0
thread_loop: for i := 0; i < thread_count; i += 1 {
t := vm.threads[i]
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Thread [PC:")
common.write_padded_hex(common.debug_stream, t.pc, 4)
io.write_string(common.debug_stream, "] stepping on ")
io.write_string(common.debug_stream, opcode_to_name(vm.code[t.pc]))
io.write_byte(common.debug_stream, '\n')
}
#partial opcode: switch vm.code[t.pc] {
case .Match:
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Thread matched!\n")
}
saved = t.saved
ok = true
break thread_loop
case .Match_And_Exit:
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Thread matched! (Exiting)\n")
}
return nil, true
case .Byte:
operand := cast(rune)vm.code[t.pc + size_of(Opcode)]
if current_rune == operand {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
}
case .Rune:
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)])
if current_rune == operand {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune))
}
case .Rune_Class:
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
class_data := vm.class_data[operand]
for r in class_data.runes {
if current_rune == r {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
break opcode
}
}
for range in class_data.ranges {
if range.lower <= current_rune && current_rune <= range.upper {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
break opcode
}
}
case .Rune_Class_Negated:
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
class_data := vm.class_data[operand]
for r in class_data.runes {
if current_rune == r {
break opcode
}
}
for range in class_data.ranges {
if range.lower <= current_rune && current_rune <= range.upper {
break opcode
}
}
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
case .Wildcard:
add_thread(vm, t.saved, t.pc + size_of(Opcode))
case .Multiline_Open:
if current_rune == '\n' {
// UNIX newline.
add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode))
} else if current_rune == '\r' {
if vm.next_rune == '\n' {
// Windows newline. (1/2)
add_thread(vm, t.saved, t.pc + size_of(Opcode))
} else {
// Mac newline.
add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode))
}
}
case .Multiline_Close:
if current_rune == '\n' {
// Windows newline. (2/2)
add_thread(vm, t.saved, t.pc + size_of(Opcode))
}
case .Wait_For_Byte:
operand := cast(rune)vm.code[t.pc + size_of(Opcode)]
if vm.next_rune == operand {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, t.pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
vm.top_thread += 1
case .Wait_For_Rune:
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)])
if vm.next_rune == operand {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, t.pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
vm.top_thread += 1
case .Wait_For_Rune_Class:
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
class_data := vm.class_data[operand]
next_rune := vm.next_rune
check: {
for r in class_data.runes {
if next_rune == r {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
break check
}
}
for range in class_data.ranges {
if range.lower <= next_rune && next_rune <= range.upper {
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
break check
}
}
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, t.pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
vm.top_thread += 1
case .Wait_For_Rune_Class_Negated:
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
class_data := vm.class_data[operand]
next_rune := vm.next_rune
check_negated: {
for r in class_data.runes {
if next_rune == r {
break check_negated
}
}
for range in class_data.ranges {
if range.lower <= next_rune && next_rune <= range.upper {
break check_negated
}
}
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
}
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "*** New thread added [PC:")
common.write_padded_hex(common.debug_stream, t.pc, 4)
io.write_string(common.debug_stream, "]\n")
}
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
vm.top_thread += 1
case .Match_All_And_Escape:
t.pc += size_of(Opcode)
// The point of this loop is to walk out of wherever this
// opcode lives to the end of the program, while saving the
// index to the length of the string at each pass on the way.
escape_loop: for {
#partial switch vm.code[t.pc] {
case .Match, .Match_And_Exit:
break escape_loop
case .Jump:
t.pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[t.pc + size_of(Opcode)])
case .Save:
index := vm.code[t.pc + size_of(Opcode)]
t.saved[index] = len(vm.memory)
t.pc += size_of(Opcode) + size_of(u8)
case .Match_All_And_Escape:
// Layering these is fine.
t.pc += size_of(Opcode)
// If the loop has to process any opcode not listed above,
// it means someone did something odd like `a(.*$)b`, in
// which case, just fail. Technically, the expression makes
// no sense.
case:
break opcode
}
}
saved = t.saved
ok = true
return
case:
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "Opcode: ")
io.write_int(common.debug_stream, cast(int)vm.code[t.pc])
io.write_string(common.debug_stream, "\n")
}
panic("Invalid opcode in RegEx thread loop.")
}
}
vm.threads, vm.next_threads = vm.next_threads, vm.threads
when common.ODIN_DEBUG_REGEX {
io.write_string(common.debug_stream, "<<< Frame ended. (Threads: ")
io.write_int(common.debug_stream, vm.top_thread)
io.write_string(common.debug_stream, ")\n")
}
if vm.string_pointer == len(vm.memory) || vm.top_thread == 0 {
break
}
vm.string_pointer += vm.current_rune_size
}
return
}
opcode_count :: proc(code: Program) -> (opcodes: int) {
iter := Opcode_Iterator{ code, 0 }
for _ in iterate_opcodes(&iter) {
opcodes += 1
}
return
}
create :: proc(code: Program, str: string) -> (vm: Machine) {
assert(len(code) > 0, "RegEx VM has no instructions.")
vm.memory = str
vm.code = code
sizing := len(code) >> 6 + (1 if len(code) & 0x3F > 0 else 0)
assert(sizing > 0)
vm.busy_map = make([]u64, sizing)
max_possible_threads := max(1, opcode_count(vm.code) - 1)
vm.threads = make([^]Thread, max_possible_threads)
vm.next_threads = make([^]Thread, max_possible_threads)
return
}

View File

@@ -128,6 +128,7 @@ import testing "core:testing"
import edit "core:text/edit"
import i18n "core:text/i18n"
import match "core:text/match"
import regex "core:text/regex"
import scanner "core:text/scanner"
import table "core:text/table"
@@ -251,6 +252,7 @@ _ :: testing
_ :: scanner
_ :: i18n
_ :: match
_ :: regex
_ :: table
_ :: edit
_ :: thread

View File

@@ -3,3 +3,4 @@ package benchmarks
@(require) import "bytes"
@(require) import "crypto"
@(require) import "hash"
@(require) import "text/regex"

View File

@@ -0,0 +1,258 @@
package benchmark_core_text_regex
import "core:fmt"
import "core:log"
import "core:math/rand"
import "core:mem"
import "core:testing"
import "core:text/regex"
import "core:time"
import "core:unicode/utf8"
randomize_ascii :: proc(data: []u8) {
for i in 0..<len(data) {
data[i] = ' ' + cast(u8)rand.int_max(0x7F - ' ')
}
}
randomize_unicode :: proc(data: []u8) {
for i := 0; i < len(data); /**/ {
check_rune_loop: for {
r := cast(rune)rand.int_max(utf8.MAX_RUNE)
if !utf8.valid_rune(r) {
continue
}
if utf8.rune_size(r) > len(data) - i {
continue
}
r_data, size := utf8.encode_rune(r)
for j in 0..<size {
data[i+j] = r_data[j]
}
i += size
break check_rune_loop
}
}
}
sizes := [?]int {
2 * mem.Kilobyte,
32 * mem.Kilobyte,
64 * mem.Kilobyte,
256 * mem.Kilobyte,
0.50 * mem.Megabyte,
1.00 * mem.Megabyte,
2.00 * mem.Megabyte,
}
@test
expensive_for_backtrackers :: proc(t: ^testing.T) {
counts := [?]int {
8,
16,
32,
64,
}
report: string
for count in counts {
data := make([]u8, count)
pattern := make([]u8, 2 * count + count)
defer {
delete(data)
delete(pattern)
}
for i in 0..<2 * count {
pattern[i] = 'a' if i & 1 == 0 else '?'
}
for i in 2 * count..<2 * count + count {
pattern[i] = 'a'
}
for i in 0..<count {
data[i] = 'a'
}
rex, err := regex.create(cast(string)pattern)
if !testing.expect_value(t, err, nil) {
return
}
defer regex.destroy(rex)
str := cast(string)data
log.debug(rex, str)
start := time.now()
capture, ok := regex.match(rex, str)
done := time.since(start)
defer regex.destroy(capture)
if !testing.expect_value(t, ok, true) {
continue
}
testing.expect_value(t, capture.pos[0], [2]int{0, count})
rate := cast(int)(cast(f64)(count / 2) / (cast(f64)done / 1e9))
report = fmt.tprintf("%s\n +++ [%i : %v : %M/s] Matched `a?^%ia^%i` against `a^%i`.", report, count, done, rate, count, count, count)
}
log.info(report)
}
@test
global_capture_end_word :: proc(t: ^testing.T) {
EXPR :: `Hellope World!`
rex, err := regex.create(EXPR, { .Global })
if !testing.expect_value(t, err, nil) {
return
}
defer regex.destroy(rex)
report := fmt.tprintf("Matching %q over a block of random ASCII text.", EXPR)
for size in sizes {
data := make([]u8, size)
defer delete(data)
randomize_ascii(data[:])
for r, i in EXPR {
data[len(data) - len(EXPR) + i] = cast(u8)r
}
str := cast(string)data
start := time.now()
capture, ok := regex.match(rex, str)
done := time.since(start)
defer regex.destroy(capture)
if !testing.expect_value(t, ok, true) {
continue
}
testing.expect_value(t, capture.pos[0], [2]int{size - len(EXPR), size})
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
}
log.info(report)
}
@test
global_capture_end_word_unicode :: proc(t: ^testing.T) {
EXPR :: `こにちは`
needle := string(EXPR)
rex, err := regex.create(EXPR, { .Global, .Unicode })
if !testing.expect_value(t, err, nil) {
return
}
defer regex.destroy(rex)
report := fmt.tprintf("Matching %q over a block of random Unicode text.", EXPR)
for size in sizes {
data := make([]u8, size)
defer delete(data)
randomize_unicode(data[:size - len(needle)])
for i := 0; i < len(needle); i += 1 {
data[len(data) - len(needle) + i] = needle[i]
}
str := cast(string)data
start := time.now()
capture, ok := regex.match(rex, str)
done := time.since(start)
defer regex.destroy(capture)
if !testing.expect_value(t, ok, true) {
continue
}
testing.expect_value(t, capture.groups[0], needle)
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
}
log.info(report)
}
@test
alternations :: proc(t: ^testing.T) {
EXPR :: `a(?:bb|cc|dd|ee|ff)`
rex, err := regex.create(EXPR, { .No_Capture, .Global })
if !testing.expect_value(t, err, nil) {
return
}
defer regex.destroy(rex)
report := fmt.tprintf("Matching %q over a text block of only `a`s.", EXPR)
for size in sizes {
data := make([]u8, size)
defer delete(data)
for i in 0..<size {
data[i] = 'a'
}
str := cast(string)data
start := time.now()
_, ok := regex.match(rex, str)
done := time.since(start)
testing.expect_value(t, ok, false)
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
}
log.info(report)
}
@test
classes :: proc(t: ^testing.T) {
EXPR :: `[\w\d]+`
NEEDLE :: "0123456789abcdef"
rex, err := regex.create(EXPR, { .Global })
if !testing.expect_value(t, err, nil) {
return
}
defer regex.destroy(rex)
report := fmt.tprintf("Matching %q over a string of spaces with %q at the end.", EXPR, NEEDLE)
for size in sizes {
data := make([]u8, size)
defer delete(data)
for i in 0..<size {
data[i] = ' '
}
for r, i in NEEDLE {
data[len(data) - len(NEEDLE) + i] = cast(u8)r
}
str := cast(string)data
start := time.now()
capture, ok := regex.match(rex, str)
done := time.since(start)
defer regex.destroy(capture)
if !testing.expect_value(t, ok, true) {
continue
}
testing.expect_value(t, capture.pos[0], [2]int{size - len(NEEDLE), size})
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
}
log.info(report)
}

View File

@@ -42,6 +42,7 @@ download_assets :: proc() {
@(require) import "sys/windows"
@(require) import "text/i18n"
@(require) import "text/match"
@(require) import "text/regex"
@(require) import "thread"
@(require) import "time"
@(require) import "unicode"

File diff suppressed because it is too large Load Diff