mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-22 22:35:19 +00:00
46
core/text/regex/common/common.odin
Normal file
46
core/text/regex/common/common.odin
Normal file
@@ -0,0 +1,46 @@
|
||||
// This package helps break dependency cycles.
|
||||
package regex_common
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
// VM limitations
|
||||
MAX_CAPTURE_GROUPS :: max(#config(ODIN_REGEX_MAX_CAPTURE_GROUPS, 10), 10)
|
||||
MAX_PROGRAM_SIZE :: int(max(i16))
|
||||
MAX_CLASSES :: int(max(u8))
|
||||
|
||||
Flag :: enum u8 {
|
||||
// Global: try to match the pattern anywhere in the string.
|
||||
Global,
|
||||
// Multiline: treat `^` and `$` as if they also match newlines.
|
||||
Multiline,
|
||||
// Case Insensitive: treat `a-z` as if it was also `A-Z`.
|
||||
Case_Insensitive,
|
||||
// Ignore Whitespace: bypass unescaped whitespace outside of classes.
|
||||
Ignore_Whitespace,
|
||||
// Unicode: let the compiler and virtual machine know to expect Unicode strings.
|
||||
Unicode,
|
||||
|
||||
// No Capture: avoid saving capture group data entirely.
|
||||
No_Capture,
|
||||
// No Optimization: do not pass the pattern through the optimizer; for debugging.
|
||||
No_Optimization,
|
||||
}
|
||||
|
||||
Flags :: bit_set[Flag; u8]
|
||||
|
||||
@(rodata)
|
||||
Flag_To_Letter := #sparse[Flag]u8 {
|
||||
.Global = 'g',
|
||||
.Multiline = 'm',
|
||||
.Case_Insensitive = 'i',
|
||||
.Ignore_Whitespace = 'x',
|
||||
.Unicode = 'u',
|
||||
.No_Capture = 'n',
|
||||
.No_Optimization = '-',
|
||||
}
|
||||
33
core/text/regex/common/debugging.odin
Normal file
33
core/text/regex/common/debugging.odin
Normal file
@@ -0,0 +1,33 @@
|
||||
package regex_common
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
@require import "core:os"
|
||||
import "core:io"
|
||||
import "core:strings"
|
||||
|
||||
ODIN_DEBUG_REGEX :: #config(ODIN_DEBUG_REGEX, false)
|
||||
|
||||
when ODIN_DEBUG_REGEX {
|
||||
debug_stream := os.stream_from_handle(os.stderr)
|
||||
}
|
||||
|
||||
write_padded_hex :: proc(w: io.Writer, #any_int n, zeroes: int) {
|
||||
sb := strings.builder_make()
|
||||
defer strings.builder_destroy(&sb)
|
||||
|
||||
sbw := strings.to_writer(&sb)
|
||||
io.write_int(sbw, n, 0x10)
|
||||
|
||||
io.write_string(w, "0x")
|
||||
for _ in 0..<max(0, zeroes - strings.builder_len(sb)) {
|
||||
io.write_byte(w, '0')
|
||||
}
|
||||
io.write_int(w, n, 0x10)
|
||||
}
|
||||
548
core/text/regex/compiler/compiler.odin
Normal file
548
core/text/regex/compiler/compiler.odin
Normal file
@@ -0,0 +1,548 @@
|
||||
package regex_compiler
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/parser"
|
||||
import "core:text/regex/tokenizer"
|
||||
import "core:text/regex/virtual_machine"
|
||||
import "core:unicode"
|
||||
|
||||
Token :: tokenizer.Token
|
||||
Token_Kind :: tokenizer.Token_Kind
|
||||
Tokenizer :: tokenizer.Tokenizer
|
||||
|
||||
Rune_Class_Range :: parser.Rune_Class_Range
|
||||
Rune_Class_Data :: parser.Rune_Class_Data
|
||||
|
||||
Node :: parser.Node
|
||||
Node_Rune :: parser.Node_Rune
|
||||
Node_Rune_Class :: parser.Node_Rune_Class
|
||||
Node_Wildcard :: parser.Node_Wildcard
|
||||
Node_Concatenation :: parser.Node_Concatenation
|
||||
Node_Alternation :: parser.Node_Alternation
|
||||
Node_Repeat_Zero :: parser.Node_Repeat_Zero
|
||||
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
|
||||
Node_Repeat_One :: parser.Node_Repeat_One
|
||||
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
|
||||
Node_Repeat_N :: parser.Node_Repeat_N
|
||||
Node_Optional :: parser.Node_Optional
|
||||
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
|
||||
Node_Group :: parser.Node_Group
|
||||
Node_Anchor :: parser.Node_Anchor
|
||||
Node_Word_Boundary :: parser.Node_Word_Boundary
|
||||
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
|
||||
|
||||
Opcode :: virtual_machine.Opcode
|
||||
Program :: [dynamic]Opcode
|
||||
|
||||
JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
|
||||
SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
|
||||
|
||||
|
||||
Compiler :: struct {
|
||||
flags: common.Flags,
|
||||
class_data: [dynamic]Rune_Class_Data,
|
||||
}
|
||||
|
||||
|
||||
Error :: enum {
|
||||
None,
|
||||
Program_Too_Big,
|
||||
Too_Many_Classes,
|
||||
}
|
||||
|
||||
classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
|
||||
assert(q != nil)
|
||||
assert(w != nil)
|
||||
|
||||
if q == w {
|
||||
return true
|
||||
}
|
||||
|
||||
if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
|
||||
return false
|
||||
}
|
||||
|
||||
for r, i in q.runes {
|
||||
if r != w.runes[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
for r, i in q.ranges {
|
||||
if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
|
||||
if tree == nil {
|
||||
return
|
||||
}
|
||||
|
||||
switch specific in tree {
|
||||
case ^Node_Rune: break
|
||||
case ^Node_Wildcard: break
|
||||
case ^Node_Anchor: break
|
||||
case ^Node_Word_Boundary: break
|
||||
case ^Node_Match_All_And_Escape: break
|
||||
|
||||
case ^Node_Concatenation:
|
||||
for subnode in specific.nodes {
|
||||
map_all_classes(subnode, collection)
|
||||
}
|
||||
|
||||
case ^Node_Repeat_Zero:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Repeat_Zero_Non_Greedy:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Repeat_One:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Repeat_One_Non_Greedy:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Repeat_N:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Optional:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Optional_Non_Greedy:
|
||||
map_all_classes(specific.inner, collection)
|
||||
case ^Node_Group:
|
||||
map_all_classes(specific.inner, collection)
|
||||
|
||||
case ^Node_Alternation:
|
||||
map_all_classes(specific.left, collection)
|
||||
map_all_classes(specific.right, collection)
|
||||
|
||||
case ^Node_Rune_Class:
|
||||
unseen := true
|
||||
for &value in collection {
|
||||
if classes_are_exact(&specific.data, &value) {
|
||||
unseen = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if unseen {
|
||||
append(collection, specific.data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
append_raw :: #force_inline proc(code: ^Program, data: $T) {
|
||||
// NOTE: This is system-dependent endian.
|
||||
for b in transmute([size_of(T)]byte)data {
|
||||
append(code, cast(Opcode)b)
|
||||
}
|
||||
}
|
||||
inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
|
||||
// NOTE: This is system-dependent endian.
|
||||
for b, i in transmute([size_of(T)]byte)data {
|
||||
inject_at(code, start + i, cast(Opcode)b)
|
||||
}
|
||||
}
|
||||
|
||||
@require_results
|
||||
generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
|
||||
if node == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
|
||||
// this later when relative jumps are turned into absolute jumps.
|
||||
|
||||
switch specific in node {
|
||||
// Atomic Nodes:
|
||||
case ^Node_Rune:
|
||||
if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
|
||||
append(&code, Opcode.Byte)
|
||||
append(&code, cast(Opcode)specific.data)
|
||||
} else {
|
||||
append(&code, Opcode.Rune)
|
||||
append_raw(&code, specific.data)
|
||||
}
|
||||
|
||||
case ^Node_Rune_Class:
|
||||
if specific.negating {
|
||||
append(&code, Opcode.Rune_Class_Negated)
|
||||
} else {
|
||||
append(&code, Opcode.Rune_Class)
|
||||
}
|
||||
|
||||
index := -1
|
||||
for &data, i in c.class_data {
|
||||
if classes_are_exact(&data, &specific.data) {
|
||||
index = i
|
||||
break
|
||||
}
|
||||
}
|
||||
assert(index != -1, "Unable to find collected Rune_Class_Data index.")
|
||||
|
||||
append(&code, Opcode(index))
|
||||
|
||||
case ^Node_Wildcard:
|
||||
append(&code, Opcode.Wildcard)
|
||||
|
||||
case ^Node_Anchor:
|
||||
if .Multiline in c.flags {
|
||||
append(&code, Opcode.Multiline_Open)
|
||||
append(&code, Opcode.Multiline_Close)
|
||||
} else {
|
||||
if specific.start {
|
||||
append(&code, Opcode.Assert_Start)
|
||||
} else {
|
||||
append(&code, Opcode.Assert_End)
|
||||
}
|
||||
}
|
||||
case ^Node_Word_Boundary:
|
||||
if specific.non_word {
|
||||
append(&code, Opcode.Assert_Non_Word_Boundary)
|
||||
} else {
|
||||
append(&code, Opcode.Assert_Word_Boundary)
|
||||
}
|
||||
|
||||
// Compound Nodes:
|
||||
case ^Node_Group:
|
||||
code = generate_code(c, specific.inner)
|
||||
|
||||
if specific.capture && .No_Capture not_in c.flags {
|
||||
inject_at(&code, 0, Opcode.Save)
|
||||
inject_at(&code, 1, Opcode(2 * specific.capture_id))
|
||||
|
||||
append(&code, Opcode.Save)
|
||||
append(&code, Opcode(2 * specific.capture_id + 1))
|
||||
}
|
||||
|
||||
case ^Node_Alternation:
|
||||
left := generate_code(c, specific.left)
|
||||
right := generate_code(c, specific.right)
|
||||
|
||||
left_len := len(left)
|
||||
|
||||
// Avoiding duplicate allocation by reusing `left`.
|
||||
code = left
|
||||
|
||||
inject_at(&code, 0, Opcode.Split)
|
||||
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
||||
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
|
||||
|
||||
append(&code, Opcode.Jump)
|
||||
append_raw(&code, i16(len(right) + JUMP_SIZE))
|
||||
|
||||
for opcode in right {
|
||||
append(&code, opcode)
|
||||
}
|
||||
|
||||
case ^Node_Concatenation:
|
||||
for subnode in specific.nodes {
|
||||
subnode_code := generate_code(c, subnode)
|
||||
for opcode in subnode_code {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
|
||||
case ^Node_Repeat_Zero:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
inject_at(&code, 0, Opcode.Split)
|
||||
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
||||
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
||||
|
||||
append(&code, Opcode.Jump)
|
||||
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
||||
|
||||
case ^Node_Repeat_Zero_Non_Greedy:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
inject_at(&code, 0, Opcode.Split)
|
||||
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
||||
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
|
||||
|
||||
append(&code, Opcode.Jump)
|
||||
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
||||
|
||||
case ^Node_Repeat_One:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
append(&code, Opcode.Split)
|
||||
append_raw(&code, i16(-original_len))
|
||||
append_raw(&code, i16(SPLIT_SIZE))
|
||||
|
||||
case ^Node_Repeat_One_Non_Greedy:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
append(&code, Opcode.Split)
|
||||
append_raw(&code, i16(SPLIT_SIZE))
|
||||
append_raw(&code, i16(-original_len))
|
||||
|
||||
case ^Node_Repeat_N:
|
||||
inside := generate_code(c, specific.inner)
|
||||
original_len := len(inside)
|
||||
|
||||
if specific.lower == specific.upper { // {N}
|
||||
// e{N} ... evaluates to ... e^N
|
||||
for i := 0; i < specific.upper; i += 1 {
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
|
||||
} else if specific.lower == -1 && specific.upper > 0 { // {,M}
|
||||
// e{,M} ... evaluates to ... e?^M
|
||||
for i := 0; i < specific.upper; i += 1 {
|
||||
append(&code, Opcode.Split)
|
||||
append_raw(&code, i16(SPLIT_SIZE))
|
||||
append_raw(&code, i16(SPLIT_SIZE + original_len))
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
|
||||
} else if specific.lower >= 0 && specific.upper == -1 { // {N,}
|
||||
// e{N,} ... evaluates to ... e^N e*
|
||||
for i := 0; i < specific.lower; i += 1 {
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
|
||||
append(&code, Opcode.Split)
|
||||
append_raw(&code, i16(SPLIT_SIZE))
|
||||
append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
||||
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
|
||||
append(&code, Opcode.Jump)
|
||||
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
||||
|
||||
} else if specific.lower >= 0 && specific.upper > 0 {
|
||||
// e{N,M} evaluates to ... e^N e?^(M-N)
|
||||
for i := 0; i < specific.lower; i += 1 {
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
for i := 0; i < specific.upper - specific.lower; i += 1 {
|
||||
append(&code, Opcode.Split)
|
||||
append_raw(&code, i16(SPLIT_SIZE + original_len))
|
||||
append_raw(&code, i16(SPLIT_SIZE))
|
||||
for opcode in inside {
|
||||
append(&code, opcode)
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
panic("RegEx compiler received invalid repetition group.")
|
||||
}
|
||||
|
||||
case ^Node_Optional:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
inject_at(&code, 0, Opcode.Split)
|
||||
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
||||
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
|
||||
|
||||
case ^Node_Optional_Non_Greedy:
|
||||
code = generate_code(c, specific.inner)
|
||||
original_len := len(code)
|
||||
|
||||
inject_at(&code, 0, Opcode.Split)
|
||||
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
|
||||
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
|
||||
|
||||
case ^Node_Match_All_And_Escape:
|
||||
append(&code, Opcode.Match_All_And_Escape)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@require_results
|
||||
compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
|
||||
if tree == nil {
|
||||
if .No_Capture not_in flags {
|
||||
append(&code, Opcode.Save); append(&code, Opcode(0x00))
|
||||
append(&code, Opcode.Save); append(&code, Opcode(0x01))
|
||||
append(&code, Opcode.Match)
|
||||
} else {
|
||||
append(&code, Opcode.Match_And_Exit)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
c: Compiler
|
||||
c.flags = flags
|
||||
|
||||
map_all_classes(tree, &class_data)
|
||||
if len(class_data) >= common.MAX_CLASSES {
|
||||
err = .Too_Many_Classes
|
||||
return
|
||||
}
|
||||
c.class_data = class_data
|
||||
|
||||
code = generate_code(&c, tree)
|
||||
|
||||
pc_open := 0
|
||||
|
||||
add_global: if .Global in flags {
|
||||
// Check if the opening to the pattern is predictable.
|
||||
// If so, use one of the optimized Wait opcodes.
|
||||
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
||||
seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
||||
#partial switch opcode {
|
||||
case .Byte:
|
||||
inject_at(&code, pc_open, Opcode.Wait_For_Byte)
|
||||
pc_open += size_of(Opcode)
|
||||
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
||||
pc_open += size_of(u8)
|
||||
break add_global
|
||||
|
||||
case .Rune:
|
||||
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
|
||||
inject_at(&code, pc_open, Opcode.Wait_For_Rune)
|
||||
pc_open += size_of(Opcode)
|
||||
inject_raw(&code, pc_open, operand)
|
||||
pc_open += size_of(rune)
|
||||
break add_global
|
||||
|
||||
case .Rune_Class:
|
||||
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
|
||||
pc_open += size_of(Opcode)
|
||||
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
||||
pc_open += size_of(u8)
|
||||
break add_global
|
||||
|
||||
case .Rune_Class_Negated:
|
||||
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
|
||||
pc_open += size_of(Opcode)
|
||||
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
||||
pc_open += size_of(u8)
|
||||
break add_global
|
||||
|
||||
case .Save:
|
||||
continue
|
||||
case:
|
||||
break seek_loop
|
||||
}
|
||||
}
|
||||
|
||||
// `.*?`
|
||||
inject_at(&code, pc_open, Opcode.Split)
|
||||
pc_open += size_of(byte)
|
||||
inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
|
||||
pc_open += size_of(i16)
|
||||
inject_raw(&code, pc_open, i16(SPLIT_SIZE))
|
||||
pc_open += size_of(i16)
|
||||
|
||||
inject_at(&code, pc_open, Opcode.Wildcard)
|
||||
pc_open += size_of(byte)
|
||||
|
||||
inject_at(&code, pc_open, Opcode.Jump)
|
||||
pc_open += size_of(byte)
|
||||
inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
|
||||
pc_open += size_of(i16)
|
||||
|
||||
}
|
||||
|
||||
if .No_Capture not_in flags {
|
||||
// `(` <generated code>
|
||||
inject_at(&code, pc_open, Opcode.Save)
|
||||
inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
|
||||
|
||||
// `)`
|
||||
append(&code, Opcode.Save); append(&code, Opcode(0x01))
|
||||
|
||||
append(&code, Opcode.Match)
|
||||
} else {
|
||||
append(&code, Opcode.Match_And_Exit)
|
||||
}
|
||||
|
||||
if len(code) >= common.MAX_PROGRAM_SIZE {
|
||||
err = .Program_Too_Big
|
||||
return
|
||||
}
|
||||
|
||||
// NOTE: No further opcode addition beyond this point, as we've already
|
||||
// checked the program size. Removal or transformation is fine.
|
||||
|
||||
// Post-Compile Optimizations:
|
||||
|
||||
// * Jump Extension
|
||||
//
|
||||
// A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
|
||||
if .No_Optimization not_in flags {
|
||||
for passes_left := 1; passes_left > 0; passes_left -= 1 {
|
||||
do_another_pass := false
|
||||
|
||||
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
||||
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
||||
#partial switch opcode {
|
||||
case .Jump:
|
||||
jmp := cast(^i16)&code[pc+size_of(Opcode)]
|
||||
jmp_value := intrinsics.unaligned_load(jmp)
|
||||
if code[cast(i16)pc+jmp_value] == .Jump {
|
||||
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)])
|
||||
intrinsics.unaligned_store(jmp, jmp_value + next_jmp)
|
||||
do_another_pass = true
|
||||
}
|
||||
case .Split:
|
||||
jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
|
||||
jmp_x_value := intrinsics.unaligned_load(jmp_x)
|
||||
if code[cast(i16)pc+jmp_x_value] == .Jump {
|
||||
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)])
|
||||
intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp)
|
||||
do_another_pass = true
|
||||
}
|
||||
jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
|
||||
jmp_y_value := intrinsics.unaligned_load(jmp_y)
|
||||
if code[cast(i16)pc+jmp_y_value] == .Jump {
|
||||
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)])
|
||||
intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp)
|
||||
do_another_pass = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if do_another_pass {
|
||||
passes_left += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// * Relative Jump to Absolute Jump
|
||||
//
|
||||
// RelJmp{PC +/- N} => AbsJmp{M}
|
||||
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
||||
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
||||
// NOTE: The virtual machine implementation depends on this.
|
||||
#partial switch opcode {
|
||||
case .Jump:
|
||||
jmp := cast(^u16)&code[pc+size_of(Opcode)]
|
||||
intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc)
|
||||
case .Split:
|
||||
jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
|
||||
intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc)
|
||||
jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
|
||||
intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
93
core/text/regex/compiler/debugging.odin
Normal file
93
core/text/regex/compiler/debugging.odin
Normal file
@@ -0,0 +1,93 @@
|
||||
package regex_compiler
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:io"
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/virtual_machine"
|
||||
|
||||
get_jump_targets :: proc(code: []Opcode) -> (jump_targets: map[int]int) {
|
||||
iter := virtual_machine.Opcode_Iterator{ code, 0 }
|
||||
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
||||
#partial switch opcode {
|
||||
case .Jump:
|
||||
jmp := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1])
|
||||
jump_targets[jmp] = pc
|
||||
case .Split:
|
||||
jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1])
|
||||
jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+3])
|
||||
jump_targets[jmp_x] = pc
|
||||
jump_targets[jmp_y] = pc
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
trace :: proc(w: io.Writer, code: []Opcode) {
|
||||
jump_targets := get_jump_targets(code)
|
||||
defer delete(jump_targets)
|
||||
|
||||
iter := virtual_machine.Opcode_Iterator{ code, 0 }
|
||||
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
||||
if src, ok := jump_targets[pc]; ok {
|
||||
io.write_string(w, "--")
|
||||
common.write_padded_hex(w, src, 4)
|
||||
io.write_string(w, "--> ")
|
||||
} else {
|
||||
io.write_string(w, " ")
|
||||
}
|
||||
|
||||
io.write_string(w, "[PC: ")
|
||||
common.write_padded_hex(w, pc, 4)
|
||||
io.write_string(w, "] ")
|
||||
io.write_string(w, virtual_machine.opcode_to_name(opcode))
|
||||
io.write_byte(w, ' ')
|
||||
|
||||
#partial switch opcode {
|
||||
case .Byte:
|
||||
operand := cast(rune)code[pc+1]
|
||||
io.write_encoded_rune(w, operand)
|
||||
case .Rune:
|
||||
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
|
||||
io.write_encoded_rune(w, operand)
|
||||
case .Rune_Class, .Rune_Class_Negated:
|
||||
operand := cast(u8)code[pc+1]
|
||||
common.write_padded_hex(w, operand, 2)
|
||||
case .Jump:
|
||||
jmp := intrinsics.unaligned_load(cast(^u16)&code[pc+1])
|
||||
io.write_string(w, "-> $")
|
||||
common.write_padded_hex(w, jmp, 4)
|
||||
case .Split:
|
||||
jmp_x := intrinsics.unaligned_load(cast(^u16)&code[pc+1])
|
||||
jmp_y := intrinsics.unaligned_load(cast(^u16)&code[pc+3])
|
||||
io.write_string(w, "=> $")
|
||||
common.write_padded_hex(w, jmp_x, 4)
|
||||
io.write_string(w, ", $")
|
||||
common.write_padded_hex(w, jmp_y, 4)
|
||||
case .Save:
|
||||
operand := cast(u8)code[pc+1]
|
||||
common.write_padded_hex(w, operand, 2)
|
||||
case .Wait_For_Byte:
|
||||
operand := cast(rune)code[pc+1]
|
||||
io.write_encoded_rune(w, operand)
|
||||
case .Wait_For_Rune:
|
||||
operand := (cast(^rune)&code[pc+1])^
|
||||
io.write_encoded_rune(w, operand)
|
||||
case .Wait_For_Rune_Class:
|
||||
operand := cast(u8)code[pc+1]
|
||||
common.write_padded_hex(w, operand, 2)
|
||||
case .Wait_For_Rune_Class_Negated:
|
||||
operand := cast(u8)code[pc+1]
|
||||
common.write_padded_hex(w, operand, 2)
|
||||
}
|
||||
|
||||
io.write_byte(w, '\n')
|
||||
}
|
||||
}
|
||||
9
core/text/regex/compiler/doc.odin
Normal file
9
core/text/regex/compiler/doc.odin
Normal file
@@ -0,0 +1,9 @@
|
||||
/*
|
||||
package regex_compiler implements a bytecode compiler for the virtual machine
|
||||
included alongside it.
|
||||
|
||||
Operands larger than u8 are written in system endian order.
|
||||
|
||||
More details can be found in the documentation for the virtual machine.
|
||||
*/
|
||||
package regex_compiler
|
||||
97
core/text/regex/doc.odin
Normal file
97
core/text/regex/doc.odin
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
package regex implements a complete suite for using Regular Expressions to
|
||||
match and capture text.
|
||||
|
||||
Regular expressions are used to describe how a piece of text can match to
|
||||
another, using a pattern language.
|
||||
|
||||
Odin's regex library implements the following features:
|
||||
|
||||
Alternation: `apple|cherry`
|
||||
Classes: `[0-9_]`
|
||||
Classes, negated: `[^0-9_]`
|
||||
Shorthands: `\d\s\w`
|
||||
Shorthands, negated: `\D\S\W`
|
||||
Wildcards: `.`
|
||||
Repeat, optional: `a*`
|
||||
Repeat, at least once: `a+`
|
||||
Repetition: `a{1,2}`
|
||||
Optional: `a?`
|
||||
Group, capture: `([0-9])`
|
||||
Group, non-capture: `(?:[0-9])`
|
||||
Start & End Anchors: `^hello$`
|
||||
Word Boundaries: `\bhello\b`
|
||||
Non-Word Boundaries: `hello\B`
|
||||
|
||||
These specifiers can be composed together, such as an optional group:
|
||||
`(?:hello)?`
|
||||
|
||||
This package also supports the non-greedy variants of the repeating and
|
||||
optional specifiers by appending a `?` to them.
|
||||
|
||||
Of the shorthand classes that are supported, they are all ASCII-based, even
|
||||
when compiling in Unicode mode. This is for the sake of general performance and
|
||||
simplicity, as there are thousands of Unicode codepoints which would qualify as
|
||||
either a digit, space, or word character which could be irrelevant depending on
|
||||
what is being matched.
|
||||
|
||||
Here are the shorthand class equivalencies:
|
||||
\d: [0-9]
|
||||
\s: [\t\n\f\r ]
|
||||
\w: [0-9A-Z_a-z]
|
||||
|
||||
If you need your own shorthands, you can compose strings together like so:
|
||||
MY_HEX :: "[0-9A-Fa-f]"
|
||||
PATTERN :: MY_HEX + "-" + MY_HEX
|
||||
|
||||
The compiler will handle turning multiple identical classes into references to
|
||||
the same set of matching runes, so there's no penalty for doing it like this.
|
||||
|
||||
|
||||
|
||||
``Some people, when confronted with a problem, think
|
||||
"I know, I'll use regular expressions." Now they have two problems.''
|
||||
|
||||
- Jamie Zawinski
|
||||
|
||||
|
||||
Regular expressions have gathered a reputation over the decades for often being
|
||||
chosen as the wrong tool for the job. Here, we will clarify a few cases in
|
||||
which RegEx might be good or bad.
|
||||
|
||||
|
||||
**When is it a good time to use RegEx?**
|
||||
|
||||
- You don't know at compile-time what patterns of text the program will need to
|
||||
match when it's running.
|
||||
- As an example, you are making a client which can be configured by the user to
|
||||
trigger on certain text patterns received from a server.
|
||||
- For another example, you need a way for users of a text editor to compose
|
||||
matching strings that are more intricate than a simple substring lookup.
|
||||
- The text you're matching against is small (< 64 KiB) and your patterns aren't
|
||||
overly complicated with branches (alternations, repeats, and optionals).
|
||||
- If none of the above general impressions apply but your project doesn't
|
||||
warrant long-term maintenance.
|
||||
|
||||
**When is it a bad time to use RegEx?**
|
||||
|
||||
- You know at compile-time the grammar you're parsing; a hand-made parser has
|
||||
the potential to be more maintainable and readable.
|
||||
- The grammar you're parsing has certain validation steps that lend itself to
|
||||
forming complicated expressions, such as e-mail addresses, URIs, dates,
|
||||
postal codes, credit cards, et cetera. Using RegEx to validate these
|
||||
structures is almost always a bad sign.
|
||||
- The text you're matching against is big (> 1 MiB); you would be better served
|
||||
by first dividing the text into manageable chunks and using some heuristic to
|
||||
locate the most likely location of a match before applying RegEx against it.
|
||||
- You value high performance and low memory usage; RegEx will always have a
|
||||
certain overhead which increases with the complexity of the pattern.
|
||||
|
||||
|
||||
The implementation of this package has been optimized, but it will never be as
|
||||
thoroughly performant as a hand-made parser. In comparison, there are just too
|
||||
many intermediate steps, assumptions, and generalizations in what it takes to
|
||||
handle a regular expression.
|
||||
|
||||
*/
|
||||
package regex
|
||||
58
core/text/regex/optimizer/doc.odin
Normal file
58
core/text/regex/optimizer/doc.odin
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
package regex_optimizer implements an optimizer which acts upon the AST of a
|
||||
parsed regular expression pattern, transforming it in-place without moving to a
|
||||
compilation step.
|
||||
|
||||
Where possible, it aims to reduce branching as much as possible in the
|
||||
expression by reducing usage of `|`.
|
||||
|
||||
|
||||
Here is a summary of the optimizations that it will do:
|
||||
|
||||
* Class Simplification : `[aab]` => `[ab]`
|
||||
`[aa]` => `[a]`
|
||||
|
||||
* Class Reduction : `[a]` => `a`
|
||||
* Range Construction : `[abc]` => `[a-c]`
|
||||
* Rune Merging into Range : `[aa-c]` => `[a-c]`
|
||||
|
||||
* Range Merging : `[a-cc-e]` => `[a-e]`
|
||||
`[a-cd-e]` => `[a-e]`
|
||||
`[a-cb-e]` => `[a-e]`
|
||||
|
||||
* Alternation to Optional : `a|` => `a?`
|
||||
* Alternation to Optional Non-Greedy : `|a` => `a??`
|
||||
* Alternation Reduction : `a|a` => `a`
|
||||
* Alternation to Class : `a|b` => `[ab]`
|
||||
* Class Union : `[a0]|[b1]` => `[a0b1]`
|
||||
`[a-b]|c` => `[a-bc]`
|
||||
`a|[b-c]` => `[b-ca]`
|
||||
|
||||
* Wildcard Reduction : `a|.` => `.`
|
||||
`.|a` => `.`
|
||||
`[ab]|.` => `.`
|
||||
`.|[ab]` => `.`
|
||||
|
||||
* Common Suffix Elimination : `blueberry|strawberry` => `(?:blue|straw)berry`
|
||||
* Common Prefix Elimination : `abi|abe` => `ab(?:i|e)`
|
||||
|
||||
* Composition: Consume All to Anchored End
|
||||
`.*$` => <special opcode>
|
||||
`.+$` => `.` <special opcode>
|
||||
|
||||
|
||||
Possible future improvements:
|
||||
|
||||
- Change the AST of alternations to be a list instead of a tree, so that
|
||||
constructions such as `(ab|bb|cb)` can be considered in whole by the affix
|
||||
elimination optimizations.
|
||||
|
||||
- Introduce specialized opcodes for certain classes of repetition.
|
||||
|
||||
- Add Common Infix Elimination.
|
||||
|
||||
- Measure the precise finite minimum and maximum of a pattern, if available,
|
||||
and check against that on any strings before running the virtual machine.
|
||||
|
||||
*/
|
||||
package regex_optimizer
|
||||
530
core/text/regex/optimizer/optimizer.odin
Normal file
530
core/text/regex/optimizer/optimizer.odin
Normal file
@@ -0,0 +1,530 @@
|
||||
package regex_optimizer
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
@require import "core:io"
|
||||
import "core:slice"
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/parser"
|
||||
|
||||
Rune_Class_Range :: parser.Rune_Class_Range
|
||||
|
||||
Node :: parser.Node
|
||||
Node_Rune :: parser.Node_Rune
|
||||
Node_Rune_Class :: parser.Node_Rune_Class
|
||||
Node_Wildcard :: parser.Node_Wildcard
|
||||
Node_Concatenation :: parser.Node_Concatenation
|
||||
Node_Alternation :: parser.Node_Alternation
|
||||
Node_Repeat_Zero :: parser.Node_Repeat_Zero
|
||||
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
|
||||
Node_Repeat_One :: parser.Node_Repeat_One
|
||||
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
|
||||
Node_Repeat_N :: parser.Node_Repeat_N
|
||||
Node_Optional :: parser.Node_Optional
|
||||
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
|
||||
Node_Group :: parser.Node_Group
|
||||
Node_Anchor :: parser.Node_Anchor
|
||||
Node_Word_Boundary :: parser.Node_Word_Boundary
|
||||
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
|
||||
|
||||
|
||||
class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool {
|
||||
return i.lower < j.lower
|
||||
}
|
||||
|
||||
optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
|
||||
if tree == nil {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
result = tree
|
||||
|
||||
switch specific in tree {
|
||||
// No direct optimization possible on these nodes:
|
||||
case ^Node_Rune: break
|
||||
case ^Node_Wildcard: break
|
||||
case ^Node_Anchor: break
|
||||
case ^Node_Word_Boundary: break
|
||||
case ^Node_Match_All_And_Escape: break
|
||||
|
||||
case ^Node_Concatenation:
|
||||
// * Composition: Consume All to Anchored End
|
||||
//
|
||||
// DO: `.*$` => <special opcode>
|
||||
// DO: `.+$` => `.` <special opcode>
|
||||
if .Multiline not_in flags && len(specific.nodes) >= 2 {
|
||||
i := len(specific.nodes) - 2
|
||||
wrza: {
|
||||
subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza
|
||||
_ = subnode.inner.(^Node_Wildcard) or_break wrza
|
||||
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza
|
||||
if next_node.start == false {
|
||||
specific.nodes[i] = new(Node_Match_All_And_Escape)
|
||||
ordered_remove(&specific.nodes, i + 1)
|
||||
changes += 1
|
||||
break
|
||||
}
|
||||
}
|
||||
wroa: {
|
||||
subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa
|
||||
subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa
|
||||
next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa
|
||||
if next_node.start == false {
|
||||
specific.nodes[i] = subsubnode
|
||||
specific.nodes[i+1] = new(Node_Match_All_And_Escape)
|
||||
changes += 1
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only recursive optimizations:
|
||||
#no_bounds_check for i := 0; i < len(specific.nodes); i += 1 {
|
||||
subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags)
|
||||
changes += subnode_changes
|
||||
if subnode == nil {
|
||||
ordered_remove(&specific.nodes, i)
|
||||
i -= 1
|
||||
changes += 1
|
||||
} else {
|
||||
specific.nodes[i] = subnode
|
||||
}
|
||||
}
|
||||
|
||||
if len(specific.nodes) == 1 {
|
||||
result = specific.nodes[0]
|
||||
changes += 1
|
||||
} else if len(specific.nodes) == 0 {
|
||||
return nil, changes + 1
|
||||
}
|
||||
|
||||
case ^Node_Repeat_Zero:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Repeat_Zero_Non_Greedy:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Repeat_One:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Repeat_One_Non_Greedy:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Repeat_N:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Optional:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
case ^Node_Optional_Non_Greedy:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
|
||||
case ^Node_Group:
|
||||
specific.inner, changes = optimize_subtree(specific.inner, flags)
|
||||
|
||||
if specific.inner == nil {
|
||||
return nil, changes + 1
|
||||
}
|
||||
|
||||
if !specific.capture {
|
||||
result = specific.inner
|
||||
changes += 1
|
||||
}
|
||||
|
||||
// Full optimization:
|
||||
case ^Node_Rune_Class:
|
||||
// * Class Simplification
|
||||
//
|
||||
// DO: `[aab]` => `[ab]`
|
||||
// DO: `[aa]` => `[a]`
|
||||
runes_seen: map[rune]bool
|
||||
|
||||
for r in specific.runes {
|
||||
runes_seen[r] = true
|
||||
}
|
||||
|
||||
if len(runes_seen) != len(specific.runes) {
|
||||
clear(&specific.runes)
|
||||
for key in runes_seen {
|
||||
append(&specific.runes, key)
|
||||
}
|
||||
changes += 1
|
||||
}
|
||||
|
||||
// * Class Reduction
|
||||
//
|
||||
// DO: `[a]` => `a`
|
||||
if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 {
|
||||
only_rune := specific.runes[0]
|
||||
|
||||
node := new(Node_Rune)
|
||||
node.data = only_rune
|
||||
|
||||
return node, changes + 1
|
||||
}
|
||||
|
||||
// * Range Construction
|
||||
//
|
||||
// DO: `[abc]` => `[a-c]`
|
||||
slice.sort(specific.runes[:])
|
||||
if len(specific.runes) > 1 {
|
||||
new_range: Rune_Class_Range
|
||||
new_range.lower = specific.runes[0]
|
||||
new_range.upper = specific.runes[0]
|
||||
|
||||
#no_bounds_check for i := 1; i < len(specific.runes); i += 1 {
|
||||
r := specific.runes[i]
|
||||
if new_range.lower == -1 {
|
||||
new_range = { r, r }
|
||||
continue
|
||||
}
|
||||
|
||||
if r == new_range.lower - 1 {
|
||||
new_range.lower -= 1
|
||||
ordered_remove(&specific.runes, i)
|
||||
i -= 1
|
||||
changes += 1
|
||||
} else if r == new_range.upper + 1 {
|
||||
new_range.upper += 1
|
||||
ordered_remove(&specific.runes, i)
|
||||
i -= 1
|
||||
changes += 1
|
||||
} else if new_range.lower != new_range.upper {
|
||||
append(&specific.ranges, new_range)
|
||||
new_range = { -1, -1 }
|
||||
changes += 1
|
||||
}
|
||||
}
|
||||
|
||||
if new_range.lower != new_range.upper {
|
||||
append(&specific.ranges, new_range)
|
||||
changes += 1
|
||||
}
|
||||
}
|
||||
|
||||
// * Rune Merging into Range
|
||||
//
|
||||
// DO: `[aa-c]` => `[a-c]`
|
||||
for range in specific.ranges {
|
||||
#no_bounds_check for i := 0; i < len(specific.runes); i += 1 {
|
||||
r := specific.runes[i]
|
||||
if range.lower <= r && r <= range.upper {
|
||||
ordered_remove(&specific.runes, i)
|
||||
i -= 1
|
||||
changes += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// * Range Merging
|
||||
//
|
||||
// DO: `[a-cc-e]` => `[a-e]`
|
||||
// DO: `[a-cd-e]` => `[a-e]`
|
||||
// DO: `[a-cb-e]` => `[a-e]`
|
||||
slice.sort_by(specific.ranges[:], class_range_sorter)
|
||||
#no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 {
|
||||
for j := i + 1; j < len(specific.ranges); j += 1 {
|
||||
left_range := &specific.ranges[i]
|
||||
right_range := specific.ranges[j]
|
||||
|
||||
if left_range.upper == right_range.lower ||
|
||||
left_range.upper == right_range.lower - 1 ||
|
||||
left_range.lower <= right_range.lower && right_range.lower <= left_range.upper {
|
||||
left_range.upper = max(left_range.upper, right_range.upper)
|
||||
ordered_remove(&specific.ranges, j)
|
||||
j -= 1
|
||||
changes += 1
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(specific.ranges) == 0 {
|
||||
specific.ranges = {}
|
||||
}
|
||||
if len(specific.runes) == 0 {
|
||||
specific.runes = {}
|
||||
}
|
||||
|
||||
// * NOP
|
||||
//
|
||||
// DO: `[]` => <nil>
|
||||
if len(specific.ranges) + len(specific.runes) == 0 {
|
||||
return nil, 1
|
||||
}
|
||||
|
||||
slice.sort(specific.runes[:])
|
||||
slice.sort_by(specific.ranges[:], class_range_sorter)
|
||||
|
||||
case ^Node_Alternation:
|
||||
// Perform recursive optimization first.
|
||||
left_changes, right_changes: int
|
||||
specific.left, left_changes = optimize_subtree(specific.left, flags)
|
||||
specific.right, right_changes = optimize_subtree(specific.right, flags)
|
||||
changes += left_changes + right_changes
|
||||
|
||||
// * Alternation to Optional
|
||||
//
|
||||
// DO: `a|` => `a?`
|
||||
if specific.left != nil && specific.right == nil {
|
||||
node := new(Node_Optional)
|
||||
node.inner = specific.left
|
||||
return node, 1
|
||||
}
|
||||
|
||||
// * Alternation to Optional Non-Greedy
|
||||
//
|
||||
// DO: `|a` => `a??`
|
||||
if specific.right != nil && specific.left == nil {
|
||||
node := new(Node_Optional_Non_Greedy)
|
||||
node.inner = specific.right
|
||||
return node, 1
|
||||
}
|
||||
|
||||
// * NOP
|
||||
//
|
||||
// DO: `|` => <nil>
|
||||
if specific.left == nil && specific.right == nil {
|
||||
return nil, 1
|
||||
}
|
||||
|
||||
left_rune, left_is_rune := specific.left.(^Node_Rune)
|
||||
right_rune, right_is_rune := specific.right.(^Node_Rune)
|
||||
|
||||
if left_is_rune && right_is_rune {
|
||||
if left_rune.data == right_rune.data {
|
||||
// * Alternation Reduction
|
||||
//
|
||||
// DO: `a|a` => `a`
|
||||
return left_rune, 1
|
||||
} else {
|
||||
// * Alternation to Class
|
||||
//
|
||||
// DO: `a|b` => `[ab]`
|
||||
node := new(Node_Rune_Class)
|
||||
append(&node.runes, left_rune.data)
|
||||
append(&node.runes, right_rune.data)
|
||||
return node, 1
|
||||
}
|
||||
}
|
||||
|
||||
left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard)
|
||||
right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard)
|
||||
|
||||
// * Class Union
|
||||
//
|
||||
// DO: `[a0]|[b1]` => `[a0b1]`
|
||||
left_class, left_is_class := specific.left.(^Node_Rune_Class)
|
||||
right_class, right_is_class := specific.right.(^Node_Rune_Class)
|
||||
if left_is_class && right_is_class {
|
||||
for r in right_class.runes {
|
||||
append(&left_class.runes, r)
|
||||
}
|
||||
for range in right_class.ranges {
|
||||
append(&left_class.ranges, range)
|
||||
}
|
||||
return left_class, 1
|
||||
}
|
||||
|
||||
// * Class Union
|
||||
//
|
||||
// DO: `[a-b]|c` => `[a-bc]`
|
||||
if left_is_class && right_is_rune {
|
||||
append(&left_class.runes, right_rune.data)
|
||||
return left_class, 1
|
||||
}
|
||||
|
||||
// * Class Union
|
||||
//
|
||||
// DO: `a|[b-c]` => `[b-ca]`
|
||||
if left_is_rune && right_is_class {
|
||||
append(&right_class.runes, left_rune.data)
|
||||
return right_class, 1
|
||||
}
|
||||
|
||||
// * Wildcard Reduction
|
||||
//
|
||||
// DO: `a|.` => `.`
|
||||
if left_is_rune && right_is_wildcard {
|
||||
return right_wildcard, 1
|
||||
}
|
||||
|
||||
// * Wildcard Reduction
|
||||
//
|
||||
// DO: `.|a` => `.`
|
||||
if left_is_wildcard && right_is_rune {
|
||||
return left_wildcard, 1
|
||||
}
|
||||
|
||||
// * Wildcard Reduction
|
||||
//
|
||||
// DO: `[ab]|.` => `.`
|
||||
if left_is_class && right_is_wildcard {
|
||||
return right_wildcard, 1
|
||||
}
|
||||
|
||||
// * Wildcard Reduction
|
||||
//
|
||||
// DO: `.|[ab]` => `.`
|
||||
if left_is_wildcard && right_is_class {
|
||||
return left_wildcard, 1
|
||||
}
|
||||
|
||||
left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation)
|
||||
right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation)
|
||||
|
||||
// * Common Suffix Elimination
|
||||
//
|
||||
// DO: `blueberry|strawberry` => `(?:blue|straw)berry`
|
||||
if left_is_concatenation && right_is_concatenation {
|
||||
// Remember that a concatenation could contain any node, not just runes.
|
||||
left_len := len(left_concatenation.nodes)
|
||||
right_len := len(right_concatenation.nodes)
|
||||
least_len := min(left_len, right_len)
|
||||
same_len := 0
|
||||
for i := 1; i <= least_len; i += 1 {
|
||||
left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune)
|
||||
right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune)
|
||||
|
||||
if !left_is_subrune || !right_is_subrune {
|
||||
// One of the nodes isn't a rune; there's nothing more we can do.
|
||||
break
|
||||
}
|
||||
|
||||
if left_subrune.data == right_subrune.data {
|
||||
same_len += 1
|
||||
} else {
|
||||
// No more similarities.
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if same_len > 0 {
|
||||
// Dissolve this alternation into a concatenation.
|
||||
cat_node := new(Node_Concatenation)
|
||||
group_node := new(Node_Group)
|
||||
append(&cat_node.nodes, group_node)
|
||||
|
||||
// Turn the concatenation into the common suffix.
|
||||
for i := left_len - same_len; i < left_len; i += 1 {
|
||||
append(&cat_node.nodes, left_concatenation.nodes[i])
|
||||
}
|
||||
|
||||
// Construct the group of alternating prefixes.
|
||||
for i := same_len; i > 0; i -= 1 {
|
||||
pop(&left_concatenation.nodes)
|
||||
pop(&right_concatenation.nodes)
|
||||
}
|
||||
|
||||
// (Re-using this alternation node.)
|
||||
alter_node := specific
|
||||
alter_node.left = left_concatenation
|
||||
alter_node.right = right_concatenation
|
||||
group_node.inner = alter_node
|
||||
|
||||
return cat_node, 1
|
||||
}
|
||||
}
|
||||
|
||||
// * Common Prefix Elimination
|
||||
//
|
||||
// DO: `abi|abe` => `ab(?:i|e)`
|
||||
if left_is_concatenation && right_is_concatenation {
|
||||
// Try to identify a common prefix.
|
||||
// Remember that a concatenation could contain any node, not just runes.
|
||||
least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes))
|
||||
same_len := 0
|
||||
for i := 0; i < least_len; i += 1 {
|
||||
left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune)
|
||||
right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune)
|
||||
|
||||
if !left_is_subrune || !right_is_subrune {
|
||||
// One of the nodes isn't a rune; there's nothing more we can do.
|
||||
break
|
||||
}
|
||||
|
||||
if left_subrune.data == right_subrune.data {
|
||||
same_len = i + 1
|
||||
} else {
|
||||
// No more similarities.
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if same_len > 0 {
|
||||
cat_node := new(Node_Concatenation)
|
||||
for i := 0; i < same_len; i += 1 {
|
||||
append(&cat_node.nodes, left_concatenation.nodes[i])
|
||||
}
|
||||
for i := same_len; i > 0; i -= 1 {
|
||||
ordered_remove(&left_concatenation.nodes, 0)
|
||||
ordered_remove(&right_concatenation.nodes, 0)
|
||||
}
|
||||
|
||||
group_node := new(Node_Group)
|
||||
// (Re-using this alternation node.)
|
||||
alter_node := specific
|
||||
alter_node.left = left_concatenation
|
||||
alter_node.right = right_concatenation
|
||||
group_node.inner = alter_node
|
||||
|
||||
append(&cat_node.nodes, group_node)
|
||||
return cat_node, 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) {
|
||||
result = tree
|
||||
new_changes := 0
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "AST before Optimizer: ")
|
||||
parser.write_node(common.debug_stream, tree)
|
||||
io.write_byte(common.debug_stream, '\n')
|
||||
}
|
||||
|
||||
// Keep optimizing until no more changes are seen.
|
||||
for {
|
||||
result, new_changes = optimize_subtree(result, flags)
|
||||
changes += new_changes
|
||||
if new_changes == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "AST after Optimizer: ")
|
||||
parser.write_node(common.debug_stream, result)
|
||||
io.write_byte(common.debug_stream, '\n')
|
||||
}
|
||||
|
||||
|
||||
return
|
||||
}
|
||||
111
core/text/regex/parser/debugging.odin
Normal file
111
core/text/regex/parser/debugging.odin
Normal file
@@ -0,0 +1,111 @@
|
||||
package regex_parser
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:io"
|
||||
|
||||
write_node :: proc(w: io.Writer, node: Node) {
|
||||
switch specific in node {
|
||||
case ^Node_Rune:
|
||||
io.write_rune(w, specific.data)
|
||||
|
||||
case ^Node_Rune_Class:
|
||||
io.write_byte(w, '[')
|
||||
if specific.negating {
|
||||
io.write_byte(w, '^')
|
||||
}
|
||||
for r in specific.data.runes {
|
||||
io.write_rune(w, r)
|
||||
}
|
||||
for range in specific.data.ranges {
|
||||
io.write_rune(w, range.lower)
|
||||
io.write_byte(w, '-')
|
||||
io.write_rune(w, range.upper)
|
||||
}
|
||||
io.write_byte(w, ']')
|
||||
|
||||
case ^Node_Wildcard:
|
||||
io.write_byte(w, '.')
|
||||
|
||||
case ^Node_Concatenation:
|
||||
io.write_rune(w, '「')
|
||||
for subnode, i in specific.nodes {
|
||||
if i != 0 {
|
||||
io.write_rune(w, '⋅')
|
||||
}
|
||||
write_node(w, subnode)
|
||||
}
|
||||
io.write_rune(w, '」')
|
||||
|
||||
case ^Node_Repeat_Zero:
|
||||
write_node(w, specific.inner)
|
||||
io.write_byte(w, '*')
|
||||
case ^Node_Repeat_Zero_Non_Greedy:
|
||||
write_node(w, specific.inner)
|
||||
io.write_string(w, "*?")
|
||||
case ^Node_Repeat_One:
|
||||
write_node(w, specific.inner)
|
||||
io.write_byte(w, '+')
|
||||
case ^Node_Repeat_One_Non_Greedy:
|
||||
write_node(w, specific.inner)
|
||||
io.write_string(w, "+?")
|
||||
|
||||
case ^Node_Repeat_N:
|
||||
write_node(w, specific.inner)
|
||||
if specific.lower == 0 && specific.upper == -1 {
|
||||
io.write_byte(w, '*')
|
||||
} else if specific.lower == 1 && specific.upper == -1 {
|
||||
io.write_byte(w, '+')
|
||||
} else {
|
||||
io.write_byte(w, '{')
|
||||
io.write_int(w, specific.lower)
|
||||
io.write_byte(w, ',')
|
||||
io.write_int(w, specific.upper)
|
||||
io.write_byte(w, '}')
|
||||
}
|
||||
|
||||
case ^Node_Alternation:
|
||||
io.write_rune(w, '《')
|
||||
write_node(w, specific.left)
|
||||
io.write_byte(w, '|')
|
||||
write_node(w, specific.right)
|
||||
io.write_rune(w, '》')
|
||||
|
||||
case ^Node_Optional:
|
||||
io.write_rune(w, '〈')
|
||||
write_node(w, specific.inner)
|
||||
io.write_byte(w, '?')
|
||||
io.write_rune(w, '〉')
|
||||
case ^Node_Optional_Non_Greedy:
|
||||
io.write_rune(w, '〈')
|
||||
write_node(w, specific.inner)
|
||||
io.write_string(w, "??")
|
||||
io.write_rune(w, '〉')
|
||||
|
||||
case ^Node_Group:
|
||||
io.write_byte(w, '(')
|
||||
if !specific.capture {
|
||||
io.write_string(w, "?:")
|
||||
}
|
||||
write_node(w, specific.inner)
|
||||
io.write_byte(w, ')')
|
||||
|
||||
case ^Node_Anchor:
|
||||
io.write_byte(w, '^' if specific.start else '$')
|
||||
|
||||
case ^Node_Word_Boundary:
|
||||
io.write_string(w, `\B` if specific.non_word else `\b`)
|
||||
|
||||
case ^Node_Match_All_And_Escape:
|
||||
io.write_string(w, "《.*$》")
|
||||
|
||||
case nil:
|
||||
io.write_string(w, "<nil>")
|
||||
}
|
||||
}
|
||||
10
core/text/regex/parser/doc.odin
Normal file
10
core/text/regex/parser/doc.odin
Normal file
@@ -0,0 +1,10 @@
|
||||
/*
|
||||
package regex_parser implements a Pratt parser, also known as a Top-Down
|
||||
Operator Precedence parser, for parsing tokenized regular expression patterns.
|
||||
|
||||
References:
|
||||
- https://dl.acm.org/doi/10.1145/512927.512931
|
||||
- https://tdop.github.io/
|
||||
- http://crockford.com/javascript/tdop/tdop.html
|
||||
*/
|
||||
package regex_parser
|
||||
590
core/text/regex/parser/parser.odin
Normal file
590
core/text/regex/parser/parser.odin
Normal file
@@ -0,0 +1,590 @@
|
||||
package regex_parser
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:strconv"
|
||||
import "core:strings"
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/tokenizer"
|
||||
import "core:unicode"
|
||||
import "core:unicode/utf8"
|
||||
|
||||
Token :: tokenizer.Token
|
||||
Token_Kind :: tokenizer.Token_Kind
|
||||
Tokenizer :: tokenizer.Tokenizer
|
||||
|
||||
Rune_Class_Range :: struct {
|
||||
lower, upper: rune,
|
||||
}
|
||||
Rune_Class_Data :: struct {
|
||||
runes: [dynamic]rune,
|
||||
ranges: [dynamic]Rune_Class_Range,
|
||||
}
|
||||
|
||||
|
||||
Node_Rune :: struct {
|
||||
data: rune,
|
||||
}
|
||||
|
||||
Node_Rune_Class :: struct {
|
||||
negating: bool,
|
||||
using data: Rune_Class_Data,
|
||||
}
|
||||
|
||||
Node_Wildcard :: struct {}
|
||||
|
||||
Node_Alternation :: struct {
|
||||
left, right: Node,
|
||||
}
|
||||
|
||||
Node_Concatenation :: struct {
|
||||
nodes: [dynamic]Node,
|
||||
}
|
||||
|
||||
Node_Repeat_Zero :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
Node_Repeat_Zero_Non_Greedy :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
Node_Repeat_One :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
Node_Repeat_One_Non_Greedy :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
|
||||
Node_Repeat_N :: struct {
|
||||
inner: Node,
|
||||
lower, upper: int,
|
||||
}
|
||||
|
||||
Node_Optional :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
Node_Optional_Non_Greedy :: struct {
|
||||
inner: Node,
|
||||
}
|
||||
|
||||
Node_Group :: struct {
|
||||
inner: Node,
|
||||
capture_id: int,
|
||||
capture: bool,
|
||||
}
|
||||
|
||||
Node_Anchor :: struct {
|
||||
start: bool,
|
||||
}
|
||||
Node_Word_Boundary :: struct {
|
||||
non_word: bool,
|
||||
}
|
||||
|
||||
Node_Match_All_And_Escape :: struct {}
|
||||
|
||||
Node :: union {
|
||||
^Node_Rune,
|
||||
^Node_Rune_Class,
|
||||
^Node_Wildcard,
|
||||
^Node_Concatenation,
|
||||
^Node_Alternation,
|
||||
^Node_Repeat_Zero,
|
||||
^Node_Repeat_Zero_Non_Greedy,
|
||||
^Node_Repeat_One,
|
||||
^Node_Repeat_One_Non_Greedy,
|
||||
^Node_Repeat_N,
|
||||
^Node_Optional,
|
||||
^Node_Optional_Non_Greedy,
|
||||
^Node_Group,
|
||||
^Node_Anchor,
|
||||
^Node_Word_Boundary,
|
||||
|
||||
// Optimized nodes (not created by the Parser):
|
||||
^Node_Match_All_And_Escape,
|
||||
}
|
||||
|
||||
|
||||
left_binding_power :: proc(kind: Token_Kind) -> int {
|
||||
#partial switch kind {
|
||||
case .Alternate: return 1
|
||||
case .Concatenate: return 2
|
||||
case .Repeat_Zero, .Repeat_One,
|
||||
.Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy,
|
||||
.Repeat_N: return 3
|
||||
case .Optional,
|
||||
.Optional_Non_Greedy: return 4
|
||||
case .Open_Paren,
|
||||
.Open_Paren_Non_Capture: return 9
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
Expected_Token :: struct {
|
||||
pos: int,
|
||||
kind: Token_Kind,
|
||||
}
|
||||
|
||||
Invalid_Repetition :: struct {
|
||||
pos: int,
|
||||
}
|
||||
|
||||
Invalid_Token :: struct {
|
||||
pos: int,
|
||||
kind: Token_Kind,
|
||||
}
|
||||
|
||||
Invalid_Unicode :: struct {
|
||||
pos: int,
|
||||
}
|
||||
|
||||
Too_Many_Capture_Groups :: struct {
|
||||
pos: int,
|
||||
}
|
||||
|
||||
Unexpected_EOF :: struct {
|
||||
pos: int,
|
||||
}
|
||||
|
||||
Error :: union {
|
||||
Expected_Token,
|
||||
Invalid_Repetition,
|
||||
Invalid_Token,
|
||||
Invalid_Unicode,
|
||||
Too_Many_Capture_Groups,
|
||||
Unexpected_EOF,
|
||||
}
|
||||
|
||||
|
||||
Parser :: struct {
|
||||
flags: common.Flags,
|
||||
t: Tokenizer,
|
||||
|
||||
cur_token: Token,
|
||||
|
||||
groups: int,
|
||||
}
|
||||
|
||||
|
||||
@require_results
|
||||
advance :: proc(p: ^Parser) -> Error {
|
||||
p.cur_token = tokenizer.scan(&p.t)
|
||||
if p.cur_token.kind == .Invalid {
|
||||
return Invalid_Unicode { pos = 0 }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) {
|
||||
if p.cur_token.kind == kind {
|
||||
advance(p) or_return
|
||||
return
|
||||
}
|
||||
|
||||
return Expected_Token{
|
||||
pos = p.t.offset,
|
||||
kind = kind,
|
||||
}
|
||||
}
|
||||
|
||||
null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) {
|
||||
#partial switch token.kind {
|
||||
case .Rune:
|
||||
r: rune
|
||||
for ru in token.text {
|
||||
r = ru
|
||||
break
|
||||
}
|
||||
assert(r != 0, "Parsed an empty Rune token.")
|
||||
|
||||
if .Case_Insensitive in p.flags {
|
||||
lower := unicode.to_lower(r)
|
||||
upper := unicode.to_upper(r)
|
||||
if lower != upper {
|
||||
node := new(Node_Rune_Class)
|
||||
append(&node.runes, lower)
|
||||
append(&node.runes, upper)
|
||||
return node, nil
|
||||
}
|
||||
}
|
||||
|
||||
node := new(Node_Rune)
|
||||
node ^= { r }
|
||||
return node, nil
|
||||
|
||||
case .Rune_Class:
|
||||
if len(token.text) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
node := new(Node_Rune_Class)
|
||||
|
||||
#no_bounds_check for i := 0; i < len(token.text); /**/ {
|
||||
r, size := utf8.decode_rune(token.text[i:])
|
||||
if i == 0 && r == '^' {
|
||||
node.negating = true
|
||||
i += size
|
||||
continue
|
||||
}
|
||||
i += size
|
||||
|
||||
assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
|
||||
|
||||
if r == '\\' {
|
||||
next_r, next_size := utf8.decode_rune(token.text[i:])
|
||||
i += next_size
|
||||
assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.")
|
||||
|
||||
// @MetaCharacter
|
||||
// NOTE: These must be kept in sync with the tokenizer.
|
||||
switch next_r {
|
||||
case 'f': append(&node.runes, '\f')
|
||||
case 'n': append(&node.runes, '\n')
|
||||
case 'r': append(&node.runes, '\r')
|
||||
case 't': append(&node.runes, '\t')
|
||||
|
||||
case 'd':
|
||||
append(&node.ranges, Rune_Class_Range{ '0', '9' })
|
||||
case 's':
|
||||
append(&node.runes, '\t')
|
||||
append(&node.runes, '\n')
|
||||
append(&node.runes, '\f')
|
||||
append(&node.runes, '\r')
|
||||
append(&node.runes, ' ')
|
||||
case 'w':
|
||||
append(&node.ranges, Rune_Class_Range{ '0', '9' })
|
||||
append(&node.ranges, Rune_Class_Range{ 'A', 'Z' })
|
||||
append(&node.runes, '_')
|
||||
append(&node.ranges, Rune_Class_Range{ 'a', 'z' })
|
||||
case 'D':
|
||||
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) })
|
||||
case 'S':
|
||||
append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 })
|
||||
// \t and \n are adjacent.
|
||||
append(&node.runes, '\x0b') // Vertical Tab
|
||||
append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) })
|
||||
case 'W':
|
||||
append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 })
|
||||
append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) })
|
||||
case:
|
||||
append(&node.runes, next_r)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if r == '-' && len(node.runes) > 0 {
|
||||
next_r, next_size := utf8.decode_rune(token.text[i:])
|
||||
if next_size > 0 {
|
||||
last := pop(&node.runes)
|
||||
i += next_size
|
||||
|
||||
append(&node.ranges, Rune_Class_Range{ last, next_r })
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
append(&node.runes, r)
|
||||
}
|
||||
|
||||
if .Case_Insensitive in p.flags {
|
||||
// These two loops cannot be in the form of `for x in y` because
|
||||
// they append to the data that they iterate over.
|
||||
length := len(node.runes)
|
||||
#no_bounds_check for i := 0; i < length; i += 1 {
|
||||
r := node.runes[i]
|
||||
lower := unicode.to_lower(r)
|
||||
upper := unicode.to_upper(r)
|
||||
|
||||
if lower != upper {
|
||||
if lower != r {
|
||||
append(&node.runes, lower)
|
||||
} else {
|
||||
append(&node.runes, upper)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
length = len(node.ranges)
|
||||
#no_bounds_check for i := 0; i < length; i += 1 {
|
||||
range := &node.ranges[i]
|
||||
|
||||
min_lower := unicode.to_lower(range.lower)
|
||||
max_lower := unicode.to_lower(range.upper)
|
||||
|
||||
min_upper := unicode.to_upper(range.lower)
|
||||
max_upper := unicode.to_upper(range.upper)
|
||||
|
||||
if min_lower != min_upper && max_lower != max_upper {
|
||||
range.lower = min_lower
|
||||
range.upper = max_lower
|
||||
append(&node.ranges, Rune_Class_Range{ min_upper, max_upper })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = node
|
||||
|
||||
case .Wildcard:
|
||||
node := new(Node_Wildcard)
|
||||
result = node
|
||||
|
||||
case .Open_Paren:
|
||||
// Because of the recursive nature of the token parser, we take the
|
||||
// group number first instead of afterwards, in order to construct
|
||||
// group matches from the outside in.
|
||||
p.groups += 1
|
||||
if p.groups == common.MAX_CAPTURE_GROUPS {
|
||||
return nil, Too_Many_Capture_Groups{ pos = token.pos }
|
||||
}
|
||||
this_group := p.groups
|
||||
|
||||
node := new(Node_Group)
|
||||
node.capture = true
|
||||
node.capture_id = this_group
|
||||
|
||||
node.inner = parse_expression(p, 0) or_return
|
||||
expect(p, .Close_Paren) or_return
|
||||
result = node
|
||||
case .Open_Paren_Non_Capture:
|
||||
node := new(Node_Group)
|
||||
node.inner = parse_expression(p, 0) or_return
|
||||
expect(p, .Close_Paren) or_return
|
||||
result = node
|
||||
case .Close_Paren:
|
||||
node := new(Node_Rune)
|
||||
node ^= { ')' }
|
||||
return node, nil
|
||||
|
||||
case .Anchor_Start:
|
||||
node := new(Node_Anchor)
|
||||
node.start = true
|
||||
result = node
|
||||
case .Anchor_End:
|
||||
node := new(Node_Anchor)
|
||||
result = node
|
||||
case .Word_Boundary:
|
||||
node := new(Node_Word_Boundary)
|
||||
result = node
|
||||
case .Non_Word_Boundary:
|
||||
node := new(Node_Word_Boundary)
|
||||
node.non_word = true
|
||||
result = node
|
||||
|
||||
case .Alternate:
|
||||
// A unary alternation with a left-side empty path, i.e. `|a`.
|
||||
right, right_err := parse_expression(p, left_binding_power(.Alternate))
|
||||
#partial switch specific in right_err {
|
||||
case Unexpected_EOF:
|
||||
// This token is a NOP, i.e. `|`.
|
||||
break
|
||||
case nil:
|
||||
break
|
||||
case:
|
||||
return nil, right_err
|
||||
}
|
||||
|
||||
node := new(Node_Alternation)
|
||||
node.right = right
|
||||
result = node
|
||||
|
||||
case .EOF:
|
||||
return nil, Unexpected_EOF{ pos = token.pos }
|
||||
|
||||
case:
|
||||
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) {
|
||||
#partial switch token.kind {
|
||||
case .Alternate:
|
||||
if p.cur_token.kind == .Close_Paren {
|
||||
// `(a|)`
|
||||
// parse_expression will fail, so intervene here.
|
||||
node := new(Node_Alternation)
|
||||
node.left = left
|
||||
return node, nil
|
||||
}
|
||||
|
||||
right, right_err := parse_expression(p, left_binding_power(.Alternate))
|
||||
|
||||
#partial switch specific in right_err {
|
||||
case nil:
|
||||
break
|
||||
case Unexpected_EOF:
|
||||
// EOF is okay in an alternation; it's an edge case in the way of
|
||||
// expressing an optional such as `a|`.
|
||||
break
|
||||
case:
|
||||
return nil, right_err
|
||||
}
|
||||
|
||||
node := new(Node_Alternation)
|
||||
node.left = left
|
||||
node.right = right
|
||||
result = node
|
||||
|
||||
case .Concatenate:
|
||||
right := parse_expression(p, left_binding_power(.Concatenate)) or_return
|
||||
|
||||
// There should be no need to check if right is Node_Concatenation, due
|
||||
// to how the parsing direction works.
|
||||
#partial switch specific in left {
|
||||
case ^Node_Concatenation:
|
||||
append(&specific.nodes, right)
|
||||
result = specific
|
||||
case:
|
||||
node := new(Node_Concatenation)
|
||||
append(&node.nodes, left)
|
||||
append(&node.nodes, right)
|
||||
result = node
|
||||
}
|
||||
|
||||
case .Repeat_Zero:
|
||||
node := new(Node_Repeat_Zero)
|
||||
node.inner = left
|
||||
result = node
|
||||
case .Repeat_Zero_Non_Greedy:
|
||||
node := new(Node_Repeat_Zero_Non_Greedy)
|
||||
node.inner = left
|
||||
result = node
|
||||
case .Repeat_One:
|
||||
node := new(Node_Repeat_One)
|
||||
node.inner = left
|
||||
result = node
|
||||
case .Repeat_One_Non_Greedy:
|
||||
node := new(Node_Repeat_One_Non_Greedy)
|
||||
node.inner = left
|
||||
result = node
|
||||
|
||||
case .Repeat_N:
|
||||
node := new(Node_Repeat_N)
|
||||
node.inner = left
|
||||
|
||||
comma := strings.index_byte(token.text, ',')
|
||||
|
||||
switch comma {
|
||||
case -1: // {N}
|
||||
exact, ok := strconv.parse_u64_of_base(token.text, base = 10)
|
||||
if !ok {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
if exact == 0 {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
|
||||
node.lower = cast(int)exact
|
||||
node.upper = cast(int)exact
|
||||
|
||||
case 0: // {,M}
|
||||
upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10)
|
||||
if !ok {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
if upper == 0 {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
|
||||
node.lower = -1
|
||||
node.upper = cast(int)upper
|
||||
|
||||
case len(token.text) - 1: // {N,}
|
||||
lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
|
||||
if !ok {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
|
||||
node.lower = cast(int)lower
|
||||
node.upper = -1
|
||||
|
||||
case: // {N,M}
|
||||
lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10)
|
||||
if !lower_ok {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10)
|
||||
if !upper_ok {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
if lower > upper {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
if upper == 0 {
|
||||
return nil, Invalid_Repetition{ pos = token.pos }
|
||||
}
|
||||
|
||||
node.lower = cast(int)lower
|
||||
node.upper = cast(int)upper
|
||||
}
|
||||
|
||||
result = node
|
||||
|
||||
case .Optional:
|
||||
node := new(Node_Optional)
|
||||
node.inner = left
|
||||
result = node
|
||||
case .Optional_Non_Greedy:
|
||||
node := new(Node_Optional_Non_Greedy)
|
||||
node.inner = left
|
||||
result = node
|
||||
|
||||
case .EOF:
|
||||
return nil, Unexpected_EOF{ pos = token.pos }
|
||||
|
||||
case:
|
||||
return nil, Invalid_Token{ pos = token.pos, kind = token.kind }
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) {
|
||||
token := p.cur_token
|
||||
|
||||
advance(p) or_return
|
||||
left := null_denotation(p, token) or_return
|
||||
|
||||
token = p.cur_token
|
||||
for rbp < left_binding_power(token.kind) {
|
||||
advance(p) or_return
|
||||
left = left_denotation(p, token, left) or_return
|
||||
token = p.cur_token
|
||||
}
|
||||
|
||||
return left, nil
|
||||
}
|
||||
|
||||
parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) {
|
||||
if len(str) == 0 {
|
||||
node := new(Node_Group)
|
||||
return node, nil
|
||||
}
|
||||
|
||||
p: Parser
|
||||
p.flags = flags
|
||||
|
||||
tokenizer.init(&p.t, str, flags)
|
||||
|
||||
p.cur_token = tokenizer.scan(&p.t)
|
||||
if p.cur_token.kind == .Invalid {
|
||||
return nil, Invalid_Unicode { pos = 0 }
|
||||
}
|
||||
|
||||
node := parse_expression(&p, 0) or_return
|
||||
result = node
|
||||
|
||||
return
|
||||
}
|
||||
450
core/text/regex/regex.odin
Normal file
450
core/text/regex/regex.odin
Normal file
@@ -0,0 +1,450 @@
|
||||
package regex
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/compiler"
|
||||
import "core:text/regex/optimizer"
|
||||
import "core:text/regex/parser"
|
||||
import "core:text/regex/virtual_machine"
|
||||
|
||||
Flag :: common.Flag
|
||||
Flags :: common.Flags
|
||||
Parser_Error :: parser.Error
|
||||
Compiler_Error :: compiler.Error
|
||||
|
||||
Creation_Error :: enum {
|
||||
None,
|
||||
// A `\` was supplied as the delimiter to `create_by_user`.
|
||||
Bad_Delimiter,
|
||||
// A pair of delimiters for `create_by_user` was not found.
|
||||
Expected_Delimiter,
|
||||
// An unknown letter was supplied to `create_by_user` after the last delimiter.
|
||||
Unknown_Flag,
|
||||
}
|
||||
|
||||
Error :: union #shared_nil {
|
||||
// An error that can occur in the pattern parsing phase.
|
||||
//
|
||||
// Most of these are regular expression syntax errors and are either
|
||||
// context-dependent as to what they mean or have self-explanatory names.
|
||||
Parser_Error,
|
||||
// An error that can occur in the pattern compiling phase.
|
||||
//
|
||||
// Of the two that can be returned, they have to do with exceeding the
|
||||
// limitations of the Virtual Machine.
|
||||
Compiler_Error,
|
||||
// An error that occurs only for `create_by_user`.
|
||||
Creation_Error,
|
||||
}
|
||||
|
||||
/*
|
||||
This struct corresponds to a set of string captures from a RegEx match.
|
||||
|
||||
`pos` will contain the start and end positions for each string in `groups`,
|
||||
such that `str[pos[0][0]:pos[0][1]] == groups[0]`.
|
||||
*/
|
||||
Capture :: struct {
|
||||
pos: [][2]int,
|
||||
groups: []string,
|
||||
}
|
||||
|
||||
/*
|
||||
A compiled Regular Expression value, to be used with the `match_*` procedures.
|
||||
*/
|
||||
Regular_Expression :: struct {
|
||||
flags: Flags `fmt:"-"`,
|
||||
class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`,
|
||||
program: []virtual_machine.Opcode `fmt:"-"`,
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Create a regular expression from a string pattern and a set of flags.
|
||||
|
||||
*Allocates Using Provided Allocators*
|
||||
|
||||
Inputs:
|
||||
- pattern: The pattern to compile.
|
||||
- flags: A `bit_set` of RegEx flags.
|
||||
- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
|
||||
- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
|
||||
|
||||
Returns:
|
||||
- result: The regular expression.
|
||||
- err: An error, if one occurred.
|
||||
*/
|
||||
@require_results
|
||||
create :: proc(
|
||||
pattern: string,
|
||||
flags: Flags = {},
|
||||
permanent_allocator := context.allocator,
|
||||
temporary_allocator := context.temp_allocator,
|
||||
) -> (result: Regular_Expression, err: Error) {
|
||||
|
||||
// For the sake of speed and simplicity, we first run all the intermediate
|
||||
// processes such as parsing and compilation through the temporary
|
||||
// allocator.
|
||||
program: [dynamic]virtual_machine.Opcode = ---
|
||||
class_data: [dynamic]parser.Rune_Class_Data = ---
|
||||
{
|
||||
context.allocator = temporary_allocator
|
||||
|
||||
ast := parser.parse(pattern, flags) or_return
|
||||
|
||||
if .No_Optimization not_in flags {
|
||||
ast, _ = optimizer.optimize(ast, flags)
|
||||
}
|
||||
|
||||
program, class_data = compiler.compile(ast, flags) or_return
|
||||
}
|
||||
|
||||
// When that's successful, re-allocate all at once with the permanent
|
||||
// allocator so everything can be tightly packed.
|
||||
context.allocator = permanent_allocator
|
||||
|
||||
result.flags = flags
|
||||
|
||||
if len(class_data) > 0 {
|
||||
result.class_data = make([]virtual_machine.Rune_Class_Data, len(class_data))
|
||||
}
|
||||
for data, i in class_data {
|
||||
if len(data.runes) > 0 {
|
||||
result.class_data[i].runes = make([]rune, len(data.runes))
|
||||
copy(result.class_data[i].runes, data.runes[:])
|
||||
}
|
||||
if len(data.ranges) > 0 {
|
||||
result.class_data[i].ranges = make([]virtual_machine.Rune_Class_Range, len(data.ranges))
|
||||
copy(result.class_data[i].ranges, data.ranges[:])
|
||||
}
|
||||
}
|
||||
|
||||
result.program = make([]virtual_machine.Opcode, len(program))
|
||||
copy(result.program, program[:])
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
Create a regular expression from a delimited string pattern, such as one
|
||||
provided by users of a program or those found in a configuration file.
|
||||
|
||||
They are in the form of:
|
||||
|
||||
[DELIMITER] [regular expression] [DELIMITER] [flags]
|
||||
|
||||
For example, the following strings are valid:
|
||||
|
||||
/hellope/i
|
||||
#hellope#i
|
||||
•hellope•i
|
||||
つhellopeつi
|
||||
|
||||
The delimiter is determined by the very first rune in the string.
|
||||
The only restriction is that the delimiter cannot be `\`, as that rune is used
|
||||
to escape the delimiter if found in the middle of the string.
|
||||
|
||||
All runes after the closing delimiter will be parsed as flags:
|
||||
|
||||
- 'g': Global
|
||||
- 'm': Multiline
|
||||
- 'i': Case_Insensitive
|
||||
- 'x': Ignore_Whitespace
|
||||
- 'u': Unicode
|
||||
- 'n': No_Capture
|
||||
- '-': No_Optimization
|
||||
|
||||
|
||||
*Allocates Using Provided Allocators*
|
||||
|
||||
Inputs:
|
||||
- pattern: The delimited pattern with optional flags to compile.
|
||||
- str: The string to match against.
|
||||
- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator)
|
||||
- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator)
|
||||
|
||||
Returns:
|
||||
- result: The regular expression.
|
||||
- err: An error, if one occurred.
|
||||
*/
|
||||
@require_results
|
||||
create_by_user :: proc(
|
||||
pattern: string,
|
||||
permanent_allocator := context.allocator,
|
||||
temporary_allocator := context.temp_allocator,
|
||||
) -> (result: Regular_Expression, err: Error) {
|
||||
|
||||
if len(pattern) == 0 {
|
||||
err = .Expected_Delimiter
|
||||
return
|
||||
}
|
||||
|
||||
delimiter: rune
|
||||
start := -1
|
||||
end := -1
|
||||
|
||||
flags: Flags
|
||||
|
||||
escaping: bool
|
||||
parse_loop: for r, i in pattern {
|
||||
if delimiter == 0 {
|
||||
if r == '\\' {
|
||||
err = .Bad_Delimiter
|
||||
return
|
||||
}
|
||||
delimiter = r
|
||||
continue parse_loop
|
||||
}
|
||||
|
||||
if start == -1 {
|
||||
start = i
|
||||
}
|
||||
|
||||
if escaping {
|
||||
escaping = false
|
||||
continue parse_loop
|
||||
}
|
||||
|
||||
switch r {
|
||||
case '\\':
|
||||
escaping = true
|
||||
case delimiter:
|
||||
end = i
|
||||
break parse_loop
|
||||
}
|
||||
}
|
||||
|
||||
if end == -1 {
|
||||
err = .Expected_Delimiter
|
||||
return
|
||||
}
|
||||
|
||||
// `start` is also the size of the delimiter, which is why it's being added
|
||||
// to `end` here.
|
||||
for r in pattern[start + end:] {
|
||||
switch r {
|
||||
case 'g': flags += { .Global }
|
||||
case 'm': flags += { .Multiline }
|
||||
case 'i': flags += { .Case_Insensitive }
|
||||
case 'x': flags += { .Ignore_Whitespace }
|
||||
case 'u': flags += { .Unicode }
|
||||
case 'n': flags += { .No_Capture }
|
||||
case '-': flags += { .No_Optimization }
|
||||
case:
|
||||
err = .Unknown_Flag
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
return create(pattern[start:end], flags, permanent_allocator, temporary_allocator)
|
||||
}
|
||||
|
||||
/*
|
||||
Match a regular expression against a string and allocate the results into the
|
||||
returned `capture` structure.
|
||||
|
||||
The resulting capture strings will be slices to the string `str`, not wholly
|
||||
copied strings, so they won't need to be individually deleted.
|
||||
|
||||
*Allocates Using Provided Allocators*
|
||||
|
||||
Inputs:
|
||||
- regex: The regular expression.
|
||||
- str: The string to match against.
|
||||
- permanent_allocator: The allocator to use for the capture results. (default: context.allocator)
|
||||
- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
|
||||
|
||||
Returns:
|
||||
- capture: The capture groups found in the string.
|
||||
- success: True if the regex matched the string.
|
||||
*/
|
||||
@require_results
|
||||
match_and_allocate_capture :: proc(
|
||||
regex: Regular_Expression,
|
||||
str: string,
|
||||
permanent_allocator := context.allocator,
|
||||
temporary_allocator := context.temp_allocator,
|
||||
) -> (capture: Capture, success: bool) {
|
||||
|
||||
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
|
||||
|
||||
{
|
||||
context.allocator = temporary_allocator
|
||||
|
||||
vm := virtual_machine.create(regex.program, str)
|
||||
vm.class_data = regex.class_data
|
||||
|
||||
if .Unicode in regex.flags {
|
||||
saved, success = virtual_machine.run(&vm, true)
|
||||
} else {
|
||||
saved, success = virtual_machine.run(&vm, false)
|
||||
}
|
||||
}
|
||||
|
||||
if saved != nil {
|
||||
context.allocator = permanent_allocator
|
||||
|
||||
num_groups := 0
|
||||
#no_bounds_check for i := 0; i < len(saved); i += 2 {
|
||||
a, b := saved[i], saved[i + 1]
|
||||
if a == -1 || b == -1 {
|
||||
continue
|
||||
}
|
||||
num_groups += 1
|
||||
}
|
||||
|
||||
if num_groups > 0 {
|
||||
capture.groups = make([]string, num_groups)
|
||||
capture.pos = make([][2]int, num_groups)
|
||||
n := 0
|
||||
|
||||
#no_bounds_check for i := 0; i < len(saved); i += 2 {
|
||||
a, b := saved[i], saved[i + 1]
|
||||
if a == -1 || b == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
capture.groups[n] = str[a:b]
|
||||
capture.pos[n] = {a, b}
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
Match a regular expression against a string and save the capture results into
|
||||
the provided `capture` structure.
|
||||
|
||||
The resulting capture strings will be slices to the string `str`, not wholly
|
||||
copied strings, so they won't need to be individually deleted.
|
||||
|
||||
*Allocates Using Provided Allocator*
|
||||
|
||||
Inputs:
|
||||
- regex: The regular expression.
|
||||
- str: The string to match against.
|
||||
- capture: A pointer to a Capture structure with `groups` and `pos` already allocated.
|
||||
- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator)
|
||||
|
||||
Returns:
|
||||
- num_groups: The number of capture groups set into `capture`.
|
||||
- success: True if the regex matched the string.
|
||||
*/
|
||||
@require_results
|
||||
match_with_preallocated_capture :: proc(
|
||||
regex: Regular_Expression,
|
||||
str: string,
|
||||
capture: ^Capture,
|
||||
temporary_allocator := context.temp_allocator,
|
||||
) -> (num_groups: int, success: bool) {
|
||||
|
||||
assert(capture != nil, "Pre-allocated RegEx capture must not be nil.")
|
||||
assert(len(capture.groups) >= common.MAX_CAPTURE_GROUPS,
|
||||
"Pre-allocated RegEx capture `groups` must be at least 10 elements long.")
|
||||
assert(len(capture.pos) >= common.MAX_CAPTURE_GROUPS,
|
||||
"Pre-allocated RegEx capture `pos` must be at least 10 elements long.")
|
||||
|
||||
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int
|
||||
|
||||
{
|
||||
context.allocator = temporary_allocator
|
||||
|
||||
vm := virtual_machine.create(regex.program, str)
|
||||
vm.class_data = regex.class_data
|
||||
|
||||
if .Unicode in regex.flags {
|
||||
saved, success = virtual_machine.run(&vm, true)
|
||||
} else {
|
||||
saved, success = virtual_machine.run(&vm, false)
|
||||
}
|
||||
}
|
||||
|
||||
if saved != nil {
|
||||
n := 0
|
||||
|
||||
#no_bounds_check for i := 0; i < len(saved); i += 2 {
|
||||
a, b := saved[i], saved[i + 1]
|
||||
if a == -1 || b == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
capture.groups[n] = str[a:b]
|
||||
capture.pos[n] = {a, b}
|
||||
n += 1
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
match :: proc {
|
||||
match_and_allocate_capture,
|
||||
match_with_preallocated_capture,
|
||||
}
|
||||
|
||||
/*
|
||||
Allocate a `Capture` in advance for use with `match`. This can save some time
|
||||
if you plan on performing several matches at once and only need the results
|
||||
between matches.
|
||||
|
||||
Inputs:
|
||||
- allocator: (default: context.allocator)
|
||||
|
||||
Returns:
|
||||
- result: The `Capture` with the maximum number of groups allocated.
|
||||
*/
|
||||
@require_results
|
||||
preallocate_capture :: proc(allocator := context.allocator) -> (result: Capture) {
|
||||
context.allocator = allocator
|
||||
result.pos = make([][2]int, common.MAX_CAPTURE_GROUPS)
|
||||
result.groups = make([]string, common.MAX_CAPTURE_GROUPS)
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
Free all data allocated by the `create*` procedures.
|
||||
|
||||
*Frees Using Provided Allocator*
|
||||
|
||||
Inputs:
|
||||
- regex: A regular expression.
|
||||
- allocator: (default: context.allocator)
|
||||
*/
|
||||
destroy_regex :: proc(regex: Regular_Expression, allocator := context.allocator) {
|
||||
context.allocator = allocator
|
||||
delete(regex.program)
|
||||
for data in regex.class_data {
|
||||
delete(data.runes)
|
||||
delete(data.ranges)
|
||||
}
|
||||
delete(regex.class_data)
|
||||
}
|
||||
|
||||
/*
|
||||
Free all data allocated by the `match_and_allocate_capture` procedure.
|
||||
|
||||
*Frees Using Provided Allocator*
|
||||
|
||||
Inputs:
|
||||
- capture: A Capture.
|
||||
- allocator: (default: context.allocator)
|
||||
*/
|
||||
destroy_capture :: proc(capture: Capture, allocator := context.allocator) {
|
||||
context.allocator = allocator
|
||||
delete(capture.groups)
|
||||
delete(capture.pos)
|
||||
}
|
||||
|
||||
destroy :: proc {
|
||||
destroy_regex,
|
||||
destroy_capture,
|
||||
}
|
||||
357
core/text/regex/tokenizer/tokenizer.odin
Normal file
357
core/text/regex/tokenizer/tokenizer.odin
Normal file
@@ -0,0 +1,357 @@
|
||||
package regex_tokenizer
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:text/regex/common"
|
||||
import "core:unicode/utf8"
|
||||
|
||||
Token_Kind :: enum {
|
||||
Invalid,
|
||||
EOF,
|
||||
|
||||
Rune,
|
||||
Wildcard,
|
||||
|
||||
Alternate,
|
||||
|
||||
Concatenate,
|
||||
|
||||
Repeat_Zero,
|
||||
Repeat_Zero_Non_Greedy,
|
||||
Repeat_One,
|
||||
Repeat_One_Non_Greedy,
|
||||
|
||||
Repeat_N,
|
||||
|
||||
Optional,
|
||||
Optional_Non_Greedy,
|
||||
|
||||
Rune_Class,
|
||||
|
||||
Open_Paren,
|
||||
Open_Paren_Non_Capture,
|
||||
Close_Paren,
|
||||
|
||||
Anchor_Start,
|
||||
Anchor_End,
|
||||
|
||||
Word_Boundary,
|
||||
Non_Word_Boundary,
|
||||
}
|
||||
|
||||
Token :: struct {
|
||||
kind: Token_Kind,
|
||||
text: string,
|
||||
pos: int,
|
||||
}
|
||||
|
||||
Tokenizer :: struct {
|
||||
flags: common.Flags,
|
||||
src: string,
|
||||
|
||||
ch: rune,
|
||||
offset: int,
|
||||
read_offset: int,
|
||||
|
||||
last_token_kind: Token_Kind,
|
||||
held_token: Token,
|
||||
error_state: Error,
|
||||
paren_depth: int,
|
||||
}
|
||||
|
||||
Error :: enum {
|
||||
None,
|
||||
Illegal_Null_Character,
|
||||
Illegal_Codepoint,
|
||||
Illegal_Byte_Order_Mark,
|
||||
}
|
||||
|
||||
init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) {
|
||||
t.src = str
|
||||
t.flags = flags
|
||||
t.error_state = advance_rune(t)
|
||||
}
|
||||
|
||||
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
|
||||
if t.read_offset+offset < len(t.src) {
|
||||
return t.src[t.read_offset+offset]
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
advance_rune :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t.error_state != nil {
|
||||
return t.error_state
|
||||
}
|
||||
|
||||
if t.read_offset < len(t.src) {
|
||||
t.offset = t.read_offset
|
||||
r, w := rune(t.src[t.read_offset]), 1
|
||||
switch {
|
||||
case r == 0:
|
||||
err = .Illegal_Null_Character
|
||||
case r >= utf8.RUNE_SELF:
|
||||
r, w = utf8.decode_rune(t.src[t.read_offset:])
|
||||
if r == utf8.RUNE_ERROR && w == 1 {
|
||||
err = .Illegal_Codepoint
|
||||
} else if r == utf8.RUNE_BOM && t.offset > 0 {
|
||||
err = .Illegal_Byte_Order_Mark
|
||||
}
|
||||
}
|
||||
t.read_offset += w
|
||||
t.ch = r
|
||||
} else {
|
||||
t.offset = len(t.src)
|
||||
t.ch = -1
|
||||
}
|
||||
|
||||
t.error_state = err
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@require_results
|
||||
scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
||||
start := t.read_offset
|
||||
|
||||
for {
|
||||
advance_rune(t)
|
||||
if t.ch == -1 || t.error_state != nil {
|
||||
return "", false
|
||||
}
|
||||
|
||||
if t.ch == '\\' {
|
||||
advance_rune(t)
|
||||
continue
|
||||
}
|
||||
|
||||
if t.ch == ']' {
|
||||
return t.src[start:t.offset], true
|
||||
}
|
||||
}
|
||||
|
||||
unreachable()
|
||||
}
|
||||
|
||||
@require_results
|
||||
scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) {
|
||||
start := t.read_offset
|
||||
|
||||
for {
|
||||
advance_rune(t)
|
||||
if t.ch == -1 {
|
||||
return "", false
|
||||
}
|
||||
if t.ch == '}' {
|
||||
return t.src[start:t.offset], true
|
||||
}
|
||||
}
|
||||
|
||||
unreachable()
|
||||
}
|
||||
|
||||
@require_results
|
||||
scan_non_greedy :: proc(t: ^Tokenizer) -> bool {
|
||||
if peek_byte(t) == '?' {
|
||||
advance_rune(t)
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
scan_comment :: proc(t: ^Tokenizer) {
|
||||
for {
|
||||
advance_rune(t)
|
||||
switch t.ch {
|
||||
case -1:
|
||||
return
|
||||
case '\n':
|
||||
// UNIX newline.
|
||||
advance_rune(t)
|
||||
return
|
||||
case '\r':
|
||||
// Mac newline.
|
||||
advance_rune(t)
|
||||
if t.ch == '\n' {
|
||||
// Windows newline.
|
||||
advance_rune(t)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@require_results
|
||||
scan_non_capture_group :: proc(t: ^Tokenizer) -> bool {
|
||||
if peek_byte(t) == '?' && peek_byte(t, 1) == ':' {
|
||||
advance_rune(t)
|
||||
advance_rune(t)
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
@require_results
|
||||
scan :: proc(t: ^Tokenizer) -> (token: Token) {
|
||||
kind: Token_Kind
|
||||
lit: string
|
||||
pos := t.offset
|
||||
|
||||
defer {
|
||||
t.last_token_kind = token.kind
|
||||
}
|
||||
|
||||
if t.error_state != nil {
|
||||
t.error_state = nil
|
||||
return { .Invalid, "", pos }
|
||||
}
|
||||
|
||||
if t.held_token != {} {
|
||||
popped := t.held_token
|
||||
t.held_token = {}
|
||||
|
||||
return popped
|
||||
}
|
||||
|
||||
ch_loop: for {
|
||||
switch t.ch {
|
||||
case -1:
|
||||
return { .EOF, "", pos }
|
||||
|
||||
case '\\':
|
||||
advance_rune(t)
|
||||
|
||||
if t.ch == -1 {
|
||||
return { .EOF, "", pos }
|
||||
}
|
||||
|
||||
pos = t.offset
|
||||
|
||||
// @MetaCharacter
|
||||
// NOTE: These must be kept in sync with the compiler.
|
||||
DIGIT_CLASS :: "0-9"
|
||||
SPACE_CLASS :: "\t\n\f\r "
|
||||
WORD_CLASS :: "0-9A-Z_a-z"
|
||||
|
||||
switch t.ch {
|
||||
case 'b': kind = .Word_Boundary
|
||||
case 'B': kind = .Non_Word_Boundary
|
||||
|
||||
case 'f': kind = .Rune; lit = "\f"
|
||||
case 'n': kind = .Rune; lit = "\n"
|
||||
case 'r': kind = .Rune; lit = "\r"
|
||||
case 't': kind = .Rune; lit = "\t"
|
||||
|
||||
case 'd': kind = .Rune_Class; lit = DIGIT_CLASS
|
||||
case 's': kind = .Rune_Class; lit = SPACE_CLASS
|
||||
case 'w': kind = .Rune_Class; lit = WORD_CLASS
|
||||
case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS
|
||||
case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS
|
||||
case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS
|
||||
case:
|
||||
kind = .Rune
|
||||
lit = t.src[t.offset:t.read_offset]
|
||||
}
|
||||
|
||||
case '.':
|
||||
kind = .Wildcard
|
||||
|
||||
case '|': kind = .Alternate
|
||||
|
||||
case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero
|
||||
case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One
|
||||
case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional
|
||||
|
||||
case '[':
|
||||
if text, ok := scan_class(t); ok {
|
||||
kind = .Rune_Class
|
||||
lit = text
|
||||
} else {
|
||||
kind = .EOF
|
||||
}
|
||||
|
||||
case '{':
|
||||
if text, ok := scan_repeat(t); ok {
|
||||
kind = .Repeat_N
|
||||
lit = text
|
||||
} else {
|
||||
kind = .EOF
|
||||
}
|
||||
|
||||
case '(':
|
||||
kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren
|
||||
t.paren_depth += 1
|
||||
case ')':
|
||||
kind = .Close_Paren
|
||||
t.paren_depth -= 1
|
||||
|
||||
case '^': kind = .Anchor_Start
|
||||
case '$':
|
||||
kind = .Anchor_End
|
||||
|
||||
case:
|
||||
if .Ignore_Whitespace in t.flags {
|
||||
switch t.ch {
|
||||
case ' ', '\r', '\n', '\t', '\f':
|
||||
advance_rune(t)
|
||||
continue ch_loop
|
||||
case:
|
||||
break
|
||||
}
|
||||
}
|
||||
if t.ch == '#' && t.paren_depth == 0 {
|
||||
scan_comment(t)
|
||||
continue ch_loop
|
||||
}
|
||||
|
||||
kind = .Rune
|
||||
lit = t.src[t.offset:t.read_offset]
|
||||
}
|
||||
|
||||
break ch_loop
|
||||
}
|
||||
|
||||
if t.error_state != nil {
|
||||
t.error_state = nil
|
||||
return { .Invalid, "", pos }
|
||||
}
|
||||
|
||||
advance_rune(t)
|
||||
|
||||
// The following set of rules dictate where Concatenate tokens are
|
||||
// automatically inserted.
|
||||
#partial switch kind {
|
||||
case
|
||||
.Close_Paren,
|
||||
.Alternate,
|
||||
.Optional, .Optional_Non_Greedy,
|
||||
.Repeat_Zero, .Repeat_Zero_Non_Greedy,
|
||||
.Repeat_One, .Repeat_One_Non_Greedy,
|
||||
.Repeat_N:
|
||||
// Never prepend a Concatenate before these tokens.
|
||||
break
|
||||
case:
|
||||
#partial switch t.last_token_kind {
|
||||
case
|
||||
.Invalid,
|
||||
.Open_Paren, .Open_Paren_Non_Capture,
|
||||
.Alternate:
|
||||
// Never prepend a Concatenate token when the _last token_ was one
|
||||
// of these.
|
||||
break
|
||||
case:
|
||||
t.held_token = { kind, lit, pos }
|
||||
return { .Concatenate, "", pos }
|
||||
}
|
||||
}
|
||||
|
||||
return { kind, lit, pos }
|
||||
}
|
||||
175
core/text/regex/virtual_machine/doc.odin
Normal file
175
core/text/regex/virtual_machine/doc.odin
Normal file
@@ -0,0 +1,175 @@
|
||||
/*
|
||||
package regex_vm implements a threaded virtual machine for interpreting
|
||||
regular expressions, based on the designs described by Russ Cox and attributed
|
||||
to both Ken Thompson and Rob Pike.
|
||||
|
||||
The virtual machine executes all threads in lock step, i.e. the string pointer
|
||||
does not advance until all threads have finished processing the current rune.
|
||||
The algorithm does not look backwards.
|
||||
|
||||
Threads merge when splitting or jumping to positions already visited by another
|
||||
thread, based on the observation that each thread having visited one PC
|
||||
(Program Counter) state will execute identically to the previous thread.
|
||||
|
||||
Each thread keeps a save state of its capture groups, and thread priority is
|
||||
used to allow higher precedence operations to complete first with correct save
|
||||
states, such as greedy versus non-greedy repetition.
|
||||
|
||||
For more information, see: https://swtch.com/~rsc/regexp/regexp2.html
|
||||
|
||||
|
||||
**Implementation Details:**
|
||||
|
||||
- Each opcode is 8 bits in size, and most instructions have no operands.
|
||||
|
||||
- All operands larger than `u8` are read in system endian order.
|
||||
|
||||
- Jump and Split instructions operate on absolute positions in `u16` operands.
|
||||
|
||||
- Classes such as `[0-9]` are stored in a RegEx-specific slice of structs which
|
||||
are then dereferenced by a `u8` index from the `Rune_Class` instructions.
|
||||
|
||||
- Each Byte and Rune opcode have their operands stored inline after the opcode,
|
||||
sized `u8` and `i32` respectively.
|
||||
|
||||
- A bitmap is used to determine which PC positions are occupied by a thread to
|
||||
perform merging. The bitmap is cleared with every new frame.
|
||||
|
||||
- The VM supports two modes: ASCII and Unicode, decided by a compile-time
|
||||
boolean constant argument provided to `run`. The procedure differs only in
|
||||
string decoding. This was done for the sake of performance.
|
||||
|
||||
- No allocations are ever freed; the VM expects an arena or temporary allocator
|
||||
to be used in the context preceding it.
|
||||
|
||||
|
||||
**Opcode Reference:**
|
||||
|
||||
(0x00) Match
|
||||
|
||||
The terminal opcode which ends a thread. This always comes at the end of
|
||||
the program.
|
||||
|
||||
(0x01) Match_And_Exit
|
||||
|
||||
A modified version of Match which stops the virtual machine entirely. It is
|
||||
only compiled for `No_Capture` expressions, as those expressions do not
|
||||
need to determine which thread may have saved the most appropriate capture
|
||||
groups.
|
||||
|
||||
(0x02) Byte
|
||||
|
||||
Consumes one byte from the text using its operand, which is also a byte.
|
||||
|
||||
(0x03) Rune
|
||||
|
||||
Consumes one Unicode codepoint from the text using its operand, which is
|
||||
four bytes long in a system-dependent endian order.
|
||||
|
||||
(0x04) Rune_Class
|
||||
|
||||
Consumes one character (which may be an ASCII byte or Unicode codepoint,
|
||||
wholly dependent on which mode the virtual machine is running in) from the
|
||||
text.
|
||||
|
||||
The actual data storing what runes and ranges of runes apply to the class
|
||||
are stored alongside the program in the Regular_Expression structure and
|
||||
the operand for this opcode is a single byte which indexes into a
|
||||
collection of these data structures.
|
||||
|
||||
(0x05) Rune_Class_Negated
|
||||
|
||||
A modified version of Rune_Class that functions the same, save for how it
|
||||
returns the opposite of what Rune_Class matches.
|
||||
|
||||
(0x06) Wildcard
|
||||
|
||||
Consumes one byte or one Unicode codepoint, depending on the VM mode.
|
||||
|
||||
(0x07) Jump
|
||||
|
||||
Sets the Program Counter of a VM thread to the operand, which is a u16.
|
||||
This opcode is used to implement Alternation (coming at the end of the left
|
||||
choice) and Repeat_Zero (to cause the thread to loop backwards).
|
||||
|
||||
(0x08) Split
|
||||
|
||||
Spawns a new thread for the X operand and causes the current thread to jump
|
||||
to the Y operand. This opcode is used to implement Alternation, all the
|
||||
Repeat variations, and the Optional nodes.
|
||||
|
||||
Splitting threads is how the virtual machine is able to execute optional
|
||||
control flow paths, letting it evaluate different possible ways to match
|
||||
text.
|
||||
|
||||
(0x09) Save
|
||||
|
||||
Saves the current string index to a slot on the thread dictated by the
|
||||
operand. These values will be used later to reconstruct capture groups.
|
||||
|
||||
(0x0A) Assert_Start
|
||||
|
||||
Asserts that the thread is at the beginning of a string.
|
||||
|
||||
(0x0B) Assert_End
|
||||
|
||||
Asserts that the thread is at the end of a string.
|
||||
|
||||
(0x0C) Assert_Word_Boundary
|
||||
|
||||
Asserts that the thread is on a word boundary, which can be the start or
|
||||
end of the text. This examines both the current rune and the next rune.
|
||||
|
||||
(0x0D) Assert_Non_Word_Boundary
|
||||
|
||||
A modified version of Assert_Word_Boundary that returns the opposite value.
|
||||
|
||||
(0x0E) Multiline_Open
|
||||
|
||||
This opcode is compiled in only when the `Multiline` flag is present, and
|
||||
it replaces both `^` and `$` text anchors.
|
||||
|
||||
It asserts that either the current thread is on one of the string
|
||||
boundaries, or it consumes a `\n` or `\r` character.
|
||||
|
||||
If a `\r` character is consumed, the PC will be advanced to the sibling
|
||||
`Multiline_Close` opcode to optionally consume a `\n` character on the next
|
||||
frame.
|
||||
|
||||
(0x0F) Multiline_Close
|
||||
|
||||
This opcode is always present after `Multiline_Open`.
|
||||
|
||||
It handles consuming the second half of a complete newline, if necessary.
|
||||
For example, Windows newlines are represented by the characters `\r\n`,
|
||||
whereas UNIX newlines are `\n` and Macintosh newlines are `\r`.
|
||||
|
||||
(0x10) Wait_For_Byte
|
||||
(0x11) Wait_For_Rune
|
||||
(0x12) Wait_For_Rune_Class
|
||||
(0x13) Wait_For_Rune_Class_Negated
|
||||
|
||||
These opcodes are an optimization around restarting threads on failed
|
||||
matches when the beginning to a pattern is predictable and the Global flag
|
||||
is set.
|
||||
|
||||
They will cause the VM to wait for the next rune to match before splitting,
|
||||
as would happen in the un-optimized version.
|
||||
|
||||
(0x14) Match_All_And_Escape
|
||||
|
||||
This opcode is an optimized version of `.*$` or `.+$` that causes the
|
||||
active thread to immediately work on escaping the program by following all
|
||||
Jumps out to the end.
|
||||
|
||||
While running through the rest of the program, the thread will trigger on
|
||||
every Save instruction it passes to store the length of the string.
|
||||
|
||||
This way, any time a program hits one of these `.*$` constructs, the
|
||||
virtual machine can exit early, vastly improving processing times.
|
||||
|
||||
Be aware, this opcode is not compiled in if the `Multiline` flag is on, as
|
||||
the meaning of `$` changes with that flag.
|
||||
|
||||
*/
|
||||
package regex_vm
|
||||
81
core/text/regex/virtual_machine/util.odin
Normal file
81
core/text/regex/virtual_machine/util.odin
Normal file
@@ -0,0 +1,81 @@
|
||||
package regex_vm
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
Opcode_Iterator :: struct {
|
||||
code: Program,
|
||||
pc: int,
|
||||
}
|
||||
|
||||
iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok: bool) {
|
||||
if iter.pc >= len(iter.code) {
|
||||
return
|
||||
}
|
||||
|
||||
opcode = iter.code[iter.pc]
|
||||
pc = iter.pc
|
||||
ok = true
|
||||
|
||||
switch opcode {
|
||||
case .Match: iter.pc += size_of(Opcode)
|
||||
case .Match_And_Exit: iter.pc += size_of(Opcode)
|
||||
case .Byte: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Rune: iter.pc += size_of(Opcode) + size_of(rune)
|
||||
case .Rune_Class: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Wildcard: iter.pc += size_of(Opcode)
|
||||
case .Jump: iter.pc += size_of(Opcode) + size_of(u16)
|
||||
case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16)
|
||||
case .Save: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Assert_Start: iter.pc += size_of(Opcode)
|
||||
case .Assert_End: iter.pc += size_of(Opcode)
|
||||
case .Assert_Word_Boundary: iter.pc += size_of(Opcode)
|
||||
case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode)
|
||||
case .Multiline_Open: iter.pc += size_of(Opcode)
|
||||
case .Multiline_Close: iter.pc += size_of(Opcode)
|
||||
case .Wait_For_Byte: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Wait_For_Rune: iter.pc += size_of(Opcode) + size_of(rune)
|
||||
case .Wait_For_Rune_Class: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Wait_For_Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8)
|
||||
case .Match_All_And_Escape: iter.pc += size_of(Opcode)
|
||||
case:
|
||||
panic("Invalid opcode found in RegEx program.")
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
opcode_to_name :: proc(opcode: Opcode) -> (str: string) {
|
||||
switch opcode {
|
||||
case .Match: str = "Match"
|
||||
case .Match_And_Exit: str = "Match_And_Exit"
|
||||
case .Byte: str = "Byte"
|
||||
case .Rune: str = "Rune"
|
||||
case .Rune_Class: str = "Rune_Class"
|
||||
case .Rune_Class_Negated: str = "Rune_Class_Negated"
|
||||
case .Wildcard: str = "Wildcard"
|
||||
case .Jump: str = "Jump"
|
||||
case .Split: str = "Split"
|
||||
case .Save: str = "Save"
|
||||
case .Assert_Start: str = "Assert_Start"
|
||||
case .Assert_End: str = "Assert_End"
|
||||
case .Assert_Word_Boundary: str = "Assert_Word_Boundary"
|
||||
case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary"
|
||||
case .Multiline_Open: str = "Multiline_Open"
|
||||
case .Multiline_Close: str = "Multiline_Close"
|
||||
case .Wait_For_Byte: str = "Wait_For_Byte"
|
||||
case .Wait_For_Rune: str = "Wait_For_Rune"
|
||||
case .Wait_For_Rune_Class: str = "Wait_For_Rune_Class"
|
||||
case .Wait_For_Rune_Class_Negated: str = "Wait_For_Rune_Class_Negated"
|
||||
case .Match_All_And_Escape: str = "Match_All_And_Escape"
|
||||
case: str = "<UNKNOWN>"
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
646
core/text/regex/virtual_machine/virtual_machine.odin
Normal file
646
core/text/regex/virtual_machine/virtual_machine.odin
Normal file
@@ -0,0 +1,646 @@
|
||||
package regex_vm
|
||||
|
||||
/*
|
||||
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
List of contributors:
|
||||
Feoramund: Initial implementation.
|
||||
*/
|
||||
|
||||
import "base:intrinsics"
|
||||
@require import "core:io"
|
||||
import "core:slice"
|
||||
import "core:text/regex/common"
|
||||
import "core:text/regex/parser"
|
||||
import "core:unicode/utf8"
|
||||
|
||||
Rune_Class_Range :: parser.Rune_Class_Range
|
||||
|
||||
// NOTE: This structure differs intentionally from the one in `regex/parser`,
|
||||
// as this data doesn't need to be a dynamic array once it hits the VM.
|
||||
Rune_Class_Data :: struct {
|
||||
runes: []rune,
|
||||
ranges: []Rune_Class_Range,
|
||||
}
|
||||
|
||||
Opcode :: enum u8 {
|
||||
// | [ operands ]
|
||||
Match = 0x00, // |
|
||||
Match_And_Exit = 0x01, // |
|
||||
Byte = 0x02, // | u8
|
||||
Rune = 0x03, // | i32
|
||||
Rune_Class = 0x04, // | u8
|
||||
Rune_Class_Negated = 0x05, // | u8
|
||||
Wildcard = 0x06, // |
|
||||
Jump = 0x07, // | u16
|
||||
Split = 0x08, // | u16, u16
|
||||
Save = 0x09, // | u8
|
||||
Assert_Start = 0x0A, // |
|
||||
Assert_End = 0x0B, // |
|
||||
Assert_Word_Boundary = 0x0C, // |
|
||||
Assert_Non_Word_Boundary = 0x0D, // |
|
||||
Multiline_Open = 0x0E, // |
|
||||
Multiline_Close = 0x0F, // |
|
||||
Wait_For_Byte = 0x10, // | u8
|
||||
Wait_For_Rune = 0x11, // | i32
|
||||
Wait_For_Rune_Class = 0x12, // | u8
|
||||
Wait_For_Rune_Class_Negated = 0x13, // | u8
|
||||
Match_All_And_Escape = 0x14, // |
|
||||
}
|
||||
|
||||
Thread :: struct {
|
||||
pc: int,
|
||||
saved: ^[2 * common.MAX_CAPTURE_GROUPS]int,
|
||||
}
|
||||
|
||||
Program :: []Opcode
|
||||
|
||||
Machine :: struct {
|
||||
// Program state
|
||||
memory: string,
|
||||
class_data: []Rune_Class_Data,
|
||||
code: Program,
|
||||
|
||||
// Thread state
|
||||
top_thread: int,
|
||||
threads: [^]Thread,
|
||||
next_threads: [^]Thread,
|
||||
|
||||
// The busy map is used to merge threads based on their program counters.
|
||||
busy_map: []u64,
|
||||
|
||||
// Global state
|
||||
string_pointer: int,
|
||||
|
||||
current_rune: rune,
|
||||
current_rune_size: int,
|
||||
next_rune: rune,
|
||||
next_rune_size: int,
|
||||
}
|
||||
|
||||
|
||||
// @MetaCharacter
|
||||
// NOTE: This must be kept in sync with the compiler & tokenizer.
|
||||
is_word_class :: #force_inline proc "contextless" (r: rune) -> bool {
|
||||
switch r {
|
||||
case '0'..='9', 'A'..='Z', '_', 'a'..='z':
|
||||
return true
|
||||
case:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
set_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check {
|
||||
slot := cast(u64)pc >> 6
|
||||
bit: u64 = 1 << (cast(u64)pc & 0x3F)
|
||||
if vm.busy_map[slot] & bit > 0 {
|
||||
return false
|
||||
}
|
||||
vm.busy_map[slot] |= bit
|
||||
return true
|
||||
}
|
||||
|
||||
check_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check {
|
||||
slot := cast(u64)pc >> 6
|
||||
bit: u64 = 1 << (cast(u64)pc & 0x3F)
|
||||
return vm.busy_map[slot] & bit > 0
|
||||
}
|
||||
|
||||
add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: int) #no_bounds_check {
|
||||
if check_busy_map(vm, pc) {
|
||||
return
|
||||
}
|
||||
|
||||
saved := saved
|
||||
pc := pc
|
||||
|
||||
resolution_loop: for {
|
||||
if !set_busy_map(vm, pc) {
|
||||
return
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Thread [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "] thinking about ")
|
||||
io.write_string(common.debug_stream, opcode_to_name(vm.code[pc]))
|
||||
io.write_rune(common.debug_stream, '\n')
|
||||
}
|
||||
|
||||
#partial switch vm.code[pc] {
|
||||
case .Jump:
|
||||
pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)])
|
||||
continue
|
||||
|
||||
case .Split:
|
||||
jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)])
|
||||
jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode) + size_of(u16)])
|
||||
|
||||
add_thread(vm, saved, jmp_x)
|
||||
pc = jmp_y
|
||||
continue
|
||||
|
||||
case .Save:
|
||||
new_saved := new([2 * common.MAX_CAPTURE_GROUPS]int)
|
||||
new_saved ^= saved^
|
||||
saved = new_saved
|
||||
|
||||
index := vm.code[pc + size_of(Opcode)]
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
saved[index] = sp
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Thread [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "] saving state: (slot ")
|
||||
io.write_int(common.debug_stream, cast(int)index)
|
||||
io.write_string(common.debug_stream, " = ")
|
||||
io.write_int(common.debug_stream, sp)
|
||||
io.write_string(common.debug_stream, ")\n")
|
||||
}
|
||||
|
||||
pc += size_of(Opcode) + size_of(u8)
|
||||
continue
|
||||
|
||||
case .Assert_Start:
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
if sp == 0 {
|
||||
pc += size_of(Opcode)
|
||||
continue
|
||||
}
|
||||
case .Assert_End:
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
if sp == len(vm.memory) {
|
||||
pc += size_of(Opcode)
|
||||
continue
|
||||
}
|
||||
case .Multiline_Open:
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
if sp == 0 || sp == len(vm.memory) {
|
||||
if vm.next_rune == '\r' || vm.next_rune == '\n' {
|
||||
// The VM is currently on a newline at the string boundary,
|
||||
// so consume the newline next frame.
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
} else {
|
||||
// Skip the `Multiline_Close` opcode.
|
||||
pc += 2 * size_of(Opcode)
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
// Not on a string boundary.
|
||||
// Try to consume a newline next frame in the other opcode loop.
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
}
|
||||
case .Assert_Word_Boundary:
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
if sp == 0 || sp == len(vm.memory) {
|
||||
pc += size_of(Opcode)
|
||||
continue
|
||||
} else {
|
||||
last_rune_is_wc := is_word_class(vm.current_rune)
|
||||
this_rune_is_wc := is_word_class(vm.next_rune)
|
||||
|
||||
if last_rune_is_wc && !this_rune_is_wc || !last_rune_is_wc && this_rune_is_wc {
|
||||
pc += size_of(Opcode)
|
||||
continue
|
||||
}
|
||||
}
|
||||
case .Assert_Non_Word_Boundary:
|
||||
sp := vm.string_pointer+vm.current_rune_size
|
||||
if sp != 0 && sp != len(vm.memory) {
|
||||
last_rune_is_wc := is_word_class(vm.current_rune)
|
||||
this_rune_is_wc := is_word_class(vm.next_rune)
|
||||
|
||||
if last_rune_is_wc && this_rune_is_wc || !last_rune_is_wc && !this_rune_is_wc {
|
||||
pc += size_of(Opcode)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
case .Wait_For_Byte:
|
||||
operand := cast(rune)vm.code[pc + size_of(Opcode)]
|
||||
if vm.next_rune == operand {
|
||||
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune:
|
||||
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[pc + size_of(Opcode)])
|
||||
if vm.next_rune == operand {
|
||||
add_thread(vm, saved, pc + size_of(Opcode) + size_of(rune))
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune_Class:
|
||||
operand := cast(u8)vm.code[pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
next_rune := vm.next_rune
|
||||
|
||||
check: {
|
||||
for r in class_data.runes {
|
||||
if next_rune == r {
|
||||
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
|
||||
break check
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= next_rune && next_rune <= range.upper {
|
||||
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
|
||||
break check
|
||||
}
|
||||
}
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune_Class_Negated:
|
||||
operand := cast(u8)vm.code[pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
next_rune := vm.next_rune
|
||||
|
||||
check_negated: {
|
||||
for r in class_data.runes {
|
||||
if next_rune == r {
|
||||
break check_negated
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= next_rune && next_rune <= range.upper {
|
||||
break check_negated
|
||||
}
|
||||
}
|
||||
add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8))
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case:
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved }
|
||||
vm.top_thread += 1
|
||||
}
|
||||
|
||||
break resolution_loop
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check {
|
||||
when UNICODE_MODE {
|
||||
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory)
|
||||
} else {
|
||||
if len(vm.memory) > 0 {
|
||||
vm.next_rune = cast(rune)vm.memory[0]
|
||||
vm.next_rune_size = 1
|
||||
}
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "### Adding initial thread.\n")
|
||||
}
|
||||
|
||||
{
|
||||
starter_saved := new([2 * common.MAX_CAPTURE_GROUPS]int)
|
||||
starter_saved ^= -1
|
||||
|
||||
add_thread(vm, starter_saved, 0)
|
||||
}
|
||||
|
||||
// `add_thread` adds to `next_threads` by default, but we need to put this
|
||||
// thread in the current thread buffer.
|
||||
vm.threads, vm.next_threads = vm.next_threads, vm.threads
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "### VM starting.\n")
|
||||
defer io.write_string(common.debug_stream, "### VM finished.\n")
|
||||
}
|
||||
|
||||
for {
|
||||
slice.zero(vm.busy_map[:])
|
||||
|
||||
assert(vm.string_pointer <= len(vm.memory), "VM string pointer went out of bounds.")
|
||||
|
||||
current_rune := vm.next_rune
|
||||
vm.current_rune = current_rune
|
||||
vm.current_rune_size = vm.next_rune_size
|
||||
when UNICODE_MODE {
|
||||
vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer+vm.current_rune_size:])
|
||||
} else {
|
||||
if vm.string_pointer+size_of(u8) < len(vm.memory) {
|
||||
vm.next_rune = cast(rune)vm.memory[vm.string_pointer+size_of(u8)]
|
||||
vm.next_rune_size = size_of(u8)
|
||||
} else {
|
||||
vm.next_rune = 0
|
||||
vm.next_rune_size = 0
|
||||
}
|
||||
}
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, ">>> Dispatching rune: ")
|
||||
io.write_encoded_rune(common.debug_stream, current_rune)
|
||||
io.write_byte(common.debug_stream, '\n')
|
||||
}
|
||||
|
||||
thread_count := vm.top_thread
|
||||
vm.top_thread = 0
|
||||
thread_loop: for i := 0; i < thread_count; i += 1 {
|
||||
t := vm.threads[i]
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Thread [PC:")
|
||||
common.write_padded_hex(common.debug_stream, t.pc, 4)
|
||||
io.write_string(common.debug_stream, "] stepping on ")
|
||||
io.write_string(common.debug_stream, opcode_to_name(vm.code[t.pc]))
|
||||
io.write_byte(common.debug_stream, '\n')
|
||||
}
|
||||
|
||||
#partial opcode: switch vm.code[t.pc] {
|
||||
case .Match:
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Thread matched!\n")
|
||||
}
|
||||
saved = t.saved
|
||||
ok = true
|
||||
break thread_loop
|
||||
|
||||
case .Match_And_Exit:
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Thread matched! (Exiting)\n")
|
||||
}
|
||||
return nil, true
|
||||
|
||||
case .Byte:
|
||||
operand := cast(rune)vm.code[t.pc + size_of(Opcode)]
|
||||
if current_rune == operand {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
}
|
||||
|
||||
case .Rune:
|
||||
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)])
|
||||
if current_rune == operand {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune))
|
||||
}
|
||||
|
||||
case .Rune_Class:
|
||||
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
|
||||
for r in class_data.runes {
|
||||
if current_rune == r {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
break opcode
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= current_rune && current_rune <= range.upper {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
break opcode
|
||||
}
|
||||
}
|
||||
|
||||
case .Rune_Class_Negated:
|
||||
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
for r in class_data.runes {
|
||||
if current_rune == r {
|
||||
break opcode
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= current_rune && current_rune <= range.upper {
|
||||
break opcode
|
||||
}
|
||||
}
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
|
||||
case .Wildcard:
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode))
|
||||
|
||||
case .Multiline_Open:
|
||||
if current_rune == '\n' {
|
||||
// UNIX newline.
|
||||
add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode))
|
||||
} else if current_rune == '\r' {
|
||||
if vm.next_rune == '\n' {
|
||||
// Windows newline. (1/2)
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode))
|
||||
} else {
|
||||
// Mac newline.
|
||||
add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode))
|
||||
}
|
||||
}
|
||||
case .Multiline_Close:
|
||||
if current_rune == '\n' {
|
||||
// Windows newline. (2/2)
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode))
|
||||
}
|
||||
|
||||
case .Wait_For_Byte:
|
||||
operand := cast(rune)vm.code[t.pc + size_of(Opcode)]
|
||||
if vm.next_rune == operand {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, t.pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune:
|
||||
operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)])
|
||||
if vm.next_rune == operand {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune))
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, t.pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune_Class:
|
||||
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
next_rune := vm.next_rune
|
||||
|
||||
check: {
|
||||
for r in class_data.runes {
|
||||
if next_rune == r {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
break check
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= next_rune && next_rune <= range.upper {
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
break check
|
||||
}
|
||||
}
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, t.pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Wait_For_Rune_Class_Negated:
|
||||
operand := cast(u8)vm.code[t.pc + size_of(Opcode)]
|
||||
class_data := vm.class_data[operand]
|
||||
next_rune := vm.next_rune
|
||||
|
||||
check_negated: {
|
||||
for r in class_data.runes {
|
||||
if next_rune == r {
|
||||
break check_negated
|
||||
}
|
||||
}
|
||||
for range in class_data.ranges {
|
||||
if range.lower <= next_rune && next_rune <= range.upper {
|
||||
break check_negated
|
||||
}
|
||||
}
|
||||
add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8))
|
||||
}
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "*** New thread added [PC:")
|
||||
common.write_padded_hex(common.debug_stream, t.pc, 4)
|
||||
io.write_string(common.debug_stream, "]\n")
|
||||
}
|
||||
vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved }
|
||||
vm.top_thread += 1
|
||||
|
||||
case .Match_All_And_Escape:
|
||||
t.pc += size_of(Opcode)
|
||||
// The point of this loop is to walk out of wherever this
|
||||
// opcode lives to the end of the program, while saving the
|
||||
// index to the length of the string at each pass on the way.
|
||||
escape_loop: for {
|
||||
#partial switch vm.code[t.pc] {
|
||||
case .Match, .Match_And_Exit:
|
||||
break escape_loop
|
||||
|
||||
case .Jump:
|
||||
t.pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[t.pc + size_of(Opcode)])
|
||||
|
||||
case .Save:
|
||||
index := vm.code[t.pc + size_of(Opcode)]
|
||||
t.saved[index] = len(vm.memory)
|
||||
t.pc += size_of(Opcode) + size_of(u8)
|
||||
|
||||
case .Match_All_And_Escape:
|
||||
// Layering these is fine.
|
||||
t.pc += size_of(Opcode)
|
||||
|
||||
// If the loop has to process any opcode not listed above,
|
||||
// it means someone did something odd like `a(.*$)b`, in
|
||||
// which case, just fail. Technically, the expression makes
|
||||
// no sense.
|
||||
case:
|
||||
break opcode
|
||||
}
|
||||
}
|
||||
|
||||
saved = t.saved
|
||||
ok = true
|
||||
return
|
||||
|
||||
case:
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "Opcode: ")
|
||||
io.write_int(common.debug_stream, cast(int)vm.code[t.pc])
|
||||
io.write_string(common.debug_stream, "\n")
|
||||
}
|
||||
panic("Invalid opcode in RegEx thread loop.")
|
||||
}
|
||||
}
|
||||
|
||||
vm.threads, vm.next_threads = vm.next_threads, vm.threads
|
||||
|
||||
when common.ODIN_DEBUG_REGEX {
|
||||
io.write_string(common.debug_stream, "<<< Frame ended. (Threads: ")
|
||||
io.write_int(common.debug_stream, vm.top_thread)
|
||||
io.write_string(common.debug_stream, ")\n")
|
||||
}
|
||||
|
||||
if vm.string_pointer == len(vm.memory) || vm.top_thread == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
vm.string_pointer += vm.current_rune_size
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
opcode_count :: proc(code: Program) -> (opcodes: int) {
|
||||
iter := Opcode_Iterator{ code, 0 }
|
||||
for _ in iterate_opcodes(&iter) {
|
||||
opcodes += 1
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
create :: proc(code: Program, str: string) -> (vm: Machine) {
|
||||
assert(len(code) > 0, "RegEx VM has no instructions.")
|
||||
|
||||
vm.memory = str
|
||||
vm.code = code
|
||||
|
||||
sizing := len(code) >> 6 + (1 if len(code) & 0x3F > 0 else 0)
|
||||
assert(sizing > 0)
|
||||
vm.busy_map = make([]u64, sizing)
|
||||
|
||||
max_possible_threads := max(1, opcode_count(vm.code) - 1)
|
||||
|
||||
vm.threads = make([^]Thread, max_possible_threads)
|
||||
vm.next_threads = make([^]Thread, max_possible_threads)
|
||||
|
||||
return
|
||||
}
|
||||
@@ -128,6 +128,7 @@ import testing "core:testing"
|
||||
import edit "core:text/edit"
|
||||
import i18n "core:text/i18n"
|
||||
import match "core:text/match"
|
||||
import regex "core:text/regex"
|
||||
import scanner "core:text/scanner"
|
||||
import table "core:text/table"
|
||||
|
||||
@@ -251,6 +252,7 @@ _ :: testing
|
||||
_ :: scanner
|
||||
_ :: i18n
|
||||
_ :: match
|
||||
_ :: regex
|
||||
_ :: table
|
||||
_ :: edit
|
||||
_ :: thread
|
||||
|
||||
@@ -3,3 +3,4 @@ package benchmarks
|
||||
@(require) import "bytes"
|
||||
@(require) import "crypto"
|
||||
@(require) import "hash"
|
||||
@(require) import "text/regex"
|
||||
|
||||
258
tests/benchmark/text/regex/benchmark_regex.odin
Normal file
258
tests/benchmark/text/regex/benchmark_regex.odin
Normal file
@@ -0,0 +1,258 @@
|
||||
package benchmark_core_text_regex
|
||||
|
||||
import "core:fmt"
|
||||
import "core:log"
|
||||
import "core:math/rand"
|
||||
import "core:mem"
|
||||
import "core:testing"
|
||||
import "core:text/regex"
|
||||
import "core:time"
|
||||
import "core:unicode/utf8"
|
||||
|
||||
randomize_ascii :: proc(data: []u8) {
|
||||
for i in 0..<len(data) {
|
||||
data[i] = ' ' + cast(u8)rand.int_max(0x7F - ' ')
|
||||
}
|
||||
}
|
||||
|
||||
randomize_unicode :: proc(data: []u8) {
|
||||
for i := 0; i < len(data); /**/ {
|
||||
check_rune_loop: for {
|
||||
r := cast(rune)rand.int_max(utf8.MAX_RUNE)
|
||||
if !utf8.valid_rune(r) {
|
||||
continue
|
||||
}
|
||||
if utf8.rune_size(r) > len(data) - i {
|
||||
continue
|
||||
}
|
||||
|
||||
r_data, size := utf8.encode_rune(r)
|
||||
for j in 0..<size {
|
||||
data[i+j] = r_data[j]
|
||||
}
|
||||
|
||||
i += size
|
||||
break check_rune_loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sizes := [?]int {
|
||||
2 * mem.Kilobyte,
|
||||
32 * mem.Kilobyte,
|
||||
64 * mem.Kilobyte,
|
||||
256 * mem.Kilobyte,
|
||||
0.50 * mem.Megabyte,
|
||||
1.00 * mem.Megabyte,
|
||||
2.00 * mem.Megabyte,
|
||||
}
|
||||
|
||||
@test
|
||||
expensive_for_backtrackers :: proc(t: ^testing.T) {
|
||||
counts := [?]int {
|
||||
8,
|
||||
16,
|
||||
32,
|
||||
64,
|
||||
}
|
||||
|
||||
report: string
|
||||
|
||||
for count in counts {
|
||||
data := make([]u8, count)
|
||||
pattern := make([]u8, 2 * count + count)
|
||||
defer {
|
||||
delete(data)
|
||||
delete(pattern)
|
||||
}
|
||||
for i in 0..<2 * count {
|
||||
pattern[i] = 'a' if i & 1 == 0 else '?'
|
||||
}
|
||||
for i in 2 * count..<2 * count + count {
|
||||
pattern[i] = 'a'
|
||||
}
|
||||
for i in 0..<count {
|
||||
data[i] = 'a'
|
||||
}
|
||||
|
||||
rex, err := regex.create(cast(string)pattern)
|
||||
if !testing.expect_value(t, err, nil) {
|
||||
return
|
||||
}
|
||||
defer regex.destroy(rex)
|
||||
|
||||
str := cast(string)data
|
||||
|
||||
log.debug(rex, str)
|
||||
|
||||
start := time.now()
|
||||
capture, ok := regex.match(rex, str)
|
||||
done := time.since(start)
|
||||
defer regex.destroy(capture)
|
||||
|
||||
if !testing.expect_value(t, ok, true) {
|
||||
continue
|
||||
}
|
||||
testing.expect_value(t, capture.pos[0], [2]int{0, count})
|
||||
|
||||
rate := cast(int)(cast(f64)(count / 2) / (cast(f64)done / 1e9))
|
||||
report = fmt.tprintf("%s\n +++ [%i : %v : %M/s] Matched `a?^%ia^%i` against `a^%i`.", report, count, done, rate, count, count, count)
|
||||
}
|
||||
log.info(report)
|
||||
}
|
||||
|
||||
@test
|
||||
global_capture_end_word :: proc(t: ^testing.T) {
|
||||
EXPR :: `Hellope World!`
|
||||
|
||||
rex, err := regex.create(EXPR, { .Global })
|
||||
if !testing.expect_value(t, err, nil) {
|
||||
return
|
||||
}
|
||||
defer regex.destroy(rex)
|
||||
|
||||
report := fmt.tprintf("Matching %q over a block of random ASCII text.", EXPR)
|
||||
|
||||
for size in sizes {
|
||||
data := make([]u8, size)
|
||||
defer delete(data)
|
||||
randomize_ascii(data[:])
|
||||
|
||||
for r, i in EXPR {
|
||||
data[len(data) - len(EXPR) + i] = cast(u8)r
|
||||
}
|
||||
|
||||
str := cast(string)data
|
||||
|
||||
start := time.now()
|
||||
capture, ok := regex.match(rex, str)
|
||||
done := time.since(start)
|
||||
defer regex.destroy(capture)
|
||||
|
||||
if !testing.expect_value(t, ok, true) {
|
||||
continue
|
||||
}
|
||||
testing.expect_value(t, capture.pos[0], [2]int{size - len(EXPR), size})
|
||||
|
||||
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
|
||||
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
|
||||
}
|
||||
log.info(report)
|
||||
}
|
||||
|
||||
@test
|
||||
global_capture_end_word_unicode :: proc(t: ^testing.T) {
|
||||
EXPR :: `こにちは`
|
||||
needle := string(EXPR)
|
||||
|
||||
rex, err := regex.create(EXPR, { .Global, .Unicode })
|
||||
if !testing.expect_value(t, err, nil) {
|
||||
return
|
||||
}
|
||||
defer regex.destroy(rex)
|
||||
|
||||
report := fmt.tprintf("Matching %q over a block of random Unicode text.", EXPR)
|
||||
|
||||
for size in sizes {
|
||||
data := make([]u8, size)
|
||||
defer delete(data)
|
||||
randomize_unicode(data[:size - len(needle)])
|
||||
|
||||
for i := 0; i < len(needle); i += 1 {
|
||||
data[len(data) - len(needle) + i] = needle[i]
|
||||
}
|
||||
|
||||
str := cast(string)data
|
||||
|
||||
start := time.now()
|
||||
capture, ok := regex.match(rex, str)
|
||||
done := time.since(start)
|
||||
defer regex.destroy(capture)
|
||||
|
||||
if !testing.expect_value(t, ok, true) {
|
||||
continue
|
||||
}
|
||||
testing.expect_value(t, capture.groups[0], needle)
|
||||
|
||||
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
|
||||
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
|
||||
}
|
||||
log.info(report)
|
||||
}
|
||||
|
||||
|
||||
@test
|
||||
alternations :: proc(t: ^testing.T) {
|
||||
EXPR :: `a(?:bb|cc|dd|ee|ff)`
|
||||
|
||||
rex, err := regex.create(EXPR, { .No_Capture, .Global })
|
||||
if !testing.expect_value(t, err, nil) {
|
||||
return
|
||||
}
|
||||
defer regex.destroy(rex)
|
||||
|
||||
report := fmt.tprintf("Matching %q over a text block of only `a`s.", EXPR)
|
||||
|
||||
for size in sizes {
|
||||
data := make([]u8, size)
|
||||
defer delete(data)
|
||||
for i in 0..<size {
|
||||
data[i] = 'a'
|
||||
}
|
||||
|
||||
str := cast(string)data
|
||||
|
||||
start := time.now()
|
||||
_, ok := regex.match(rex, str)
|
||||
done := time.since(start)
|
||||
|
||||
testing.expect_value(t, ok, false)
|
||||
|
||||
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
|
||||
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
|
||||
}
|
||||
log.info(report)
|
||||
}
|
||||
|
||||
@test
|
||||
classes :: proc(t: ^testing.T) {
|
||||
EXPR :: `[\w\d]+`
|
||||
NEEDLE :: "0123456789abcdef"
|
||||
|
||||
rex, err := regex.create(EXPR, { .Global })
|
||||
if !testing.expect_value(t, err, nil) {
|
||||
return
|
||||
}
|
||||
defer regex.destroy(rex)
|
||||
|
||||
report := fmt.tprintf("Matching %q over a string of spaces with %q at the end.", EXPR, NEEDLE)
|
||||
|
||||
for size in sizes {
|
||||
data := make([]u8, size)
|
||||
defer delete(data)
|
||||
|
||||
for i in 0..<size {
|
||||
data[i] = ' '
|
||||
}
|
||||
|
||||
for r, i in NEEDLE {
|
||||
data[len(data) - len(NEEDLE) + i] = cast(u8)r
|
||||
}
|
||||
|
||||
str := cast(string)data
|
||||
|
||||
start := time.now()
|
||||
capture, ok := regex.match(rex, str)
|
||||
done := time.since(start)
|
||||
defer regex.destroy(capture)
|
||||
|
||||
if !testing.expect_value(t, ok, true) {
|
||||
continue
|
||||
}
|
||||
testing.expect_value(t, capture.pos[0], [2]int{size - len(NEEDLE), size})
|
||||
|
||||
rate := cast(int)(cast(f64)size / (cast(f64)done / 1e9))
|
||||
report = fmt.tprintf("%s\n +++ [%M : %v : %M/s]", report, size, done, rate)
|
||||
}
|
||||
log.info(report)
|
||||
}
|
||||
@@ -42,6 +42,7 @@ download_assets :: proc() {
|
||||
@(require) import "sys/windows"
|
||||
@(require) import "text/i18n"
|
||||
@(require) import "text/match"
|
||||
@(require) import "text/regex"
|
||||
@(require) import "thread"
|
||||
@(require) import "time"
|
||||
@(require) import "unicode"
|
||||
|
||||
1045
tests/core/text/regex/test_core_text_regex.odin
Normal file
1045
tests/core/text/regex/test_core_text_regex.odin
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user