mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-03 19:52:30 +00:00
In `.Multiline` mode: - `^` is now defined to assert the start of the string or that a "\n" or "\r" rune was parsed on last VM dispatch. - `$` is now defined to consume a newline sequence of "\n", "\r", or "\r\n" or to assert the end of the string.
557 lines
15 KiB
Odin
557 lines
15 KiB
Odin
package regex_compiler
|
|
|
|
/*
|
|
(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
|
|
Made available under Odin's BSD-3 license.
|
|
|
|
List of contributors:
|
|
Feoramund: Initial implementation.
|
|
*/
|
|
|
|
import "base:intrinsics"
|
|
import "core:text/regex/common"
|
|
import "core:text/regex/parser"
|
|
import "core:text/regex/tokenizer"
|
|
import "core:text/regex/virtual_machine"
|
|
import "core:unicode"
|
|
|
|
Token :: tokenizer.Token
|
|
Token_Kind :: tokenizer.Token_Kind
|
|
Tokenizer :: tokenizer.Tokenizer
|
|
|
|
Rune_Class_Range :: parser.Rune_Class_Range
|
|
Rune_Class_Data :: parser.Rune_Class_Data
|
|
|
|
Node :: parser.Node
|
|
Node_Rune :: parser.Node_Rune
|
|
Node_Rune_Class :: parser.Node_Rune_Class
|
|
Node_Wildcard :: parser.Node_Wildcard
|
|
Node_Concatenation :: parser.Node_Concatenation
|
|
Node_Alternation :: parser.Node_Alternation
|
|
Node_Repeat_Zero :: parser.Node_Repeat_Zero
|
|
Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
|
|
Node_Repeat_One :: parser.Node_Repeat_One
|
|
Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
|
|
Node_Repeat_N :: parser.Node_Repeat_N
|
|
Node_Optional :: parser.Node_Optional
|
|
Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
|
|
Node_Group :: parser.Node_Group
|
|
Node_Anchor :: parser.Node_Anchor
|
|
Node_Word_Boundary :: parser.Node_Word_Boundary
|
|
Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
|
|
|
|
Opcode :: virtual_machine.Opcode
|
|
Program :: [dynamic]Opcode
|
|
|
|
JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
|
|
SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
|
|
|
|
|
|
Compiler :: struct {
|
|
flags: common.Flags,
|
|
class_data: [dynamic]Rune_Class_Data,
|
|
}
|
|
|
|
|
|
Error :: enum {
|
|
None,
|
|
Program_Too_Big,
|
|
Too_Many_Classes,
|
|
}
|
|
|
|
classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
|
|
assert(q != nil)
|
|
assert(w != nil)
|
|
|
|
if q == w {
|
|
return true
|
|
}
|
|
|
|
if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
|
|
return false
|
|
}
|
|
|
|
for r, i in q.runes {
|
|
if r != w.runes[i] {
|
|
return false
|
|
}
|
|
}
|
|
|
|
for r, i in q.ranges {
|
|
if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
|
|
if tree == nil {
|
|
return
|
|
}
|
|
|
|
switch specific in tree {
|
|
case ^Node_Rune: break
|
|
case ^Node_Wildcard: break
|
|
case ^Node_Anchor: break
|
|
case ^Node_Word_Boundary: break
|
|
case ^Node_Match_All_And_Escape: break
|
|
|
|
case ^Node_Concatenation:
|
|
for subnode in specific.nodes {
|
|
map_all_classes(subnode, collection)
|
|
}
|
|
|
|
case ^Node_Repeat_Zero:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Repeat_Zero_Non_Greedy:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Repeat_One:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Repeat_One_Non_Greedy:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Repeat_N:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Optional:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Optional_Non_Greedy:
|
|
map_all_classes(specific.inner, collection)
|
|
case ^Node_Group:
|
|
map_all_classes(specific.inner, collection)
|
|
|
|
case ^Node_Alternation:
|
|
map_all_classes(specific.left, collection)
|
|
map_all_classes(specific.right, collection)
|
|
|
|
case ^Node_Rune_Class:
|
|
unseen := true
|
|
for &value in collection {
|
|
if classes_are_exact(&specific.data, &value) {
|
|
unseen = false
|
|
break
|
|
}
|
|
}
|
|
|
|
if unseen {
|
|
append(collection, specific.data)
|
|
}
|
|
}
|
|
}
|
|
|
|
append_raw :: #force_inline proc(code: ^Program, data: $T) {
|
|
// NOTE: This is system-dependent endian.
|
|
for b in transmute([size_of(T)]byte)data {
|
|
append(code, cast(Opcode)b)
|
|
}
|
|
}
|
|
inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
|
|
// NOTE: This is system-dependent endian.
|
|
for b, i in transmute([size_of(T)]byte)data {
|
|
inject_at(code, start + i, cast(Opcode)b)
|
|
}
|
|
}
|
|
|
|
@require_results
|
|
generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
|
|
if node == nil {
|
|
return
|
|
}
|
|
|
|
// NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
|
|
// this later when relative jumps are turned into absolute jumps.
|
|
|
|
switch specific in node {
|
|
// Atomic Nodes:
|
|
case ^Node_Rune:
|
|
if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
|
|
append(&code, Opcode.Byte)
|
|
append(&code, cast(Opcode)specific.data)
|
|
} else {
|
|
append(&code, Opcode.Rune)
|
|
append_raw(&code, specific.data)
|
|
}
|
|
|
|
case ^Node_Rune_Class:
|
|
if specific.negating {
|
|
append(&code, Opcode.Rune_Class_Negated)
|
|
} else {
|
|
append(&code, Opcode.Rune_Class)
|
|
}
|
|
|
|
index := -1
|
|
for &data, i in c.class_data {
|
|
if classes_are_exact(&data, &specific.data) {
|
|
index = i
|
|
break
|
|
}
|
|
}
|
|
assert(index != -1, "Unable to find collected Rune_Class_Data index.")
|
|
|
|
append(&code, Opcode(index))
|
|
|
|
case ^Node_Wildcard:
|
|
append(&code, Opcode.Wildcard)
|
|
|
|
case ^Node_Anchor:
|
|
if .Multiline in c.flags {
|
|
if specific.start {
|
|
append(&code, Opcode.Assert_Start_Multiline)
|
|
} else {
|
|
append(&code, Opcode.Multiline_Open)
|
|
append(&code, Opcode.Multiline_Close)
|
|
}
|
|
} else {
|
|
if specific.start {
|
|
append(&code, Opcode.Assert_Start)
|
|
} else {
|
|
append(&code, Opcode.Assert_End)
|
|
}
|
|
}
|
|
case ^Node_Word_Boundary:
|
|
if specific.non_word {
|
|
append(&code, Opcode.Assert_Non_Word_Boundary)
|
|
} else {
|
|
append(&code, Opcode.Assert_Word_Boundary)
|
|
}
|
|
|
|
// Compound Nodes:
|
|
case ^Node_Group:
|
|
code = generate_code(c, specific.inner)
|
|
|
|
if specific.capture && .No_Capture not_in c.flags {
|
|
inject_at(&code, 0, Opcode.Save)
|
|
inject_at(&code, 1, Opcode(2 * specific.capture_id))
|
|
|
|
append(&code, Opcode.Save)
|
|
append(&code, Opcode(2 * specific.capture_id + 1))
|
|
}
|
|
|
|
case ^Node_Alternation:
|
|
left := generate_code(c, specific.left)
|
|
right := generate_code(c, specific.right)
|
|
|
|
left_len := len(left)
|
|
|
|
// Avoiding duplicate allocation by reusing `left`.
|
|
code = left
|
|
|
|
inject_at(&code, 0, Opcode.Split)
|
|
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
|
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
|
|
|
|
append(&code, Opcode.Jump)
|
|
append_raw(&code, i16(len(right) + JUMP_SIZE))
|
|
|
|
for opcode in right {
|
|
append(&code, opcode)
|
|
}
|
|
|
|
case ^Node_Concatenation:
|
|
for subnode in specific.nodes {
|
|
subnode_code := generate_code(c, subnode)
|
|
for opcode in subnode_code {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
|
|
case ^Node_Repeat_Zero:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
inject_at(&code, 0, Opcode.Split)
|
|
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
|
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
|
|
|
append(&code, Opcode.Jump)
|
|
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
|
|
|
case ^Node_Repeat_Zero_Non_Greedy:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
inject_at(&code, 0, Opcode.Split)
|
|
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
|
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
|
|
|
|
append(&code, Opcode.Jump)
|
|
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
|
|
|
case ^Node_Repeat_One:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
append(&code, Opcode.Split)
|
|
append_raw(&code, i16(-original_len))
|
|
append_raw(&code, i16(SPLIT_SIZE))
|
|
|
|
case ^Node_Repeat_One_Non_Greedy:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
append(&code, Opcode.Split)
|
|
append_raw(&code, i16(SPLIT_SIZE))
|
|
append_raw(&code, i16(-original_len))
|
|
|
|
case ^Node_Repeat_N:
|
|
inside := generate_code(c, specific.inner)
|
|
original_len := len(inside)
|
|
|
|
if specific.lower == specific.upper { // {N}
|
|
// e{N} ... evaluates to ... e^N
|
|
for i := 0; i < specific.upper; i += 1 {
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
|
|
} else if specific.lower == -1 && specific.upper > 0 { // {,M}
|
|
// e{,M} ... evaluates to ... e?^M
|
|
for i := 0; i < specific.upper; i += 1 {
|
|
append(&code, Opcode.Split)
|
|
append_raw(&code, i16(SPLIT_SIZE))
|
|
append_raw(&code, i16(SPLIT_SIZE + original_len))
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
|
|
} else if specific.lower >= 0 && specific.upper == -1 { // {N,}
|
|
// e{N,} ... evaluates to ... e^N e*
|
|
for i := 0; i < specific.lower; i += 1 {
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
|
|
append(&code, Opcode.Split)
|
|
append_raw(&code, i16(SPLIT_SIZE))
|
|
append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
|
|
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
|
|
append(&code, Opcode.Jump)
|
|
append_raw(&code, i16(-original_len - SPLIT_SIZE))
|
|
|
|
} else if specific.lower >= 0 && specific.upper > 0 {
|
|
// e{N,M} evaluates to ... e^N e?^(M-N)
|
|
for i := 0; i < specific.lower; i += 1 {
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
for i := 0; i < specific.upper - specific.lower; i += 1 {
|
|
append(&code, Opcode.Split)
|
|
append_raw(&code, i16(SPLIT_SIZE + original_len))
|
|
append_raw(&code, i16(SPLIT_SIZE))
|
|
for opcode in inside {
|
|
append(&code, opcode)
|
|
}
|
|
}
|
|
|
|
} else {
|
|
panic("RegEx compiler received invalid repetition group.")
|
|
}
|
|
|
|
case ^Node_Optional:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
inject_at(&code, 0, Opcode.Split)
|
|
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
|
|
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
|
|
|
|
case ^Node_Optional_Non_Greedy:
|
|
code = generate_code(c, specific.inner)
|
|
original_len := len(code)
|
|
|
|
inject_at(&code, 0, Opcode.Split)
|
|
inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
|
|
inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
|
|
|
|
case ^Node_Match_All_And_Escape:
|
|
append(&code, Opcode.Match_All_And_Escape)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
@require_results
|
|
compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
|
|
if tree == nil {
|
|
if .No_Capture not_in flags {
|
|
append(&code, Opcode.Save); append(&code, Opcode(0x00))
|
|
append(&code, Opcode.Save); append(&code, Opcode(0x01))
|
|
append(&code, Opcode.Match)
|
|
} else {
|
|
append(&code, Opcode.Match_And_Exit)
|
|
}
|
|
return
|
|
}
|
|
|
|
c: Compiler
|
|
c.flags = flags
|
|
|
|
map_all_classes(tree, &class_data)
|
|
if len(class_data) >= common.MAX_CLASSES {
|
|
err = .Too_Many_Classes
|
|
return
|
|
}
|
|
c.class_data = class_data
|
|
|
|
code = generate_code(&c, tree)
|
|
|
|
pc_open := 0
|
|
|
|
optimize_opening: {
|
|
// Check if the opening to the pattern is predictable.
|
|
// If so, use one of the optimized Wait opcodes.
|
|
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
|
seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
|
#partial switch opcode {
|
|
case .Byte:
|
|
inject_at(&code, pc_open, Opcode.Wait_For_Byte)
|
|
pc_open += size_of(Opcode)
|
|
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
|
pc_open += size_of(u8)
|
|
break optimize_opening
|
|
|
|
case .Rune:
|
|
operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
|
|
inject_at(&code, pc_open, Opcode.Wait_For_Rune)
|
|
pc_open += size_of(Opcode)
|
|
inject_raw(&code, pc_open, operand)
|
|
pc_open += size_of(rune)
|
|
break optimize_opening
|
|
|
|
case .Rune_Class:
|
|
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
|
|
pc_open += size_of(Opcode)
|
|
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
|
pc_open += size_of(u8)
|
|
break optimize_opening
|
|
|
|
case .Rune_Class_Negated:
|
|
inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
|
|
pc_open += size_of(Opcode)
|
|
inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
|
|
pc_open += size_of(u8)
|
|
break optimize_opening
|
|
|
|
case .Save:
|
|
continue
|
|
|
|
case .Assert_Start, .Assert_Start_Multiline:
|
|
break optimize_opening
|
|
|
|
case:
|
|
break seek_loop
|
|
}
|
|
}
|
|
|
|
// `.*?`
|
|
inject_at(&code, pc_open, Opcode.Split)
|
|
pc_open += size_of(byte)
|
|
inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
|
|
pc_open += size_of(i16)
|
|
inject_raw(&code, pc_open, i16(SPLIT_SIZE))
|
|
pc_open += size_of(i16)
|
|
|
|
inject_at(&code, pc_open, Opcode.Wildcard)
|
|
pc_open += size_of(byte)
|
|
|
|
inject_at(&code, pc_open, Opcode.Jump)
|
|
pc_open += size_of(byte)
|
|
inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
|
|
pc_open += size_of(i16)
|
|
|
|
}
|
|
|
|
if .No_Capture not_in flags {
|
|
// `(` <generated code>
|
|
inject_at(&code, pc_open, Opcode.Save)
|
|
inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
|
|
|
|
// `)`
|
|
append(&code, Opcode.Save); append(&code, Opcode(0x01))
|
|
|
|
append(&code, Opcode.Match)
|
|
} else {
|
|
append(&code, Opcode.Match_And_Exit)
|
|
}
|
|
|
|
if len(code) >= common.MAX_PROGRAM_SIZE {
|
|
err = .Program_Too_Big
|
|
return
|
|
}
|
|
|
|
// NOTE: No further opcode addition beyond this point, as we've already
|
|
// checked the program size. Removal or transformation is fine.
|
|
|
|
// Post-Compile Optimizations:
|
|
|
|
// * Jump Extension
|
|
//
|
|
// A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
|
|
if .No_Optimization not_in flags {
|
|
for passes_left := 1; passes_left > 0; passes_left -= 1 {
|
|
do_another_pass := false
|
|
|
|
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
|
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
|
#partial switch opcode {
|
|
case .Jump:
|
|
jmp := cast(^i16)&code[pc+size_of(Opcode)]
|
|
jmp_value := intrinsics.unaligned_load(jmp)
|
|
if code[cast(i16)pc+jmp_value] == .Jump {
|
|
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)])
|
|
intrinsics.unaligned_store(jmp, jmp_value + next_jmp)
|
|
do_another_pass = true
|
|
}
|
|
case .Split:
|
|
jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
|
|
jmp_x_value := intrinsics.unaligned_load(jmp_x)
|
|
if code[cast(i16)pc+jmp_x_value] == .Jump {
|
|
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)])
|
|
intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp)
|
|
do_another_pass = true
|
|
}
|
|
jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
|
|
jmp_y_value := intrinsics.unaligned_load(jmp_y)
|
|
if code[cast(i16)pc+jmp_y_value] == .Jump {
|
|
next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)])
|
|
intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp)
|
|
do_another_pass = true
|
|
}
|
|
}
|
|
}
|
|
|
|
if do_another_pass {
|
|
passes_left += 1
|
|
}
|
|
}
|
|
}
|
|
|
|
// * Relative Jump to Absolute Jump
|
|
//
|
|
// RelJmp{PC +/- N} => AbsJmp{M}
|
|
iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
|
|
for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
|
|
// NOTE: The virtual machine implementation depends on this.
|
|
#partial switch opcode {
|
|
case .Jump:
|
|
jmp := cast(^u16)&code[pc+size_of(Opcode)]
|
|
intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc)
|
|
case .Split:
|
|
jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
|
|
intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc)
|
|
jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
|
|
intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|