From cb0704d51c6a170bca8206a9bb3e9796c71c6341 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:58:25 -0400 Subject: [PATCH 01/24] Add `core:text/regex` --- core/text/regex/common/common.odin | 27 + core/text/regex/common/debugging.odin | 25 + core/text/regex/compiler/compiler.odin | 538 +++++++++++++++ core/text/regex/compiler/debugging.odin | 84 +++ core/text/regex/compiler/doc.odin | 9 + core/text/regex/doc.odin | 75 ++ core/text/regex/optimizer/doc.odin | 58 ++ core/text/regex/optimizer/optimizer.odin | 522 ++++++++++++++ core/text/regex/parser/debugging.odin | 103 +++ core/text/regex/parser/doc.odin | 10 + core/text/regex/parser/parser.odin | 580 ++++++++++++++++ core/text/regex/regex.odin | 434 ++++++++++++ core/text/regex/tokenizer/tokenizer.odin | 349 ++++++++++ core/text/regex/virtual_machine/doc.odin | 175 +++++ core/text/regex/virtual_machine/util.odin | 73 ++ .../virtual_machine/virtual_machine.odin | 638 ++++++++++++++++++ 16 files changed, 3700 insertions(+) create mode 100644 core/text/regex/common/common.odin create mode 100644 core/text/regex/common/debugging.odin create mode 100644 core/text/regex/compiler/compiler.odin create mode 100644 core/text/regex/compiler/debugging.odin create mode 100644 core/text/regex/compiler/doc.odin create mode 100644 core/text/regex/doc.odin create mode 100644 core/text/regex/optimizer/doc.odin create mode 100644 core/text/regex/optimizer/optimizer.odin create mode 100644 core/text/regex/parser/debugging.odin create mode 100644 core/text/regex/parser/doc.odin create mode 100644 core/text/regex/parser/parser.odin create mode 100644 core/text/regex/regex.odin create mode 100644 core/text/regex/tokenizer/tokenizer.odin create mode 100644 core/text/regex/virtual_machine/doc.odin create mode 100644 core/text/regex/virtual_machine/util.odin create mode 100644 core/text/regex/virtual_machine/virtual_machine.odin diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin new file mode 100644 index 00000000000..f53f043a17c --- /dev/null +++ b/core/text/regex/common/common.odin @@ -0,0 +1,27 @@ +// This package helps break dependency cycles. +package regex_common + +// VM limitations +MAX_CAPTURE_GROUPS :: 10 +MAX_PROGRAM_SIZE :: int(max(i16)) +MAX_CLASSES :: int(max(u8)) + +Flag :: enum u8 { + // Global: try to match the pattern anywhere in the string. + Global, + // Multiline: treat `^` and `$` as if they also match newlines. + Multiline, + // Case Insensitive: treat `a-z` as if it was also `A-Z`. + Case_Insensitive, + // Ignore Whitespace: bypass unescaped whitespace outside of classes. + Ignore_Whitespace, + // Unicode: let the compiler and virtual machine know to expect Unicode strings. + Unicode, + + // No Capture: avoid saving capture group data entirely. + No_Capture, + // No Optimization: do not pass the pattern through the optimizer; for debugging. + No_Optimization, +} + +Flags :: bit_set[Flag; u8] diff --git a/core/text/regex/common/debugging.odin b/core/text/regex/common/debugging.odin new file mode 100644 index 00000000000..062c314ccf2 --- /dev/null +++ b/core/text/regex/common/debugging.odin @@ -0,0 +1,25 @@ +package regex_common + +@require import "core:os" +import "core:io" +import "core:strings" + +ODIN_DEBUG_REGEX :: #config(ODIN_DEBUG_REGEX, false) + +when ODIN_DEBUG_REGEX { + debug_stream := os.stream_from_handle(os.stderr) +} + +write_padded_hex :: proc(w: io.Writer, #any_int n, zeroes: int) { + sb := strings.builder_make() + defer strings.builder_destroy(&sb) + + sbw := strings.to_writer(&sb) + io.write_int(sbw, n, 0x10) + + io.write_string(w, "0x") + for _ in 0.. bool #no_bounds_check { + assert(q != nil) + assert(w != nil) + + if q == w { + return true + } + + if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) { + return false + } + + for r, i in q.runes { + if r != w.runes[i] { + return false + } + } + + for r, i in q.ranges { + if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper { + return false + } + } + + return true +} + +map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) { + if tree == nil { + return + } + + switch specific in tree { + case ^Node_Rune: break + case ^Node_Wildcard: break + case ^Node_Anchor: break + case ^Node_Word_Boundary: break + case ^Node_Match_All_And_Escape: break + + case ^Node_Concatenation: + for subnode in specific.nodes { + map_all_classes(subnode, collection) + } + + case ^Node_Repeat_Zero: + map_all_classes(specific.inner, collection) + case ^Node_Repeat_Zero_Non_Greedy: + map_all_classes(specific.inner, collection) + case ^Node_Repeat_One: + map_all_classes(specific.inner, collection) + case ^Node_Repeat_One_Non_Greedy: + map_all_classes(specific.inner, collection) + case ^Node_Repeat_N: + map_all_classes(specific.inner, collection) + case ^Node_Optional: + map_all_classes(specific.inner, collection) + case ^Node_Optional_Non_Greedy: + map_all_classes(specific.inner, collection) + case ^Node_Group: + map_all_classes(specific.inner, collection) + + case ^Node_Alternation: + map_all_classes(specific.left, collection) + map_all_classes(specific.right, collection) + + case ^Node_Rune_Class: + unseen := true + for &value in collection { + if classes_are_exact(&specific.data, &value) { + unseen = false + break + } + } + + if unseen { + append(collection, specific.data) + } + } +} + +append_raw :: #force_inline proc(code: ^Program, data: $T) { + // NOTE: This is system-dependent endian. + for b in transmute([size_of(T)]byte)data { + append(code, cast(Opcode)b) + } +} +inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) { + // NOTE: This is system-dependent endian. + for b, i in transmute([size_of(T)]byte)data { + inject_at(code, start + i, cast(Opcode)b) + } +} + +@require_results +generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) { + if node == nil { + return + } + + // NOTE: For Jump/Split arguments, we write as i16 and will reinterpret + // this later when relative jumps are turned into absolute jumps. + + switch specific in node { + // Atomic Nodes: + case ^Node_Rune: + if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 { + append(&code, Opcode.Byte) + append(&code, cast(Opcode)specific.data) + } else { + append(&code, Opcode.Rune) + append_raw(&code, specific.data) + } + + case ^Node_Rune_Class: + if specific.negating { + append(&code, Opcode.Rune_Class_Negated) + } else { + append(&code, Opcode.Rune_Class) + } + + index := -1 + for &data, i in c.class_data { + if classes_are_exact(&data, &specific.data) { + index = i + break + } + } + assert(index != -1, "Unable to find collected Rune_Class_Data index.") + + append(&code, Opcode(index)) + + case ^Node_Wildcard: + append(&code, Opcode.Wildcard) + + case ^Node_Anchor: + if .Multiline in c.flags { + append(&code, Opcode.Multiline_Open) + append(&code, Opcode.Multiline_Close) + } else { + if specific.start { + c.anchor_start_seen = true + append(&code, Opcode.Assert_Start) + } else { + append(&code, Opcode.Assert_End) + } + } + case ^Node_Word_Boundary: + if specific.non_word { + append(&code, Opcode.Assert_Non_Word_Boundary) + } else { + append(&code, Opcode.Assert_Word_Boundary) + } + + // Compound Nodes: + case ^Node_Group: + code = generate_code(c, specific.inner) + + if specific.capture && .No_Capture not_in c.flags { + inject_at(&code, 0, Opcode.Save) + inject_at(&code, 1, Opcode(2 * specific.capture_id)) + + append(&code, Opcode.Save) + append(&code, Opcode(2 * specific.capture_id + 1)) + } + + case ^Node_Alternation: + left := generate_code(c, specific.left) + right := generate_code(c, specific.right) + + left_len := len(left) + + // Avoiding duplicate allocation by reusing `left`. + code = left + + inject_at(&code, 0, Opcode.Split) + inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE)) + inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE)) + + append(&code, Opcode.Jump) + append_raw(&code, i16(len(right) + JUMP_SIZE)) + + for opcode in right { + append(&code, opcode) + } + + case ^Node_Concatenation: + for subnode in specific.nodes { + subnode_code := generate_code(c, subnode) + for opcode in subnode_code { + append(&code, opcode) + } + } + + case ^Node_Repeat_Zero: + code = generate_code(c, specific.inner) + original_len := len(code) + + inject_at(&code, 0, Opcode.Split) + inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE)) + inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE)) + + append(&code, Opcode.Jump) + append_raw(&code, i16(-original_len - SPLIT_SIZE)) + + case ^Node_Repeat_Zero_Non_Greedy: + code = generate_code(c, specific.inner) + original_len := len(code) + + inject_at(&code, 0, Opcode.Split) + inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE)) + inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE)) + + append(&code, Opcode.Jump) + append_raw(&code, i16(-original_len - SPLIT_SIZE)) + + case ^Node_Repeat_One: + code = generate_code(c, specific.inner) + original_len := len(code) + + append(&code, Opcode.Split) + append_raw(&code, i16(-original_len)) + append_raw(&code, i16(SPLIT_SIZE)) + + case ^Node_Repeat_One_Non_Greedy: + code = generate_code(c, specific.inner) + original_len := len(code) + + append(&code, Opcode.Split) + append_raw(&code, i16(SPLIT_SIZE)) + append_raw(&code, i16(-original_len)) + + case ^Node_Repeat_N: + inside := generate_code(c, specific.inner) + original_len := len(inside) + + if specific.lower == specific.upper { // {N} + // e{N} ... evaluates to ... e^N + for i := 0; i < specific.upper; i += 1 { + for opcode in inside { + append(&code, opcode) + } + } + + } else if specific.lower == -1 && specific.upper > 0 { // {,M} + // e{,M} ... evaluates to ... e?^M + for i := 0; i < specific.upper; i += 1 { + append(&code, Opcode.Split) + append_raw(&code, i16(SPLIT_SIZE)) + append_raw(&code, i16(SPLIT_SIZE + original_len)) + for opcode in inside { + append(&code, opcode) + } + } + + } else if specific.lower >= 0 && specific.upper == -1 { // {N,} + // e{N,} ... evaluates to ... e^N e* + for i := 0; i < specific.lower; i += 1 { + for opcode in inside { + append(&code, opcode) + } + } + + append(&code, Opcode.Split) + append_raw(&code, i16(SPLIT_SIZE)) + append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE)) + + for opcode in inside { + append(&code, opcode) + } + + append(&code, Opcode.Jump) + append_raw(&code, i16(-original_len - SPLIT_SIZE)) + + } else if specific.lower >= 0 && specific.upper > 0 { + // e{N,M} evaluates to ... e^N e?^(M-N) + for i := 0; i < specific.lower; i += 1 { + for opcode in inside { + append(&code, opcode) + } + } + for i := 0; i < specific.upper - specific.lower; i += 1 { + append(&code, Opcode.Split) + append_raw(&code, i16(SPLIT_SIZE + original_len)) + append_raw(&code, i16(SPLIT_SIZE)) + for opcode in inside { + append(&code, opcode) + } + } + + } else { + panic("RegEx compiler received invalid repetition group.") + } + + case ^Node_Optional: + code = generate_code(c, specific.inner) + original_len := len(code) + + inject_at(&code, 0, Opcode.Split) + inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE)) + inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len)) + + case ^Node_Optional_Non_Greedy: + code = generate_code(c, specific.inner) + original_len := len(code) + + inject_at(&code, 0, Opcode.Split) + inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len)) + inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE)) + + case ^Node_Match_All_And_Escape: + append(&code, Opcode.Match_All_And_Escape) + } + + return +} + +@require_results +compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) { + if tree == nil { + if .No_Capture not_in flags { + append(&code, Opcode.Save); append(&code, Opcode(0x00)) + append(&code, Opcode.Save); append(&code, Opcode(0x01)) + append(&code, Opcode.Match) + } else { + append(&code, Opcode.Match_And_Exit) + } + return + } + + c: Compiler + c.flags = flags + + map_all_classes(tree, &class_data) + if len(class_data) >= common.MAX_CLASSES { + err = .Too_Many_Classes + return + } + c.class_data = class_data + + code = generate_code(&c, tree) + + pc_open := 0 + + add_global: if .Global in flags { + // Check if the opening to the pattern is predictable. + // If so, use one of the optimized Wait opcodes. + iter := virtual_machine.Opcode_Iterator{ code[:], 0 } + seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) { + #partial switch opcode { + case .Byte: + inject_at(&code, pc_open, Opcode.Wait_For_Byte) + pc_open += size_of(Opcode) + inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) + pc_open += size_of(u8) + break add_global + + case .Rune: + operand := (cast(^rune)&code[pc+1])^ + inject_at(&code, pc_open, Opcode.Wait_For_Rune) + pc_open += size_of(Opcode) + inject_raw(&code, pc_open, operand) + pc_open += size_of(rune) + break add_global + + case .Rune_Class: + inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class) + pc_open += size_of(Opcode) + inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) + pc_open += size_of(u8) + break add_global + + case .Rune_Class_Negated: + inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated) + pc_open += size_of(Opcode) + inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open])) + pc_open += size_of(u8) + break add_global + + case .Save: + continue + case: + break seek_loop + } + } + + // `.*?` + inject_at(&code, pc_open, Opcode.Split) + pc_open += size_of(byte) + inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE)) + pc_open += size_of(i16) + inject_raw(&code, pc_open, i16(SPLIT_SIZE)) + pc_open += size_of(i16) + + inject_at(&code, pc_open, Opcode.Wildcard) + pc_open += size_of(byte) + + inject_at(&code, pc_open, Opcode.Jump) + pc_open += size_of(byte) + inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE)) + pc_open += size_of(i16) + + } + + if .No_Capture not_in flags { + // `(` + inject_at(&code, pc_open, Opcode.Save) + inject_at(&code, pc_open + size_of(byte), Opcode(0x00)) + + // `)` + append(&code, Opcode.Save); append(&code, Opcode(0x01)) + + append(&code, Opcode.Match) + } else { + append(&code, Opcode.Match_And_Exit) + } + + if len(code) >= common.MAX_PROGRAM_SIZE { + err = .Program_Too_Big + return + } + + // NOTE: No further opcode addition beyond this point, as we've already + // checked the program size. Removal or transformation is fine. + + // Post-Compile Optimizations: + + // * Jump Extension + // + // A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2) + if .No_Optimization not_in flags { + for passes_left := 1; passes_left > 0; passes_left -= 1 { + do_another_pass := false + + iter := virtual_machine.Opcode_Iterator{ code[:], 0 } + for opcode, pc in virtual_machine.iterate_opcodes(&iter) { + #partial switch opcode { + case .Jump: + jmp := cast(^i16)&code[pc+size_of(Opcode)] + if code[cast(i16)pc+jmp^] == .Jump { + next_jmp := (cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)])^ + jmp^ = jmp^ + next_jmp + do_another_pass = true + } + case .Split: + jmp_x := cast(^i16)&code[pc+size_of(Opcode)] + if code[cast(i16)pc+jmp_x^] == .Jump { + next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)])^ + jmp_x^ = jmp_x^ + next_jmp + do_another_pass = true + } + jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)] + if code[cast(i16)pc+jmp_y^] == .Jump { + next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)])^ + jmp_y^ = jmp_y^ + next_jmp + do_another_pass = true + } + } + } + + if do_another_pass { + passes_left += 1 + } + } + } + + // * Relative Jump to Absolute Jump + // + // RelJmp{PC +/- N} => AbsJmp{M} + iter := virtual_machine.Opcode_Iterator{ code[:], 0 } + for opcode, pc in virtual_machine.iterate_opcodes(&iter) { + // NOTE: The virtual machine implementation depends on this. + #partial switch opcode { + case .Jump: + jmp := cast(^u16)&code[pc+size_of(Opcode)] + jmp^ = jmp^ + cast(u16)pc + case .Split: + jmp_x := cast(^u16)&code[pc+size_of(Opcode)] + jmp_x^ = jmp_x^ + cast(u16)pc + jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)] + jmp_y^ = jmp_y^ + cast(u16)pc + } + } + + return +} diff --git a/core/text/regex/compiler/debugging.odin b/core/text/regex/compiler/debugging.odin new file mode 100644 index 00000000000..1ef3e6d78ef --- /dev/null +++ b/core/text/regex/compiler/debugging.odin @@ -0,0 +1,84 @@ +package regex_compiler + +import "core:io" +import "core:text/regex/common" +import "core:text/regex/virtual_machine" + +get_jump_targets :: proc(code: []Opcode) -> (jump_targets: map[int]int) { + iter := virtual_machine.Opcode_Iterator{ code, 0 } + for opcode, pc in virtual_machine.iterate_opcodes(&iter) { + #partial switch opcode { + case .Jump: + jmp := cast(int)(cast(^u16)&code[pc+1])^ + jump_targets[jmp] = pc + case .Split: + jmp_x := cast(int)(cast(^u16)&code[pc+1])^ + jmp_y := cast(int)(cast(^u16)&code[pc+3])^ + jump_targets[jmp_x] = pc + jump_targets[jmp_y] = pc + } + } + return +} + +trace :: proc(w: io.Writer, code: []Opcode) { + jump_targets := get_jump_targets(code) + defer delete(jump_targets) + + iter := virtual_machine.Opcode_Iterator{ code, 0 } + for opcode, pc in virtual_machine.iterate_opcodes(&iter) { + if src, ok := jump_targets[pc]; ok { + io.write_string(w, "--") + common.write_padded_hex(w, src, 4) + io.write_string(w, "--> ") + } else { + io.write_string(w, " ") + } + + io.write_string(w, "[PC: ") + common.write_padded_hex(w, pc, 4) + io.write_string(w, "] ") + io.write_string(w, virtual_machine.opcode_to_name(opcode)) + io.write_byte(w, ' ') + + #partial switch opcode { + case .Byte: + operand := cast(rune)code[pc+1] + io.write_encoded_rune(w, operand) + case .Rune: + operand := (cast(^rune)&code[pc+1])^ + io.write_encoded_rune(w, operand) + case .Rune_Class, .Rune_Class_Negated: + operand := cast(u8)code[pc+1] + common.write_padded_hex(w, operand, 2) + case .Jump: + jmp := (cast(^u16)&code[pc+1])^ + io.write_string(w, "-> $") + common.write_padded_hex(w, jmp, 4) + case .Split: + jmp_x := (cast(^u16)&code[pc+1])^ + jmp_y := (cast(^u16)&code[pc+3])^ + io.write_string(w, "=> $") + common.write_padded_hex(w, jmp_x, 4) + io.write_string(w, ", $") + common.write_padded_hex(w, jmp_y, 4) + case .Save: + operand := cast(u8)code[pc+1] + common.write_padded_hex(w, operand, 2) + case .Wait_For_Byte: + operand := cast(rune)code[pc+1] + io.write_encoded_rune(w, operand) + case .Wait_For_Rune: + operand := (cast(^rune)&code[pc+1])^ + io.write_encoded_rune(w, operand) + case .Wait_For_Rune_Class: + operand := cast(u8)code[pc+1] + common.write_padded_hex(w, operand, 2) + case .Wait_For_Rune_Class_Negated: + operand := cast(u8)code[pc+1] + common.write_padded_hex(w, operand, 2) + } + + io.write_byte(w, '\n') + } +} diff --git a/core/text/regex/compiler/doc.odin b/core/text/regex/compiler/doc.odin new file mode 100644 index 00000000000..8c876d837b0 --- /dev/null +++ b/core/text/regex/compiler/doc.odin @@ -0,0 +1,9 @@ +/* +package regex_compiler implements a bytecode compiler for the virtual machine +included alongside it. + +Operands larger than u8 are written in system endian order. + +More details can be found in the documentation for the virtual machine. +*/ +package regex_compiler diff --git a/core/text/regex/doc.odin b/core/text/regex/doc.odin new file mode 100644 index 00000000000..8899e1af691 --- /dev/null +++ b/core/text/regex/doc.odin @@ -0,0 +1,75 @@ +/* +package regex implements a complete suite for using Regular Expressions to +match and capture text. + +Regular expressions are used to describe how a piece of text can match to +another, using a pattern language. + +Odin's regex library implements the following features: + + Alternation: `apple|cherry` + Classes: `[0-9_]` + Wildcards: `.` + Repeat, optional: `a*` + Repeat, at least once: `a+` + Optional: `a?` + Group Capture: `([0-9])` + Group Non-Capture: `(?:[0-9])` + Start & End Anchors: `^hello$` + Word Boundaries: `\bhello\b` + Non-Word Boundaries: `hello\B` + +These specifiers can be composed together, such as an optional group: +`(?:hello)?` + +This package also supports the non-greedy variants of the repeating and +optional specifiers by appending a `?` to them. + + + + ``Some people, when confronted with a problem, think + "I know, I'll use regular expressions." Now they have two problems.'' + + - Jamie Zawinski + + +Regular expressions have gathered a reputation over the decades for often being +chosen as the wrong tool for the job. Here, we will clarify a few cases in +which RegEx might be good or bad. + + +**When is it a good time to use RegEx?** + +- You don't know at compile-time what patterns of text the program will need to + match when it's running. +- As an example, you are making a client which can be configured by the user to + trigger on certain text patterns received from a server. +- For another example, you need a way for users of a text editor to compose + matching strings that are more intricate than a simple substring lookup. +- The text you're matching against is small (< 64 KiB) and your patterns aren't + overly complicated with branches (alternations, repeats, and optionals). +- If none of the above general impressions apply but your project doesn't + warrant long-term maintenance. + +**When is it a bad time to use RegEx?** + +- You know at compile-time the grammar you're parsing; a hand-made parser has + the potential to be more maintainable and readable. +- The grammar you're parsing has certain validation steps that lend itself to + forming complicated expressions, such as e-mail addresses, URIs, dates, + postal codes, credit cards, et cetera. Using RegEx to validate these + structures is almost always a bad sign. +- The text you're matching against is big (> 1 MiB); you would be better served + by first dividing the text into manageable chunks and using some heuristic to + locate the most likely location of a match before applying RegEx against it. +- You value high performance and low memory usage; RegEx will always have a + certain overhead which increases with the complexity of the pattern. + + +The implementation of this package has been optimized, but it will never be as +thoroughly performant as a hand-made parser. In comparison, there are just too +many intermediate steps, assumptions, and generalizations in what it takes to +handle a regular expression. + +*/ +package regex diff --git a/core/text/regex/optimizer/doc.odin b/core/text/regex/optimizer/doc.odin new file mode 100644 index 00000000000..7f2c84c8d43 --- /dev/null +++ b/core/text/regex/optimizer/doc.odin @@ -0,0 +1,58 @@ +/* +package regex_optimizer implements an optimizer which acts upon the AST of a +parsed regular expression pattern, transforming it in-place without moving to a +compilation step. + +Where possible, it aims to reduce branching as much as possible in the +expression by reducing usage of `|`. + + +Here is a summary of the optimizations that it will do: + +* Class Simplification : `[aab]` => `[ab]` + `[aa]` => `[a]` + +* Class Reduction : `[a]` => `a` +* Range Construction : `[abc]` => `[a-c]` +* Rune Merging into Range : `[aa-c]` => `[a-c]` + +* Range Merging : `[a-cc-e]` => `[a-e]` + `[a-cd-e]` => `[a-e]` + `[a-cb-e]` => `[a-e]` + +* Alternation to Optional : `a|` => `a?` +* Alternation to Optional Non-Greedy : `|a` => `a??` +* Alternation Reduction : `a|a` => `a` +* Alternation to Class : `a|b` => `[ab]` +* Class Union : `[a0]|[b1]` => `[a0b1]` + `[a-b]|c` => `[a-bc]` + `a|[b-c]` => `[b-ca]` + +* Wildcard Reduction : `a|.` => `.` + `.|a` => `.` + `[ab]|.` => `.` + `.|[ab]` => `.` + +* Common Suffix Elimination : `blueberry|strawberry` => `(?:blue|straw)berry` +* Common Prefix Elimination : `abi|abe` => `ab(?:i|e)` + +* Composition: Consume All to Anchored End + `.*$` => + `.+$` => `.` + + +Possible future improvements: + +- Change the AST of alternations to be a list instead of a tree, so that + constructions such as `(ab|bb|cb)` can be considered in whole by the affix + elimination optimizations. + +- Introduce specialized opcodes for certain classes of repetition. + +- Add Common Infix Elimination. + +- Measure the precise finite minimum and maximum of a pattern, if available, + and check against that on any strings before running the virtual machine. + +*/ +package regex_optimizer diff --git a/core/text/regex/optimizer/optimizer.odin b/core/text/regex/optimizer/optimizer.odin new file mode 100644 index 00000000000..fbb65cf79b4 --- /dev/null +++ b/core/text/regex/optimizer/optimizer.odin @@ -0,0 +1,522 @@ +package regex_optimizer + +import "base:intrinsics" +@require import "core:io" +import "core:slice" +import "core:text/regex/common" +import "core:text/regex/parser" + +Rune_Class_Range :: parser.Rune_Class_Range + +Node :: parser.Node +Node_Rune :: parser.Node_Rune +Node_Rune_Class :: parser.Node_Rune_Class +Node_Wildcard :: parser.Node_Wildcard +Node_Concatenation :: parser.Node_Concatenation +Node_Alternation :: parser.Node_Alternation +Node_Repeat_Zero :: parser.Node_Repeat_Zero +Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy +Node_Repeat_One :: parser.Node_Repeat_One +Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy +Node_Repeat_N :: parser.Node_Repeat_N +Node_Optional :: parser.Node_Optional +Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy +Node_Group :: parser.Node_Group +Node_Anchor :: parser.Node_Anchor +Node_Word_Boundary :: parser.Node_Word_Boundary +Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape + + +class_range_sorter :: proc(i, j: Rune_Class_Range) -> bool { + return i.lower < j.lower +} + +optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) { + if tree == nil { + return nil, 0 + } + + result = tree + + switch specific in tree { + // No direct optimization possible on these nodes: + case ^Node_Rune: break + case ^Node_Wildcard: break + case ^Node_Anchor: break + case ^Node_Word_Boundary: break + case ^Node_Match_All_And_Escape: break + + case ^Node_Concatenation: + // * Composition: Consume All to Anchored End + // + // DO: `.*$` => + // DO: `.+$` => `.` + if .Multiline not_in flags && len(specific.nodes) >= 2 { + i := len(specific.nodes) - 2 + wrza: { + subnode := specific.nodes[i].(^Node_Repeat_Zero) or_break wrza + _ = subnode.inner.(^Node_Wildcard) or_break wrza + next_node := specific.nodes[i+1].(^Node_Anchor) or_break wrza + if next_node.start == false { + specific.nodes[i] = new(Node_Match_All_And_Escape) + ordered_remove(&specific.nodes, i + 1) + changes += 1 + break + } + } + wroa: { + subnode := specific.nodes[i].(^Node_Repeat_One) or_break wroa + subsubnode := subnode.inner.(^Node_Wildcard) or_break wroa + next_node := specific.nodes[i+1].(^Node_Anchor) or_break wroa + if next_node.start == false { + specific.nodes[i] = subsubnode + specific.nodes[i+1] = new(Node_Match_All_And_Escape) + changes += 1 + break + } + } + } + + // Only recursive optimizations: + for i := 0; i < len(specific.nodes); i += 1 { + subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags) + changes += subnode_changes + if subnode == nil { + ordered_remove(&specific.nodes, i) + i -= 1 + changes += 1 + } else { + specific.nodes[i] = subnode + } + } + + if len(specific.nodes) == 1 { + result = specific.nodes[0] + changes += 1 + } else if len(specific.nodes) == 0 { + return nil, changes + 1 + } + + case ^Node_Repeat_Zero: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Repeat_Zero_Non_Greedy: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Repeat_One: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Repeat_One_Non_Greedy: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Repeat_N: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Optional: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + case ^Node_Optional_Non_Greedy: + specific.inner, changes = optimize_subtree(specific.inner, flags) + if specific.inner == nil { + return nil, changes + 1 + } + + case ^Node_Group: + specific.inner, changes = optimize_subtree(specific.inner, flags) + + if specific.inner == nil { + return nil, changes + 1 + } + + if !specific.capture { + result = specific.inner + changes += 1 + } + + // Full optimization: + case ^Node_Rune_Class: + // * Class Simplification + // + // DO: `[aab]` => `[ab]` + // DO: `[aa]` => `[a]` + runes_seen: map[rune]bool + + for r in specific.runes { + runes_seen[r] = true + } + + if len(runes_seen) != len(specific.runes) { + clear(&specific.runes) + for key in runes_seen { + append(&specific.runes, key) + } + changes += 1 + } + + // * Class Reduction + // + // DO: `[a]` => `a` + if !specific.negating && len(specific.runes) == 1 && len(specific.ranges) == 0 { + only_rune := specific.runes[0] + + node := new(Node_Rune) + node.data = only_rune + + return node, changes + 1 + } + + // * Range Construction + // + // DO: `[abc]` => `[a-c]` + slice.sort(specific.runes[:]) + if len(specific.runes) > 1 { + new_range: Rune_Class_Range + new_range.lower = specific.runes[0] + new_range.upper = specific.runes[0] + + for i := 1; i < len(specific.runes); i += 1 { + r := specific.runes[i] + if new_range.lower == -1 { + new_range = { r, r } + continue + } + + if r == new_range.lower - 1 { + new_range.lower -= 1 + ordered_remove(&specific.runes, i) + i -= 1 + changes += 1 + } else if r == new_range.upper + 1 { + new_range.upper += 1 + ordered_remove(&specific.runes, i) + i -= 1 + changes += 1 + } else if new_range.lower != new_range.upper { + append(&specific.ranges, new_range) + new_range = { -1, -1 } + changes += 1 + } + } + + if new_range.lower != new_range.upper { + append(&specific.ranges, new_range) + changes += 1 + } + } + + // * Rune Merging into Range + // + // DO: `[aa-c]` => `[a-c]` + for range in specific.ranges { + for i := 0; i < len(specific.runes); i += 1 { + r := specific.runes[i] + if range.lower <= r && r <= range.upper { + ordered_remove(&specific.runes, i) + i -= 1 + changes += 1 + } + } + } + + // * Range Merging + // + // DO: `[a-cc-e]` => `[a-e]` + // DO: `[a-cd-e]` => `[a-e]` + // DO: `[a-cb-e]` => `[a-e]` + slice.sort_by(specific.ranges[:], class_range_sorter) + for i := 0; i < len(specific.ranges) - 1; i += 1 { + for j := i + 1; j < len(specific.ranges); j += 1 { + left_range := &specific.ranges[i] + right_range := specific.ranges[j] + + if left_range.upper == right_range.lower || + left_range.upper == right_range.lower - 1 || + left_range.lower <= right_range.lower && right_range.lower <= left_range.upper { + left_range.upper = max(left_range.upper, right_range.upper) + ordered_remove(&specific.ranges, j) + j -= 1 + changes += 1 + } else { + break + } + } + } + + if len(specific.ranges) == 0 { + specific.ranges = {} + } + if len(specific.runes) == 0 { + specific.runes = {} + } + + // * NOP + // + // DO: `[]` => + if len(specific.ranges) + len(specific.runes) == 0 { + return nil, 1 + } + + slice.sort(specific.runes[:]) + slice.sort_by(specific.ranges[:], class_range_sorter) + + case ^Node_Alternation: + // Perform recursive optimization first. + left_changes, right_changes: int + specific.left, left_changes = optimize_subtree(specific.left, flags) + specific.right, right_changes = optimize_subtree(specific.right, flags) + changes += left_changes + right_changes + + // * Alternation to Optional + // + // DO: `a|` => `a?` + if specific.left != nil && specific.right == nil { + node := new(Node_Optional) + node.inner = specific.left + return node, 1 + } + + // * Alternation to Optional Non-Greedy + // + // DO: `|a` => `a??` + if specific.right != nil && specific.left == nil { + node := new(Node_Optional_Non_Greedy) + node.inner = specific.right + return node, 1 + } + + // * NOP + // + // DO: `|` => + if specific.left == nil && specific.right == nil { + return nil, 1 + } + + left_rune, left_is_rune := specific.left.(^Node_Rune) + right_rune, right_is_rune := specific.right.(^Node_Rune) + + if left_is_rune && right_is_rune { + if left_rune.data == right_rune.data { + // * Alternation Reduction + // + // DO: `a|a` => `a` + return left_rune, 1 + } else { + // * Alternation to Class + // + // DO: `a|b` => `[ab]` + node := new(Node_Rune_Class) + append(&node.runes, left_rune.data) + append(&node.runes, right_rune.data) + return node, 1 + } + } + + left_wildcard, left_is_wildcard := specific.left.(^Node_Wildcard) + right_wildcard, right_is_wildcard := specific.right.(^Node_Wildcard) + + // * Class Union + // + // DO: `[a0]|[b1]` => `[a0b1]` + left_class, left_is_class := specific.left.(^Node_Rune_Class) + right_class, right_is_class := specific.right.(^Node_Rune_Class) + if left_is_class && right_is_class { + for r in right_class.runes { + append(&left_class.runes, r) + } + for range in right_class.ranges { + append(&left_class.ranges, range) + } + return left_class, 1 + } + + // * Class Union + // + // DO: `[a-b]|c` => `[a-bc]` + if left_is_class && right_is_rune { + append(&left_class.runes, right_rune.data) + return left_class, 1 + } + + // * Class Union + // + // DO: `a|[b-c]` => `[b-ca]` + if left_is_rune && right_is_class { + append(&right_class.runes, left_rune.data) + return right_class, 1 + } + + // * Wildcard Reduction + // + // DO: `a|.` => `.` + if left_is_rune && right_is_wildcard { + return right_wildcard, 1 + } + + // * Wildcard Reduction + // + // DO: `.|a` => `.` + if left_is_wildcard && right_is_rune { + return left_wildcard, 1 + } + + // * Wildcard Reduction + // + // DO: `[ab]|.` => `.` + if left_is_class && right_is_wildcard { + return right_wildcard, 1 + } + + // * Wildcard Reduction + // + // DO: `.|[ab]` => `.` + if left_is_wildcard && right_is_class { + return left_wildcard, 1 + } + + left_concatenation, left_is_concatenation := specific.left.(^Node_Concatenation) + right_concatenation, right_is_concatenation := specific.right.(^Node_Concatenation) + + // * Common Suffix Elimination + // + // DO: `blueberry|strawberry` => `(?:blue|straw)berry` + if left_is_concatenation && right_is_concatenation { + // Remember that a concatenation could contain any node, not just runes. + left_len := len(left_concatenation.nodes) + right_len := len(right_concatenation.nodes) + least_len := min(left_len, right_len) + same_len := 0 + for i := 1; i <= least_len; i += 1 { + left_subrune, left_is_subrune := left_concatenation.nodes[left_len - i].(^Node_Rune) + right_subrune, right_is_subrune := right_concatenation.nodes[right_len - i].(^Node_Rune) + + if !left_is_subrune || !right_is_subrune { + // One of the nodes isn't a rune; there's nothing more we can do. + break + } + + if left_subrune.data == right_subrune.data { + same_len += 1 + } else { + // No more similarities. + break + } + } + + if same_len > 0 { + // Dissolve this alternation into a concatenation. + cat_node := new(Node_Concatenation) + group_node := new(Node_Group) + append(&cat_node.nodes, group_node) + + // Turn the concatenation into the common suffix. + for i := left_len - same_len; i < left_len; i += 1 { + append(&cat_node.nodes, left_concatenation.nodes[i]) + } + + // Construct the group of alternating prefixes. + for i := same_len; i > 0; i -= 1 { + pop(&left_concatenation.nodes) + pop(&right_concatenation.nodes) + } + + // (Re-using this alternation node.) + alter_node := specific + alter_node.left = left_concatenation + alter_node.right = right_concatenation + group_node.inner = alter_node + + return cat_node, 1 + } + } + + // * Common Prefix Elimination + // + // DO: `abi|abe` => `ab(?:i|e)` + if left_is_concatenation && right_is_concatenation { + // Try to identify a common prefix. + // Remember that a concatenation could contain any node, not just runes. + least_len := min(len(left_concatenation.nodes), len(right_concatenation.nodes)) + same_len := 0 + for i := 0; i < least_len; i += 1 { + left_subrune, left_is_subrune := left_concatenation.nodes[i].(^Node_Rune) + right_subrune, right_is_subrune := right_concatenation.nodes[i].(^Node_Rune) + + if !left_is_subrune || !right_is_subrune { + // One of the nodes isn't a rune; there's nothing more we can do. + break + } + + if left_subrune.data == right_subrune.data { + same_len = i + 1 + } else { + // No more similarities. + break + } + } + + if same_len > 0 { + cat_node := new(Node_Concatenation) + for i := 0; i < same_len; i += 1 { + append(&cat_node.nodes, left_concatenation.nodes[i]) + } + for i := same_len; i > 0; i -= 1 { + ordered_remove(&left_concatenation.nodes, 0) + ordered_remove(&right_concatenation.nodes, 0) + } + + group_node := new(Node_Group) + // (Re-using this alternation node.) + alter_node := specific + alter_node.left = left_concatenation + alter_node.right = right_concatenation + group_node.inner = alter_node + + append(&cat_node.nodes, group_node) + return cat_node, 1 + } + } + } + + return +} + +optimize :: proc(tree: Node, flags: common.Flags) -> (result: Node, changes: int) { + result = tree + new_changes := 0 + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "AST before Optimizer: ") + parser.write_node(common.debug_stream, tree) + io.write_byte(common.debug_stream, '\n') + } + + // Keep optimizing until no more changes are seen. + for { + result, new_changes = optimize_subtree(result, flags) + changes += new_changes + if new_changes == 0 { + break + } + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "AST after Optimizer: ") + parser.write_node(common.debug_stream, result) + io.write_byte(common.debug_stream, '\n') + } + + + return +} diff --git a/core/text/regex/parser/debugging.odin b/core/text/regex/parser/debugging.odin new file mode 100644 index 00000000000..4d531965c66 --- /dev/null +++ b/core/text/regex/parser/debugging.odin @@ -0,0 +1,103 @@ +package regex_parser + +import "core:io" + +write_node :: proc(w: io.Writer, node: Node) { + switch specific in node { + case ^Node_Rune: + io.write_rune(w, specific.data) + + case ^Node_Rune_Class: + io.write_byte(w, '[') + if specific.negating { + io.write_byte(w, '^') + } + for r in specific.data.runes { + io.write_rune(w, r) + } + for range in specific.data.ranges { + io.write_rune(w, range.lower) + io.write_byte(w, '-') + io.write_rune(w, range.upper) + } + io.write_byte(w, ']') + + case ^Node_Wildcard: + io.write_byte(w, '.') + + case ^Node_Concatenation: + io.write_rune(w, '「') + for subnode, i in specific.nodes { + if i != 0 { + io.write_rune(w, '⋅') + } + write_node(w, subnode) + } + io.write_rune(w, '」') + + case ^Node_Repeat_Zero: + write_node(w, specific.inner) + io.write_byte(w, '*') + case ^Node_Repeat_Zero_Non_Greedy: + write_node(w, specific.inner) + io.write_string(w, "*?") + case ^Node_Repeat_One: + write_node(w, specific.inner) + io.write_byte(w, '+') + case ^Node_Repeat_One_Non_Greedy: + write_node(w, specific.inner) + io.write_string(w, "+?") + + case ^Node_Repeat_N: + write_node(w, specific.inner) + if specific.lower == 0 && specific.upper == -1 { + io.write_byte(w, '*') + } else if specific.lower == 1 && specific.upper == -1 { + io.write_byte(w, '+') + } else { + io.write_byte(w, '{') + io.write_int(w, specific.lower) + io.write_byte(w, ',') + io.write_int(w, specific.upper) + io.write_byte(w, '}') + } + + case ^Node_Alternation: + io.write_rune(w, '《') + write_node(w, specific.left) + io.write_byte(w, '|') + write_node(w, specific.right) + io.write_rune(w, '》') + + case ^Node_Optional: + io.write_rune(w, '〈') + write_node(w, specific.inner) + io.write_byte(w, '?') + io.write_rune(w, '〉') + case ^Node_Optional_Non_Greedy: + io.write_rune(w, '〈') + write_node(w, specific.inner) + io.write_string(w, "??") + io.write_rune(w, '〉') + + case ^Node_Group: + io.write_byte(w, '(') + if !specific.capture { + io.write_string(w, "?:") + } + write_node(w, specific.inner) + io.write_byte(w, ')') + + case ^Node_Anchor: + io.write_byte(w, '^' if specific.start else '$') + + case ^Node_Word_Boundary: + io.write_string(w, `\B` if specific.non_word else `\b`) + + case ^Node_Match_All_And_Escape: + io.write_string(w, "《.*$》") + + case nil: + io.write_string(w, "") + } +} diff --git a/core/text/regex/parser/doc.odin b/core/text/regex/parser/doc.odin new file mode 100644 index 00000000000..f518e518de6 --- /dev/null +++ b/core/text/regex/parser/doc.odin @@ -0,0 +1,10 @@ +/* +package regex_parser implements a Pratt parser, also known as a Top-Down +Operator Precedence parser, for parsing tokenized regular expression patterns. + +References: +- https://dl.acm.org/doi/10.1145/512927.512931 +- https://tdop.github.io/ +- http://crockford.com/javascript/tdop/tdop.html +*/ +package regex_parser diff --git a/core/text/regex/parser/parser.odin b/core/text/regex/parser/parser.odin new file mode 100644 index 00000000000..1958ee39918 --- /dev/null +++ b/core/text/regex/parser/parser.odin @@ -0,0 +1,580 @@ +package regex_parser + +import "base:intrinsics" +import "core:strconv" +import "core:strings" +import "core:text/regex/common" +import "core:text/regex/tokenizer" +import "core:unicode" +import "core:unicode/utf8" + +Token :: tokenizer.Token +Token_Kind :: tokenizer.Token_Kind +Tokenizer :: tokenizer.Tokenizer + +Rune_Class_Range :: struct { + lower, upper: rune, +} +Rune_Class_Data :: struct { + runes: [dynamic]rune, + ranges: [dynamic]Rune_Class_Range, +} + + +Node_Rune :: struct { + data: rune, +} + +Node_Rune_Class :: struct { + negating: bool, + using data: Rune_Class_Data, +} + +Node_Wildcard :: struct {} + +Node_Alternation :: struct { + left, right: Node, +} + +Node_Concatenation :: struct { + nodes: [dynamic]Node, +} + +Node_Repeat_Zero :: struct { + inner: Node, +} +Node_Repeat_Zero_Non_Greedy :: struct { + inner: Node, +} +Node_Repeat_One :: struct { + inner: Node, +} +Node_Repeat_One_Non_Greedy :: struct { + inner: Node, +} + +Node_Repeat_N :: struct { + inner: Node, + lower, upper: int, +} + +Node_Optional :: struct { + inner: Node, +} +Node_Optional_Non_Greedy :: struct { + inner: Node, +} + +Node_Group :: struct { + inner: Node, + capture_id: int, + capture: bool, +} + +Node_Anchor :: struct { + start: bool, +} +Node_Word_Boundary :: struct { + non_word: bool, +} + +Node_Match_All_And_Escape :: struct {} + +Node :: union { + ^Node_Rune, + ^Node_Rune_Class, + ^Node_Wildcard, + ^Node_Concatenation, + ^Node_Alternation, + ^Node_Repeat_Zero, + ^Node_Repeat_Zero_Non_Greedy, + ^Node_Repeat_One, + ^Node_Repeat_One_Non_Greedy, + ^Node_Repeat_N, + ^Node_Optional, + ^Node_Optional_Non_Greedy, + ^Node_Group, + ^Node_Anchor, + ^Node_Word_Boundary, + + // Optimized nodes (not created by the Parser): + ^Node_Match_All_And_Escape, +} + + +left_binding_power :: proc(kind: Token_Kind) -> int { + #partial switch kind { + case .Alternate: return 1 + case .Concatenate: return 2 + case .Repeat_Zero, .Repeat_One, + .Repeat_Zero_Non_Greedy, .Repeat_One_Non_Greedy, + .Repeat_N: return 3 + case .Optional, + .Optional_Non_Greedy: return 4 + case .Open_Paren, + .Open_Paren_Non_Capture: return 9 + } + return 0 +} + + +Expected_Token :: struct { + pos: int, + kind: Token_Kind, +} + +Invalid_Repetition :: struct { + pos: int, +} + +Invalid_Token :: struct { + pos: int, + kind: Token_Kind, +} + +Invalid_Unicode :: struct { + pos: int, +} + +Too_Many_Capture_Groups :: struct { + pos: int, +} + +Unexpected_EOF :: struct { + pos: int, +} + +Error :: union { + Expected_Token, + Invalid_Repetition, + Invalid_Token, + Invalid_Unicode, + Too_Many_Capture_Groups, + Unexpected_EOF, +} + + +Parser :: struct { + flags: common.Flags, + t: Tokenizer, + + cur_token: Token, + + groups: int, +} + + +@require_results +advance :: proc(p: ^Parser) -> Error { + p.cur_token = tokenizer.scan(&p.t) + if p.cur_token.kind == .Invalid { + return Invalid_Unicode { pos = 0 } + } + return nil +} + +expect :: proc(p: ^Parser, kind: Token_Kind) -> (err: Error) { + if p.cur_token.kind == kind { + advance(p) or_return + return + } + + return Expected_Token{ + pos = p.t.offset, + kind = kind, + } +} + +null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) { + #partial switch token.kind { + case .Rune: + r: rune + for ru in token.text { + r = ru + break + } + assert(r != 0, "Parsed an empty Rune token.") + + if .Case_Insensitive in p.flags { + lower := unicode.to_lower(r) + upper := unicode.to_upper(r) + if lower != upper { + node := new(Node_Rune_Class) + append(&node.runes, lower) + append(&node.runes, upper) + return node, nil + } + } + + node := new(Node_Rune) + node ^= { r } + return node, nil + + case .Rune_Class: + if len(token.text) == 0 { + return nil, nil + } + + node := new(Node_Rune_Class) + + for i := 0; i < len(token.text); /**/ { + r, size := utf8.decode_rune(token.text[i:]) + if i == 0 && r == '^' { + node.negating = true + i += size + continue + } + i += size + + assert(size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.") + + if r == '\\' { + next_r, next_size := utf8.decode_rune(token.text[i:]) + i += next_size + assert(next_size > 0, "RegEx tokenizer passed an incomplete Rune_Class to the parser.") + + // @MetaCharacter + // NOTE: These must be kept in sync with the tokenizer. + switch next_r { + case 'f': append(&node.runes, '\f') + case 'n': append(&node.runes, '\n') + case 'r': append(&node.runes, '\r') + case 't': append(&node.runes, '\t') + + case 'd': + append(&node.ranges, Rune_Class_Range{ '0', '9' }) + case 's': + append(&node.runes, '\t') + append(&node.runes, '\n') + append(&node.runes, '\f') + append(&node.runes, '\r') + append(&node.runes, ' ') + case 'w': + append(&node.ranges, Rune_Class_Range{ '0', '9' }) + append(&node.ranges, Rune_Class_Range{ 'A', 'Z' }) + append(&node.runes, '_') + append(&node.ranges, Rune_Class_Range{ 'a', 'z' }) + case 'D': + append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 }) + append(&node.ranges, Rune_Class_Range{ '9' + 1, max(rune) }) + case 'S': + append(&node.ranges, Rune_Class_Range{ 0, '\t' - 1 }) + // \t and \n are adjacent. + append(&node.runes, '\x0b') // Vertical Tab + append(&node.ranges, Rune_Class_Range{ '\r' + 1, ' ' - 1 }) + append(&node.ranges, Rune_Class_Range{ ' ' + 1, max(rune) }) + case 'W': + append(&node.ranges, Rune_Class_Range{ 0, '0' - 1 }) + append(&node.ranges, Rune_Class_Range{ '9' + 1, 'A' - 1 }) + append(&node.ranges, Rune_Class_Range{ 'Z' + 1, '_' - 1 }) + append(&node.ranges, Rune_Class_Range{ '_' + 1, 'a' - 1 }) + append(&node.ranges, Rune_Class_Range{ 'z' + 1, max(rune) }) + case: + append(&node.runes, next_r) + } + continue + } + + if r == '-' && len(node.runes) > 0 { + next_r, next_size := utf8.decode_rune(token.text[i:]) + if next_size > 0 { + last := pop(&node.runes) + i += next_size + + append(&node.ranges, Rune_Class_Range{ last, next_r }) + continue + } + } + + append(&node.runes, r) + } + + if .Case_Insensitive in p.flags { + length := len(node.runes) + #no_bounds_check for i := 0; i < length; i += 1 { + r := node.runes[i] + lower := unicode.to_lower(r) + upper := unicode.to_upper(r) + + if lower != upper { + if lower != r { + append(&node.runes, lower) + } else { + append(&node.runes, upper) + } + } + } + + length = len(node.ranges) + #no_bounds_check for i := 0; i < length; i += 1 { + range := &node.ranges[i] + + min_lower := unicode.to_lower(range.lower) + max_lower := unicode.to_lower(range.upper) + + min_upper := unicode.to_upper(range.lower) + max_upper := unicode.to_upper(range.upper) + + if min_lower != min_upper && max_lower != max_upper { + range.lower = min_lower + range.upper = max_lower + append(&node.ranges, Rune_Class_Range{ min_upper, max_upper }) + } + } + } + + result = node + + case .Wildcard: + node := new(Node_Wildcard) + result = node + + case .Open_Paren: + // Because of the recursive nature of the token parser, we take the + // group number first instead of afterwards, in order to construct + // group matches from the outside in. + p.groups += 1 + if p.groups == common.MAX_CAPTURE_GROUPS { + return nil, Too_Many_Capture_Groups{ pos = token.pos } + } + this_group := p.groups + + node := new(Node_Group) + node.capture = true + node.capture_id = this_group + + node.inner = parse_expression(p, 0) or_return + expect(p, .Close_Paren) or_return + result = node + case .Open_Paren_Non_Capture: + node := new(Node_Group) + node.inner = parse_expression(p, 0) or_return + expect(p, .Close_Paren) or_return + result = node + case .Close_Paren: + node := new(Node_Rune) + node ^= { ')' } + return node, nil + + case .Anchor_Start: + node := new(Node_Anchor) + node.start = true + result = node + case .Anchor_End: + node := new(Node_Anchor) + result = node + case .Word_Boundary: + node := new(Node_Word_Boundary) + result = node + case .Non_Word_Boundary: + node := new(Node_Word_Boundary) + node.non_word = true + result = node + + case .Alternate: + // A unary alternation with a left-side empty path, i.e. `|a`. + right, right_err := parse_expression(p, left_binding_power(.Alternate)) + #partial switch specific in right_err { + case Unexpected_EOF: + // This token is a NOP, i.e. `|`. + break + case nil: + break + case: + return nil, right_err + } + + node := new(Node_Alternation) + node.right = right + result = node + + case .EOF: + return nil, Unexpected_EOF{ pos = token.pos } + + case: + return nil, Invalid_Token{ pos = token.pos, kind = token.kind } + } + + return +} + +left_denotation :: proc(p: ^Parser, token: Token, left: Node) -> (result: Node, err: Error) { + #partial switch token.kind { + case .Alternate: + if p.cur_token.kind == .Close_Paren { + // `(a|)` + // parse_expression will fail, so intervene here. + node := new(Node_Alternation) + node.left = left + return node, nil + } + + right, right_err := parse_expression(p, left_binding_power(.Alternate)) + + #partial switch specific in right_err { + case nil: + break + case Unexpected_EOF: + // EOF is okay in an alternation; it's an edge case in the way of + // expressing an optional such as `a|`. + break + case: + return nil, right_err + } + + node := new(Node_Alternation) + node.left = left + node.right = right + result = node + + case .Concatenate: + right := parse_expression(p, left_binding_power(.Concatenate)) or_return + + // There should be no need to check if right is Node_Concatenation, due + // to how the parsing direction works. + #partial switch specific in left { + case ^Node_Concatenation: + append(&specific.nodes, right) + result = specific + case: + node := new(Node_Concatenation) + append(&node.nodes, left) + append(&node.nodes, right) + result = node + } + + case .Repeat_Zero: + node := new(Node_Repeat_Zero) + node.inner = left + result = node + case .Repeat_Zero_Non_Greedy: + node := new(Node_Repeat_Zero_Non_Greedy) + node.inner = left + result = node + case .Repeat_One: + node := new(Node_Repeat_One) + node.inner = left + result = node + case .Repeat_One_Non_Greedy: + node := new(Node_Repeat_One_Non_Greedy) + node.inner = left + result = node + + case .Repeat_N: + node := new(Node_Repeat_N) + node.inner = left + + comma := strings.index_byte(token.text, ',') + + switch comma { + case -1: // {N} + exact, ok := strconv.parse_u64_of_base(token.text, base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if exact == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)exact + node.upper = cast(int)exact + + case 0: // {,M} + upper, ok := strconv.parse_u64_of_base(token.text[1:], base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if upper == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = -1 + node.upper = cast(int)upper + + case len(token.text) - 1: // {N,} + lower, ok := strconv.parse_u64_of_base(token.text[:comma], base = 10) + if !ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)lower + node.upper = -1 + + case: // {N,M} + lower, lower_ok := strconv.parse_u64_of_base(token.text[:comma], base = 10) + if !lower_ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + upper, upper_ok := strconv.parse_u64_of_base(token.text[comma+1:], base = 10) + if !upper_ok { + return nil, Invalid_Repetition{ pos = token.pos } + } + if lower > upper { + return nil, Invalid_Repetition{ pos = token.pos } + } + if upper == 0 { + return nil, Invalid_Repetition{ pos = token.pos } + } + + node.lower = cast(int)lower + node.upper = cast(int)upper + } + + result = node + + case .Optional: + node := new(Node_Optional) + node.inner = left + result = node + case .Optional_Non_Greedy: + node := new(Node_Optional_Non_Greedy) + node.inner = left + result = node + + case .EOF: + return nil, Unexpected_EOF{ pos = token.pos } + + case: + return nil, Invalid_Token{ pos = token.pos, kind = token.kind } + } + + return +} + +parse_expression :: proc(p: ^Parser, rbp: int) -> (result: Node, err: Error) { + token := p.cur_token + + advance(p) or_return + left := null_denotation(p, token) or_return + + token = p.cur_token + for rbp < left_binding_power(token.kind) { + advance(p) or_return + left = left_denotation(p, token, left) or_return + token = p.cur_token + } + + return left, nil +} + +parse :: proc(str: string, flags: common.Flags) -> (result: Node, err: Error) { + if len(str) == 0 { + node := new(Node_Group) + return node, nil + } + + p: Parser + p.flags = flags + + tokenizer.init(&p.t, str, flags) + + p.cur_token = tokenizer.scan(&p.t) + if p.cur_token.kind == .Invalid { + return nil, Invalid_Unicode { pos = 0 } + } + + node := parse_expression(&p, 0) or_return + result = node + + return +} diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin new file mode 100644 index 00000000000..1736f2305c7 --- /dev/null +++ b/core/text/regex/regex.odin @@ -0,0 +1,434 @@ +package regex + +import "core:text/regex/common" +import "core:text/regex/compiler" +import "core:text/regex/optimizer" +import "core:text/regex/parser" +import "core:text/regex/virtual_machine" + +Flag :: common.Flag +Flags :: common.Flags +Parser_Error :: parser.Error +Compiler_Error :: compiler.Error + +Creation_Error :: enum { + None, + Bad_Delimiter, + Expected_Delimiter, + Unknown_Flag, +} + +Error :: union #shared_nil { + Parser_Error, + Compiler_Error, + Creation_Error, +} + +Capture :: struct { + pos: [][2]int, + groups: []string, +} + +Regular_Expression :: struct { + original_pattern: string, + flags: Flags, + class_data: []virtual_machine.Rune_Class_Data, + program: []virtual_machine.Opcode `fmt:"-"`, +} + + +@(rodata) +Flag_To_Letter := #sparse[Flag]u8 { + .Global = 'g', + .Multiline = 'm', + .Case_Insensitive = 'i', + .Ignore_Whitespace = 'x', + .Unicode = 'u', + .No_Capture = 'n', + .No_Optimization = '-', +} + +/* +Create a regular expression from a string pattern and a set of flags. + +*Allocates Using Provided Allocators* + +Inputs: +- pattern: The pattern to compile. +- flags: A `bit_set` of RegEx flags. +- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator) +- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator) + +Returns: +- result: The regular expression. +- err: An error, if one occurred. +*/ +@require_results +create :: proc( + pattern: string, + flags: Flags = {}, + permanent_allocator := context.allocator, + temporary_allocator := context.temp_allocator, +) -> (result: Regular_Expression, err: Error) { + + // For the sake of speed and simplicity, we first run all the intermediate + // processes such as parsing and compilation through the temporary + // allocator. + program: [dynamic]virtual_machine.Opcode = --- + class_data: [dynamic]parser.Rune_Class_Data = --- + { + context.allocator = temporary_allocator + + ast := parser.parse(pattern, flags) or_return + + if .No_Optimization not_in flags { + ast, _ = optimizer.optimize(ast, flags) + } + + program, class_data = compiler.compile(ast, flags) or_return + } + + // When that's successful, re-allocate all at once with the permanent + // allocator so everything can be tightly packed. + context.allocator = permanent_allocator + + result.original_pattern = pattern + result.flags = flags + + if len(class_data) > 0 { + result.class_data = make([]virtual_machine.Rune_Class_Data, len(class_data)) + } + for data, i in class_data { + if len(data.runes) > 0 { + result.class_data[i].runes = make([]rune, len(data.runes)) + copy(result.class_data[i].runes, data.runes[:]) + } + if len(data.ranges) > 0 { + result.class_data[i].ranges = make([]virtual_machine.Rune_Class_Range, len(data.ranges)) + copy(result.class_data[i].ranges, data.ranges[:]) + } + } + + result.program = make([]virtual_machine.Opcode, len(program)) + copy(result.program, program[:]) + + return +} + +/* +Create a regular expression from a delimited string pattern, such as one +provided by users of a program or those found in a configuration file. + +They are in the form of: + + [DELIMITER] [regular expression] [DELIMITER] [flags] + +For example, the following strings are valid: + + /hellope/i + #hellope#i + •hellope•i + つhellopeつi + +The delimiter is determined by the very first rune in the string. +The only restriction is that the delimiter cannot be `\`, as that rune is used +to escape the delimiter if found in the middle of the string. + +All runes after the closing delimiter will be parsed as flags: + +- 'g': Global +- 'm': Multiline +- 'i': Case_Insensitive +- 'x': Ignore_Whitespace +- 'u': Unicode +- 'n': No_Capture +- '-': No_Optimization + + +*Allocates Using Provided Allocators* + +Inputs: +- pattern: The delimited pattern with optional flags to compile. +- str: The string to match against. +- permanent_allocator: The allocator to use for the final regular expression. (default: context.allocator) +- temporary_allocator: The allocator to use for the intermediate compilation stages. (default: context.temp_allocator) + +Returns: +- result: The regular expression. +- err: An error, if one occurred. +*/ +@require_results +create_by_user :: proc( + pattern: string, + permanent_allocator := context.allocator, + temporary_allocator := context.temp_allocator, +) -> (result: Regular_Expression, err: Error) { + + if len(pattern) == 0 { + err = .Expected_Delimiter + return + } + + delimiter: rune + start := -1 + end := -1 + + flags: Flags + + escaping: bool + parse_loop: for r, i in pattern { + if delimiter == 0 { + if r == '\\' { + err = .Bad_Delimiter + return + } + delimiter = r + continue parse_loop + } + + if start == -1 { + start = i + } + + if escaping { + escaping = false + continue parse_loop + } + + switch r { + case '\\': + escaping = true + case delimiter: + end = i + break parse_loop + } + } + + if end == -1 { + err = .Expected_Delimiter + return + } + + // `start` is also the size of the delimiter, which is why it's being added + // to `end` here. + for r in pattern[start + end:] { + switch r { + case 'g': flags += { .Global } + case 'm': flags += { .Multiline } + case 'i': flags += { .Case_Insensitive } + case 'x': flags += { .Ignore_Whitespace } + case 'u': flags += { .Unicode } + case 'n': flags += { .No_Capture } + case '-': flags += { .No_Optimization } + case: + err = .Unknown_Flag + return + } + } + + return create(pattern[start:end], flags, permanent_allocator, temporary_allocator) +} + +/* +Match a regular expression against a string and allocate the results into the +returned `capture` structure. + +The resulting capture strings will be slices to the string `str`, not wholly +copied strings, so they won't need to be individually deleted. + +*Allocates Using Provided Allocators* + +Inputs: +- regex: The regular expression. +- str: The string to match against. +- permanent_allocator: The allocator to use for the capture results. (default: context.allocator) +- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator) + +Returns: +- capture: The capture groups found in the string. +- success: True if the regex matched the string. +*/ +@require_results +match_and_allocate_capture :: proc( + regex: Regular_Expression, + str: string, + permanent_allocator := context.allocator, + temporary_allocator := context.temp_allocator, +) -> (capture: Capture, success: bool) { + + saved: ^[2 * common.MAX_CAPTURE_GROUPS]int + + { + context.allocator = temporary_allocator + + vm := virtual_machine.create(regex.program, str) + vm.class_data = regex.class_data + + if .Unicode in regex.flags { + saved, success = virtual_machine.run(&vm, true) + } else { + saved, success = virtual_machine.run(&vm, false) + } + } + + if saved != nil { + context.allocator = permanent_allocator + + num_groups := 0 + for i := 0; i < len(saved); i += 2 { + a, b := saved[i], saved[i + 1] + if a == -1 || b == -1 { + continue + } + num_groups += 1 + } + + if num_groups > 0 { + capture.groups = make([]string, num_groups) + capture.pos = make([][2]int, num_groups) + n := 0 + + #no_bounds_check for i := 0; i < len(saved); i += 2 { + a, b := saved[i], saved[i + 1] + if a == -1 || b == -1 { + continue + } + + capture.groups[n] = str[a:b] + capture.pos[n] = {a, b} + n += 1 + } + } + } + + return +} + +/* +Match a regular expression against a string and save the capture results into +the provided `capture` structure. + +The resulting capture strings will be slices to the string `str`, not wholly +copied strings, so they won't need to be individually deleted. + +*Allocates Using Provided Allocator* + +Inputs: +- regex: The regular expression. +- str: The string to match against. +- capture: A pointer to a Capture structure with `groups` and `pos` already allocated. +- temporary_allocator: The allocator to use for the virtual machine. (default: context.temp_allocator) + +Returns: +- num_groups: The number of capture groups set into `capture`. +- success: True if the regex matched the string. +*/ +@require_results +match_with_preallocated_capture :: proc( + regex: Regular_Expression, + str: string, + capture: ^Capture, + temporary_allocator := context.temp_allocator, +) -> (num_groups: int, success: bool) { + + assert(capture != nil, "Pre-allocated RegEx capture must not be nil.") + assert(len(capture.groups) >= common.MAX_CAPTURE_GROUPS, + "Pre-allocated RegEx capture `groups` must be at least 10 elements long.") + assert(len(capture.pos) >= common.MAX_CAPTURE_GROUPS, + "Pre-allocated RegEx capture `pos` must be at least 10 elements long.") + + saved: ^[2 * common.MAX_CAPTURE_GROUPS]int + + { + context.allocator = temporary_allocator + + vm := virtual_machine.create(regex.program, str) + vm.class_data = regex.class_data + + if .Unicode in regex.flags { + saved, success = virtual_machine.run(&vm, true) + } else { + saved, success = virtual_machine.run(&vm, false) + } + } + + if saved != nil { + n := 0 + + #no_bounds_check for i := 0; i < len(saved); i += 2 { + a, b := saved[i], saved[i + 1] + if a == -1 || b == -1 { + continue + } + + capture.groups[n] = str[a:b] + capture.pos[n] = {a, b} + n += 1 + } + } + + return +} + +match :: proc { + match_and_allocate_capture, + match_with_preallocated_capture, +} + +/* +Allocate a `Capture` in advance for use with `match`. This can save some time +if you plan on performing several matches at once and only need the results +between matches. + +Inputs: +- allocator: (default: context.allocator) + +Returns: +- result: The `Capture` with the maximum number of groups allocated. +*/ +@require_results +preallocate_capture :: proc(allocator := context.allocator) -> (result: Capture) { + context.allocator = allocator + result.pos = make([][2]int, common.MAX_CAPTURE_GROUPS) + result.groups = make([]string, common.MAX_CAPTURE_GROUPS) + return +} + +/* +Free all data allocated by the `create*` procedures. + +*Frees Using Provided Allocator* + +Inputs: +- regex: A regular expression. +- allocator: (default: context.allocator) +*/ +destroy_regex :: proc(regex: Regular_Expression, allocator := context.allocator) { + context.allocator = allocator + delete(regex.program) + for data in regex.class_data { + delete(data.runes) + delete(data.ranges) + } + delete(regex.class_data) +} + +/* +Free all data allocated by the `match_and_allocate_capture` procedure. + +*Frees Using Provided Allocator* + +Inputs: +- capture: A Capture. +- allocator: (default: context.allocator) +*/ +destroy_capture :: proc(capture: Capture, allocator := context.allocator) { + context.allocator = allocator + delete(capture.groups) + delete(capture.pos) +} + +destroy :: proc { + destroy_regex, + destroy_capture, +} diff --git a/core/text/regex/tokenizer/tokenizer.odin b/core/text/regex/tokenizer/tokenizer.odin new file mode 100644 index 00000000000..2702c543409 --- /dev/null +++ b/core/text/regex/tokenizer/tokenizer.odin @@ -0,0 +1,349 @@ +package regex_tokenizer + +import "core:text/regex/common" +import "core:unicode/utf8" + +Token_Kind :: enum { + Invalid, + EOF, + + Rune, + Wildcard, + + Alternate, + + Concatenate, + + Repeat_Zero, + Repeat_Zero_Non_Greedy, + Repeat_One, + Repeat_One_Non_Greedy, + + Repeat_N, + + Optional, + Optional_Non_Greedy, + + Rune_Class, + + Open_Paren, + Open_Paren_Non_Capture, + Close_Paren, + + Anchor_Start, + Anchor_End, + + Word_Boundary, + Non_Word_Boundary, +} + +Token :: struct { + kind: Token_Kind, + text: string, + pos: int, +} + +Tokenizer :: struct { + flags: common.Flags, + src: string, + + ch: rune, + offset: int, + read_offset: int, + + last_token_kind: Token_Kind, + held_token: Token, + error_state: Error, + paren_depth: int, +} + +Error :: enum { + None, + Illegal_Null_Character, + Illegal_Codepoint, + Illegal_Byte_Order_Mark, +} + +init :: proc(t: ^Tokenizer, str: string, flags: common.Flags) { + t.src = str + t.flags = flags + t.error_state = advance_rune(t) +} + +peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte { + if t.read_offset+offset < len(t.src) { + return t.src[t.read_offset+offset] + } + return 0 +} + +advance_rune :: proc(t: ^Tokenizer) -> (err: Error) { + if t.error_state != nil { + return t.error_state + } + + if t.read_offset < len(t.src) { + t.offset = t.read_offset + r, w := rune(t.src[t.read_offset]), 1 + switch { + case r == 0: + err = .Illegal_Null_Character + case r >= utf8.RUNE_SELF: + r, w = utf8.decode_rune(t.src[t.read_offset:]) + if r == utf8.RUNE_ERROR && w == 1 { + err = .Illegal_Codepoint + } else if r == utf8.RUNE_BOM && t.offset > 0 { + err = .Illegal_Byte_Order_Mark + } + } + t.read_offset += w + t.ch = r + } else { + t.offset = len(t.src) + t.ch = -1 + } + + t.error_state = err + + return +} + +@require_results +scan_class :: proc(t: ^Tokenizer) -> (str: string, ok: bool) { + start := t.read_offset + + for { + advance_rune(t) + if t.ch == -1 || t.error_state != nil { + return "", false + } + + if t.ch == '\\' { + advance_rune(t) + continue + } + + if t.ch == ']' { + return t.src[start:t.offset], true + } + } + + unreachable() +} + +@require_results +scan_repeat :: proc(t: ^Tokenizer) -> (str: string, ok: bool) { + start := t.read_offset + + for { + advance_rune(t) + if t.ch == -1 { + return "", false + } + if t.ch == '}' { + return t.src[start:t.offset], true + } + } + + unreachable() +} + +@require_results +scan_non_greedy :: proc(t: ^Tokenizer) -> bool { + if peek_byte(t) == '?' { + advance_rune(t) + return true + } + + return false +} + +scan_comment :: proc(t: ^Tokenizer) { + for { + advance_rune(t) + switch t.ch { + case -1: + return + case '\n': + // UNIX newline. + advance_rune(t) + return + case '\r': + // Mac newline. + advance_rune(t) + if t.ch == '\n' { + // Windows newline. + advance_rune(t) + } + return + } + } +} + +@require_results +scan_non_capture_group :: proc(t: ^Tokenizer) -> bool { + if peek_byte(t) == '?' && peek_byte(t, 1) == ':' { + advance_rune(t) + advance_rune(t) + return true + } + + return false +} + +@require_results +scan :: proc(t: ^Tokenizer) -> (token: Token) { + kind: Token_Kind + lit: string + pos := t.offset + + defer { + t.last_token_kind = token.kind + } + + if t.error_state != nil { + t.error_state = nil + return { .Invalid, "", pos } + } + + if t.held_token != {} { + popped := t.held_token + t.held_token = {} + + return popped + } + + ch_loop: for { + switch t.ch { + case -1: + return { .EOF, "", pos } + + case '\\': + advance_rune(t) + + if t.ch == -1 { + return { .EOF, "", pos } + } + + pos = t.offset + + // @MetaCharacter + // NOTE: These must be kept in sync with the compiler. + DIGIT_CLASS :: "0-9" + SPACE_CLASS :: "\t\n\f\r " + WORD_CLASS :: "0-9A-Z_a-z" + + switch t.ch { + case 'b': kind = .Word_Boundary + case 'B': kind = .Non_Word_Boundary + + case 'f': kind = .Rune; lit = "\f" + case 'n': kind = .Rune; lit = "\n" + case 'r': kind = .Rune; lit = "\r" + case 't': kind = .Rune; lit = "\t" + + case 'd': kind = .Rune_Class; lit = DIGIT_CLASS + case 's': kind = .Rune_Class; lit = SPACE_CLASS + case 'w': kind = .Rune_Class; lit = WORD_CLASS + case 'D': kind = .Rune_Class; lit = "^" + DIGIT_CLASS + case 'S': kind = .Rune_Class; lit = "^" + SPACE_CLASS + case 'W': kind = .Rune_Class; lit = "^" + WORD_CLASS + case: + kind = .Rune + lit = t.src[t.offset:t.read_offset] + } + + case '.': + kind = .Wildcard + + case '|': kind = .Alternate + + case '*': kind = .Repeat_Zero_Non_Greedy if scan_non_greedy(t) else .Repeat_Zero + case '+': kind = .Repeat_One_Non_Greedy if scan_non_greedy(t) else .Repeat_One + case '?': kind = .Optional_Non_Greedy if scan_non_greedy(t) else .Optional + + case '[': + if text, ok := scan_class(t); ok { + kind = .Rune_Class + lit = text + } else { + return { .EOF, "", pos } + } + + case '{': + if text, ok := scan_repeat(t); ok { + kind = .Repeat_N + lit = text + } else { + return { .EOF, "", pos } + } + + case '(': + kind = .Open_Paren_Non_Capture if scan_non_capture_group(t) else .Open_Paren + t.paren_depth += 1 + case ')': + kind = .Close_Paren + t.paren_depth -= 1 + + case '^': kind = .Anchor_Start + case '$': + kind = .Anchor_End + + case: + if .Ignore_Whitespace in t.flags { + switch t.ch { + case ' ', '\r', '\n', '\t', '\f': + advance_rune(t) + continue ch_loop + case: + break + } + } + if t.ch == '#' && t.paren_depth == 0 { + scan_comment(t) + continue ch_loop + } + + kind = .Rune + lit = t.src[t.offset:t.read_offset] + } + + break ch_loop + } + + if t.error_state != nil { + t.error_state = nil + return { .Invalid, "", pos } + } + + advance_rune(t) + + // The following set of rules dictate where Concatenate tokens are + // automatically inserted. + #partial switch kind { + case + .Close_Paren, + .Alternate, + .Optional, .Optional_Non_Greedy, + .Repeat_Zero, .Repeat_Zero_Non_Greedy, + .Repeat_One, .Repeat_One_Non_Greedy, + .Repeat_N: + // Never prepend a Concatenate before these tokens. + break + case: + #partial switch t.last_token_kind { + case + .Invalid, + .Open_Paren, .Open_Paren_Non_Capture, + .Alternate: + // Never prepend a Concatenate token when the _last token_ was one + // of these. + break + case: + t.held_token = { kind, lit, pos } + return { .Concatenate, "", pos } + } + } + + return { kind, lit, pos } +} diff --git a/core/text/regex/virtual_machine/doc.odin b/core/text/regex/virtual_machine/doc.odin new file mode 100644 index 00000000000..1b06945654a --- /dev/null +++ b/core/text/regex/virtual_machine/doc.odin @@ -0,0 +1,175 @@ +/* +package regex_vm implements a threaded virtual machine for interpreting +regular expressions, based on the designs described by Russ Cox and attributed +to both Ken Thompson and Rob Pike. + +The virtual machine executes all threads in lock step, i.e. the string pointer +does not advance until all threads have finished processing the current rune. +The algorithm does not look backwards. + +Threads merge when splitting or jumping to positions already visited by another +thread, based on the observation that each thread having visited one PC +(Program Counter) state will execute identically to the previous thread. + +Each thread keeps a save state of its capture groups, and thread priority is +used to allow higher precedence operations to complete first with correct save +states, such as greedy versus non-greedy repetition. + +For more information, see: https://swtch.com/~rsc/regexp/regexp2.html + + +**Implementation Details:** + +- Each opcode is 8 bits in size, and most instructions have no operands. + +- All operands larger than `u8` are read in system endian order. + +- Jump and Split instructions operate on absolute positions in `u16` operands. + +- Classes such as `[0-9]` are stored in a RegEx-specific slice of structs which + are then dereferenced by a `u8` index from the `Rune_Class` instructions. + +- Each Byte and Rune opcode have their operands stored inline after the opcode, + sized `u8` and `i32` respectively. + +- A bitmap is used to determine which PC positions are occupied by a thread to + perform merging. The bitmap is cleared with every new frame. + +- The VM supports two modes: ASCII and Unicode, decided by a compile-time + boolean constant argument provided to `run`. The procedure differs only in + string decoding. This was done for the sake of performance. + +- No allocations are ever freed; the VM expects an arena or temporary allocator + to be used in the context preceding it. + + +**Opcode Reference:** + + (0x00) Match + + The terminal opcode which ends a thread. This always comes at the end of + the program. + + (0x01) Match_And_Exit + + A modified version of Match which stops the virtual machine entirely. It is + only compiled for `No_Capture` expressions, as those expressions do not + need to determine which thread may have saved the most appropriate capture + groups. + + (0x02) Byte + + Consumes one byte from the text using its operand, which is also a byte. + + (0x03) Rune + + Consumes one Unicode codepoint from the text using its operand, which is + four bytes long in a system-dependent endian order. + + (0x04) Rune_Class + + Consumes one character (which may be an ASCII byte or Unicode codepoint, + wholly dependent on which mode the virtual machine is running in) from the + text. + + The actual data storing what runes and ranges of runes apply to the class + are stored alongside the program in the Regular_Expression structure and + the operand for this opcode is a single byte which indexes into a + collection of these data structures. + + (0x05) Rune_Class_Negated + + A modified version of Rune_Class that functions the same, save for how it + returns the opposite of what Rune_Class matches. + + (0x06) Wildcard + + Consumes one byte or one Unicode codepoint, depending on the VM mode. + + (0x07) Jump + + Sets the Program Counter of a VM thread to the operand, which is a u16. + This opcode is used to implement Alternation (coming at the end of the left + choice) and Repeat_Zero (to cause the thread to loop backwards). + + (0x08) Split + + Spawns a new thread for the X operand and causes the current thread to jump + to the Y operand. This opcode is used to implement Alternation, all the + Repeat variations, and the Optional nodes. + + Splitting threads is how the virtual machine is able to execute optional + control flow paths, letting it evaluate different possible ways to match + text. + + (0x09) Save + + Saves the current string index to a slot on the thread dictated by the + operand. These values will be used later to reconstruct capture groups. + + (0x0A) Assert_Start + + Asserts that the thread is at the beginning of a string. + + (0x0B) Assert_End + + Asserts that the thread is at the end of a string. + + (0x0C) Assert_Word_Boundary + + Asserts that the thread is on a word boundary, which can be the start or + end of the text. This examines both the current rune and the next rune. + + (0x0D) Assert_Non_Word_Boundary + + A modified version of Assert_Word_Boundary that returns the opposite value. + + (0x0E) Multiline_Open + + This opcode is compiled in only when the `Multiline` flag is present, and + it replaces both `^` and `$` text anchors. + + It asserts that either the current thread is on one of the string + boundaries, or it consumes a `\n` or `\r` character. + + If a `\r` character is consumed, the PC will be advanced to the sibling + `Multiline_Close` opcode to optionally consume a `\n` character on the next + frame. + + (0x0F) Multiline_Close + + This opcode is always present after `Multiline_Open`. + + It handles consuming the second half of a complete newline, if necessary. + For example, Windows newlines are represented by the characters `\r\n`, + whereas UNIX newlines are `\n` and Macintosh newlines are `\r`. + + (0x10) Wait_For_Byte + (0x11) Wait_For_Rune + (0x12) Wait_For_Rune_Class + (0x13) Wait_For_Rune_Class_Negated + + These opcodes are an optimization around restarting threads on failed + matches when the beginning to a pattern is predictable and the Global flag + is set. + + They will cause the VM to wait for the next rune to match before splitting, + as would happen in the un-optimized version. + + (0x14) Match_All_And_Escape + + This opcode is an optimized version of `.*$` or `.+$` that causes the + active thread to immediately work on escaping the program by following all + Jumps out to the end. + + While running through the rest of the program, the thread will trigger on + every Save instruction it passes to store the length of the string. + + This way, any time a program hits one of these `.*$` constructs, the + virtual machine can exit early, vastly improving processing times. + + Be aware, this opcode is not compiled in if the `Multiline` flag is on, as + the meaning of `$` changes with that flag. + +*/ +package regex_vm diff --git a/core/text/regex/virtual_machine/util.odin b/core/text/regex/virtual_machine/util.odin new file mode 100644 index 00000000000..edf055bc709 --- /dev/null +++ b/core/text/regex/virtual_machine/util.odin @@ -0,0 +1,73 @@ +package regex_vm + +Opcode_Iterator :: struct { + code: Program, + pc: int, +} + +iterate_opcodes :: proc(iter: ^Opcode_Iterator) -> (opcode: Opcode, pc: int, ok: bool) { + if iter.pc >= len(iter.code) { + return + } + + opcode = iter.code[iter.pc] + pc = iter.pc + ok = true + + switch opcode { + case .Match: iter.pc += size_of(Opcode) + case .Match_And_Exit: iter.pc += size_of(Opcode) + case .Byte: iter.pc += size_of(Opcode) + size_of(u8) + case .Rune: iter.pc += size_of(Opcode) + size_of(rune) + case .Rune_Class: iter.pc += size_of(Opcode) + size_of(u8) + case .Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8) + case .Wildcard: iter.pc += size_of(Opcode) + case .Jump: iter.pc += size_of(Opcode) + size_of(u16) + case .Split: iter.pc += size_of(Opcode) + 2 * size_of(u16) + case .Save: iter.pc += size_of(Opcode) + size_of(u8) + case .Assert_Start: iter.pc += size_of(Opcode) + case .Assert_End: iter.pc += size_of(Opcode) + case .Assert_Word_Boundary: iter.pc += size_of(Opcode) + case .Assert_Non_Word_Boundary: iter.pc += size_of(Opcode) + case .Multiline_Open: iter.pc += size_of(Opcode) + case .Multiline_Close: iter.pc += size_of(Opcode) + case .Wait_For_Byte: iter.pc += size_of(Opcode) + size_of(u8) + case .Wait_For_Rune: iter.pc += size_of(Opcode) + size_of(rune) + case .Wait_For_Rune_Class: iter.pc += size_of(Opcode) + size_of(u8) + case .Wait_For_Rune_Class_Negated: iter.pc += size_of(Opcode) + size_of(u8) + case .Match_All_And_Escape: iter.pc += size_of(Opcode) + case: + panic("Invalid opcode found in RegEx program.") + } + + return +} + +opcode_to_name :: proc(opcode: Opcode) -> (str: string) { + switch opcode { + case .Match: str = "Match" + case .Match_And_Exit: str = "Match_And_Exit" + case .Byte: str = "Byte" + case .Rune: str = "Rune" + case .Rune_Class: str = "Rune_Class" + case .Rune_Class_Negated: str = "Rune_Class_Negated" + case .Wildcard: str = "Wildcard" + case .Jump: str = "Jump" + case .Split: str = "Split" + case .Save: str = "Save" + case .Assert_Start: str = "Assert_Start" + case .Assert_End: str = "Assert_End" + case .Assert_Word_Boundary: str = "Assert_Word_Boundary" + case .Assert_Non_Word_Boundary: str = "Assert_Non_Word_Boundary" + case .Multiline_Open: str = "Multiline_Open" + case .Multiline_Close: str = "Multiline_Close" + case .Wait_For_Byte: str = "Wait_For_Byte" + case .Wait_For_Rune: str = "Wait_For_Rune" + case .Wait_For_Rune_Class: str = "Wait_For_Rune_Class" + case .Wait_For_Rune_Class_Negated: str = "Wait_For_Rune_Class_Negated" + case .Match_All_And_Escape: str = "Match_All_And_Escape" + case: str = "" + } + + return +} diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin new file mode 100644 index 00000000000..f92b84ace3e --- /dev/null +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -0,0 +1,638 @@ +package regex_vm + +@require import "core:io" +import "core:text/regex/common" +import "core:text/regex/parser" +import "core:unicode/utf8" + +Rune_Class_Range :: parser.Rune_Class_Range + +// NOTE: This structure differs intentionally from the one in `regex/parser`, +// as this data doesn't need to be a dynamic array once it hits the VM. +Rune_Class_Data :: struct { + runes: []rune, + ranges: []Rune_Class_Range, +} + +Opcode :: enum u8 { + // | [ operands ] + Match = 0x00, // | + Match_And_Exit = 0x01, // | + Byte = 0x02, // | u8 + Rune = 0x03, // | i32 + Rune_Class = 0x04, // | u8 + Rune_Class_Negated = 0x05, // | u8 + Wildcard = 0x06, // | + Jump = 0x07, // | u16 + Split = 0x08, // | u16, u16 + Save = 0x09, // | u8 + Assert_Start = 0x0A, // | + Assert_End = 0x0B, // | + Assert_Word_Boundary = 0x0C, // | + Assert_Non_Word_Boundary = 0x0D, // | + Multiline_Open = 0x0E, // | + Multiline_Close = 0x0F, // | + Wait_For_Byte = 0x10, // | u8 + Wait_For_Rune = 0x11, // | i32 + Wait_For_Rune_Class = 0x12, // | u8 + Wait_For_Rune_Class_Negated = 0x13, // | u8 + Match_All_And_Escape = 0x14, // | +} + +Thread :: struct { + pc: int, + saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, +} + +Program :: []Opcode + +Machine :: struct { + // Program state + memory: string, + class_data: []Rune_Class_Data, + code: Program, + + // Thread state + top_thread: int, + threads: [^]Thread, + next_threads: [^]Thread, + + // The busy map is used to merge threads based on their program counters. + busy_map: []u64, + + // Global state + string_pointer: int, + + current_rune: rune, + current_rune_size: int, + next_rune: rune, + next_rune_size: int, +} + + +// @MetaCharacter +// NOTE: This must be kept in sync with the compiler & tokenizer. +is_word_class :: #force_inline proc "contextless" (r: rune) -> bool { + switch r { + case '0'..='9', 'A'..='Z', '_', 'a'..='z': + return true + case: + return false + } +} + +set_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check { + slot := cast(u64)pc >> 6 + bit: u64 = 1 << (cast(u64)pc & 0x3F) + if vm.busy_map[slot] & bit > 0 { + return false + } + vm.busy_map[slot] |= bit + return true +} + +check_busy_map :: #force_inline proc "contextless" (vm: ^Machine, pc: int) -> bool #no_bounds_check { + slot := cast(u64)pc >> 6 + bit: u64 = 1 << (cast(u64)pc & 0x3F) + return vm.busy_map[slot] & bit > 0 +} + +add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: int) #no_bounds_check { + if check_busy_map(vm, pc) { + return + } + + saved := saved + pc := pc + + resolution_loop: for { + if !set_busy_map(vm, pc) { + return + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Thread [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "] thinking about ") + io.write_string(common.debug_stream, opcode_to_name(vm.code[pc])) + io.write_rune(common.debug_stream, '\n') + } + + #partial switch vm.code[pc] { + case .Jump: + pc = cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode)])^ + continue + + case .Split: + jmp_x := cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode)])^ + jmp_y := cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode) + size_of(u16)])^ + + add_thread(vm, saved, jmp_x) + pc = jmp_y + continue + + case .Save: + new_saved := new([2 * common.MAX_CAPTURE_GROUPS]int) + new_saved ^= saved^ + saved = new_saved + + index := vm.code[pc + size_of(Opcode)] + sp := vm.string_pointer+vm.current_rune_size + saved[index] = sp + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Thread [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "] saving state: (slot ") + io.write_int(common.debug_stream, cast(int)index) + io.write_string(common.debug_stream, " = ") + io.write_int(common.debug_stream, sp) + io.write_string(common.debug_stream, ")\n") + } + + pc += size_of(Opcode) + size_of(u8) + continue + + case .Assert_Start: + sp := vm.string_pointer+vm.current_rune_size + if sp == 0 { + pc += size_of(Opcode) + continue + } + case .Assert_End: + sp := vm.string_pointer+vm.current_rune_size + if sp == len(vm.memory) { + pc += size_of(Opcode) + continue + } + case .Multiline_Open: + sp := vm.string_pointer+vm.current_rune_size + if sp == 0 || sp == len(vm.memory) { + if vm.next_rune == '\r' || vm.next_rune == '\n' { + // The VM is currently on a newline at the string boundary, + // so consume the newline next frame. + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + } else { + // Skip the `Multiline_Close` opcode. + pc += 2 * size_of(Opcode) + continue + } + } else { + // Not on a string boundary. + // Try to consume a newline next frame in the other opcode loop. + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + } + case .Assert_Word_Boundary: + sp := vm.string_pointer+vm.current_rune_size + if sp == 0 || sp == len(vm.memory) { + pc += size_of(Opcode) + continue + } else { + last_rune_is_wc := is_word_class(vm.current_rune) + this_rune_is_wc := is_word_class(vm.next_rune) + + if last_rune_is_wc && !this_rune_is_wc || !last_rune_is_wc && this_rune_is_wc { + pc += size_of(Opcode) + continue + } + } + case .Assert_Non_Word_Boundary: + sp := vm.string_pointer+vm.current_rune_size + if sp != 0 && sp != len(vm.memory) { + last_rune_is_wc := is_word_class(vm.current_rune) + this_rune_is_wc := is_word_class(vm.next_rune) + + if last_rune_is_wc && this_rune_is_wc || !last_rune_is_wc && !this_rune_is_wc { + pc += size_of(Opcode) + continue + } + } + + case .Wait_For_Byte: + operand := cast(rune)vm.code[pc + size_of(Opcode)] + if vm.next_rune == operand { + add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8)) + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + + case .Wait_For_Rune: + operand := (cast(^rune)&vm.code[pc + size_of(Opcode)])^ + if vm.next_rune == operand { + add_thread(vm, saved, pc + size_of(Opcode) + size_of(rune)) + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + + case .Wait_For_Rune_Class: + operand := cast(u8)vm.code[pc + size_of(Opcode)] + class_data := vm.class_data[operand] + next_rune := vm.next_rune + + check: { + for r in class_data.runes { + if next_rune == r { + add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8)) + break check + } + } + for range in class_data.ranges { + if range.lower <= next_rune && next_rune <= range.upper { + add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8)) + break check + } + } + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + + case .Wait_For_Rune_Class_Negated: + operand := cast(u8)vm.code[pc + size_of(Opcode)] + class_data := vm.class_data[operand] + next_rune := vm.next_rune + + check_negated: { + for r in class_data.runes { + if next_rune == r { + break check_negated + } + } + for range in class_data.ranges { + if range.lower <= next_rune && next_rune <= range.upper { + break check_negated + } + } + add_thread(vm, saved, pc + size_of(Opcode) + size_of(u8)) + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + + case: + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = pc, saved = saved } + vm.top_thread += 1 + } + + break resolution_loop + } + + return +} + +run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, ok: bool) #no_bounds_check { + when UNICODE_MODE { + vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory) + } else { + if len(vm.memory) > 0 { + vm.next_rune = cast(rune)vm.memory[0] + vm.next_rune_size = 1 + } + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "### Adding initial thread.\n") + } + + { + starter_saved := new([2 * common.MAX_CAPTURE_GROUPS]int) + starter_saved ^= -1 + + add_thread(vm, starter_saved, 0) + } + + // `add_thread` adds to `next_threads` by default, but we need to put this + // thread in the current thread buffer. + vm.threads, vm.next_threads = vm.next_threads, vm.threads + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "### VM starting.\n") + defer io.write_string(common.debug_stream, "### VM finished.\n") + } + + for { + for i := 0; i < len(vm.busy_map); i += 1 { + vm.busy_map[i] = 0 + } + + assert(vm.string_pointer <= len(vm.memory), "VM string pointer went out of bounds.") + + current_rune := vm.next_rune + vm.current_rune = current_rune + vm.current_rune_size = vm.next_rune_size + when UNICODE_MODE { + vm.next_rune, vm.next_rune_size = utf8.decode_rune_in_string(vm.memory[vm.string_pointer+vm.current_rune_size:]) + } else { + if vm.string_pointer+size_of(u8) < len(vm.memory) { + vm.next_rune = cast(rune)vm.memory[vm.string_pointer+size_of(u8)] + vm.next_rune_size = size_of(u8) + } else { + vm.next_rune = 0 + vm.next_rune_size = 0 + } + } + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, ">>> Dispatching rune: ") + io.write_encoded_rune(common.debug_stream, current_rune) + io.write_byte(common.debug_stream, '\n') + } + + thread_count := vm.top_thread + vm.top_thread = 0 + thread_loop: for i := 0; i < thread_count; i += 1 { + t := vm.threads[i] + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Thread [PC:") + common.write_padded_hex(common.debug_stream, t.pc, 4) + io.write_string(common.debug_stream, "] stepping on ") + io.write_string(common.debug_stream, opcode_to_name(vm.code[t.pc])) + io.write_byte(common.debug_stream, '\n') + } + + #partial opcode: switch vm.code[t.pc] { + case .Match: + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Thread matched!\n") + } + saved = t.saved + ok = true + break thread_loop + + case .Match_And_Exit: + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Thread matched! (Exiting)\n") + } + return nil, true + + case .Byte: + operand := cast(rune)vm.code[t.pc + size_of(Opcode)] + if current_rune == operand { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + } + + case .Rune: + operand := (cast(^rune)&vm.code[t.pc + size_of(Opcode)])^ + if current_rune == operand { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune)) + } + + case .Rune_Class: + operand := cast(u8)vm.code[t.pc + size_of(Opcode)] + class_data := vm.class_data[operand] + + for r in class_data.runes { + if current_rune == r { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + break opcode + } + } + for range in class_data.ranges { + if range.lower <= current_rune && current_rune <= range.upper { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + break opcode + } + } + + case .Rune_Class_Negated: + operand := cast(u8)vm.code[t.pc + size_of(Opcode)] + class_data := vm.class_data[operand] + for r in class_data.runes { + if current_rune == r { + break opcode + } + } + for range in class_data.ranges { + if range.lower <= current_rune && current_rune <= range.upper { + break opcode + } + } + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + + case .Wildcard: + add_thread(vm, t.saved, t.pc + size_of(Opcode)) + + case .Multiline_Open: + if current_rune == '\n' { + // UNIX newline. + add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode)) + } else if current_rune == '\r' { + if vm.next_rune == '\n' { + // Windows newline. (1/2) + add_thread(vm, t.saved, t.pc + size_of(Opcode)) + } else { + // Mac newline. + add_thread(vm, t.saved, t.pc + 2 * size_of(Opcode)) + } + } + case .Multiline_Close: + if current_rune == '\n' { + // Windows newline. (2/2) + add_thread(vm, t.saved, t.pc + size_of(Opcode)) + } + + case .Wait_For_Byte: + operand := cast(rune)vm.code[t.pc + size_of(Opcode)] + if vm.next_rune == operand { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, t.pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved } + vm.top_thread += 1 + + case .Wait_For_Rune: + operand := (cast(^rune)&vm.code[t.pc + size_of(Opcode)])^ + if vm.next_rune == operand { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune)) + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, t.pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved } + vm.top_thread += 1 + + case .Wait_For_Rune_Class: + operand := cast(u8)vm.code[t.pc + size_of(Opcode)] + class_data := vm.class_data[operand] + next_rune := vm.next_rune + + check: { + for r in class_data.runes { + if next_rune == r { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + break check + } + } + for range in class_data.ranges { + if range.lower <= next_rune && next_rune <= range.upper { + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + break check + } + } + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, t.pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved } + vm.top_thread += 1 + + case .Wait_For_Rune_Class_Negated: + operand := cast(u8)vm.code[t.pc + size_of(Opcode)] + class_data := vm.class_data[operand] + next_rune := vm.next_rune + + check_negated: { + for r in class_data.runes { + if next_rune == r { + break check_negated + } + } + for range in class_data.ranges { + if range.lower <= next_rune && next_rune <= range.upper { + break check_negated + } + } + add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(u8)) + } + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "*** New thread added [PC:") + common.write_padded_hex(common.debug_stream, t.pc, 4) + io.write_string(common.debug_stream, "]\n") + } + vm.next_threads[vm.top_thread] = Thread{ pc = t.pc, saved = t.saved } + vm.top_thread += 1 + + case .Match_All_And_Escape: + t.pc += size_of(Opcode) + // The point of this loop is to walk out of wherever this + // opcode lives to the end of the program, while saving the + // index to the length of the string at each pass on the way. + escape_loop: for { + #partial switch vm.code[t.pc] { + case .Match, .Match_And_Exit: + break escape_loop + + case .Jump: + t.pc = cast(int)(cast(^u16)&vm.code[t.pc + size_of(Opcode)])^ + + case .Save: + index := vm.code[t.pc + size_of(Opcode)] + t.saved[index] = len(vm.memory) + t.pc += size_of(Opcode) + size_of(u8) + + case .Match_All_And_Escape: + // Layering these is fine. + t.pc += size_of(Opcode) + + // If the loop has to process any opcode not listed above, + // it means someone did something odd like `a(.*$)b`, in + // which case, just fail. Technically, the expression makes + // no sense. + case: + break opcode + } + } + + saved = t.saved + ok = true + return + + case: + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "Opcode: ") + io.write_int(common.debug_stream, cast(int)vm.code[t.pc]) + io.write_string(common.debug_stream, "\n") + } + panic("Invalid opcode in RegEx thread loop.") + } + } + + vm.threads, vm.next_threads = vm.next_threads, vm.threads + + when common.ODIN_DEBUG_REGEX { + io.write_string(common.debug_stream, "<<< Frame ended. (Threads: ") + io.write_int(common.debug_stream, vm.top_thread) + io.write_string(common.debug_stream, ")\n") + } + + if vm.string_pointer == len(vm.memory) || vm.top_thread == 0 { + break + } + + vm.string_pointer += vm.current_rune_size + } + + return +} + +opcode_count :: proc(code: Program) -> (opcodes: int) { + iter := Opcode_Iterator{ code, 0 } + for _ in iterate_opcodes(&iter) { + opcodes += 1 + } + return +} + +create :: proc(code: Program, str: string) -> (vm: Machine) { + assert(len(code) > 0, "RegEx VM has no instructions.") + + vm.memory = str + vm.code = code + + sizing := len(code) >> 6 + (1 if len(code) & 0x3F > 0 else 0) + assert(sizing > 0) + vm.busy_map = make([]u64, sizing) + + max_possible_threads := max(1, opcode_count(vm.code) - 1) + + vm.threads = make([^]Thread, max_possible_threads) + vm.next_threads = make([^]Thread, max_possible_threads) + + return +} From 730e10bd6f3bc04050a0d3c161c08a6145f61099 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:17:18 -0400 Subject: [PATCH 02/24] Support printing `Regular_Expression` in `fmt` --- core/fmt/fmt.odin | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/core/fmt/fmt.odin b/core/fmt/fmt.odin index 9aa9c99dc1c..22ac1cc369b 100644 --- a/core/fmt/fmt.odin +++ b/core/fmt/fmt.odin @@ -9,6 +9,7 @@ import "core:io" import "core:reflect" import "core:strconv" import "core:strings" +import "core:text/regex" import "core:time" import "core:unicode/utf8" @@ -2405,6 +2406,21 @@ fmt_named :: proc(fi: ^Info, v: any, verb: rune, info: runtime.Type_Info_Named) write_padded_number(fi, (ns), 9) io.write_string(fi.writer, " +0000 UTC", &fi.n) return + + case regex.Regular_Expression: + io.write_byte(fi.writer, '/') + for r in a.original_pattern { + if r == '/' { + io.write_string(fi.writer, `\/`) + } else { + io.write_rune(fi.writer, r) + } + } + io.write_byte(fi.writer, '/') + for flag in a.flags { + io.write_byte(fi.writer, regex.Flag_To_Letter[flag]) + } + return } } From 3e49ceb82ab907a3549451358f78bea2ae683e7e Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:18:26 -0400 Subject: [PATCH 03/24] Add tests for `core:text/regex` --- tests/core/normal.odin | 1 + .../core/text/regex/test_core_text_regex.odin | 1012 +++++++++++++++++ 2 files changed, 1013 insertions(+) create mode 100644 tests/core/text/regex/test_core_text_regex.odin diff --git a/tests/core/normal.odin b/tests/core/normal.odin index 065090be387..1f34e329205 100644 --- a/tests/core/normal.odin +++ b/tests/core/normal.odin @@ -38,6 +38,7 @@ download_assets :: proc() { @(require) import "strings" @(require) import "text/i18n" @(require) import "text/match" +@(require) import "text/regex" @(require) import "thread" @(require) import "time" @(require) import "unicode" diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin new file mode 100644 index 00000000000..da44e6b2d6c --- /dev/null +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -0,0 +1,1012 @@ +package test_core_text_regex + +import "core:fmt" +import "core:io" +import "core:log" +import "core:strings" +import "core:testing" +import "core:text/regex" +import "core:text/regex/common" +import "core:text/regex/parser" +import "core:text/regex/tokenizer" + + +check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex.Flags, haystack: string, needles: ..string, loc := #caller_location) { + rex, parse_err := regex.create(pattern, flags) + if !testing.expect_value(t, parse_err, nil, loc = loc) { + log.infof("Failed test's flags were: %v", flags, location = loc) + return + } + defer regex.destroy(rex) + + capture, success := regex.match(rex, haystack) + defer { + delete(capture.groups) + delete(capture.pos) + } + + if len(needles) > 0 { + testing.expect(t, success, "match failed", loc = loc) + } + + matches_aligned := testing.expectf(t, len(needles) == len(capture.groups), + "expected %i match groups, got %i (flags: %w)", + len(needles), len(capture.groups), flags, loc = loc) + + if matches_aligned { + for needle, i in needles { + if !testing.expectf(t, capture.groups[i] == needle, + "match group %i was %q, expected %q (flags: %w)", + i, capture.groups[i], needle, flags, loc = loc) { + } + } + } else { + log.infof("match groups were: %v", capture.groups, location = loc) + } +} + +check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags: regex.Flags = {}, loc := #caller_location) { + check_expression_with_flags(t, pattern, { .Global } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags, + haystack, ..needles, loc = loc) +} + + +@test +test_concatenation :: proc(t: ^testing.T) { + check_expression(t, "abc", "abc", "abc") +} + +@test +test_rune_class :: proc(t: ^testing.T) { + EXPR :: "[abc]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "b", "b") + check_expression(t, EXPR, "c", "c") +} + +@test +test_rune_ranges :: proc(t: ^testing.T) { + EXPR :: "0x[0-9A-Fa-f]+" + check_expression(t, EXPR, "0x0065c816", "0x0065c816") +} + +@test +test_rune_range_terminal_dash :: proc(t: ^testing.T) { + { + EXPR :: "[a-]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-a]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-a-]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-]" + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[--]" + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[---]" + check_expression(t, EXPR, "-", "-") + } +} + +@test +test_rune_range_escaping_class :: proc(t: ^testing.T) { + EXPR :: `[\]a\[\.]` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "[", "[") + check_expression(t, EXPR, "]", "]") + check_expression(t, EXPR, ".", ".") + check_expression(t, EXPR, "b") +} + +@test +test_negated_rune_class :: proc(t: ^testing.T) { + EXPR :: "[^ac-d]" + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "b", "b") + check_expression(t, EXPR, "e", "e") + check_expression(t, EXPR, "c") + check_expression(t, EXPR, "d") +} + +@test +test_shorthand_classes :: proc(t: ^testing.T) { + EXPR_P :: `\d\w\s` + check_expression(t, EXPR_P, "1a ", "1a ") + check_expression(t, EXPR_P, "a!1") + EXPR_N :: `\D\W\S` + check_expression(t, EXPR_N, "a!1", "a!1") + check_expression(t, EXPR_N, "1a ") +} + +@test +test_shorthand_classes_in_classes :: proc(t: ^testing.T) { + EXPR_P :: `[\d][\w][\s]` + check_expression(t, EXPR_P, "1a ", "1a ") + check_expression(t, EXPR_P, "a!1") + EXPR_NP :: `[^\d][^\w][^\s]` + check_expression(t, EXPR_NP, "a!1", "a!1") + check_expression(t, EXPR_NP, "1a ") + EXPR_N :: `[\D][\W][\S]` + check_expression(t, EXPR_N, "a!1", "a!1") + check_expression(t, EXPR_N, "1a ") + EXPR_NN :: `[^\D][^\W][^\S]` + check_expression(t, EXPR_NN, "1a ", "1a ") + check_expression(t, EXPR_NN, "a!1") +} + +@test +test_mixed_shorthand_class :: proc(t: ^testing.T) { + EXPR_P :: `[\d\s]+` + check_expression(t, EXPR_P, "0123456789 98", "0123456789 98") + check_expression(t, EXPR_P, "!@#$%^&*()_()") + EXPR_NP :: `[^\d\s]+` + check_expression(t, EXPR_NP, "!@#$%^&*()_()", "!@#$%^&*()_()") + check_expression(t, EXPR_NP, "0123456789 98") +} + +@test +test_wildcard :: proc(t: ^testing.T) { + EXPR :: "." + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, ".", ".") +} + +@test +test_alternation :: proc(t: ^testing.T) { + EXPR :: "aa|bb|cc" + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "bb", "bb") + check_expression(t, EXPR, "cc", "cc") +} + +@test +test_optional :: proc(t: ^testing.T) { + EXPR :: "a?a?a?aaa" + check_expression(t, EXPR, "aaa", "aaa") +} + +@test +test_repeat_zero :: proc(t: ^testing.T) { + EXPR :: "a*b" + check_expression(t, EXPR, "aaab", "aaab") +} + +@test +test_repeat_one :: proc(t: ^testing.T) { + EXPR :: "a+b" + check_expression(t, EXPR, "aaab", "aaab") +} + +@test +test_greedy :: proc(t: ^testing.T) { + HTML :: "" + + check_expression(t, "<.+>", HTML, HTML) + check_expression(t, "<.*>", HTML, HTML) + + check_expression(t, "aaa?", "aaa", "aaa") +} + +@test +test_non_greedy :: proc(t: ^testing.T) { + HTML :: "" + + check_expression(t, "<.+?>", HTML, "") + check_expression(t, "<.*?>", HTML, "") + + // NOTE: make a comment about optional non-greedy capture groups + check_expression(t, "aaa??", "aaa", "aa") +} + +@test +test_groups :: proc(t: ^testing.T) { + check_expression(t, "a(b)", "ab", /*|*/ "ab", "b") + check_expression(t, "(a)b", "ab", /*|*/ "ab", "a") + check_expression(t, "(a)(b)", "ab", /*|*/ "ab", "a", "b") + + check_expression(t, "(a(b))", "ab", /*|*/ "ab", "ab", "b") + check_expression(t, "((ab))", "ab", /*|*/ "ab", "ab", "ab") + check_expression(t, "((a)b)", "ab", /*|*/ "ab", "ab", "a") + + check_expression(t, "(ab)+", "ababababab", /*|*/ "ababababab", "ab") + check_expression(t, "((ab)+)", "ababababab", /*|*/ "ababababab", "ababababab", "ab") +} + +@test +test_class_group_repeat :: proc(t: ^testing.T) { + EXPR_1 :: "([0-9]:?)+" + EXPR_2 :: "([0-9]+:?)+" + check_expression(t, EXPR_1, "123:456:789", "123:456:789", "9") + check_expression(t, EXPR_2, "123:456:789", "123:456:789", "789") +} + +@test +test_non_capture_group :: proc(t: ^testing.T) { + EXPR :: "(?:a|b)c" + check_expression(t, EXPR, "ac", "ac") + check_expression(t, EXPR, "bc", "bc") + check_expression(t, EXPR, "cc") +} + +@test +test_optional_capture_group :: proc(t: ^testing.T) { + EXPR :: "^(blue|straw)?berry" + check_expression(t, EXPR, "berry", "berry") + check_expression(t, EXPR, "blueberry", "blueberry", "blue") + check_expression(t, EXPR, "strawberry", "strawberry", "straw") + check_expression(t, EXPR, "cranberry") +} + +@test +test_max_capture_groups :: proc(t: ^testing.T) { + EXPR :: "(1)(2)(3)(4)(5)(6)(7)(8)(9)" + check_expression(t, EXPR, "123456789", "123456789", + "1", "2", "3", "4", "5", "6", "7", "8", "9") +} + +@test +test_repetition :: proc(t: ^testing.T) { + { + EXPR :: "^a{3}$" + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa") + } + { + EXPR :: "^a{3,5}$" + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa", "aaaa") + check_expression(t, EXPR, "aaaaa", "aaaaa") + check_expression(t, EXPR, "aaaaaa") + } + { + EXPR :: "^(?:meow){2}$" + check_expression(t, EXPR, "meow") + check_expression(t, EXPR, "meowmeow", "meowmeow") + check_expression(t, EXPR, "meowmeowmeow") + } + { + EXPR :: "a{2,}" + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "aaa", "aaa") + } + { + EXPR :: "a{,2}" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "aaa", "aa") + } + { + EXPR :: "^a{3,3}$" + check_expression(t, EXPR, "aa") + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa") + } + { + EXPR :: "a{0,}" + check_expression(t, EXPR, "aaa", "aaa") + } +} + +@test +test_repeated_groups :: proc(t: ^testing.T) { + { + EXPR :: "(ab){3}" + check_expression(t, EXPR, "ababab", "ababab", "ab") + } + { + EXPR :: "((?:ab){3})" + check_expression(t, EXPR, "ababab", "ababab", "ababab") + } +} + +@test +test_escaped_newline :: proc(t: ^testing.T) { + EXPR :: `\n[\n]` + check_expression(t, EXPR, "\n\n", "\n\n") +} + +@test +test_anchors :: proc(t: ^testing.T) { + { + EXPR :: "^ab" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab") + } + { + EXPR :: "ab$" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab", "ab") + } + { + EXPR :: "^ab$" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab") + } +} + +@test +test_grouped_anchors :: proc(t: ^testing.T) { + { + EXPR :: "^a|b" + check_expression(t, EXPR, "ab", "a") + check_expression(t, EXPR, "ba", "b") + } + { + EXPR :: "b|c$" + check_expression(t, EXPR, "ac", "c") + check_expression(t, EXPR, "cb", "b") + } + { + EXPR :: "^hellope$|world" + check_expression(t, EXPR, "hellope", "hellope") + check_expression(t, EXPR, "hellope world", "world") + } +} + +@test +test_empty_alternation :: proc(t: ^testing.T) { + { + EXPR :: "(?:a|)b" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "b", "b") + } + { + EXPR :: "(?:|a)b" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "b", "b") + } + { + EXPR :: "|b" + check_expression(t, EXPR, "b", "") + check_expression(t, EXPR, "", "") + } + { + EXPR :: "a|" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "", "") + } + { + EXPR :: "|" + check_expression(t, EXPR, "a", "") + check_expression(t, EXPR, "", "") + } +} + +@test +test_empty_class :: proc(t: ^testing.T) { + EXPR :: "a[]b" + check_expression(t, EXPR, "ab", "ab") +} + +@test +test_dot_in_class :: proc(t: ^testing.T) { + EXPR :: `[a\..]` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, ".", ".") + check_expression(t, EXPR, "b") +} + + +@test +test_word_boundaries :: proc(t: ^testing.T) { + STR :: "This is an island." + { + EXPR :: `\bis\b` + check_expression(t, EXPR, STR, "is") + } + { + EXPR :: `\bis\w+` + check_expression(t, EXPR, STR, "island") + } + { + EXPR :: `\w+is\b` + check_expression(t, EXPR, STR, "This") + } + { + EXPR :: `\b\w\w\b` + check_expression(t, EXPR, STR, "is") + } +} + +@test +test_non_word_boundaries :: proc(t: ^testing.T) { + { + EXPR :: `.\B.` + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, " ", " ") + check_expression(t, EXPR, "a ") + check_expression(t, EXPR, " b") + } + { + EXPR :: `\B.\B` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "abc", "b") + } + { + EXPR :: `\B.+` + check_expression(t, EXPR, "abc", "bc") + } + { + EXPR :: `.+\B` + check_expression(t, EXPR, "abc", "ab") + } +} + +@test +test_empty_patterns :: proc(t: ^testing.T) { + { + EXPR :: "" + check_expression(t, EXPR, "abc", "") + } + { + EXPR :: "^$" + check_expression(t, EXPR, "", "") + check_expression(t, EXPR, "a") + } +} + +@test +test_unanchored :: proc(t: ^testing.T) { + EXPR :: "ab" + check_expression(t, EXPR, "cab", "ab") +} + +@test +test_affixes :: proc(t: ^testing.T) { + // This test is for the optimizer. + EXPR :: "^(?:samples|ample|sample)$" + check_expression(t, EXPR, "sample", "sample") + check_expression(t, EXPR, "samples", "samples") + check_expression(t, EXPR, "ample", "ample") + check_expression(t, EXPR, "amples") +} + +@test +test_anchored_capture_until_end :: proc(t: ^testing.T) { + // This test is for the optimizer. + { + EXPR :: `^hellope.*$` + check_expression(t, EXPR, "hellope world", "hellope world") + check_expression(t, EXPR, "hellope", "hellope") + check_expression(t, EXPR, "hellope !", "hellope !") + } + { + EXPR :: `^hellope.+$` + check_expression(t, EXPR, "hellope world", "hellope world") + check_expression(t, EXPR, "hellope") + check_expression(t, EXPR, "hellope !", "hellope !") + } + { + EXPR :: `^(aa|bb|cc.+$).*$` + check_expression(t, EXPR, "aa", "aa", "aa") + check_expression(t, EXPR, "bb", "bb", "bb") + check_expression(t, EXPR, "bbaa", "bbaa", "bb") + check_expression(t, EXPR, "cc") + check_expression(t, EXPR, "ccc", "ccc", "ccc") + check_expression(t, EXPR, "cccc", "cccc", "cccc") + } + // This makes sure that the `.*$` / `.*$` optimization doesn't cause + // any issues if someone does something strange like putting it in the + // middle of an expression. + { + EXPR :: `^(a(b.*$)c).*$` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "ab") + check_expression(t, EXPR, "abc") + } + { + EXPR :: `^(a(b.*$)?c).*$` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "ab") + check_expression(t, EXPR, "abc") + check_expression(t, EXPR, "ac", "ac", "ac") + check_expression(t, EXPR, "acc", "acc", "ac") + } +} + +@test +test_unicode_explicitly :: proc(t: ^testing.T) { + { + EXPR :: "^....!$" + check_expression_with_flags(t, EXPR, { .Unicode }, + "こにちは!", "こにちは!") + check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization }, + "こにちは!", "こにちは!") + } + { + EXPR :: "こにちは!" + check_expression_with_flags(t, EXPR, { .Global, .Unicode }, + "Hello こにちは!", "こにちは!") + check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization }, + "Hello こにちは!", "こにちは!") + } +} + +@test +test_no_capture_match :: proc(t: ^testing.T) { + EXPR :: "^abc$" + + rex, parse_err := regex.create(EXPR, { .No_Capture }) + if !testing.expect_value(t, parse_err, nil) { + return + } + defer regex.destroy(rex) + + _, matched := regex.match(rex, "abc") + testing.expect(t, matched) +} + +@test +test_comments :: proc(t: ^testing.T) { + EXPR :: `^[abc]# This is a comment. +[def]# This is another comment. +\#$# This is a comment following an escaped '#'.` + check_expression(t, EXPR, "ad#", "ad#") +} + +@test +test_ignore_whitespace :: proc(t: ^testing.T) { + EXPR :: "\f" + ` +\ H e l # Note that the first space on this line is escaped, thus it is not ignored. + l +o p e [ ] w o rld (?: [ ]) ! # Spaces in classes are fine, too. +` + "\r" + + check_expression(t, EXPR, " Hellope world !", " Hellope world !", extra_flags = { .Ignore_Whitespace }) +} + +@test +test_case_insensitive :: proc(t: ^testing.T) { + EXPR :: `hElLoPe [w!][o-P]+rLd!` + check_expression(t, EXPR, "HeLlOpE WoRlD!", "HeLlOpE WoRlD!", extra_flags = { .Case_Insensitive }) +} + +@test +test_multiline :: proc(t: ^testing.T) { + { + EXPR :: `^hellope$world$` + check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline }) + } + { + EXPR :: `^?.$` + check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline }) + check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline }) + } + { + EXPR :: `^$` + check_expression(t, EXPR, "\n", "\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "", "", extra_flags = { .Multiline }) + } + { + EXPR :: `$` + check_expression(t, EXPR, "\n", "\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "", "", extra_flags = { .Multiline }) + } +} + +@test +test_optional_inside_optional :: proc(t: ^testing.T) { + EXPR :: `(?:a?)?` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "", "") +} + +@test +test_printing :: proc(t: ^testing.T) { + rex, parse_err := regex.create(`^/a$`, { + .Global, + .Multiline, + .Case_Insensitive, + .Unicode, + .Ignore_Whitespace, + .No_Optimization, + .No_Capture, + }) + if !testing.expect_value(t, parse_err, nil) { + return + } + defer regex.destroy(rex) + + str := fmt.tprint(rex) + str_hash := fmt.tprintf("%#v", rex) + testing.expect_value(t, str, `/^\/a$/gmixun-`) + testing.expect_value(t, str_hash, `/^\/a$/gmixun-`) +} + + + +@test +test_error_bad_repetitions :: proc(t: ^testing.T) { + check_repetition_error :: proc(t: ^testing.T, pattern: string, loc := #caller_location) { + rex, err := regex.create(pattern) + regex.destroy(rex) + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Repetition) + if !ok { + log.errorf("expected error Invalid_Repetition, got %v", parse_err, location = loc) + } + } + + check_repetition_error(t, "a{-1,2}") + check_repetition_error(t, "a{2,1}") + check_repetition_error(t, "a{bc}") + check_repetition_error(t, "a{,-3}") + check_repetition_error(t, "a{d,}") + check_repetition_error(t, "a{}") + check_repetition_error(t, "a{0,0}") + check_repetition_error(t, "a{,0}") + check_repetition_error(t, "a{,}") +} + +@test +test_error_invalid_unicode_in_pattern :: proc(t: ^testing.T) { + rex, err := regex.create("\xC0", { .Unicode }) + regex.destroy(rex) + parse_err := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Unicode) + if !ok { + log.errorf("expected error Invalid_Unicode, got %v", parse_err) + } +} + +@test +test_error_invalid_unicode_in_string :: proc(t: ^testing.T) { + EXPR :: "^...$" + // NOTE: Matching on invalid Unicode is currently safe. + // If `utf8.decode_rune` ever changes, this test may fail. + check_expression(t, EXPR, "\xC0\xFF\xFE", "\xC0\xFF\xFE") +} + +@test +test_error_too_many_capture_groups :: proc(t: ^testing.T) { + // NOTE: There are 1 + 9 + 1 capture groups in this pattern. + // Remember the implicit capture group 0. + rex, err := regex.create("(1)(2)(3)(4)(5)(6)(7)(8)(9) (A)") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Too_Many_Capture_Groups) + if !ok { + log.errorf("expected error Too_Many_Capture_Groups, got %v", parse_err) + } +} + +@test +test_error_unclosed_paren :: proc(t: ^testing.T) { + rex, err := regex.create("(Hellope") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Expected_Token) + if !ok { + log.errorf("expected error Expected_Token, got %v", parse_err) + } +} + +@test +test_error_unclosed_class :: proc(t: ^testing.T) { + rex, err := regex.create("[helope") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Unexpected_EOF) + if !ok { + log.errorf("expected error Unexpected_EOF, got %v", parse_err) + } +} + +@test +test_error_invalid_unicode_in_unclosed_class :: proc(t: ^testing.T) { + rex, err := regex.create("[\xC0", { .Unicode }) + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Unicode) + if !ok { + log.errorf("expected error Invalid_Unicode, got %v", parse_err) + } +} + +@test +test_program_too_big :: proc(t: ^testing.T) { + sb := strings.builder_make() + w := strings.to_writer(&sb) + defer strings.builder_destroy(&sb) + + // Each byte will turn into two bytes for the whole opcode and operand, + // then the compiler will insert 5 more bytes for the Save instructions + // and the Match. + N :: common.MAX_PROGRAM_SIZE/2 - 2 + for _ in 0.. Date: Sun, 21 Jul 2024 23:20:32 -0400 Subject: [PATCH 04/24] Add benchmarks for `core:text/regex` --- tests/benchmark/all.odin | 1 + .../benchmark/text/regex/benchmark_regex.odin | 258 ++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 tests/benchmark/text/regex/benchmark_regex.odin diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin index d1b7662e2a6..36de14278a5 100644 --- a/tests/benchmark/all.odin +++ b/tests/benchmark/all.odin @@ -2,3 +2,4 @@ package benchmarks @(require) import "crypto" @(require) import "hash" +@(require) import "text/regex" diff --git a/tests/benchmark/text/regex/benchmark_regex.odin b/tests/benchmark/text/regex/benchmark_regex.odin new file mode 100644 index 00000000000..cd9812b08b4 --- /dev/null +++ b/tests/benchmark/text/regex/benchmark_regex.odin @@ -0,0 +1,258 @@ +package benchmark_core_text_regex + +import "core:fmt" +import "core:log" +import "core:math/rand" +import "core:mem" +import "core:testing" +import "core:text/regex" +import "core:time" +import "core:unicode/utf8" + +randomize_ascii :: proc(data: []u8) { + for i in 0.. len(data) - i { + continue + } + + r_data, size := utf8.encode_rune(r) + for j in 0.. Date: Mon, 22 Jul 2024 13:27:06 -0400 Subject: [PATCH 05/24] Add `core:text/regex` to `examples/all` --- examples/all/all_main.odin | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index d92a6b8c452..71a1ad73383 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -127,6 +127,7 @@ import testing "core:testing" import edit "core:text/edit" import i18n "core:text/i18n" import match "core:text/match" +import regex "core:text/regex" import scanner "core:text/scanner" import table "core:text/table" @@ -248,6 +249,7 @@ _ :: testing _ :: scanner _ :: i18n _ :: match +_ :: regex _ :: table _ :: edit _ :: thread From e642be8550a7adba2bfcfc47fb5589ba60d837d6 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:17:37 -0400 Subject: [PATCH 06/24] Fix handling of unclosed `regex` classes and repetitions --- core/text/regex/tokenizer/tokenizer.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/text/regex/tokenizer/tokenizer.odin b/core/text/regex/tokenizer/tokenizer.odin index 2702c543409..5804439a861 100644 --- a/core/text/regex/tokenizer/tokenizer.odin +++ b/core/text/regex/tokenizer/tokenizer.odin @@ -267,7 +267,7 @@ scan :: proc(t: ^Tokenizer) -> (token: Token) { kind = .Rune_Class lit = text } else { - return { .EOF, "", pos } + kind = .EOF } case '{': @@ -275,7 +275,7 @@ scan :: proc(t: ^Tokenizer) -> (token: Token) { kind = .Repeat_N lit = text } else { - return { .EOF, "", pos } + kind = .EOF } case '(': From e8537a3134b0539c16907ed42b8832696d65112c Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:21:41 -0400 Subject: [PATCH 07/24] Add test cases for unclosed classes and repetition Simplified error checking while I was at it, too. --- .../core/text/regex/test_core_text_regex.odin | 163 +++++++----------- 1 file changed, 66 insertions(+), 97 deletions(-) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index da44e6b2d6c..0bd1ff2886d 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -3,6 +3,7 @@ package test_core_text_regex import "core:fmt" import "core:io" import "core:log" +import "core:reflect" import "core:strings" import "core:testing" import "core:text/regex" @@ -56,6 +57,17 @@ check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..st haystack, ..needles, loc = loc) } +expect_error :: proc(t: ^testing.T, pattern: string, expected_error: typeid, flags: regex.Flags = {}, loc := #caller_location) { + rex, err := regex.create(pattern, flags) + regex.destroy(rex) + + variant := reflect.get_union_variant(err) + variant_ti := reflect.union_variant_type_info(variant) + expected_ti := type_info_of(expected_error) + + testing.expect_value(t, variant_ti, expected_ti, loc = loc) +} + @test test_concatenation :: proc(t: ^testing.T) { @@ -109,12 +121,18 @@ test_rune_range_terminal_dash :: proc(t: ^testing.T) { @test test_rune_range_escaping_class :: proc(t: ^testing.T) { - EXPR :: `[\]a\[\.]` - check_expression(t, EXPR, "a", "a") - check_expression(t, EXPR, "[", "[") - check_expression(t, EXPR, "]", "]") - check_expression(t, EXPR, ".", ".") - check_expression(t, EXPR, "b") + { + EXPR :: `[\]a\[\.]` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "[", "[") + check_expression(t, EXPR, "]", "]") + check_expression(t, EXPR, ".", ".") + check_expression(t, EXPR, "b") + } + { + EXPR :: `a[\\]b` + check_expression(t, EXPR, `a\b`, `a\b`) + } } @test @@ -546,8 +564,8 @@ test_unicode_explicitly :: proc(t: ^testing.T) { test_no_capture_match :: proc(t: ^testing.T) { EXPR :: "^abc$" - rex, parse_err := regex.create(EXPR, { .No_Capture }) - if !testing.expect_value(t, parse_err, nil) { + rex, err := regex.create(EXPR, { .No_Capture }) + if !testing.expect_value(t, err, nil) { return } defer regex.destroy(rex) @@ -616,7 +634,7 @@ test_optional_inside_optional :: proc(t: ^testing.T) { @test test_printing :: proc(t: ^testing.T) { - rex, parse_err := regex.create(`^/a$`, { + rex, err := regex.create(`^/a$`, { .Global, .Multiline, .Case_Insensitive, @@ -625,7 +643,7 @@ test_printing :: proc(t: ^testing.T) { .No_Optimization, .No_Capture, }) - if !testing.expect_value(t, parse_err, nil) { + if !testing.expect_value(t, err, nil) { return } defer regex.destroy(rex) @@ -640,36 +658,28 @@ test_printing :: proc(t: ^testing.T) { @test test_error_bad_repetitions :: proc(t: ^testing.T) { - check_repetition_error :: proc(t: ^testing.T, pattern: string, loc := #caller_location) { - rex, err := regex.create(pattern) - regex.destroy(rex) - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Invalid_Repetition) - if !ok { - log.errorf("expected error Invalid_Repetition, got %v", parse_err, location = loc) - } - } - - check_repetition_error(t, "a{-1,2}") - check_repetition_error(t, "a{2,1}") - check_repetition_error(t, "a{bc}") - check_repetition_error(t, "a{,-3}") - check_repetition_error(t, "a{d,}") - check_repetition_error(t, "a{}") - check_repetition_error(t, "a{0,0}") - check_repetition_error(t, "a{,0}") - check_repetition_error(t, "a{,}") + expect_error(t, "a{-1,2}", parser.Invalid_Repetition) + expect_error(t, "a{2,1}", parser.Invalid_Repetition) + expect_error(t, "a{bc}", parser.Invalid_Repetition) + expect_error(t, "a{,-3}", parser.Invalid_Repetition) + expect_error(t, "a{d,}", parser.Invalid_Repetition) + expect_error(t, "a{}", parser.Invalid_Repetition) + expect_error(t, "a{0,0}", parser.Invalid_Repetition) + expect_error(t, "a{,0}", parser.Invalid_Repetition) + expect_error(t, "a{,}", parser.Invalid_Repetition) + + // Unclosed braces + expect_error(t, "a{", parser.Unexpected_EOF) + expect_error(t, "a{", parser.Unexpected_EOF) + expect_error(t, "a{1,2", parser.Unexpected_EOF) + expect_error(t, "a{0,", parser.Unexpected_EOF) + expect_error(t, "a{,3", parser.Unexpected_EOF) + expect_error(t, "a{,", parser.Unexpected_EOF) } @test test_error_invalid_unicode_in_pattern :: proc(t: ^testing.T) { - rex, err := regex.create("\xC0", { .Unicode }) - regex.destroy(rex) - parse_err := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Invalid_Unicode) - if !ok { - log.errorf("expected error Invalid_Unicode, got %v", parse_err) - } + expect_error(t, "\xC0", parser.Invalid_Unicode) } @test @@ -684,50 +694,26 @@ test_error_invalid_unicode_in_string :: proc(t: ^testing.T) { test_error_too_many_capture_groups :: proc(t: ^testing.T) { // NOTE: There are 1 + 9 + 1 capture groups in this pattern. // Remember the implicit capture group 0. - rex, err := regex.create("(1)(2)(3)(4)(5)(6)(7)(8)(9) (A)") - regex.destroy(rex) - - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Too_Many_Capture_Groups) - if !ok { - log.errorf("expected error Too_Many_Capture_Groups, got %v", parse_err) - } + expect_error(t, "(1)(2)(3)(4)(5)(6)(7)(8)(9) (A)", parser.Too_Many_Capture_Groups) } @test test_error_unclosed_paren :: proc(t: ^testing.T) { - rex, err := regex.create("(Hellope") - regex.destroy(rex) - - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Expected_Token) - if !ok { - log.errorf("expected error Expected_Token, got %v", parse_err) - } + expect_error(t, "(Hellope", parser.Expected_Token) } @test test_error_unclosed_class :: proc(t: ^testing.T) { - rex, err := regex.create("[helope") - regex.destroy(rex) - - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Unexpected_EOF) - if !ok { - log.errorf("expected error Unexpected_EOF, got %v", parse_err) - } + expect_error(t, "[helope", parser.Unexpected_EOF) + expect_error(t, `a[\]b`, parser.Unexpected_EOF) + expect_error(t, `a[\b`, parser.Unexpected_EOF) + expect_error(t, `a[\`, parser.Unexpected_EOF) + expect_error(t, `a[`, parser.Unexpected_EOF) } @test test_error_invalid_unicode_in_unclosed_class :: proc(t: ^testing.T) { - rex, err := regex.create("[\xC0", { .Unicode }) - regex.destroy(rex) - - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Invalid_Unicode) - if !ok { - log.errorf("expected error Invalid_Unicode, got %v", parse_err) - } + expect_error(t, "[\xC0", parser.Invalid_Unicode, { .Unicode }) } @test @@ -794,35 +780,18 @@ test_lone_enders :: proc(t: ^testing.T) { @test test_invalid_unary_tokens :: proc(t: ^testing.T) { - check_token_error :: proc(t: ^testing.T, pattern: string, loc := #caller_location) { - rex, err := regex.create(pattern) - regex.destroy(rex) - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Invalid_Token) - if !ok { - log.errorf("expected error Invalid_Token, got %v", parse_err, location = loc) - } - } - check_token_error(t, `*`) - check_token_error(t, `*?`) - check_token_error(t, `+`) - check_token_error(t, `+?`) - check_token_error(t, `?`) - check_token_error(t, `??`) - check_token_error(t, `{}`) - check_token_error(t, `{1,}`) - check_token_error(t, `{1,2}`) - check_token_error(t, `{,2}`) - - { - rex, err := regex.create(`\`) - regex.destroy(rex) - parse_err, _ := err.(regex.Parser_Error) - _, ok := parse_err.(parser.Unexpected_EOF) - if !ok { - log.errorf("expected error Unexpected_EOF, got %v", parse_err) - } - } + expect_error(t, `*`, parser.Invalid_Token) + expect_error(t, `*?`, parser.Invalid_Token) + expect_error(t, `+`, parser.Invalid_Token) + expect_error(t, `+?`, parser.Invalid_Token) + expect_error(t, `?`, parser.Invalid_Token) + expect_error(t, `??`, parser.Invalid_Token) + expect_error(t, `{}`, parser.Invalid_Token) + expect_error(t, `{1,}`, parser.Invalid_Token) + expect_error(t, `{1,2}`, parser.Invalid_Token) + expect_error(t, `{,2}`, parser.Invalid_Token) + + expect_error(t, `\`, parser.Unexpected_EOF) } @test From 16b644ad79ca80227c67c4a7a2234dcd47800161 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:23:20 -0400 Subject: [PATCH 08/24] Use `slice.zero` instead --- core/text/regex/virtual_machine/virtual_machine.odin | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin index f92b84ace3e..f102fb78ca2 100644 --- a/core/text/regex/virtual_machine/virtual_machine.odin +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -1,6 +1,7 @@ package regex_vm @require import "core:io" +import "core:slice" import "core:text/regex/common" import "core:text/regex/parser" import "core:unicode/utf8" @@ -348,9 +349,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU } for { - for i := 0; i < len(vm.busy_map); i += 1 { - vm.busy_map[i] = 0 - } + slice.zero(vm.busy_map[:]) assert(vm.string_pointer <= len(vm.memory), "VM string pointer went out of bounds.") From c52a8a5f86707eeb71bcb44e2f691c67c9383500 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:59:59 -0400 Subject: [PATCH 09/24] Allow configuring of `MAX_CAPTURE_GROUPS` for `n` > 10 --- core/text/regex/common/common.odin | 2 +- .../core/text/regex/test_core_text_regex.odin | 70 +++++++++++++++++-- 2 files changed, 65 insertions(+), 7 deletions(-) diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin index f53f043a17c..f401658cb9b 100644 --- a/core/text/regex/common/common.odin +++ b/core/text/regex/common/common.odin @@ -2,7 +2,7 @@ package regex_common // VM limitations -MAX_CAPTURE_GROUPS :: 10 +MAX_CAPTURE_GROUPS :: max(#config(ODIN_REGEX_MAX_CAPTURE_GROUPS, 10), 10) MAX_PROGRAM_SIZE :: int(max(i16)) MAX_CLASSES :: int(max(u8)) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 0bd1ff2886d..74a0b8cf79c 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -276,9 +276,58 @@ test_optional_capture_group :: proc(t: ^testing.T) { @test test_max_capture_groups :: proc(t: ^testing.T) { - EXPR :: "(1)(2)(3)(4)(5)(6)(7)(8)(9)" - check_expression(t, EXPR, "123456789", "123456789", - "1", "2", "3", "4", "5", "6", "7", "8", "9") + sb_pattern := strings.builder_make() + sb_haystack := strings.builder_make() + expected_captures: [dynamic]string + defer { + strings.builder_destroy(&sb_pattern) + strings.builder_destroy(&sb_haystack) + delete(expected_captures) + } + + w_pattern := strings.to_writer(&sb_pattern) + w_haystack := strings.to_writer(&sb_haystack) + + // The full expression capture, capture 0: + for i in 1.. Date: Wed, 24 Jul 2024 16:05:48 -0400 Subject: [PATCH 10/24] Remove printing facilities for `Regular_Expression` The `original_pattern` introduced a tenuous dependency to the expression value as a whole, and after some consideration, I decided that it would be better for the developer to manage their own pattern strings. In the event you need to print the text representation of a pattern, it's usually better that you manage the memory of it as well. --- core/fmt/fmt.odin | 16 -------------- core/text/regex/regex.odin | 2 -- .../benchmark/text/regex/benchmark_regex.odin | 8 +++---- .../core/text/regex/test_core_text_regex.odin | 22 ------------------- 4 files changed, 4 insertions(+), 44 deletions(-) diff --git a/core/fmt/fmt.odin b/core/fmt/fmt.odin index 22ac1cc369b..9aa9c99dc1c 100644 --- a/core/fmt/fmt.odin +++ b/core/fmt/fmt.odin @@ -9,7 +9,6 @@ import "core:io" import "core:reflect" import "core:strconv" import "core:strings" -import "core:text/regex" import "core:time" import "core:unicode/utf8" @@ -2406,21 +2405,6 @@ fmt_named :: proc(fi: ^Info, v: any, verb: rune, info: runtime.Type_Info_Named) write_padded_number(fi, (ns), 9) io.write_string(fi.writer, " +0000 UTC", &fi.n) return - - case regex.Regular_Expression: - io.write_byte(fi.writer, '/') - for r in a.original_pattern { - if r == '/' { - io.write_string(fi.writer, `\/`) - } else { - io.write_rune(fi.writer, r) - } - } - io.write_byte(fi.writer, '/') - for flag in a.flags { - io.write_byte(fi.writer, regex.Flag_To_Letter[flag]) - } - return } } diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 1736f2305c7..0bb0b78247c 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -30,7 +30,6 @@ Capture :: struct { } Regular_Expression :: struct { - original_pattern: string, flags: Flags, class_data: []virtual_machine.Rune_Class_Data, program: []virtual_machine.Opcode `fmt:"-"`, @@ -92,7 +91,6 @@ create :: proc( // allocator so everything can be tightly packed. context.allocator = permanent_allocator - result.original_pattern = pattern result.flags = flags if len(class_data) > 0 { diff --git a/tests/benchmark/text/regex/benchmark_regex.odin b/tests/benchmark/text/regex/benchmark_regex.odin index cd9812b08b4..8d29888a350 100644 --- a/tests/benchmark/text/regex/benchmark_regex.odin +++ b/tests/benchmark/text/regex/benchmark_regex.odin @@ -111,7 +111,7 @@ global_capture_end_word :: proc(t: ^testing.T) { } defer regex.destroy(rex) - report := fmt.tprintf("Matching %v over a block of random ASCII text.", rex) + report := fmt.tprintf("Matching %q over a block of random ASCII text.", EXPR) for size in sizes { data := make([]u8, size) @@ -151,7 +151,7 @@ global_capture_end_word_unicode :: proc(t: ^testing.T) { } defer regex.destroy(rex) - report := fmt.tprintf("Matching %v over a block of random Unicode text.", rex) + report := fmt.tprintf("Matching %q over a block of random Unicode text.", EXPR) for size in sizes { data := make([]u8, size) @@ -191,7 +191,7 @@ alternations :: proc(t: ^testing.T) { } defer regex.destroy(rex) - report := fmt.tprintf("Matching %v over a text block of only `a`s.", rex) + report := fmt.tprintf("Matching %q over a text block of only `a`s.", EXPR) for size in sizes { data := make([]u8, size) @@ -225,7 +225,7 @@ classes :: proc(t: ^testing.T) { } defer regex.destroy(rex) - report := fmt.tprintf("Matching %v over a string of spaces with %q at the end.", rex, NEEDLE) + report := fmt.tprintf("Matching %q over a string of spaces with %q at the end.", EXPR, NEEDLE) for size in sizes { data := make([]u8, size) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 74a0b8cf79c..8ecf6cef2c8 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -681,28 +681,6 @@ test_optional_inside_optional :: proc(t: ^testing.T) { check_expression(t, EXPR, "", "") } -@test -test_printing :: proc(t: ^testing.T) { - rex, err := regex.create(`^/a$`, { - .Global, - .Multiline, - .Case_Insensitive, - .Unicode, - .Ignore_Whitespace, - .No_Optimization, - .No_Capture, - }) - if !testing.expect_value(t, err, nil) { - return - } - defer regex.destroy(rex) - - str := fmt.tprint(rex) - str_hash := fmt.tprintf("%#v", rex) - testing.expect_value(t, str, `/^\/a$/gmixun-`) - testing.expect_value(t, str_hash, `/^\/a$/gmixun-`) -} - @test From ff492e615cc5523903b9b4d38214eefc531b4d0c Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:22:04 -0400 Subject: [PATCH 11/24] Use `unaligned_load` for `regex` virtual machine This should hopefully avoid any issues with loading operands greater than 8 bits on alignment-sensitive platforms. --- core/text/regex/compiler/compiler.odin | 9 +++++---- core/text/regex/compiler/debugging.odin | 15 ++++++++------- .../regex/virtual_machine/virtual_machine.odin | 15 ++++++++------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index 7617a7bcd68..4404947f148 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -1,5 +1,6 @@ package regex_compiler +import "base:intrinsics" import "core:text/regex/common" import "core:text/regex/parser" import "core:text/regex/tokenizer" @@ -408,7 +409,7 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: break add_global case .Rune: - operand := (cast(^rune)&code[pc+1])^ + operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1]) inject_at(&code, pc_open, Opcode.Wait_For_Rune) pc_open += size_of(Opcode) inject_raw(&code, pc_open, operand) @@ -490,20 +491,20 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: case .Jump: jmp := cast(^i16)&code[pc+size_of(Opcode)] if code[cast(i16)pc+jmp^] == .Jump { - next_jmp := (cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)])^ + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)]) jmp^ = jmp^ + next_jmp do_another_pass = true } case .Split: jmp_x := cast(^i16)&code[pc+size_of(Opcode)] if code[cast(i16)pc+jmp_x^] == .Jump { - next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)])^ + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)]) jmp_x^ = jmp_x^ + next_jmp do_another_pass = true } jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)] if code[cast(i16)pc+jmp_y^] == .Jump { - next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)])^ + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)]) jmp_y^ = jmp_y^ + next_jmp do_another_pass = true } diff --git a/core/text/regex/compiler/debugging.odin b/core/text/regex/compiler/debugging.odin index 1ef3e6d78ef..114b88fa28d 100644 --- a/core/text/regex/compiler/debugging.odin +++ b/core/text/regex/compiler/debugging.odin @@ -1,5 +1,6 @@ package regex_compiler +import "base:intrinsics" import "core:io" import "core:text/regex/common" import "core:text/regex/virtual_machine" @@ -9,11 +10,11 @@ get_jump_targets :: proc(code: []Opcode) -> (jump_targets: map[int]int) { for opcode, pc in virtual_machine.iterate_opcodes(&iter) { #partial switch opcode { case .Jump: - jmp := cast(int)(cast(^u16)&code[pc+1])^ + jmp := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1]) jump_targets[jmp] = pc case .Split: - jmp_x := cast(int)(cast(^u16)&code[pc+1])^ - jmp_y := cast(int)(cast(^u16)&code[pc+3])^ + jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+1]) + jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&code[pc+3]) jump_targets[jmp_x] = pc jump_targets[jmp_y] = pc } @@ -46,18 +47,18 @@ trace :: proc(w: io.Writer, code: []Opcode) { operand := cast(rune)code[pc+1] io.write_encoded_rune(w, operand) case .Rune: - operand := (cast(^rune)&code[pc+1])^ + operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1]) io.write_encoded_rune(w, operand) case .Rune_Class, .Rune_Class_Negated: operand := cast(u8)code[pc+1] common.write_padded_hex(w, operand, 2) case .Jump: - jmp := (cast(^u16)&code[pc+1])^ + jmp := intrinsics.unaligned_load(cast(^u16)&code[pc+1]) io.write_string(w, "-> $") common.write_padded_hex(w, jmp, 4) case .Split: - jmp_x := (cast(^u16)&code[pc+1])^ - jmp_y := (cast(^u16)&code[pc+3])^ + jmp_x := intrinsics.unaligned_load(cast(^u16)&code[pc+1]) + jmp_y := intrinsics.unaligned_load(cast(^u16)&code[pc+3]) io.write_string(w, "=> $") common.write_padded_hex(w, jmp_x, 4) io.write_string(w, ", $") diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin index f102fb78ca2..7eb6b1f9be2 100644 --- a/core/text/regex/virtual_machine/virtual_machine.odin +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -1,5 +1,6 @@ package regex_vm +import "base:intrinsics" @require import "core:io" import "core:slice" import "core:text/regex/common" @@ -121,12 +122,12 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: #partial switch vm.code[pc] { case .Jump: - pc = cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode)])^ + pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)]) continue case .Split: - jmp_x := cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode)])^ - jmp_y := cast(int)(cast(^u16)&vm.code[pc + size_of(Opcode) + size_of(u16)])^ + jmp_x := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode)]) + jmp_y := cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[pc + size_of(Opcode) + size_of(u16)]) add_thread(vm, saved, jmp_x) pc = jmp_y @@ -236,7 +237,7 @@ add_thread :: proc(vm: ^Machine, saved: ^[2 * common.MAX_CAPTURE_GROUPS]int, pc: vm.top_thread += 1 case .Wait_For_Rune: - operand := (cast(^rune)&vm.code[pc + size_of(Opcode)])^ + operand := intrinsics.unaligned_load(cast(^rune)&vm.code[pc + size_of(Opcode)]) if vm.next_rune == operand { add_thread(vm, saved, pc + size_of(Opcode) + size_of(rune)) } @@ -409,7 +410,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU } case .Rune: - operand := (cast(^rune)&vm.code[t.pc + size_of(Opcode)])^ + operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)]) if current_rune == operand { add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune)) } @@ -482,7 +483,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU vm.top_thread += 1 case .Wait_For_Rune: - operand := (cast(^rune)&vm.code[t.pc + size_of(Opcode)])^ + operand := intrinsics.unaligned_load(cast(^rune)&vm.code[t.pc + size_of(Opcode)]) if vm.next_rune == operand { add_thread(vm, t.saved, t.pc + size_of(Opcode) + size_of(rune)) } @@ -558,7 +559,7 @@ run :: proc(vm: ^Machine, $UNICODE_MODE: bool) -> (saved: ^[2 * common.MAX_CAPTU break escape_loop case .Jump: - t.pc = cast(int)(cast(^u16)&vm.code[t.pc + size_of(Opcode)])^ + t.pc = cast(int)intrinsics.unaligned_load(cast(^u16)&vm.code[t.pc + size_of(Opcode)]) case .Save: index := vm.code[t.pc + size_of(Opcode)] From 90f1f7fbdfc283b03216e15e2700331395539161 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:48:49 -0400 Subject: [PATCH 12/24] Use `unaligned_store` in `regex` too --- core/text/regex/compiler/compiler.odin | 27 ++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index 4404947f148..f5d6d2f6a7f 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -490,22 +490,25 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: #partial switch opcode { case .Jump: jmp := cast(^i16)&code[pc+size_of(Opcode)] - if code[cast(i16)pc+jmp^] == .Jump { - next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)]) - jmp^ = jmp^ + next_jmp + jmp_value := intrinsics.unaligned_load(jmp) + if code[cast(i16)pc+jmp_value] == .Jump { + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)]) + intrinsics.unaligned_store(jmp, jmp_value + next_jmp) do_another_pass = true } case .Split: jmp_x := cast(^i16)&code[pc+size_of(Opcode)] - if code[cast(i16)pc+jmp_x^] == .Jump { - next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)]) - jmp_x^ = jmp_x^ + next_jmp + jmp_x_value := intrinsics.unaligned_load(jmp_x) + if code[cast(i16)pc+jmp_x_value] == .Jump { + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)]) + intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp) do_another_pass = true } jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)] - if code[cast(i16)pc+jmp_y^] == .Jump { - next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)]) - jmp_y^ = jmp_y^ + next_jmp + jmp_y_value := intrinsics.unaligned_load(jmp_y) + if code[cast(i16)pc+jmp_y_value] == .Jump { + next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)]) + intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp) do_another_pass = true } } @@ -526,12 +529,12 @@ compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: #partial switch opcode { case .Jump: jmp := cast(^u16)&code[pc+size_of(Opcode)] - jmp^ = jmp^ + cast(u16)pc + intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc) case .Split: jmp_x := cast(^u16)&code[pc+size_of(Opcode)] - jmp_x^ = jmp_x^ + cast(u16)pc + intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc) jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)] - jmp_y^ = jmp_y^ + cast(u16)pc + intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc) } } From 62527123638ba5ac9e707e0fbcdeb9e9e56b9c31 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 13:21:13 -0400 Subject: [PATCH 13/24] Add missing features to `regex` package documentation --- core/text/regex/doc.odin | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/text/regex/doc.odin b/core/text/regex/doc.odin index 8899e1af691..7b28bbc3d9e 100644 --- a/core/text/regex/doc.odin +++ b/core/text/regex/doc.odin @@ -9,12 +9,16 @@ Odin's regex library implements the following features: Alternation: `apple|cherry` Classes: `[0-9_]` + Classes, negated: `[^0-9_]` + Shorthands: `\d\s\w` + Shorthands, negated: `\D\S\W` Wildcards: `.` Repeat, optional: `a*` Repeat, at least once: `a+` + Repetition: `a{1,2}` Optional: `a?` - Group Capture: `([0-9])` - Group Non-Capture: `(?:[0-9])` + Group, capture: `([0-9])` + Group, non-capture: `(?:[0-9])` Start & End Anchors: `^hello$` Word Boundaries: `\bhello\b` Non-Word Boundaries: `hello\B` From cd8272557feae951543c0d661b3c8d82e1a67c44 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:33:36 -0400 Subject: [PATCH 14/24] Test that a RegEx Capture `pos` corresponds to its `groups` --- tests/core/text/regex/test_core_text_regex.odin | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 8ecf6cef2c8..d9d4f8cbc46 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -44,6 +44,13 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex } else { log.infof("match groups were: %v", capture.groups, location = loc) } + + for pos, g in capture.pos { + pos_str := haystack[pos[0]:pos[1]] + if !testing.expectf(t, pos_str == capture.groups[g], "position string %v %q does not correspond to group string %q", pos, pos_str, capture.groups[g]) { + break + } + } } check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags: regex.Flags = {}, loc := #caller_location) { From d3a51e208d01d2ab59067018a219340d42a228fa Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:40:27 -0400 Subject: [PATCH 15/24] Hide `Regular_Expression` values We don't directly support printing these. To prevent future issues being raised about the pattern being missing if someone tries to print one, hide everything. --- core/text/regex/regex.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 0bb0b78247c..9da7e9246b3 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -30,8 +30,8 @@ Capture :: struct { } Regular_Expression :: struct { - flags: Flags, - class_data: []virtual_machine.Rune_Class_Data, + flags: Flags `fmt:"-"`, + class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`, program: []virtual_machine.Opcode `fmt:"-"`, } From babdc432e963f2c6456b0d85102419c526600718 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:42:46 -0400 Subject: [PATCH 16/24] Move `Flag_To_Letter` to `core:text/regex/common` --- core/text/regex/common/common.odin | 11 +++++++++++ core/text/regex/regex.odin | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin index f401658cb9b..1169bb3d4f5 100644 --- a/core/text/regex/common/common.odin +++ b/core/text/regex/common/common.odin @@ -25,3 +25,14 @@ Flag :: enum u8 { } Flags :: bit_set[Flag; u8] + +@(rodata) +Flag_To_Letter := #sparse[Flag]u8 { + .Global = 'g', + .Multiline = 'm', + .Case_Insensitive = 'i', + .Ignore_Whitespace = 'x', + .Unicode = 'u', + .No_Capture = 'n', + .No_Optimization = '-', +} diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 9da7e9246b3..a4bd4292e61 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -36,17 +36,6 @@ Regular_Expression :: struct { } -@(rodata) -Flag_To_Letter := #sparse[Flag]u8 { - .Global = 'g', - .Multiline = 'm', - .Case_Insensitive = 'i', - .Ignore_Whitespace = 'x', - .Unicode = 'u', - .No_Capture = 'n', - .No_Optimization = '-', -} - /* Create a regular expression from a string pattern and a set of flags. From 1ccb0b25583cc2e5b1b58ba6d60571d637a28370 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:45:20 -0400 Subject: [PATCH 17/24] Remove unused code --- core/text/regex/compiler/compiler.odin | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index f5d6d2f6a7f..1ce881894be 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -41,7 +41,6 @@ SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16) Compiler :: struct { flags: common.Flags, - anchor_start_seen: bool, class_data: [dynamic]Rune_Class_Data, } @@ -192,7 +191,6 @@ generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) { append(&code, Opcode.Multiline_Close) } else { if specific.start { - c.anchor_start_seen = true append(&code, Opcode.Assert_Start) } else { append(&code, Opcode.Assert_End) From 743480b1a489ecd889a0ea9d29842555b136606d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:51:40 -0400 Subject: [PATCH 18/24] Use `regex.destroy` for test captures --- tests/core/text/regex/test_core_text_regex.odin | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index d9d4f8cbc46..96a3058561d 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -21,10 +21,7 @@ check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex defer regex.destroy(rex) capture, success := regex.match(rex, haystack) - defer { - delete(capture.groups) - delete(capture.pos) - } + defer regex.destroy(capture) if len(needles) > 0 { testing.expect(t, success, "match failed", loc = loc) From ca7e46d56f54b33bb3ac630ead662c327733774d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:51:51 -0400 Subject: [PATCH 19/24] Add explicit test case for Capture `pos` --- .../core/text/regex/test_core_text_regex.odin | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index 96a3058561d..d272dcc0bd0 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -499,6 +499,31 @@ test_word_boundaries :: proc(t: ^testing.T) { } } +@test +test_pos_index_explicitly :: proc(t: ^testing.T) { + STR :: "This is an island." + EXPR :: `\bis\b` + + rex, err := regex.create(EXPR, { .Global }) + if !testing.expect_value(t, err, nil) { + return + } + defer regex.destroy(rex) + + capture, success := regex.match(rex, STR) + log.info(capture, success) + if !testing.expect(t, success) { + return + } + defer regex.destroy(capture) + + if !testing.expect_value(t, len(capture.pos), 1) { + return + } + testing.expect_value(t, capture.pos[0][0], 5) + testing.expect_value(t, capture.pos[0][1], 7) +} + @test test_non_word_boundaries :: proc(t: ^testing.T) { { From dde42f0ebcef9dd7741761e6a7cc5ba738b63320 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 18:55:54 -0400 Subject: [PATCH 20/24] Add more documentation for `core:text/regex` API --- core/text/regex/regex.odin | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index a4bd4292e61..0d8a1d9c063 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -13,22 +13,43 @@ Compiler_Error :: compiler.Error Creation_Error :: enum { None, + // A `\` was supplied as the delimiter to `create_by_user`. Bad_Delimiter, + // A pair of delimiters for `create_by_user` was not found. Expected_Delimiter, + // An unknown letter was supplied to `create_by_user` after the last delimiter. Unknown_Flag, } Error :: union #shared_nil { + // An error that can occur in the pattern parsing phase. + // + // Most of these are regular expression syntax errors and are either + // context-dependent as to what they mean or have self-explanatory names. Parser_Error, + // An error that can occur in the pattern compiling phase. + // + // Of the two that can be returned, they have to do with exceeding the + // limitations of the Virtual Machine. Compiler_Error, + // An error that occurs only for `create_by_user`. Creation_Error, } +/* +This struct corresponds to a set of string captures from a RegEx match. + +`pos` will contain the start and end positions for each string in `groups`, +such that `str[pos[0][0]:pos[0][1]] == groups[0]`. +*/ Capture :: struct { pos: [][2]int, groups: []string, } +/* +A compiled Regular Expression value, to be used with the `match_*` procedures. +*/ Regular_Expression :: struct { flags: Flags `fmt:"-"`, class_data: []virtual_machine.Rune_Class_Data `fmt:"-"`, From e17fc8272b08d1e2f59c13ff23df9a3d84a0c8a0 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 19:12:46 -0400 Subject: [PATCH 21/24] Document rationale behind RegEx shorthand classes --- core/text/regex/doc.odin | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/core/text/regex/doc.odin b/core/text/regex/doc.odin index 7b28bbc3d9e..61ab8b80e10 100644 --- a/core/text/regex/doc.odin +++ b/core/text/regex/doc.odin @@ -29,6 +29,24 @@ These specifiers can be composed together, such as an optional group: This package also supports the non-greedy variants of the repeating and optional specifiers by appending a `?` to them. +Of the shorthand classes that are supported, they are all ASCII-based, even +when compiling in Unicode mode. This is for the sake of general performance and +simplicity, as there are thousands of Unicode codepoints which would qualify as +either a digit, space, or word character which could be irrelevant depending on +what is being matched. + +Here are the shorthand class equivalencies: + \d: [0-9] + \s: [\t\n\f\r ] + \w: [0-9A-Z_a-z] + +If you need your own shorthands, you can compose strings together like so: + MY_HEX :: "[0-9A-Fa-f]" + PATTERN :: MY_HEX + "-" + MY_HEX + +The compiler will handle turning multiple identical classes into references to +the same set of matching runes, so there's no penalty for doing it like this. + ``Some people, when confronted with a problem, think From 14858309f082e7fccdeb9859422f7ba1f0ee98c8 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 19:18:16 -0400 Subject: [PATCH 22/24] Add explicit license info to `core:text/regex` --- core/text/regex/common/common.odin | 8 ++++++++ core/text/regex/common/debugging.odin | 8 ++++++++ core/text/regex/compiler/compiler.odin | 8 ++++++++ core/text/regex/compiler/debugging.odin | 8 ++++++++ core/text/regex/optimizer/optimizer.odin | 8 ++++++++ core/text/regex/parser/debugging.odin | 8 ++++++++ core/text/regex/parser/parser.odin | 8 ++++++++ core/text/regex/regex.odin | 8 ++++++++ core/text/regex/tokenizer/tokenizer.odin | 8 ++++++++ core/text/regex/virtual_machine/util.odin | 8 ++++++++ core/text/regex/virtual_machine/virtual_machine.odin | 8 ++++++++ 11 files changed, 88 insertions(+) diff --git a/core/text/regex/common/common.odin b/core/text/regex/common/common.odin index 1169bb3d4f5..4a303e0a371 100644 --- a/core/text/regex/common/common.odin +++ b/core/text/regex/common/common.odin @@ -1,6 +1,14 @@ // This package helps break dependency cycles. package regex_common +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + // VM limitations MAX_CAPTURE_GROUPS :: max(#config(ODIN_REGEX_MAX_CAPTURE_GROUPS, 10), 10) MAX_PROGRAM_SIZE :: int(max(i16)) diff --git a/core/text/regex/common/debugging.odin b/core/text/regex/common/debugging.odin index 062c314ccf2..0e4161a9268 100644 --- a/core/text/regex/common/debugging.odin +++ b/core/text/regex/common/debugging.odin @@ -1,5 +1,13 @@ package regex_common +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + @require import "core:os" import "core:io" import "core:strings" diff --git a/core/text/regex/compiler/compiler.odin b/core/text/regex/compiler/compiler.odin index 1ce881894be..b3ded01041c 100644 --- a/core/text/regex/compiler/compiler.odin +++ b/core/text/regex/compiler/compiler.odin @@ -1,5 +1,13 @@ package regex_compiler +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "base:intrinsics" import "core:text/regex/common" import "core:text/regex/parser" diff --git a/core/text/regex/compiler/debugging.odin b/core/text/regex/compiler/debugging.odin index 114b88fa28d..142cb8839fb 100644 --- a/core/text/regex/compiler/debugging.odin +++ b/core/text/regex/compiler/debugging.odin @@ -1,5 +1,13 @@ package regex_compiler +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "base:intrinsics" import "core:io" import "core:text/regex/common" diff --git a/core/text/regex/optimizer/optimizer.odin b/core/text/regex/optimizer/optimizer.odin index fbb65cf79b4..835e5022c3c 100644 --- a/core/text/regex/optimizer/optimizer.odin +++ b/core/text/regex/optimizer/optimizer.odin @@ -1,5 +1,13 @@ package regex_optimizer +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "base:intrinsics" @require import "core:io" import "core:slice" diff --git a/core/text/regex/parser/debugging.odin b/core/text/regex/parser/debugging.odin index 4d531965c66..e060f58c29f 100644 --- a/core/text/regex/parser/debugging.odin +++ b/core/text/regex/parser/debugging.odin @@ -1,5 +1,13 @@ package regex_parser +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "core:io" write_node :: proc(w: io.Writer, node: Node) { diff --git a/core/text/regex/parser/parser.odin b/core/text/regex/parser/parser.odin index 1958ee39918..720992cb9cd 100644 --- a/core/text/regex/parser/parser.odin +++ b/core/text/regex/parser/parser.odin @@ -1,5 +1,13 @@ package regex_parser +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "base:intrinsics" import "core:strconv" import "core:strings" diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 0d8a1d9c063..9ff9241926a 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -1,5 +1,13 @@ package regex +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "core:text/regex/common" import "core:text/regex/compiler" import "core:text/regex/optimizer" diff --git a/core/text/regex/tokenizer/tokenizer.odin b/core/text/regex/tokenizer/tokenizer.odin index 5804439a861..447fe4329dc 100644 --- a/core/text/regex/tokenizer/tokenizer.odin +++ b/core/text/regex/tokenizer/tokenizer.odin @@ -1,5 +1,13 @@ package regex_tokenizer +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "core:text/regex/common" import "core:unicode/utf8" diff --git a/core/text/regex/virtual_machine/util.odin b/core/text/regex/virtual_machine/util.odin index edf055bc709..fa94a139ff7 100644 --- a/core/text/regex/virtual_machine/util.odin +++ b/core/text/regex/virtual_machine/util.odin @@ -1,5 +1,13 @@ package regex_vm +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + Opcode_Iterator :: struct { code: Program, pc: int, diff --git a/core/text/regex/virtual_machine/virtual_machine.odin b/core/text/regex/virtual_machine/virtual_machine.odin index 7eb6b1f9be2..a4fca6c4d12 100644 --- a/core/text/regex/virtual_machine/virtual_machine.odin +++ b/core/text/regex/virtual_machine/virtual_machine.odin @@ -1,5 +1,13 @@ package regex_vm +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: Initial implementation. +*/ + import "base:intrinsics" @require import "core:io" import "core:slice" From 8f5b838a071d010959a3f38a2c73cf49f31a16cf Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 5 Aug 2024 03:49:29 -0400 Subject: [PATCH 23/24] Review manual `for` loops in `core:text/regex` --- core/text/regex/optimizer/optimizer.odin | 8 ++++---- core/text/regex/parser/parser.odin | 4 +++- core/text/regex/regex.odin | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/core/text/regex/optimizer/optimizer.odin b/core/text/regex/optimizer/optimizer.odin index 835e5022c3c..e23cc1bc50b 100644 --- a/core/text/regex/optimizer/optimizer.odin +++ b/core/text/regex/optimizer/optimizer.odin @@ -86,7 +86,7 @@ optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, chan } // Only recursive optimizations: - for i := 0; i < len(specific.nodes); i += 1 { + #no_bounds_check for i := 0; i < len(specific.nodes); i += 1 { subnode, subnode_changes := optimize_subtree(specific.nodes[i], flags) changes += subnode_changes if subnode == nil { @@ -194,7 +194,7 @@ optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, chan new_range.lower = specific.runes[0] new_range.upper = specific.runes[0] - for i := 1; i < len(specific.runes); i += 1 { + #no_bounds_check for i := 1; i < len(specific.runes); i += 1 { r := specific.runes[i] if new_range.lower == -1 { new_range = { r, r } @@ -228,7 +228,7 @@ optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, chan // // DO: `[aa-c]` => `[a-c]` for range in specific.ranges { - for i := 0; i < len(specific.runes); i += 1 { + #no_bounds_check for i := 0; i < len(specific.runes); i += 1 { r := specific.runes[i] if range.lower <= r && r <= range.upper { ordered_remove(&specific.runes, i) @@ -244,7 +244,7 @@ optimize_subtree :: proc(tree: Node, flags: common.Flags) -> (result: Node, chan // DO: `[a-cd-e]` => `[a-e]` // DO: `[a-cb-e]` => `[a-e]` slice.sort_by(specific.ranges[:], class_range_sorter) - for i := 0; i < len(specific.ranges) - 1; i += 1 { + #no_bounds_check for i := 0; i < len(specific.ranges) - 1; i += 1 { for j := i + 1; j < len(specific.ranges); j += 1 { left_range := &specific.ranges[i] right_range := specific.ranges[j] diff --git a/core/text/regex/parser/parser.odin b/core/text/regex/parser/parser.odin index 720992cb9cd..038d4cb85f6 100644 --- a/core/text/regex/parser/parser.odin +++ b/core/text/regex/parser/parser.odin @@ -225,7 +225,7 @@ null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) node := new(Node_Rune_Class) - for i := 0; i < len(token.text); /**/ { + #no_bounds_check for i := 0; i < len(token.text); /**/ { r, size := utf8.decode_rune(token.text[i:]) if i == 0 && r == '^' { node.negating = true @@ -298,6 +298,8 @@ null_denotation :: proc(p: ^Parser, token: Token) -> (result: Node, err: Error) } if .Case_Insensitive in p.flags { + // These two loops cannot be in the form of `for x in y` because + // they append to the data that they iterate over. length := len(node.runes) #no_bounds_check for i := 0; i < length; i += 1 { r := node.runes[i] diff --git a/core/text/regex/regex.odin b/core/text/regex/regex.odin index 9ff9241926a..3dc26b5c63a 100644 --- a/core/text/regex/regex.odin +++ b/core/text/regex/regex.odin @@ -291,7 +291,7 @@ match_and_allocate_capture :: proc( context.allocator = permanent_allocator num_groups := 0 - for i := 0; i < len(saved); i += 2 { + #no_bounds_check for i := 0; i < len(saved); i += 2 { a, b := saved[i], saved[i + 1] if a == -1 || b == -1 { continue From d0d4f19097a8df19ebe5ef471047c656bd2318b3 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Mon, 5 Aug 2024 03:50:41 -0400 Subject: [PATCH 24/24] Remove debug line from test --- tests/core/text/regex/test_core_text_regex.odin | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin index d272dcc0bd0..d9913058415 100644 --- a/tests/core/text/regex/test_core_text_regex.odin +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -511,7 +511,6 @@ test_pos_index_explicitly :: proc(t: ^testing.T) { defer regex.destroy(rex) capture, success := regex.match(rex, STR) - log.info(capture, success) if !testing.expect(t, success) { return }