diff --git a/tests/core/normal.odin b/tests/core/normal.odin index 065090be3..1f34e3292 100644 --- a/tests/core/normal.odin +++ b/tests/core/normal.odin @@ -38,6 +38,7 @@ download_assets :: proc() { @(require) import "strings" @(require) import "text/i18n" @(require) import "text/match" +@(require) import "text/regex" @(require) import "thread" @(require) import "time" @(require) import "unicode" diff --git a/tests/core/text/regex/test_core_text_regex.odin b/tests/core/text/regex/test_core_text_regex.odin new file mode 100644 index 000000000..da44e6b2d --- /dev/null +++ b/tests/core/text/regex/test_core_text_regex.odin @@ -0,0 +1,1012 @@ +package test_core_text_regex + +import "core:fmt" +import "core:io" +import "core:log" +import "core:strings" +import "core:testing" +import "core:text/regex" +import "core:text/regex/common" +import "core:text/regex/parser" +import "core:text/regex/tokenizer" + + +check_expression_with_flags :: proc(t: ^testing.T, pattern: string, flags: regex.Flags, haystack: string, needles: ..string, loc := #caller_location) { + rex, parse_err := regex.create(pattern, flags) + if !testing.expect_value(t, parse_err, nil, loc = loc) { + log.infof("Failed test's flags were: %v", flags, location = loc) + return + } + defer regex.destroy(rex) + + capture, success := regex.match(rex, haystack) + defer { + delete(capture.groups) + delete(capture.pos) + } + + if len(needles) > 0 { + testing.expect(t, success, "match failed", loc = loc) + } + + matches_aligned := testing.expectf(t, len(needles) == len(capture.groups), + "expected %i match groups, got %i (flags: %w)", + len(needles), len(capture.groups), flags, loc = loc) + + if matches_aligned { + for needle, i in needles { + if !testing.expectf(t, capture.groups[i] == needle, + "match group %i was %q, expected %q (flags: %w)", + i, capture.groups[i], needle, flags, loc = loc) { + } + } + } else { + log.infof("match groups were: %v", capture.groups, location = loc) + } +} + +check_expression :: proc(t: ^testing.T, pattern, haystack: string, needles: ..string, extra_flags: regex.Flags = {}, loc := #caller_location) { + check_expression_with_flags(t, pattern, { .Global } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .No_Optimization } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .Unicode } + extra_flags, + haystack, ..needles, loc = loc) + check_expression_with_flags(t, pattern, { .Global, .Unicode, .No_Optimization } + extra_flags, + haystack, ..needles, loc = loc) +} + + +@test +test_concatenation :: proc(t: ^testing.T) { + check_expression(t, "abc", "abc", "abc") +} + +@test +test_rune_class :: proc(t: ^testing.T) { + EXPR :: "[abc]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "b", "b") + check_expression(t, EXPR, "c", "c") +} + +@test +test_rune_ranges :: proc(t: ^testing.T) { + EXPR :: "0x[0-9A-Fa-f]+" + check_expression(t, EXPR, "0x0065c816", "0x0065c816") +} + +@test +test_rune_range_terminal_dash :: proc(t: ^testing.T) { + { + EXPR :: "[a-]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-a]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-a-]" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[-]" + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[--]" + check_expression(t, EXPR, "-", "-") + } + { + EXPR :: "[---]" + check_expression(t, EXPR, "-", "-") + } +} + +@test +test_rune_range_escaping_class :: proc(t: ^testing.T) { + EXPR :: `[\]a\[\.]` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "[", "[") + check_expression(t, EXPR, "]", "]") + check_expression(t, EXPR, ".", ".") + check_expression(t, EXPR, "b") +} + +@test +test_negated_rune_class :: proc(t: ^testing.T) { + EXPR :: "[^ac-d]" + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "b", "b") + check_expression(t, EXPR, "e", "e") + check_expression(t, EXPR, "c") + check_expression(t, EXPR, "d") +} + +@test +test_shorthand_classes :: proc(t: ^testing.T) { + EXPR_P :: `\d\w\s` + check_expression(t, EXPR_P, "1a ", "1a ") + check_expression(t, EXPR_P, "a!1") + EXPR_N :: `\D\W\S` + check_expression(t, EXPR_N, "a!1", "a!1") + check_expression(t, EXPR_N, "1a ") +} + +@test +test_shorthand_classes_in_classes :: proc(t: ^testing.T) { + EXPR_P :: `[\d][\w][\s]` + check_expression(t, EXPR_P, "1a ", "1a ") + check_expression(t, EXPR_P, "a!1") + EXPR_NP :: `[^\d][^\w][^\s]` + check_expression(t, EXPR_NP, "a!1", "a!1") + check_expression(t, EXPR_NP, "1a ") + EXPR_N :: `[\D][\W][\S]` + check_expression(t, EXPR_N, "a!1", "a!1") + check_expression(t, EXPR_N, "1a ") + EXPR_NN :: `[^\D][^\W][^\S]` + check_expression(t, EXPR_NN, "1a ", "1a ") + check_expression(t, EXPR_NN, "a!1") +} + +@test +test_mixed_shorthand_class :: proc(t: ^testing.T) { + EXPR_P :: `[\d\s]+` + check_expression(t, EXPR_P, "0123456789 98", "0123456789 98") + check_expression(t, EXPR_P, "!@#$%^&*()_()") + EXPR_NP :: `[^\d\s]+` + check_expression(t, EXPR_NP, "!@#$%^&*()_()", "!@#$%^&*()_()") + check_expression(t, EXPR_NP, "0123456789 98") +} + +@test +test_wildcard :: proc(t: ^testing.T) { + EXPR :: "." + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, ".", ".") +} + +@test +test_alternation :: proc(t: ^testing.T) { + EXPR :: "aa|bb|cc" + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "bb", "bb") + check_expression(t, EXPR, "cc", "cc") +} + +@test +test_optional :: proc(t: ^testing.T) { + EXPR :: "a?a?a?aaa" + check_expression(t, EXPR, "aaa", "aaa") +} + +@test +test_repeat_zero :: proc(t: ^testing.T) { + EXPR :: "a*b" + check_expression(t, EXPR, "aaab", "aaab") +} + +@test +test_repeat_one :: proc(t: ^testing.T) { + EXPR :: "a+b" + check_expression(t, EXPR, "aaab", "aaab") +} + +@test +test_greedy :: proc(t: ^testing.T) { + HTML :: "" + + check_expression(t, "<.+>", HTML, HTML) + check_expression(t, "<.*>", HTML, HTML) + + check_expression(t, "aaa?", "aaa", "aaa") +} + +@test +test_non_greedy :: proc(t: ^testing.T) { + HTML :: "" + + check_expression(t, "<.+?>", HTML, "") + check_expression(t, "<.*?>", HTML, "") + + // NOTE: make a comment about optional non-greedy capture groups + check_expression(t, "aaa??", "aaa", "aa") +} + +@test +test_groups :: proc(t: ^testing.T) { + check_expression(t, "a(b)", "ab", /*|*/ "ab", "b") + check_expression(t, "(a)b", "ab", /*|*/ "ab", "a") + check_expression(t, "(a)(b)", "ab", /*|*/ "ab", "a", "b") + + check_expression(t, "(a(b))", "ab", /*|*/ "ab", "ab", "b") + check_expression(t, "((ab))", "ab", /*|*/ "ab", "ab", "ab") + check_expression(t, "((a)b)", "ab", /*|*/ "ab", "ab", "a") + + check_expression(t, "(ab)+", "ababababab", /*|*/ "ababababab", "ab") + check_expression(t, "((ab)+)", "ababababab", /*|*/ "ababababab", "ababababab", "ab") +} + +@test +test_class_group_repeat :: proc(t: ^testing.T) { + EXPR_1 :: "([0-9]:?)+" + EXPR_2 :: "([0-9]+:?)+" + check_expression(t, EXPR_1, "123:456:789", "123:456:789", "9") + check_expression(t, EXPR_2, "123:456:789", "123:456:789", "789") +} + +@test +test_non_capture_group :: proc(t: ^testing.T) { + EXPR :: "(?:a|b)c" + check_expression(t, EXPR, "ac", "ac") + check_expression(t, EXPR, "bc", "bc") + check_expression(t, EXPR, "cc") +} + +@test +test_optional_capture_group :: proc(t: ^testing.T) { + EXPR :: "^(blue|straw)?berry" + check_expression(t, EXPR, "berry", "berry") + check_expression(t, EXPR, "blueberry", "blueberry", "blue") + check_expression(t, EXPR, "strawberry", "strawberry", "straw") + check_expression(t, EXPR, "cranberry") +} + +@test +test_max_capture_groups :: proc(t: ^testing.T) { + EXPR :: "(1)(2)(3)(4)(5)(6)(7)(8)(9)" + check_expression(t, EXPR, "123456789", "123456789", + "1", "2", "3", "4", "5", "6", "7", "8", "9") +} + +@test +test_repetition :: proc(t: ^testing.T) { + { + EXPR :: "^a{3}$" + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa") + } + { + EXPR :: "^a{3,5}$" + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa", "aaaa") + check_expression(t, EXPR, "aaaaa", "aaaaa") + check_expression(t, EXPR, "aaaaaa") + } + { + EXPR :: "^(?:meow){2}$" + check_expression(t, EXPR, "meow") + check_expression(t, EXPR, "meowmeow", "meowmeow") + check_expression(t, EXPR, "meowmeowmeow") + } + { + EXPR :: "a{2,}" + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "aaa", "aaa") + } + { + EXPR :: "a{,2}" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "aa", "aa") + check_expression(t, EXPR, "aaa", "aa") + } + { + EXPR :: "^a{3,3}$" + check_expression(t, EXPR, "aa") + check_expression(t, EXPR, "aaa", "aaa") + check_expression(t, EXPR, "aaaa") + } + { + EXPR :: "a{0,}" + check_expression(t, EXPR, "aaa", "aaa") + } +} + +@test +test_repeated_groups :: proc(t: ^testing.T) { + { + EXPR :: "(ab){3}" + check_expression(t, EXPR, "ababab", "ababab", "ab") + } + { + EXPR :: "((?:ab){3})" + check_expression(t, EXPR, "ababab", "ababab", "ababab") + } +} + +@test +test_escaped_newline :: proc(t: ^testing.T) { + EXPR :: `\n[\n]` + check_expression(t, EXPR, "\n\n", "\n\n") +} + +@test +test_anchors :: proc(t: ^testing.T) { + { + EXPR :: "^ab" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab") + } + { + EXPR :: "ab$" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab", "ab") + } + { + EXPR :: "^ab$" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "aab") + } +} + +@test +test_grouped_anchors :: proc(t: ^testing.T) { + { + EXPR :: "^a|b" + check_expression(t, EXPR, "ab", "a") + check_expression(t, EXPR, "ba", "b") + } + { + EXPR :: "b|c$" + check_expression(t, EXPR, "ac", "c") + check_expression(t, EXPR, "cb", "b") + } + { + EXPR :: "^hellope$|world" + check_expression(t, EXPR, "hellope", "hellope") + check_expression(t, EXPR, "hellope world", "world") + } +} + +@test +test_empty_alternation :: proc(t: ^testing.T) { + { + EXPR :: "(?:a|)b" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "b", "b") + } + { + EXPR :: "(?:|a)b" + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, "b", "b") + } + { + EXPR :: "|b" + check_expression(t, EXPR, "b", "") + check_expression(t, EXPR, "", "") + } + { + EXPR :: "a|" + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "", "") + } + { + EXPR :: "|" + check_expression(t, EXPR, "a", "") + check_expression(t, EXPR, "", "") + } +} + +@test +test_empty_class :: proc(t: ^testing.T) { + EXPR :: "a[]b" + check_expression(t, EXPR, "ab", "ab") +} + +@test +test_dot_in_class :: proc(t: ^testing.T) { + EXPR :: `[a\..]` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, ".", ".") + check_expression(t, EXPR, "b") +} + + +@test +test_word_boundaries :: proc(t: ^testing.T) { + STR :: "This is an island." + { + EXPR :: `\bis\b` + check_expression(t, EXPR, STR, "is") + } + { + EXPR :: `\bis\w+` + check_expression(t, EXPR, STR, "island") + } + { + EXPR :: `\w+is\b` + check_expression(t, EXPR, STR, "This") + } + { + EXPR :: `\b\w\w\b` + check_expression(t, EXPR, STR, "is") + } +} + +@test +test_non_word_boundaries :: proc(t: ^testing.T) { + { + EXPR :: `.\B.` + check_expression(t, EXPR, "ab", "ab") + check_expression(t, EXPR, " ", " ") + check_expression(t, EXPR, "a ") + check_expression(t, EXPR, " b") + } + { + EXPR :: `\B.\B` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "abc", "b") + } + { + EXPR :: `\B.+` + check_expression(t, EXPR, "abc", "bc") + } + { + EXPR :: `.+\B` + check_expression(t, EXPR, "abc", "ab") + } +} + +@test +test_empty_patterns :: proc(t: ^testing.T) { + { + EXPR :: "" + check_expression(t, EXPR, "abc", "") + } + { + EXPR :: "^$" + check_expression(t, EXPR, "", "") + check_expression(t, EXPR, "a") + } +} + +@test +test_unanchored :: proc(t: ^testing.T) { + EXPR :: "ab" + check_expression(t, EXPR, "cab", "ab") +} + +@test +test_affixes :: proc(t: ^testing.T) { + // This test is for the optimizer. + EXPR :: "^(?:samples|ample|sample)$" + check_expression(t, EXPR, "sample", "sample") + check_expression(t, EXPR, "samples", "samples") + check_expression(t, EXPR, "ample", "ample") + check_expression(t, EXPR, "amples") +} + +@test +test_anchored_capture_until_end :: proc(t: ^testing.T) { + // This test is for the optimizer. + { + EXPR :: `^hellope.*$` + check_expression(t, EXPR, "hellope world", "hellope world") + check_expression(t, EXPR, "hellope", "hellope") + check_expression(t, EXPR, "hellope !", "hellope !") + } + { + EXPR :: `^hellope.+$` + check_expression(t, EXPR, "hellope world", "hellope world") + check_expression(t, EXPR, "hellope") + check_expression(t, EXPR, "hellope !", "hellope !") + } + { + EXPR :: `^(aa|bb|cc.+$).*$` + check_expression(t, EXPR, "aa", "aa", "aa") + check_expression(t, EXPR, "bb", "bb", "bb") + check_expression(t, EXPR, "bbaa", "bbaa", "bb") + check_expression(t, EXPR, "cc") + check_expression(t, EXPR, "ccc", "ccc", "ccc") + check_expression(t, EXPR, "cccc", "cccc", "cccc") + } + // This makes sure that the `.*$` / `.*$` optimization doesn't cause + // any issues if someone does something strange like putting it in the + // middle of an expression. + { + EXPR :: `^(a(b.*$)c).*$` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "ab") + check_expression(t, EXPR, "abc") + } + { + EXPR :: `^(a(b.*$)?c).*$` + check_expression(t, EXPR, "a") + check_expression(t, EXPR, "ab") + check_expression(t, EXPR, "abc") + check_expression(t, EXPR, "ac", "ac", "ac") + check_expression(t, EXPR, "acc", "acc", "ac") + } +} + +@test +test_unicode_explicitly :: proc(t: ^testing.T) { + { + EXPR :: "^....!$" + check_expression_with_flags(t, EXPR, { .Unicode }, + "こにちは!", "こにちは!") + check_expression_with_flags(t, EXPR, { .Unicode, .No_Optimization }, + "こにちは!", "こにちは!") + } + { + EXPR :: "こにちは!" + check_expression_with_flags(t, EXPR, { .Global, .Unicode }, + "Hello こにちは!", "こにちは!") + check_expression_with_flags(t, EXPR, { .Global, .Unicode, .No_Optimization }, + "Hello こにちは!", "こにちは!") + } +} + +@test +test_no_capture_match :: proc(t: ^testing.T) { + EXPR :: "^abc$" + + rex, parse_err := regex.create(EXPR, { .No_Capture }) + if !testing.expect_value(t, parse_err, nil) { + return + } + defer regex.destroy(rex) + + _, matched := regex.match(rex, "abc") + testing.expect(t, matched) +} + +@test +test_comments :: proc(t: ^testing.T) { + EXPR :: `^[abc]# This is a comment. +[def]# This is another comment. +\#$# This is a comment following an escaped '#'.` + check_expression(t, EXPR, "ad#", "ad#") +} + +@test +test_ignore_whitespace :: proc(t: ^testing.T) { + EXPR :: "\f" + ` +\ H e l # Note that the first space on this line is escaped, thus it is not ignored. + l +o p e [ ] w o rld (?: [ ]) ! # Spaces in classes are fine, too. +` + "\r" + + check_expression(t, EXPR, " Hellope world !", " Hellope world !", extra_flags = { .Ignore_Whitespace }) +} + +@test +test_case_insensitive :: proc(t: ^testing.T) { + EXPR :: `hElLoPe [w!][o-P]+rLd!` + check_expression(t, EXPR, "HeLlOpE WoRlD!", "HeLlOpE WoRlD!", extra_flags = { .Case_Insensitive }) +} + +@test +test_multiline :: proc(t: ^testing.T) { + { + EXPR :: `^hellope$world$` + check_expression(t, EXPR, "\nhellope\nworld\n", "\nhellope\nworld\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\nworld", "hellope\nworld", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\rworld", "hellope\rworld", extra_flags = { .Multiline }) + check_expression(t, EXPR, "hellope\r\nworld", "hellope\r\nworld", extra_flags = { .Multiline }) + } + { + EXPR :: `^?.$` + check_expression(t, EXPR, "\nh", "\nh", extra_flags = { .Multiline }) + check_expression(t, EXPR, "h", "h", extra_flags = { .Multiline }) + } + { + EXPR :: `^$` + check_expression(t, EXPR, "\n", "\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "", "", extra_flags = { .Multiline }) + } + { + EXPR :: `$` + check_expression(t, EXPR, "\n", "\n", extra_flags = { .Multiline }) + check_expression(t, EXPR, "", "", extra_flags = { .Multiline }) + } +} + +@test +test_optional_inside_optional :: proc(t: ^testing.T) { + EXPR :: `(?:a?)?` + check_expression(t, EXPR, "a", "a") + check_expression(t, EXPR, "", "") +} + +@test +test_printing :: proc(t: ^testing.T) { + rex, parse_err := regex.create(`^/a$`, { + .Global, + .Multiline, + .Case_Insensitive, + .Unicode, + .Ignore_Whitespace, + .No_Optimization, + .No_Capture, + }) + if !testing.expect_value(t, parse_err, nil) { + return + } + defer regex.destroy(rex) + + str := fmt.tprint(rex) + str_hash := fmt.tprintf("%#v", rex) + testing.expect_value(t, str, `/^\/a$/gmixun-`) + testing.expect_value(t, str_hash, `/^\/a$/gmixun-`) +} + + + +@test +test_error_bad_repetitions :: proc(t: ^testing.T) { + check_repetition_error :: proc(t: ^testing.T, pattern: string, loc := #caller_location) { + rex, err := regex.create(pattern) + regex.destroy(rex) + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Repetition) + if !ok { + log.errorf("expected error Invalid_Repetition, got %v", parse_err, location = loc) + } + } + + check_repetition_error(t, "a{-1,2}") + check_repetition_error(t, "a{2,1}") + check_repetition_error(t, "a{bc}") + check_repetition_error(t, "a{,-3}") + check_repetition_error(t, "a{d,}") + check_repetition_error(t, "a{}") + check_repetition_error(t, "a{0,0}") + check_repetition_error(t, "a{,0}") + check_repetition_error(t, "a{,}") +} + +@test +test_error_invalid_unicode_in_pattern :: proc(t: ^testing.T) { + rex, err := regex.create("\xC0", { .Unicode }) + regex.destroy(rex) + parse_err := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Unicode) + if !ok { + log.errorf("expected error Invalid_Unicode, got %v", parse_err) + } +} + +@test +test_error_invalid_unicode_in_string :: proc(t: ^testing.T) { + EXPR :: "^...$" + // NOTE: Matching on invalid Unicode is currently safe. + // If `utf8.decode_rune` ever changes, this test may fail. + check_expression(t, EXPR, "\xC0\xFF\xFE", "\xC0\xFF\xFE") +} + +@test +test_error_too_many_capture_groups :: proc(t: ^testing.T) { + // NOTE: There are 1 + 9 + 1 capture groups in this pattern. + // Remember the implicit capture group 0. + rex, err := regex.create("(1)(2)(3)(4)(5)(6)(7)(8)(9) (A)") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Too_Many_Capture_Groups) + if !ok { + log.errorf("expected error Too_Many_Capture_Groups, got %v", parse_err) + } +} + +@test +test_error_unclosed_paren :: proc(t: ^testing.T) { + rex, err := regex.create("(Hellope") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Expected_Token) + if !ok { + log.errorf("expected error Expected_Token, got %v", parse_err) + } +} + +@test +test_error_unclosed_class :: proc(t: ^testing.T) { + rex, err := regex.create("[helope") + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Unexpected_EOF) + if !ok { + log.errorf("expected error Unexpected_EOF, got %v", parse_err) + } +} + +@test +test_error_invalid_unicode_in_unclosed_class :: proc(t: ^testing.T) { + rex, err := regex.create("[\xC0", { .Unicode }) + regex.destroy(rex) + + parse_err, _ := err.(regex.Parser_Error) + _, ok := parse_err.(parser.Invalid_Unicode) + if !ok { + log.errorf("expected error Invalid_Unicode, got %v", parse_err) + } +} + +@test +test_program_too_big :: proc(t: ^testing.T) { + sb := strings.builder_make() + w := strings.to_writer(&sb) + defer strings.builder_destroy(&sb) + + // Each byte will turn into two bytes for the whole opcode and operand, + // then the compiler will insert 5 more bytes for the Save instructions + // and the Match. + N :: common.MAX_PROGRAM_SIZE/2 - 2 + for _ in 0..