ghostty/src/config/url.zig

const std = @import("std");
const oni = @import("oniguruma");

/// Default URL/path regex. This is used to detect URLs and file paths in
/// terminal output.
///
/// This is here in the config package because one day the matchers will be
/// configurable and this will be a default.
///
/// For scheme URLs, this regex is liberal in what it accepts after the scheme,
/// with exceptions for URLs ending with . or ). Although such URLs are
/// perfectly valid, it is common for text to contain URLs surrounded by
/// parentheses (such as in Markdown links) or at the end of sentences.
/// Therefore, this regex excludes them as follows:
///
/// 1. Do not match regexes ending with .
/// 2. Do not match regexes ending with ), except for ones which contain a (
///    without a subsequent )
///
/// Rule 2 means that we handle the following two cases:
///
///   "https://en.wikipedia.org/wiki/Rust_(video_game)" (include parens)
///   "(https://example.com)" (do not include the parens)
///
/// There are many complicated cases where these heuristics break down, but
/// handling them well requires a non-regex approach.
const url_schemes =
    \\https?://|mailto:|ftp://|file:|ssh:|git://|ssh://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:
;

const ipv6_url_pattern =
    \\(?:\[[:0-9a-fA-F]+(?:[:0-9a-fA-F]*)+\](?::[0-9]+)?)
;

const scheme_url_chars =
    \\[\w\-.~:/?#@!$&*+,;=%]
;

const path_chars =
    \\[\w\-.~:\/?#@!$&*+;=%]
;

const optional_bracketed_word_suffix =
    \\(?:[\(\[]\w*[\)\]])?
;

const no_trailing_punctuation =
    \\(?<![,.])
;

const no_trailing_colon =
    \\(?<!:)
;

const trailing_spaces_at_eol =
    \\(?: +(?= *$))?
;

const dotted_path_lookahead =
    \\(?=[\w\-.~:\/?#@!$&*+;=%]*\.)
;

const non_dotted_path_lookahead =
    \\(?![\w\-.~:\/?#@!$&*+;=%]*\.)
;

const dotted_path_space_segments =
    \\(?:(?<!:) (?!\w+:\/\/)(?!\.{0,2}\/)(?!~\/)[\w\-.~:\/?#@!$&*+;=%]*[\/.])*
;

const any_path_space_segments =
    \\(?:(?<!:) (?!\w+:\/\/)(?!\.{0,2}\/)(?!~\/)[\w\-.~:\/?#@!$&*+;=%]+)*
;

// Branch 1: URLs with explicit schemes (http, mailto, ftp, etc.).
const scheme_url_branch =
    "(?:" ++ url_schemes ++ ")" ++
    "(?:" ++ ipv6_url_pattern ++ "|" ++ scheme_url_chars ++ "+" ++ optional_bracketed_word_suffix ++ ")+" ++
    no_trailing_punctuation;

const rooted_or_relative_path_prefix =
    \\(?:\.\.\/|\.\/|(?<!\w)~\/|(?:[\w][\w\-.]*\/)*(?<!\w)\$[A-Za-z_]\w*\/|\.[\w][\w\-.]*\/|(?<![\w~\/])\/(?!\/))
;

// Branch 2: Absolute paths and dot-relative paths (/, ./, ../).
// A dotted segment is treated as file-like, while the undotted case stays
// broad to capture directory-like paths with spaces.
const rooted_or_relative_path_branch =
    rooted_or_relative_path_prefix ++
    "(?:" ++
    dotted_path_lookahead ++
    path_chars ++ "+" ++
    dotted_path_space_segments ++
    no_trailing_colon ++
    trailing_spaces_at_eol ++
    "|" ++
    non_dotted_path_lookahead ++
    path_chars ++ "+" ++
    any_path_space_segments ++
    no_trailing_colon ++
    trailing_spaces_at_eol ++
    ")";

// Branch 3: Bare relative paths such as src/config/url.zig.
const bare_relative_path_prefix =
    \\(?<!\$\d*)(?<!\w)[\w][\w\-.]*\/
;

const bare_relative_path_branch =
    dotted_path_lookahead ++
    bare_relative_path_prefix ++
    path_chars ++ "+" ++
    no_trailing_colon ++
    trailing_spaces_at_eol;

pub const regex =
    scheme_url_branch ++
    "|" ++
    rooted_or_relative_path_branch ++
    "|" ++
    bare_relative_path_branch;

test "url regex" {
    const testing = std.testing;

    try oni.testing.ensureInit();
    var re = try oni.Regex.init(
        regex,
        .{},
        oni.Encoding.utf8,
        oni.Syntax.default,
        null,
    );
    defer re.deinit();

    // The URL cases to test what our regex matches. Feel free to add to this
    // as we find bugs or just want more coverage.
    const cases = [_]struct {
        input: []const u8,
        expect: []const u8,
        num_matches: usize = 1,
    }{
        .{
            .input = "hello https://example.com world",
            .expect = "https://example.com",
        },
        .{
            .input = "https://example.com/foo(bar) more",
            .expect = "https://example.com/foo(bar)",
        },
        .{
            .input = "https://example.com/foo(bar)baz more",
            .expect = "https://example.com/foo(bar)baz",
        },
        .{
            .input = "Link inside (https://example.com) parens",
            .expect = "https://example.com",
        },
        .{
            .input = "Link period https://example.com. More text.",
            .expect = "https://example.com",
        },
        .{
            .input = "Link trailing comma https://example.com, more text.",
            .expect = "https://example.com",
        },
        .{
            .input = "Link in double quotes \"https://example.com\" and more",
            .expect = "https://example.com",
        },
        .{
            .input = "Link in single quotes 'https://example.com' and more",
            .expect = "https://example.com",
        },
        .{
            .input = "some file with https://google.com https://duckduckgo.com links.",
            .expect = "https://google.com",
        },
        .{
            .input = "and links in it. links https://yahoo.com mailto:test@example.com ssh://1.2.3.4",
            .expect = "https://yahoo.com",
        },
        .{
            .input = "also match http://example.com non-secure links",
            .expect = "http://example.com",
        },
        .{
            .input = "match tel://+12123456789 phone numbers",
            .expect = "tel://+12123456789",
        },
        .{
            .input = "match with query url https://example.com?query=1&other=2 and more text.",
            .expect = "https://example.com?query=1&other=2",
        },
        .{
            .input = "url with dashes [mode 2027](https://github.com/contour-terminal/terminal-unicode-core) for better unicode support",
            .expect = "https://github.com/contour-terminal/terminal-unicode-core",
        },
        .{
            .input = "dot.http://example.com",
            .expect = "http://example.com",
        },
        // weird characters in URL
        .{
            .input = "weird characters https://example.com/~user/?query=1&other=2#hash and more",
            .expect = "https://example.com/~user/?query=1&other=2#hash",
        },
        // square brackets in URL
        .{
            .input = "square brackets https://example.com/[foo] and more",
            .expect = "https://example.com/[foo]",
        },
        // square bracket following url
        .{
            .input = "[13]:TooManyStatements: TempFile#assign_temp_file_to_entity has approx 7 statements [https://example.com/docs/Too-Many-Statements.md]",
            .expect = "https://example.com/docs/Too-Many-Statements.md",
        },
        // remaining URL schemes tests
        .{
            .input = "match ftp://example.com ftp links",
            .expect = "ftp://example.com",
        },
        .{
            .input = "match file://example.com file links",
            .expect = "file://example.com",
        },
        .{
            .input = "match ssh://example.com ssh links",
            .expect = "ssh://example.com",
        },
        .{
            .input = "match git://example.com git links",
            .expect = "git://example.com",
        },
        .{
            .input = "/tmp/test.txt http://www.google.com",
            .expect = "/tmp/test.txt",
        },
        .{
            .input = "match tel:+18005551234 tel links",
            .expect = "tel:+18005551234",
        },
        .{
            .input = "match magnet:?xt=urn:btih:1234567890 magnet links",
            .expect = "magnet:?xt=urn:btih:1234567890",
        },
        .{
            .input = "match ipfs://QmSomeHashValue ipfs links",
            .expect = "ipfs://QmSomeHashValue",
        },
        .{
            .input = "match ipns://QmSomeHashValue ipns links",
            .expect = "ipns://QmSomeHashValue",
        },
        .{
            .input = "match gemini://example.com gemini links",
            .expect = "gemini://example.com",
        },
        .{
            .input = "match gopher://example.com gopher links",
            .expect = "gopher://example.com",
        },
        .{
            .input = "match news:comp.infosystems.www.servers.unix news links",
            .expect = "news:comp.infosystems.www.servers.unix",
        },
        .{
            .input = "/Users/ghostty.user/code/example.py",
            .expect = "/Users/ghostty.user/code/example.py",
        },
        .{
            .input = "/Users/ghostty.user/code/../example.py",
            .expect = "/Users/ghostty.user/code/../example.py",
        },
        .{
            .input = "/Users/ghostty.user/code/../example.py hello world",
            .expect = "/Users/ghostty.user/code/../example.py",
        },
        .{
            .input = "../example.py",
            .expect = "../example.py",
        },
        .{
            .input = "../example.py ",
            .expect = "../example.py ",
        },
        .{
            .input = "first time ../example.py contributor ",
            .expect = "../example.py",
        },
        .{
            .input = "[link](/home/user/ghostty.user/example)",
            .expect = "/home/user/ghostty.user/example",
        },
        // IPv6 URL tests - Basic tests
        .{
            .input = "Serving HTTP on :: port 8000 (http://[::]:8000/)",
            .expect = "http://[::]:8000/",
        },
        .{
            .input = "IPv6 address https://[2001:db8::1]:8080/path",
            .expect = "https://[2001:db8::1]:8080/path",
        },
        .{
            .input = "IPv6 localhost http://[::1]:3000",
            .expect = "http://[::1]:3000",
        },
        .{
            .input = "Complex IPv6 https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443/",
            .expect = "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443/",
        },
        // IPv6 URL tests - URLs with paths and query parameters
        .{
            .input = "IPv6 with path https://[2001:db8::1]/path/to/resource",
            .expect = "https://[2001:db8::1]/path/to/resource",
        },
        .{
            .input = "IPv6 with query https://[2001:db8::1]:8080/api?param=value&other=123",
            .expect = "https://[2001:db8::1]:8080/api?param=value&other=123",
        },
        // IPv6 URL tests - Compressed forms
        .{
            .input = "IPv6 compressed http://[2001:db8::]:80/",
            .expect = "http://[2001:db8::]:80/",
        },
        .{
            .input = "IPv6 multiple zeros http://[2001:0:0:0:0:0:0:1]",
            .expect = "http://[2001:0:0:0:0:0:0:1]",
        },
        // IPv6 URL tests - Special cases
        .{
            .input = "IPv6 link-local https://[fe80::1234:5678:9abc]",
            .expect = "https://[fe80::1234:5678:9abc]",
        },
        .{
            .input = "IPv6 multicast http://[ff02::1]/stream",
            .expect = "http://[ff02::1]/stream",
        },
        // IPv6 URL tests - Mixed scenarios
        .{
            .input = "IPv6 in markdown [link](http://[2001:db8::1]/docs)",
            .expect = "http://[2001:db8::1]/docs",
        },
        // File paths with spaces
        .{
            .input = "./spaces-end.   ",
            .expect = "./spaces-end.   ",
        },
        .{
            .input = "./space middle",
            .expect = "./space middle",
        },
        .{
            .input = "../test folder/file.txt",
            .expect = "../test folder/file.txt",
        },
        .{
            .input = "/tmp/test folder/file.txt",
            .expect = "/tmp/test folder/file.txt",
        },
        .{
            .input = "/tmp/test  folder/file.txt",
            .expect = "/tmp/test",
        },
        // unified diff lines
        .{
            .input = "diff --git a/src/font/shaper/harfbuzz.zig b/src/font/shaper/harfbuzz.zig",
            .expect = "a/src/font/shaper/harfbuzz.zig",
        },
        // Two space-separated absolute paths should match only the first
        .{
            .input = "/tmp/foo /tmp/bar",
            .expect = "/tmp/foo",
        },
        .{
            .input = "/tmp/foo.txt /tmp/bar.txt",
            .expect = "/tmp/foo.txt",
        },
        // Bare relative file paths (no ./ or ../ prefix)
        .{
            .input = "src/config/url.zig",
            .expect = "src/config/url.zig",
        },
        .{
            .input = "app/folder/file.rb:1",
            .expect = "app/folder/file.rb:1",
        },
        .{
            .input = "modified:   src/config/url.zig",
            .expect = "src/config/url.zig",
        },
        .{
            .input = "lib/ghostty/terminal.zig:42:10",
            .expect = "lib/ghostty/terminal.zig:42:10",
        },
        .{
            .input = "some-pkg/src/file.txt more text",
            .expect = "some-pkg/src/file.txt",
        },
        // comma should match substrings
        .{
            .input = "src/foo.c,baz.txt",
            .expect = "src/foo.c",
        },
        .{
            .input = "~/foo/bar.txt",
            .expect = "~/foo/bar.txt",
        },
        .{
            .input = "open ~/Documents/notes.md please",
            .expect = "~/Documents/notes.md",
        },
        .{
            .input = "~/.config/ghostty/config",
            .expect = "~/.config/ghostty/config",
        },
        .{
            .input = "directory: ~/src/ghostty-org/ghostty",
            .expect = "~/src/ghostty-org/ghostty",
        },
        .{
            .input = "$HOME/src/config/url.zig",
            .expect = "$HOME/src/config/url.zig",
        },
        .{
            .input = "project dir: $PWD/src/ghostty/main.zig",
            .expect = "$PWD/src/ghostty/main.zig",
        },
        // $VAR mid-path should match fully, not partially from the $
        .{
            .input = "foo/$BAR/baz",
            .expect = "foo/$BAR/baz",
        },
        .{
            .input = ".foo/bar/$VAR",
            .expect = ".foo/bar/$VAR",
        },
        .{
            .input = ".config/ghostty/config",
            .expect = ".config/ghostty/config",
        },
        .{
            .input = "loaded from .local/share/ghostty/state.db now",
            .expect = ".local/share/ghostty/state.db",
        },
        .{
            .input = "../some/where",
            .expect = "../some/where",
        },
        // comma-separated file paths
        .{
            .input = "  - shared/src/foo/SomeItem.m:12, shared/src/",
            .expect = "shared/src/foo/SomeItem.m:12",
        },
        // mid-string dot should not partially match but fully
        .{
            .input = "foo.local/share",
            .expect = "foo.local/share",
        },
        // numeric directory should match fully
        .{
            .input = "2024/report.txt",
            .expect = "2024/report.txt",
        },
        // comma should stop matching in spaced path segments
        .{
            .input = "./foo bar,baz",
            .expect = "./foo bar",
        },
        .{
            .input = "/tmp/foo bar,baz",
            .expect = "/tmp/foo bar",
        },
        // trailing colon should not be part of the path
        .{
            .input = "./.config/ghostty: Needs upstream (main)",
            .expect = "./.config/ghostty",
        },
        .{
            .input = "./Downloads: Operation not permitted",
            .expect = "./Downloads",
        },
    };

    for (cases) |case| {
        //std.debug.print("input: {s}\n", .{case.input});
        //std.debug.print("match: {s}\n", .{case.expect});
        var reg = try re.search(case.input, .{});
        //std.debug.print("count: {d}\n", .{@as(usize, reg.count())});
        //std.debug.print("starts: {d}\n", .{reg.starts()});
        //std.debug.print("ends: {d}\n", .{reg.ends()});
        defer reg.deinit();
        try testing.expectEqual(@as(usize, case.num_matches), reg.count());
        const match = case.input[@intCast(reg.starts()[0])..@intCast(reg.ends()[0])];
        try testing.expectEqualStrings(case.expect, match);
    }

    const no_match_cases = [_][]const u8{
        // bare relative paths without any dot should not match as file paths
        "input/output",
        "foo/bar",
        // $-numeric character should not match
        "$10/bar",
        "$10/$20",
        "$10/bar.txt",
        // comma should not let dot detection look past it
        "foo/bar,baz.txt",
        // $VAR should not match mid-word
        "foo$BAR/baz.txt",
        // ~ should not match mid-word
        "foo~/bar.txt",
        // double-slash comments are not paths
        "// foo bar",
        "//foo",
    };
    for (no_match_cases) |input| {
        var result = re.search(input, .{});
        if (result) |*reg| {
            reg.deinit();
            return error.TestUnexpectedResult;
        } else |_| {}
    }
}