more clickable file path fixes (#10619)

This pull request addresses some of the remaining issues when matching
`~`, `$VAR`, `.directory/`, and embedded commas. It does not address
issues with embedded line breaks.

The PR is split in multiple commits carefully applying a set of changes
to

1. make the big regex more composable / readable
2. update some doc strings
3. add more test cases for the issues mentioned
4. two simple commits, each fixing the issues

Changes:

- **url: refactor regex into documented branches**
  Break up the big monolithic URL and path regex into named sub-pattern
  constants and compose the final expression from three commented
  branches:
  
  - URLs with a scheme
  - absolute or dot-relative paths
  - bare relative paths
  
  This commit only breaks up the regex. It keeps the existing matching
  behavior unchanged.
  

- **url: update top-level comment**
  

- **url: carefully extend test cases**
  Extend existing test cases with `~`, `$VAR`, and bare .-prefixed paths
  and embedded `,` comma handling.
  
  See following issue comments:
  
-
https://github.com/ghostty-org/ghostty/pull/10570#issuecomment-3853842036
-
https://github.com/ghostty-org/ghostty/issues/1972#issuecomment-3859329233
-
https://github.com/ghostty-org/ghostty/issues/1972#issuecomment-3857881196
  

- **url: remove `,` from path_chars**
  Related to #1972
  
  Fixes an issue when paths have embedded comma, e.g.:
  
      shared/src/foo/SomeItem.m:12, shared/src/
  
  with path_chars greedily consuming the rest of the string.
  
  Now file path matching stops at comma. Scheme URLs are unchanged and
  still using the comma.
  

- **url: fix matching `~`, `$VAR`, `.directory/`**
  Related to #1972
  
  This commit adds three new alternatives for
  `rooted_or_relative_path_prefix`:
  
  - `~/`
  - `$VAR` and
  - `.local/`, `.config/` etc. for dot-prefixed directory names
  
Remaining commits fix edge cases one by one:

  - **url: fix mid-string dot partial matches**
    `"foo.local/share"` (was partial match) → now matches fully
    
  - **url: fix $-numeric character matches**
    `"$10/$20"` → no match
    
  - **url: fix partial match of mid string $-variable**
    `"foo/$BAR/baz"` (was partial match) → matches fully now
    
  - **url: fix incomplete $-numeric behavior**
    `"$10/bar.txt"` (was partial match) → but should not match at all
This commit is contained in:
Mitchell Hashimoto
2026-02-15 15:15:58 -08:00
committed by GitHub

View File

@@ -1,15 +1,17 @@
const std = @import("std");
const oni = @import("oniguruma");
/// Default URL regex. This is used to detect URLs in terminal output.
/// Default URL/path regex. This is used to detect URLs and file paths in
/// terminal output.
///
/// This is here in the config package because one day the matchers will be
/// configurable and this will be a default.
///
/// This regex is liberal in what it accepts after the scheme, with exceptions
/// for URLs ending with . or ). Although such URLs are perfectly valid, it is
/// common for text to contain URLs surrounded by parentheses (such as in
/// Markdown links) or at the end of sentences. Therefore, this regex excludes
/// them as follows:
/// For scheme URLs, this regex is liberal in what it accepts after the scheme,
/// with exceptions for URLs ending with . or ). Although such URLs are
/// perfectly valid, it is common for text to contain URLs surrounded by
/// parentheses (such as in Markdown links) or at the end of sentences.
/// Therefore, this regex excludes them as follows:
///
/// 1. Do not match regexes ending with .
/// 2. Do not match regexes ending with ), except for ones which contain a (
@@ -22,12 +24,6 @@ const oni = @import("oniguruma");
///
/// There are many complicated cases where these heuristics break down, but
/// handling them well requires a non-regex approach.
pub const regex =
"(?:" ++ url_schemes ++
\\)(?:
++ ipv6_url_pattern ++
\\|[\w\-.~:/?#@!$&*+,;=%]+(?:[\(\[]\w*[\)\]])?)+(?<![,.])|(?:\.\.\/|\.\/|(?<!\w)\/)(?:(?=[\w\-.~:\/?#@!$&*+,;=%]*\.)[\w\-.~:\/?#@!$&*+,;=%]+(?: [\w\-.~:\/?#@!$&*+,;=%]*[\/.])*(?: +(?= *$))?|(?![\w\-.~:\/?#@!$&*+,;=%]*\.)[\w\-.~:\/?#@!$&*+,;=%]+(?: [\w\-.~:\/?#@!$&*+,;=%]+)*(?: +(?= *$))?)|[\w][\w\-.]*\/(?=[\w\-.~:\/?#@!$&*+,;=%]*\.)[\w\-.~:\/?#@!$&*+,;=%]+(?: [\w\-.~:\/?#@!$&*+,;=%]*[\/.])*(?: +(?= *$))?
;
const url_schemes =
\\https?://|mailto:|ftp://|file:|ssh:|git://|ssh://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:
;
@@ -36,6 +32,95 @@ const ipv6_url_pattern =
\\(?:\[[:0-9a-fA-F]+(?:[:0-9a-fA-F]*)+\](?::[0-9]+)?)
;
const scheme_url_chars =
\\[\w\-.~:/?#@!$&*+,;=%]
;
const path_chars =
\\[\w\-.~:\/?#@!$&*+;=%]
;
const optional_bracketed_word_suffix =
\\(?:[\(\[]\w*[\)\]])?
;
const no_trailing_punctuation =
\\(?<![,.])
;
const no_trailing_colon =
\\(?<!:)
;
const trailing_spaces_at_eol =
\\(?: +(?= *$))?
;
const dotted_path_lookahead =
\\(?=[\w\-.~:\/?#@!$&*+;=%]*\.)
;
const non_dotted_path_lookahead =
\\(?![\w\-.~:\/?#@!$&*+;=%]*\.)
;
const dotted_path_space_segments =
\\(?:(?<!:) (?!\w+:\/\/)[\w\-.~:\/?#@!$&*+;=%]*[\/.])*
;
const any_path_space_segments =
\\(?:(?<!:) (?!\w+:\/\/)[\w\-.~:\/?#@!$&*+;=%]+)*
;
// Branch 1: URLs with explicit schemes (http, mailto, ftp, etc.).
const scheme_url_branch =
"(?:" ++ url_schemes ++ ")" ++
"(?:" ++ ipv6_url_pattern ++ "|" ++ scheme_url_chars ++ "+" ++ optional_bracketed_word_suffix ++ ")+" ++
no_trailing_punctuation;
const rooted_or_relative_path_prefix =
\\(?:\.\.\/|\.\/|(?<!\w)~\/|(?:[\w][\w\-.]*\/)*(?<!\w)\$[A-Za-z_]\w*\/|\.[\w][\w\-.]*\/|(?<![\w~\/])\/(?!\/))
;
// Branch 2: Absolute paths and dot-relative paths (/, ./, ../).
// A dotted segment is treated as file-like, while the undotted case stays
// broad to capture directory-like paths with spaces.
const rooted_or_relative_path_branch =
rooted_or_relative_path_prefix ++
"(?:" ++
dotted_path_lookahead ++
path_chars ++ "+" ++
dotted_path_space_segments ++
no_trailing_colon ++
trailing_spaces_at_eol ++
"|" ++
non_dotted_path_lookahead ++
path_chars ++ "+" ++
any_path_space_segments ++
no_trailing_colon ++
trailing_spaces_at_eol ++
")";
// Branch 3: Bare relative paths such as src/config/url.zig.
const bare_relative_path_prefix =
\\(?<!\$\d*)(?<!\w)[\w][\w\-.]*\/
;
const bare_relative_path_branch =
dotted_path_lookahead ++
bare_relative_path_prefix ++
path_chars ++ "+" ++
dotted_path_space_segments ++
no_trailing_colon ++
trailing_spaces_at_eol;
pub const regex =
scheme_url_branch ++
"|" ++
rooted_or_relative_path_branch ++
"|" ++
bare_relative_path_branch;
test "url regex" {
const testing = std.testing;
@@ -77,7 +162,7 @@ test "url regex" {
.expect = "https://example.com",
},
.{
.input = "Link trailing colon https://example.com, more text.",
.input = "Link trailing comma https://example.com, more text.",
.expect = "https://example.com",
},
.{
@@ -148,6 +233,10 @@ test "url regex" {
.input = "match git://example.com git links",
.expect = "git://example.com",
},
.{
.input = "/tmp/test.txt http://www.google.com",
.expect = "/tmp/test.txt",
},
.{
.input = "match tel:+18005551234 tel links",
.expect = "tel:+18005551234",
@@ -291,6 +380,89 @@ test "url regex" {
.input = "some-pkg/src/file.txt more text",
.expect = "some-pkg/src/file.txt",
},
// comma should match substrings
.{
.input = "src/foo.c,baz.txt",
.expect = "src/foo.c",
},
.{
.input = "~/foo/bar.txt",
.expect = "~/foo/bar.txt",
},
.{
.input = "open ~/Documents/notes.md please",
.expect = "~/Documents/notes.md",
},
.{
.input = "~/.config/ghostty/config",
.expect = "~/.config/ghostty/config",
},
.{
.input = "directory: ~/src/ghostty-org/ghostty",
.expect = "~/src/ghostty-org/ghostty",
},
.{
.input = "$HOME/src/config/url.zig",
.expect = "$HOME/src/config/url.zig",
},
.{
.input = "project dir: $PWD/src/ghostty/main.zig",
.expect = "$PWD/src/ghostty/main.zig",
},
// $VAR mid-path should match fully, not partially from the $
.{
.input = "foo/$BAR/baz",
.expect = "foo/$BAR/baz",
},
.{
.input = ".foo/bar/$VAR",
.expect = ".foo/bar/$VAR",
},
.{
.input = ".config/ghostty/config",
.expect = ".config/ghostty/config",
},
.{
.input = "loaded from .local/share/ghostty/state.db now",
.expect = ".local/share/ghostty/state.db",
},
.{
.input = "../some/where",
.expect = "../some/where",
},
// comma-separated file paths
.{
.input = " - shared/src/foo/SomeItem.m:12, shared/src/",
.expect = "shared/src/foo/SomeItem.m:12",
},
// mid-string dot should not partially match but fully
.{
.input = "foo.local/share",
.expect = "foo.local/share",
},
// numeric directory should match fully
.{
.input = "2024/report.txt",
.expect = "2024/report.txt",
},
// comma should stop matching in spaced path segments
.{
.input = "./foo bar,baz",
.expect = "./foo bar",
},
.{
.input = "/tmp/foo bar,baz",
.expect = "/tmp/foo bar",
},
// trailing colon should not be part of the path
.{
.input = "./.config/ghostty: Needs upstream (main)",
.expect = "./.config/ghostty",
},
.{
.input = "./Downloads: Operation not permitted",
.expect = "./Downloads",
},
};
for (cases) |case| {
@@ -306,10 +478,23 @@ test "url regex" {
try testing.expectEqualStrings(case.expect, match);
}
// Bare relative paths without any dot should not match as file paths
const no_match_cases = [_][]const u8{
// bare relative paths without any dot should not match as file paths
"input/output",
"foo/bar",
// $-numeric character should not match
"$10/bar",
"$10/$20",
"$10/bar.txt",
// comma should not let dot detection look past it
"foo/bar,baz.txt",
// $VAR should not match mid-word
"foo$BAR/baz.txt",
// ~ should not match mid-word
"foo~/bar.txt",
// double-slash comments are not paths
"// foo bar",
"//foo",
};
for (no_match_cases) |input| {
var result = re.search(input, .{});