From 05d2f881b6cdd2edf0e6d0b1c63dfaa01c6f30fb Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Fri, 31 Oct 2025 08:15:20 -0700 Subject: [PATCH] terminal: emit non-ASCII characters as Unicode codepoints for HTML Fixes #9426 Since we can't set the meta charset tag since we emit partial HTML, we use codepoint entities like `{` for non-ASCII characters to ensure proper rendering. --- src/terminal/formatter.zig | 111 ++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/src/terminal/formatter.zig b/src/terminal/formatter.zig index baa6b61c1..ddb6d5334 100644 --- a/src/terminal/formatter.zig +++ b/src/terminal/formatter.zig @@ -1263,7 +1263,18 @@ pub const PageFormatter = struct { '&' => try writer.writeAll("&"), '"' => try writer.writeAll("""), '\'' => try writer.writeAll("'"), - else => try writer.print("{u}", .{codepoint}), + else => { + // For HTML, emit ASCII (< 0x80) directly, but encode + // all non-ASCII as numeric entities to avoid encoding + // detection issues (fixes #9426). We can't set the + // meta tag because we emit partial HTML so this ensures + // proper unicode handling. + if (codepoint < 0x80) { + try writer.print("{u}", .{codepoint}); + } else { + try writer.print("&#{d};", .{codepoint}); + } + }, } }, } @@ -5065,6 +5076,104 @@ test "Page html with escaping" { try testing.expectEqual(Coordinate{ .x = 11, .y = 0 }, point_map.items[offset + 30]); } +test "Page html with unicode as numeric entities" { + const testing = std.testing; + const alloc = testing.allocator; + + var builder: std.Io.Writer.Allocating = .init(alloc); + defer builder.deinit(); + + var t = try Terminal.init(alloc, .{ + .cols = 80, + .rows = 24, + }); + defer t.deinit(alloc); + + var s = t.vtStream(); + defer s.deinit(); + + // Box drawing characters that caused issue #9426 + try s.nextSlice("╰─ ❯"); + + const pages = &t.screen.pages; + const page = &pages.pages.last.?.data; + var formatter: PageFormatter = .init(page, .{ .emit = .html }); + + try formatter.format(&builder.writer); + const output = builder.writer.buffered(); + + // Expected: box drawing chars as numeric entities + // ╰ = U+2570 = 9584, ─ = U+2500 = 9472, ❯ = U+276F = 10095 + try testing.expectEqualStrings( + "
╰─ ❯
", + output, + ); +} + +test "Page html ascii characters unchanged" { + const testing = std.testing; + const alloc = testing.allocator; + + var builder: std.Io.Writer.Allocating = .init(alloc); + defer builder.deinit(); + + var t = try Terminal.init(alloc, .{ + .cols = 80, + .rows = 24, + }); + defer t.deinit(alloc); + + var s = t.vtStream(); + defer s.deinit(); + + try s.nextSlice("hello world"); + + const pages = &t.screen.pages; + const page = &pages.pages.last.?.data; + var formatter: PageFormatter = .init(page, .{ .emit = .html }); + + try formatter.format(&builder.writer); + const output = builder.writer.buffered(); + + // ASCII should be emitted directly + try testing.expectEqualStrings( + "
hello world
", + output, + ); +} + +test "Page html mixed ascii and unicode" { + const testing = std.testing; + const alloc = testing.allocator; + + var builder: std.Io.Writer.Allocating = .init(alloc); + defer builder.deinit(); + + var t = try Terminal.init(alloc, .{ + .cols = 80, + .rows = 24, + }); + defer t.deinit(alloc); + + var s = t.vtStream(); + defer s.deinit(); + + try s.nextSlice("test ╰─❯ ok"); + + const pages = &t.screen.pages; + const page = &pages.pages.last.?.data; + var formatter: PageFormatter = .init(page, .{ .emit = .html }); + + try formatter.format(&builder.writer); + const output = builder.writer.buffered(); + + // Mix of ASCII and Unicode entities + try testing.expectEqualStrings( + "
test ╰─❯ ok
", + output, + ); +} + test "Page VT with palette option emits RGB" { const testing = std.testing; const alloc = testing.allocator;