is_symbol uses generated Unicode tables

Tables for Sm, Sc, Sk, So are now being generated. These tables
are now being searched by is_symbol
This commit is contained in:
StudebakerGuy
2026-03-08 16:20:35 -04:00
committed by Jeroen van Rijn
parent 8f5f1ee1cc
commit 65d0e5c82f
3 changed files with 312 additions and 4 deletions

View File

@@ -1442,6 +1442,307 @@ ps_ranges := Range{
single_16 = ps_singles16[:],
}
@(rodata)
sc_singles16 := [?]u16{
0x0024, 0x058F, 0x060B, 0x09FB, 0x0AF1, 0x0BF9, 0x0E3F, 0x17DB, 0xA838, 0xFDFC,
0xFE69, 0xFF04,
}
@(rodata)
sc_ranges16 := [?]u16{
0x00A2, 0x00A5,
0x07FE, 0x07FF,
0x09F2, 0x09F3,
0x20A0, 0x20C1,
0xFFE0, 0xFFE1,
0xFFE5, 0xFFE6,
}
@(rodata)
sc_singles32 := [?]i32{
0x1E2FF, 0x1ECB0,
}
@(rodata)
sc_ranges32 := [?]i32{
0x11FDD, 0x11FE0,
}
sc_ranges := Range{
single_16 = sc_singles16[:],
ranges_16 = sc_ranges16[:],
single_32 = sc_singles32[:],
ranges_32 = sc_ranges32[:],
}
@(rodata)
sk_singles16 := [?]u16{
0x005E, 0x0060, 0x00A8, 0x00AF, 0x00B4, 0x00B8, 0x02ED, 0x0375, 0x0888, 0x1FBD,
0xAB5B, 0xFF3E, 0xFF40, 0xFFE3,
}
@(rodata)
sk_ranges16 := [?]u16{
0x02C2, 0x02C5,
0x02D2, 0x02DF,
0x02E5, 0x02EB,
0x02EF, 0x02FF,
0x0384, 0x0385,
0x1FBF, 0x1FC1,
0x1FCD, 0x1FCF,
0x1FDD, 0x1FDF,
0x1FED, 0x1FEF,
0x1FFD, 0x1FFE,
0x309B, 0x309C,
0xA700, 0xA716,
0xA720, 0xA721,
0xA789, 0xA78A,
0xAB6A, 0xAB6B,
0xFBB2, 0xFBC2,
}
@(rodata)
sk_ranges32 := [?]i32{
0x1F3FB, 0x1F3FF,
}
sk_ranges := Range{
single_16 = sk_singles16[:],
ranges_16 = sk_ranges16[:],
ranges_32 = sk_ranges32[:],
}
@(rodata)
sm_singles16 := [?]u16{
0x002B, 0x007C, 0x007E, 0x00AC, 0x00B1, 0x00D7, 0x00F7, 0x03F6, 0x2044, 0x2052,
0x2118, 0x214B, 0x21A0, 0x21A3, 0x21A6, 0x21AE, 0x21D2, 0x21D4, 0x237C, 0x25B7,
0x25C1, 0x266F, 0xFB29, 0xFE62, 0xFF0B, 0xFF5C, 0xFF5E, 0xFFE2,
}
@(rodata)
sm_ranges16 := [?]u16{
0x003C, 0x003E,
0x0606, 0x0608,
0x207A, 0x207C,
0x208A, 0x208C,
0x2140, 0x2144,
0x2190, 0x2194,
0x219A, 0x219B,
0x21CE, 0x21CF,
0x21F4, 0x22FF,
0x2320, 0x2321,
0x239B, 0x23B3,
0x23DC, 0x23E1,
0x25F8, 0x25FF,
0x27C0, 0x27C4,
0x27C7, 0x27E5,
0x27F0, 0x27FF,
0x2900, 0x2982,
0x2999, 0x29D7,
0x29DC, 0x29FB,
0x29FE, 0x2AFF,
0x2B30, 0x2B44,
0x2B47, 0x2B4C,
0xFE64, 0xFE66,
0xFF1C, 0xFF1E,
0xFFE9, 0xFFEC,
}
@(rodata)
sm_singles32 := [?]i32{
0x1CEF0, 0x1D6C1, 0x1D6DB, 0x1D6FB, 0x1D715, 0x1D735, 0x1D74F, 0x1D76F,
0x1D789, 0x1D7A9, 0x1D7C3,
}
@(rodata)
sm_ranges32 := [?]i32{
0x10D8E, 0x10D8F,
0x1EEF0, 0x1EEF1,
0x1F8D0, 0x1F8D8,
}
sm_ranges := Range{
single_16 = sm_singles16[:],
ranges_16 = sm_ranges16[:],
single_32 = sm_singles32[:],
ranges_32 = sm_ranges32[:],
}
@(rodata)
so_singles16 := [?]u16{
0x00A6, 0x00A9, 0x00AE, 0x00B0, 0x0482, 0x06DE, 0x06E9, 0x07F6, 0x09FA, 0x0B70,
0x0BFA, 0x0C7F, 0x0D4F, 0x0D79, 0x0F13, 0x0F34, 0x0F36, 0x0F38, 0x166D, 0x1940,
0x2114, 0x2125, 0x2127, 0x2129, 0x212E, 0x214A, 0x214F, 0x21D3, 0x3004, 0x3020,
0x31EF, 0x3250, 0xA839, 0xFFE4, 0xFFE8,
}
@(rodata)
so_ranges16 := [?]u16{
0x058D, 0x058E,
0x060E, 0x060F,
0x06FD, 0x06FE,
0x0BF3, 0x0BF8,
0x0F01, 0x0F03,
0x0F15, 0x0F17,
0x0F1A, 0x0F1F,
0x0FBE, 0x0FC5,
0x0FC7, 0x0FCC,
0x0FCE, 0x0FCF,
0x0FD5, 0x0FD8,
0x109E, 0x109F,
0x1390, 0x1399,
0x19DE, 0x19FF,
0x1B61, 0x1B6A,
0x1B74, 0x1B7C,
0x2100, 0x2101,
0x2103, 0x2106,
0x2108, 0x2109,
0x2116, 0x2117,
0x211E, 0x2123,
0x213A, 0x213B,
0x214C, 0x214D,
0x218A, 0x218B,
0x2195, 0x2199,
0x219C, 0x219F,
0x21A1, 0x21A2,
0x21A4, 0x21A5,
0x21A7, 0x21AD,
0x21AF, 0x21CD,
0x21D0, 0x21D1,
0x21D5, 0x21F3,
0x2300, 0x2307,
0x230C, 0x231F,
0x2322, 0x2328,
0x232B, 0x237B,
0x237D, 0x239A,
0x23B4, 0x23DB,
0x23E2, 0x2429,
0x2440, 0x244A,
0x249C, 0x24E9,
0x2500, 0x25B6,
0x25B8, 0x25C0,
0x25C2, 0x25F7,
0x2600, 0x266E,
0x2670, 0x2767,
0x2794, 0x27BF,
0x2800, 0x28FF,
0x2B00, 0x2B2F,
0x2B45, 0x2B46,
0x2B4D, 0x2B73,
0x2B76, 0x2BFF,
0x2CE5, 0x2CEA,
0x2E50, 0x2E51,
0x2E80, 0x2E99,
0x2E9B, 0x2EF3,
0x2F00, 0x2FD5,
0x2FF0, 0x2FFF,
0x3012, 0x3013,
0x3036, 0x3037,
0x303E, 0x303F,
0x3190, 0x3191,
0x3196, 0x319F,
0x31C0, 0x31E5,
0x3200, 0x321E,
0x322A, 0x3247,
0x3260, 0x327F,
0x328A, 0x32B0,
0x32C0, 0x33FF,
0x4DC0, 0x4DFF,
0xA490, 0xA4C6,
0xA828, 0xA82B,
0xA836, 0xA837,
0xAA77, 0xAA79,
0xFBC3, 0xFBD2,
0xFD40, 0xFD4F,
0xFD90, 0xFD91,
0xFDC8, 0xFDCF,
0xFDFD, 0xFDFF,
0xFFED, 0xFFEE,
0xFFFC, 0xFFFD,
}
@(rodata)
so_singles32 := [?]i32{
0x101A0, 0x10AC8, 0x1173F, 0x16B45, 0x1BC9C, 0x1D245, 0x1E14F, 0x1ECAC,
0x1ED2E, 0x1F7F0, 0x1FAC8, 0x1FBFA,
}
@(rodata)
so_ranges32 := [?]i32{
0x10137, 0x1013F,
0x10179, 0x10189,
0x1018C, 0x1018E,
0x10190, 0x1019C,
0x101D0, 0x101FC,
0x10877, 0x10878,
0x10ED1, 0x10ED8,
0x11FD5, 0x11FDC,
0x11FE1, 0x11FF1,
0x16B3C, 0x16B3F,
0x1CC00, 0x1CCEF,
0x1CCFA, 0x1CCFC,
0x1CD00, 0x1CEB3,
0x1CEBA, 0x1CED0,
0x1CEE0, 0x1CEEF,
0x1CF50, 0x1CFC3,
0x1D000, 0x1D0F5,
0x1D100, 0x1D126,
0x1D129, 0x1D164,
0x1D16A, 0x1D16C,
0x1D183, 0x1D184,
0x1D18C, 0x1D1A9,
0x1D1AE, 0x1D1EA,
0x1D200, 0x1D241,
0x1D300, 0x1D356,
0x1D800, 0x1D9FF,
0x1DA37, 0x1DA3A,
0x1DA6D, 0x1DA74,
0x1DA76, 0x1DA83,
0x1DA85, 0x1DA86,
0x1F000, 0x1F02B,
0x1F030, 0x1F093,
0x1F0A0, 0x1F0AE,
0x1F0B1, 0x1F0BF,
0x1F0C1, 0x1F0CF,
0x1F0D1, 0x1F0F5,
0x1F10D, 0x1F1AD,
0x1F1E6, 0x1F202,
0x1F210, 0x1F23B,
0x1F240, 0x1F248,
0x1F250, 0x1F251,
0x1F260, 0x1F265,
0x1F300, 0x1F3FA,
0x1F400, 0x1F6D8,
0x1F6DC, 0x1F6EC,
0x1F6F0, 0x1F6FC,
0x1F700, 0x1F7D9,
0x1F7E0, 0x1F7EB,
0x1F800, 0x1F80B,
0x1F810, 0x1F847,
0x1F850, 0x1F859,
0x1F860, 0x1F887,
0x1F890, 0x1F8AD,
0x1F8B0, 0x1F8BB,
0x1F8C0, 0x1F8C1,
0x1F900, 0x1FA57,
0x1FA60, 0x1FA6D,
0x1FA70, 0x1FA7C,
0x1FA80, 0x1FA8A,
0x1FA8E, 0x1FAC6,
0x1FACD, 0x1FADC,
0x1FADF, 0x1FAEA,
0x1FAEF, 0x1FAF8,
0x1FB00, 0x1FB92,
0x1FB94, 0x1FBEF,
}
so_ranges := Range{
single_16 = so_singles16[:],
ranges_16 = so_ranges16[:],
single_32 = so_singles32[:],
ranges_32 = so_ranges32[:],
}
@(rodata)
extra_digits_singles16 := [?]u16{
0x00B9, 0x19DA, 0x2070, 0x24EA, 0x24FF,

View File

@@ -259,6 +259,13 @@ is_symbol :: proc(r: rune) -> bool #no_bounds_check {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pS != 0
}
s := in_range(r, sc_ranges) || in_range(r, sm_ranges)
if s || in_range(r, so_ranges) || in_range(r, sk_ranges) {
return true
}
return false
}

View File

@@ -291,10 +291,10 @@ main :: proc() {
// .Pi, // Initial_Punctuation, an initial quotation mark
// .Po, // Other_Punctuation, a punctuation mark of other type
// .Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
.Sc, // Currency_Symbol, a currency sign
.Sk, // Modifier_Symbol, a non-letterlike modifier symbol
.Sm, // Math_Symbol, a symbol of mathematical use
.So, // Other_Symbol, a symbol of other type
// .Sc, // Currency_Symbol, a currency sign
// .Sk, // Modifier_Symbol, a non-letterlike modifier symbol
// .Sm, // Math_Symbol, a symbol of mathematical use
// .So, // Other_Symbol, a symbol of other type
.Zl, // Line_Separator, U+2028 LINE SEPARATOR only
.Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
.Zs, // Space_Separator, a space character (of various non-zero widths)