Merge branch 'master' into xmlcomment

This commit is contained in:
Jeroen van Rijn
2026-01-22 11:47:23 +01:00
committed by GitHub
127 changed files with 15539 additions and 3176 deletions

View File

@@ -24,6 +24,18 @@ ENC_TABLE := [64]byte {
'4', '5', '6', '7', '8', '9', '+', '/',
}
// Encoding table for Base64url variant
ENC_URL_TABLE := [64]byte {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
'w', 'x', 'y', 'z', '0', '1', '2', '3',
'4', '5', '6', '7', '8', '9', '-', '_',
}
PADDING :: '='
DEC_TABLE := [256]u8 {
@@ -61,6 +73,43 @@ DEC_TABLE := [256]u8 {
0, 0, 0, 0, 0, 0, 0, 0,
}
// Decoding table for Base64url variant
DEC_URL_TABLE := [256]u8 {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 62, 0, 0,
52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 0, 0, 0, 0, 0, 0,
0, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 0, 0, 0, 0, 63,
0, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48,
49, 50, 51, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
}
encode :: proc(data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) -> (encoded: string, err: mem.Allocator_Error) #optional_allocator_error {
out_length := encoded_len(data)
if out_length == 0 {

View File

@@ -21,6 +21,7 @@ package encoding_unicode_entity
Jeroen van Rijn: Initial implementation.
*/
import "base:runtime"
import "core:unicode/utf8"
import "core:unicode"
import "core:strings"
@@ -141,8 +142,10 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
write_string(&builder, entity)
} else {
if .No_Entity_Decode not_in options {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
if decoded, count, ok := xml_decode_entity(entity); ok {
for i in 0..<count {
write_rune(&builder, decoded[i])
}
continue
}
}
@@ -212,17 +215,16 @@ advance :: proc(t: ^Tokenizer) -> (err: Error) {
}
}
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
xml_decode_entity :: proc(entity: string) -> (decoded: [2]rune, rune_count: int, ok: bool) {
entity := entity
if len(entity) == 0 { return -1, false }
if len(entity) == 0 { return }
switch entity[0] {
case '#':
if entity[0] == '#' {
base := 10
val := 0
entity = entity[1:]
if len(entity) == 0 { return -1, false }
if len(entity) == 0 { return }
if entity[0] == 'x' || entity[0] == 'X' {
base = 16
@@ -237,30 +239,275 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
val += int(r - '0')
case 'a'..='f':
if base == 10 { return -1, false }
if base == 10 { return }
val *= base
val += int(r - 'a' + 10)
case 'A'..='F':
if base == 10 { return -1, false }
if base == 10 { return }
val *= base
val += int(r - 'A' + 10)
case:
return -1, false
return
}
if val > MAX_RUNE_CODEPOINT { return -1, false }
if val > MAX_RUNE_CODEPOINT { return }
entity = entity[1:]
}
return rune(val), true
case:
// Named entity.
return named_xml_entity_to_rune(entity)
return rune(val), 1, true
}
// Named entity.
return named_xml_entity_to_rune(entity)
}
// escape_html escapes special characters like '&' to become '&amp;'.
// It escapes only 5 different characters: & ' < > and "
@(require_results)
escape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool) {
/*
& -> &amp;
' -> &#39; // &#39; is shorter than &apos; (NOTE: &apos; was not available until HTML 5)
< -> &lt;
> -> &gt;
" -> &#34; // &#34; is shorter than &quot;
*/
b := transmute([]byte)s
extra_bytes_needed := 0
for c in b {
switch c {
case '&': extra_bytes_needed += 4
case '\'': extra_bytes_needed += 4
case '<': extra_bytes_needed += 3
case '>': extra_bytes_needed += 3
case '"': extra_bytes_needed += 4
}
}
if extra_bytes_needed == 0 {
return s, false
}
t, err := make([]byte, len(s) + extra_bytes_needed, allocator, loc)
if err != nil {
return
}
was_allocation = true
w := 0
for c in b {
x := ""
switch c {
case '&': x = "&amp;"
case '\'': x = "&#39;"
case '<': x = "&lt;"
case '>': x = "&gt;"
case '"': x = "&#34;"
}
if x != "" {
copy(t[w:], x)
w += len(x)
} else {
t[w] = c
w += 1
}
}
output = string(t[0:w])
return
}
@(require_results)
unescape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool, err: runtime.Allocator_Error) {
@(require_results)
do_append :: proc(s: string, amp_idx: int, buf: ^[dynamic]byte) -> (n: int) {
s, amp_idx := s, amp_idx
n += len(s[:amp_idx])
if buf != nil { append(buf, s[:amp_idx]) }
s = s[amp_idx:]
for len(s) > 0 {
b, w, j := unescape_entity(s)
n += w
if buf != nil { append(buf, ..b[:w]) }
s = s[j:]
amp_idx = strings.index_byte(s, '&')
if amp_idx < 0 {
n += len(s)
if buf != nil { append(buf, s) }
break
}
n += amp_idx
if buf != nil { append(buf, s[:amp_idx]) }
s = s[amp_idx:]
}
return
}
s := s
amp_idx := strings.index_byte(s, '&')
if amp_idx < 0 {
return s, false, nil
}
// NOTE(bill): this does a two pass in order to minimize the allocations required
bytes_required := do_append(s, amp_idx, nil)
buf := make([dynamic]byte, 0, bytes_required, allocator, loc) or_return
was_allocation = true
_ = do_append(s, amp_idx, &buf)
assert(len(buf) == cap(buf))
output = string(buf[:])
return
}
// Returns an unescaped string of an encoded XML/HTML entity.
@(require_results)
unescape_entity :: proc(s: string) -> (b: [8]byte, w: int, j: int) {
s := s
if len(s) < 2 {
return
}
if s[0] != '&' {
return
}
j = 1
if s[j] == '#' { // scan numbers
j += 1
if len(s) <= 3 { // remove `&#.`
return
}
c := s[j]
hex := false
if c == 'x' || c == 'X' {
hex = true
j += 1
}
x := rune(0)
scan_number: for j < len(s) {
c = s[j]
j += 1
if hex {
switch c {
case '0'..='9': x = 16*x + rune(c) - '0'; continue scan_number
case 'a'..='f': x = 16*x + rune(c) - 'a' + 10; continue scan_number
case 'A'..='F': x = 16*x + rune(c) - 'A' + 10; continue scan_number
}
} else {
switch c {
case '0'..='9': x = 10*x + rune(c) - '0'; continue scan_number
}
}
// Keep the ';' to check for cases which require it and cases which might not
if c != ';' {
j -= 1
}
break scan_number
}
if j <= 3 { // no replacement characters found
return
}
@(static, rodata)
windows_1252_replacement_table := [0xa0 - 0x80]rune{ // Windows-1252 -> UTF-8
'\u20ac', '\u0081', '\u201a', '\u0192',
'\u201e', '\u2026', '\u2020', '\u2021',
'\u02c6', '\u2030', '\u0160', '\u2039',
'\u0152', '\u008d', '\u017d', '\u008f',
'\u0090', '\u2018', '\u2019', '\u201c',
'\u201d', '\u2022', '\u2013', '\u2014',
'\u02dc', '\u2122', '\u0161', '\u203a',
'\u0153', '\u009d', '\u017e', '\u0178',
}
switch x {
case 0x80..<0xa0:
x = windows_1252_replacement_table[x-0x80]
case 0, 0xd800..=0xdfff:
x = utf8.RUNE_ERROR
case:
if x > 0x10ffff {
x = utf8.RUNE_ERROR
}
}
b1, w1 := utf8.encode_rune(x)
w += copy(b[:], b1[:w1])
return
}
// Lookup by entity names
scan_ident: for j < len(s) { // scan over letters and digits
c := s[j]
j += 1
switch c {
case 'a'..='z', 'A'..='Z', '0'..='9':
continue scan_ident
}
// Keep the ';' to check for cases which require it and cases which might not
if c != ';' {
j -= 1
}
break scan_ident
}
entity_name := s[1:j]
if len(entity_name) == 0 {
return
}
if entity_name[len(entity_name)-1] == ';' {
entity_name = entity_name[:len(entity_name)-1]
}
if r2, _, ok := named_xml_entity_to_rune(entity_name); ok {
b1, w1 := utf8.encode_rune(r2[0])
w += copy(b[w:], b1[:w1])
if r2[1] != 0 {
b2, w2 := utf8.encode_rune(r2[1])
w += copy(b[w:], b2[:w2])
}
return
}
// The longest entities that do not end with a semicolon are <=6 bytes long
LONGEST_ENTITY_WITHOUT_SEMICOLON :: 6
n := min(len(entity_name)-1, LONGEST_ENTITY_WITHOUT_SEMICOLON)
for i := n; i > 1; i -= 1 {
if r2, _, ok := named_xml_entity_to_rune(entity_name[:i]); ok {
b1, w1 := utf8.encode_rune(r2[0])
w += copy(b[w:], b1[:w1])
if r2[1] != 0 {
b2, w2 := utf8.encode_rune(r2[1])
w += copy(b[w:], b2[:w2])
}
return
}
}
return
}
// Private XML helper to extract `&<stuff>;` entity.
@(private="file")
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {

File diff suppressed because it is too large Load Diff