Merge branch 'master' into xmlcomment

2026-07-16 20:51:04 +00:00 · 2026-01-22 11:47:23 +01:00
parent fb479b3aae 5c09550d38
commit 3f8a32aeb9
127 changed files with 15539 additions and 3176 deletions
--- a/core/encoding/base64/base64.odin
+++ b/core/encoding/base64/base64.odin
@@ -24,6 +24,18 @@ ENC_TABLE := [64]byte {
    '4', '5', '6', '7', '8', '9', '+', '/',
 }

+// Encoding table for Base64url variant
+ENC_URL_TABLE := [64]byte {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+    'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+    'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+    'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+    'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+    'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_',
+}
+
 PADDING :: '='

 DEC_TABLE := [256]u8 {
@@ -61,6 +73,43 @@ DEC_TABLE := [256]u8 {
     0,  0,  0,  0,  0,  0,  0,  0,
 }

+// Decoding table for Base64url variant
+DEC_URL_TABLE := [256]u8 {
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0, 62,  0,  0,
+    52, 53, 54, 55, 56, 57, 58, 59,
+    60, 61,  0,  0,  0,  0,  0,  0,
+     0,  0,  1,  2,  3,  4,  5,  6,
+     7,  8,  9, 10, 11, 12, 13, 14,
+    15, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25,  0,  0,  0,  0, 63,
+     0, 26, 27, 28, 29, 30, 31, 32,
+    33, 34, 35, 36, 37, 38, 39, 40,
+    41, 42, 43, 44, 45, 46, 47, 48,
+    49, 50, 51,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,
+}
+
+
 encode :: proc(data: []byte, ENC_TBL := ENC_TABLE, allocator := context.allocator) -> (encoded: string, err: mem.Allocator_Error) #optional_allocator_error {
 	out_length := encoded_len(data)
 	if out_length == 0 {
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -21,6 +21,7 @@ package encoding_unicode_entity
 		Jeroen van Rijn: Initial implementation.
 */

+import "base:runtime"
 import "core:unicode/utf8"
 import "core:unicode"
 import "core:strings"
@@ -141,8 +142,10 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 					write_string(&builder, entity)
 				} else {
 					if .No_Entity_Decode not_in options {
-						if decoded, ok := xml_decode_entity(entity); ok {
-							write_rune(&builder, decoded)
+						if decoded, count, ok := xml_decode_entity(entity); ok {
+							for i in 0..<count {
+								write_rune(&builder, decoded[i])
+							}
 							continue
 						}
 					}
@@ -212,17 +215,16 @@ advance :: proc(t: ^Tokenizer) -> (err: Error) {
 	}
 }

-xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
+xml_decode_entity :: proc(entity: string) -> (decoded: [2]rune, rune_count: int, ok: bool) {
 	entity := entity
-	if len(entity) == 0 { return -1, false }
+	if len(entity) == 0 { return }

-	switch entity[0] {
-	case '#':
+	if entity[0] == '#' {
 		base  := 10
 		val   := 0
 		entity = entity[1:]

-		if len(entity) == 0 { return -1, false }
+		if len(entity) == 0 { return }

 		if entity[0] == 'x' || entity[0] == 'X' {
 			base = 16
@@ -237,30 +239,275 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
 				val += int(r - '0')

 			case 'a'..='f':
-				if base == 10 { return -1, false }
+				if base == 10 { return }
 				val *= base
 				val += int(r - 'a' + 10)

 			case 'A'..='F':
-				if base == 10 { return -1, false }
+				if base == 10 { return }
 				val *= base
 				val += int(r - 'A' + 10)

 			case:
-				return -1, false
+				return
 			}

-			if val > MAX_RUNE_CODEPOINT { return -1, false }
+			if val > MAX_RUNE_CODEPOINT { return  }
 			entity = entity[1:]
 		}
-		return rune(val), true
-
-	case:
-		// Named entity.
-		return named_xml_entity_to_rune(entity)
+		return rune(val), 1, true
 	}
+	// Named entity.
+	return named_xml_entity_to_rune(entity)
 }

+
+// escape_html escapes special characters like '&' to become '&amp;'.
+// It escapes only 5 different characters: & ' < > and "
+@(require_results)
+escape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool) {
+	/*
+		& -> &amp;
+		' -> &#39; // &#39; is shorter than &apos; (NOTE: &apos; was not available until HTML 5)
+		< -> &lt;
+		> -> &gt;
+		" -> &#34; // &#34; is shorter than &quot;
+	*/
+
+	b := transmute([]byte)s
+
+	extra_bytes_needed := 0
+
+	for c in b {
+		switch c {
+		case '&':  extra_bytes_needed += 4
+		case '\'': extra_bytes_needed += 4
+		case '<':  extra_bytes_needed += 3
+		case '>':  extra_bytes_needed += 3
+		case '"':  extra_bytes_needed += 4
+		}
+	}
+
+	if extra_bytes_needed == 0 {
+		return s, false
+	}
+
+	t, err := make([]byte, len(s) + extra_bytes_needed, allocator, loc)
+	if err != nil {
+		return
+	}
+	was_allocation = true
+
+	w := 0
+	for c in b {
+		x := ""
+		switch c {
+		case '&':  x = "&amp;"
+		case '\'': x = "&#39;"
+		case '<':  x = "&lt;"
+		case '>':  x = "&gt;"
+		case '"':  x = "&#34;"
+		}
+		if x != "" {
+			copy(t[w:], x)
+			w += len(x)
+		} else {
+			t[w] = c
+			w += 1
+		}
+	}
+	output = string(t[0:w])
+	return
+}
+
+
+@(require_results)
+unescape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool, err: runtime.Allocator_Error) {
+	@(require_results)
+	do_append :: proc(s: string, amp_idx: int, buf: ^[dynamic]byte) -> (n: int) {
+		s, amp_idx := s, amp_idx
+
+		n += len(s[:amp_idx])
+		if buf != nil { append(buf, s[:amp_idx]) }
+		s = s[amp_idx:]
+		for len(s) > 0 {
+			b, w, j := unescape_entity(s)
+			n += w
+			if buf != nil { append(buf, ..b[:w]) }
+
+			s = s[j:]
+
+			amp_idx = strings.index_byte(s, '&')
+			if amp_idx < 0 {
+				n += len(s)
+				if buf != nil { append(buf, s) }
+				break
+			}
+			n += amp_idx
+			if buf != nil { append(buf, s[:amp_idx]) }
+			s = s[amp_idx:]
+		}
+
+		return
+	}
+
+	s := s
+	amp_idx := strings.index_byte(s, '&')
+	if amp_idx < 0 {
+		return s, false, nil
+	}
+
+	// NOTE(bill): this does a two pass in order to minimize the allocations required
+	bytes_required := do_append(s, amp_idx, nil)
+
+	buf := make([dynamic]byte, 0, bytes_required, allocator, loc) or_return
+	was_allocation = true
+
+	_ = do_append(s, amp_idx, &buf)
+
+	assert(len(buf) == cap(buf))
+	output = string(buf[:])
+
+	return
+}
+
+// Returns an unescaped string of an encoded XML/HTML entity.
+@(require_results)
+unescape_entity :: proc(s: string) -> (b: [8]byte, w: int, j: int) {
+	s := s
+	if len(s) < 2 {
+		return
+	}
+	if s[0] != '&' {
+		return
+	}
+	j = 1
+
+	if s[j] == '#' { // scan numbers
+		j += 1
+		if len(s) <= 3 { // remove `&#.`
+			return
+		}
+		c := s[j]
+		hex := false
+		if c == 'x' || c == 'X' {
+			hex = true
+			j += 1
+		}
+
+		x := rune(0)
+		scan_number: for j < len(s) {
+			c = s[j]
+			j += 1
+			if hex {
+				switch c {
+				case '0'..='9': x = 16*x + rune(c) - '0';      continue scan_number
+				case 'a'..='f': x = 16*x + rune(c) - 'a' + 10; continue scan_number
+				case 'A'..='F': x = 16*x + rune(c) - 'A' + 10; continue scan_number
+				}
+			} else {
+				switch c {
+				case '0'..='9': x = 10*x + rune(c) - '0'; continue scan_number
+				}
+			}
+
+			// Keep the ';' to check for cases which require it and cases which might not
+			if c != ';' {
+				j -= 1
+			}
+			break scan_number
+		}
+
+
+		if j <= 3 { // no replacement characters found
+			return
+		}
+
+		@(static, rodata)
+		windows_1252_replacement_table := [0xa0 - 0x80]rune{ // Windows-1252 -> UTF-8
+			'\u20ac', '\u0081', '\u201a', '\u0192',
+			'\u201e', '\u2026', '\u2020', '\u2021',
+			'\u02c6', '\u2030', '\u0160', '\u2039',
+			'\u0152', '\u008d', '\u017d', '\u008f',
+			'\u0090', '\u2018', '\u2019', '\u201c',
+			'\u201d', '\u2022', '\u2013', '\u2014',
+			'\u02dc', '\u2122', '\u0161', '\u203a',
+			'\u0153', '\u009d', '\u017e', '\u0178',
+		}
+
+		switch x {
+		case 0x80..<0xa0:
+			x = windows_1252_replacement_table[x-0x80]
+		case 0, 0xd800..=0xdfff:
+			x = utf8.RUNE_ERROR
+		case:
+			if x > 0x10ffff {
+				x = utf8.RUNE_ERROR
+			}
+
+		}
+
+		b1, w1 := utf8.encode_rune(x)
+		w += copy(b[:], b1[:w1])
+		return
+	}
+
+	// Lookup by entity names
+
+	scan_ident: for j < len(s) { // scan over letters and digits
+		c := s[j]
+		j += 1
+
+		switch c {
+		case 'a'..='z', 'A'..='Z', '0'..='9':
+			continue scan_ident
+		}
+		// Keep the ';' to check for cases which require it and cases which might not
+		if c != ';' {
+			j -= 1
+		}
+		break scan_ident
+	}
+
+	entity_name := s[1:j]
+	if len(entity_name) == 0 {
+		return
+	}
+
+	if entity_name[len(entity_name)-1] == ';' {
+		entity_name = entity_name[:len(entity_name)-1]
+	}
+
+	if r2, _, ok := named_xml_entity_to_rune(entity_name); ok {
+		b1, w1 := utf8.encode_rune(r2[0])
+		w += copy(b[w:], b1[:w1])
+		if r2[1] != 0 {
+			b2, w2 := utf8.encode_rune(r2[1])
+			w += copy(b[w:], b2[:w2])
+		}
+		return
+	}
+
+	// The longest entities that do not end with a semicolon are <=6 bytes long
+	LONGEST_ENTITY_WITHOUT_SEMICOLON :: 6
+
+	n := min(len(entity_name)-1, LONGEST_ENTITY_WITHOUT_SEMICOLON)
+	for i := n; i > 1; i -= 1 {
+		if r2, _, ok := named_xml_entity_to_rune(entity_name[:i]); ok {
+			b1, w1 := utf8.encode_rune(r2[0])
+			w += copy(b[w:], b1[:w1])
+			if r2[1] != 0 {
+				b2, w2 := utf8.encode_rune(r2[1])
+				w += copy(b[w:], b2[:w2])
+			}
+			return
+		}
+	}
+
+	return
+}
+
+
 // Private XML helper to extract `&<stuff>;` entity.
@(private="file")
 _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
--- a/core/encoding/entity/generated.odin
+++ b/core/encoding/entity/generated.odin