Update strings case convertors to be unicode compliant

2026-02-16 08:04:07 +00:00 · 2020-05-24 17:50:27 +01:00
parent e42f7008fc
commit f06efffe22
3 changed files with 201 additions and 173 deletions
--- a/core/strings/builder.odin
+++ b/core/strings/builder.odin
@@ -8,10 +8,27 @@ Builder :: struct {
 	buf: [dynamic]byte,
 }

-make_builder :: proc(allocator := context.allocator) -> Builder {
+make_builder_none :: proc(allocator := context.allocator) -> Builder {
 	return Builder{make([dynamic]byte, allocator)};
 }

+make_builder_len :: proc(len: int, allocator := context.allocator) -> Builder {
+	return Builder{make([dynamic]byte, len, allocator)};
+}
+
+make_builder_len_cap :: proc(len, cap: int, allocator := context.allocator) -> Builder {
+	return Builder{make([dynamic]byte, len, cap, allocator)};
+}
+
+make_builder :: proc{
+	make_builder_none,
+	make_builder_len,
+	make_builder_len_cap,
+};
+
+
+
+
 destroy_builder :: proc(b: ^Builder) {
 	delete(b.buf);
 	clear(&b.buf);
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -678,8 +678,7 @@ trim_null :: proc(s: string) -> string {
 // Adjacent invalid bytes are only replaced once
 scrub :: proc(s: string, replacement: string, allocator := context.allocator) -> string {
 	str := s;
-	b := make_builder(allocator);;
-	grow_builder(&b, len(str));
+	b := make_builder(0, len(str), allocator);

 	has_error := false;
 	cursor := 0;
@@ -708,193 +707,204 @@ scrub :: proc(s: string, replacement: string, allocator := context.allocator) ->
 	return to_string(b);
 }

-to_snake_case :: proc(str: string, allocator := context.allocator) -> string {
-	buf := make_builder(allocator);

-	last_chars: [2]rune;
-	for char, _ in str {
-		switch char {
-		case 'A'..'Z':
-			switch last_chars[1] {
-			case 'a'..'z', '0'..'9':
-				write_rune(&buf, '_');
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-			}
-		case 'a'..'z':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				switch last_chars[0] {
-				case 'A'..'Z':
-					write_rune(&buf, '_');
-				}
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-			case '0'..'9':
-				write_rune(&buf, '_');
-			}
-			write_rune(&buf, char);
-		case '0'..'9':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-				write_rune(&buf, '_');
-			case 'a'..'z':
-				write_rune(&buf, '_');
-			}
-			write_rune(&buf, char);
-		case '_':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-			}
-			write_rune(&buf, char);
-		case:
-			unimplemented();
-		}
-
-		last_chars[0] = last_chars[1];
-		last_chars[1] = char;
+to_lower :: proc(s: string, allocator := context.allocator) -> string {
+	b := make_builder(0, len(s), allocator);
+	for r in s {
+		write_rune(&b, unicode.to_lower(r));
 	}
-
-	switch last_chars[1] {
-	case 'A'..'Z':
-		write_rune(&buf, last_chars[1] + ('a'-'A'));
+	return to_string(b);
+}
+to_upper :: proc(s: string, allocator := context.allocator) -> string {
+	b := make_builder(0, len(s), allocator);
+	for r in s {
+		write_rune(&b, unicode.to_upper(r));
 	}
-
-	return to_string(buf);
+	return to_string(b);
 }

-to_ada_case :: proc(str: string, allocator := context.allocator) -> string {
-	buf := make_builder(allocator);

-	last_chars: [2]rune;
-	for char, _ in str {
-		switch char {
-		case 'A'..'Z':
-			switch last_chars[1] {
-			case 'a'..'z', '0'..'9':
-				write_rune(&buf, '_');
-			case 'A'..'Z':
-				switch last_chars[0] {
-				case '_', '\x00':
-					write_rune(&buf, last_chars[1]);
-				case:
-					write_rune(&buf, last_chars[1] + ('a'-'A'));
-				}
-			}
-		case 'a'..'z':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				switch last_chars[0] {
-				case 'A'..'Z':
-					write_rune(&buf, '_');
-					write_rune(&buf, last_chars[1]);
-				case:
-					write_rune(&buf, last_chars[1]);
-				}
-				write_rune(&buf, char);
-			case '0'..'9':
-				write_rune(&buf, '_');
-				write_rune(&buf, char);
-			case 'a'..'z':
-				write_rune(&buf, char);
-			case '_', '\x00':
-				write_rune(&buf, char - ('a'-'A'));
-			}
-		case '0'..'9':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-				write_rune(&buf, '_');
-			case 'a'..'z':
-				write_rune(&buf, '_');
-			}
-			write_rune(&buf, char);
-		case '_':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1] + ('a'-'A'));
-			}
-			write_rune(&buf, char);
-		case:
-			write_rune(&buf, char);
-		}

-		last_chars[0] = last_chars[1];
-		last_chars[1] = char;
-	}

-	switch last_chars[1] {
-	case 'A'..'Z':
-		write_rune(&buf, last_chars[1] + ('a'-'A'));
-	}
-
-	return to_string(buf);
+is_delimiter :: proc(c: rune) -> bool {
+	return c == '-' || c == '_' || is_space(c);
 }

-to_screaming_snake_case :: proc(str: string, allocator := context.allocator) -> string {
-	buf := make_builder(allocator);
+is_separator :: proc(r: rune) -> bool {
+	if r <= 0x7f {
+		switch r {
+		case '0'..'9': return false;
+		case 'a'..'z': return false;
+		case 'A'..'Z': return false;
+		case '_': return false;
+		}
+		return true;
+	}

-	last_chars: [2]rune;
-	for char, _ in str {
-		switch char {
-		case 'A'..'Z':
-			switch last_chars[1] {
-			case 'a'..'z', '0'..'9':
-				write_rune(&buf, '_');
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1]);
-			}
-		case 'a'..'z':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				switch last_chars[0] {
-				case 'A'..'Z':
-					write_rune(&buf, '_');
-					write_rune(&buf, last_chars[1]);
-				case:
-					write_rune(&buf, last_chars[1]);
-				}
-				write_rune(&buf, char - ('a'-'A'));
-			case '0'..'9':
-				write_rune(&buf, '_');
-				write_rune(&buf, char - ('a'-'A'));
-			case 'a'..'z':
-				write_rune(&buf, char - ('a'-'A'));
-			case '_', '\x00':
-				write_rune(&buf, char - ('a'-'A'));
-			}
-		case '0'..'9':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1]);
-				write_rune(&buf, '_');
-			case 'a'..'z':
-				write_rune(&buf, '_');
-			}
-			write_rune(&buf, char);
-		case '_':
-			switch last_chars[1] {
-			case 'A'..'Z':
-				write_rune(&buf, last_chars[1]);
-			}
-			write_rune(&buf, char);
-		case:
-			unimplemented();
+	// TODO(bill): unicode categories
+	// if unicode.is_letter(r) || unicode.is_digit(r) {
+	// 	return false;
+	// }
+
+	return unicode.is_space(r);
+}
+
+
+string_case_iterator :: proc(b: ^Builder, s: string, callback: proc(b: ^Builder, prev, curr, next: rune)) {
+	prev, curr: rune;
+	for next in s {
+		if curr == 0 {
+			prev = curr;
+			curr = next;
+			continue;
 		}

-		last_chars[0] = last_chars[1];
-		last_chars[1] = char;
+		callback(b, prev, curr, next);
+
+		prev = curr;
+		curr = next;
 	}

-	switch last_chars[1] {
-	case 'A'..'Z':
-		write_rune(&buf, last_chars[1]);
+	if len(s) > 0 {
+		callback(b, prev, curr, 0);
 	}
-
-	return to_string(buf);
 }

+
+to_lower_camel_case :: to_camel_case;
+to_camel_case :: proc(s: string, allocator := context.allocator) -> string {
+	s := trim_space(s);
+	b := make_builder(0, len(s), allocator);
+
+	string_case_iterator(&b, s, proc(b: ^Builder, prev, curr, next: rune) {
+		if !is_delimiter(curr) {
+			if is_delimiter(prev) {
+				write_rune(b, unicode.to_upper(curr));
+			} else if unicode.is_lower(prev) {
+				write_rune(b, curr);
+			} else {
+				write_rune(b, unicode.to_lower(curr));
+			}
+		}
+	});
+
+	return to_string(b);
+}
+
+to_upper_camel_case :: to_pascal_case;
+to_pascal_case :: proc(s: string, allocator := context.allocator) -> string {
+	s := trim_space(s);
+	b := make_builder(0, len(s), allocator);
+
+	string_case_iterator(&b, s, proc(b: ^Builder, prev, curr, next: rune) {
+		if !is_delimiter(curr) {
+			if is_delimiter(prev) || prev == 0 {
+				write_rune(b, unicode.to_upper(curr));
+			} else if unicode.is_lower(prev) {
+				write_rune(b, curr);
+			} else {
+				write_rune(b, unicode.to_lower(curr));
+			}
+		}
+	});
+
+	return to_string(b);
+}
+
+to_delimiter_case :: proc(s: string, delimiter: rune, all_upper_case: bool, allocator := context.allocator) -> string {
+	s := trim_space(s);
+	b := make_builder(0, len(s), allocator);
+
+	adjust_case := unicode.to_upper if all_upper_case else unicode.to_lower;
+
+	prev, curr: rune;
+
+	for next in s {
+		if is_delimiter(curr) {
+			if !is_delimiter(prev) {
+				write_rune(&b, delimiter);
+			}
+		} else if unicode.is_upper(curr) {
+			if unicode.is_lower(prev) || (unicode.is_upper(prev) && unicode.is_lower(next)) {
+				write_rune(&b, delimiter);
+			}
+			write_rune(&b, adjust_case(curr));
+		} else if curr != 0 {
+			write_rune(&b, adjust_case(curr));
+		}
+
+		prev = curr;
+		curr = next;
+	}
+
+	if len(s) > 0 {
+		if unicode.is_upper(curr) && unicode.is_lower(prev) && prev != 0 {
+			write_rune(&b, delimiter);
+		}
+		write_rune(&b, adjust_case(curr));
+	}
+
+	return to_string(b);
+}
+
+
+to_snake_case :: proc(s: string, allocator := context.allocator) -> string {
+	return to_delimiter_case(s, '_', false, allocator);
+}
+
+to_screaming_snake_case :: to_upper_snake_case;
+to_upper_snake_case :: proc(s: string, allocator := context.allocator) -> string {
+	return to_delimiter_case(s, '_', true, allocator);
+}
+
+to_kebab_case :: proc(s: string, allocator := context.allocator) -> string {
+	return to_delimiter_case(s, '-', false, allocator);
+}
+
+to_upper_case :: proc(s: string, allocator := context.allocator) -> string {
+	return to_delimiter_case(s, '-', true, allocator);
+}
+
+to_ada_case :: proc(s: string, allocator := context.allocator) -> string {
+	delimiter :: '_';
+
+	s := trim_space(s);
+	b := make_builder(0, len(s), allocator);
+
+	prev, curr: rune;
+
+	for next in s {
+		if is_delimiter(curr) {
+			if !is_delimiter(prev) {
+				write_rune(&b, delimiter);
+			}
+		} else if unicode.is_upper(curr) {
+			if unicode.is_lower(prev) || (unicode.is_upper(prev) && unicode.is_lower(next)) {
+				write_rune(&b, delimiter);
+			}
+			write_rune(&b, unicode.to_upper(curr));
+		} else if curr != 0 {
+			write_rune(&b, unicode.to_lower(curr));
+		}
+
+		prev = curr;
+		curr = next;
+	}
+
+	if len(s) > 0 {
+		if unicode.is_upper(curr) && unicode.is_lower(prev) && prev != 0 {
+			write_rune(&b, delimiter);
+			write_rune(&b, unicode.to_upper(curr));
+		} else {
+			write_rune(&b, unicode.to_lower(curr));
+		}
+	}
+
+	return to_string(b);
+}
+
+
+
 reverse :: proc(s: string, allocator := context.allocator) -> string {
 	str := s;
 	n := len(str);
--- a/core/unicode/letter.odin
+++ b/core/unicode/letter.odin
@@ -105,7 +105,8 @@ is_title :: proc(r: rune) -> bool {
 	return is_upper(r) && is_lower(r);
 }

-is_white_space :: proc(r: rune) -> bool {
+is_white_space :: is_space;
+is_space :: proc(r: rune) -> bool {
 	c := i32(r);
 	p := binary_search(c, space_ranges[:], len(space_ranges)/2, 2);
 	if p >= 0 && space_ranges[p] <= c && c <= space_ranges[p+1] {