From eee97f7f62bbd65dd03ea3ec8668fef3fcfc685c Mon Sep 17 00:00:00 2001 From: hikari Date: Thu, 21 Apr 2022 20:49:32 +0300 Subject: [PATCH 1/5] strings: add levenshtein_distance procedure --- core/strings/strings.odin | 59 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 8e774b367..87bbb42cf 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1809,3 +1809,62 @@ fields_iterator :: proc(s: ^string) -> (field: string, ok: bool) { s^ = s[len(s):] return } + +// `levenshtein_distance` returns the Levenshtein edit distance between 2 strings. +// This is a single-row-version of the Wagner–Fischer algorithm, based on C code by Martin Ettl. +// Note: allocator isn't used if the length of string b in runes is smaller than 70. +levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> int { + LEVENSHTEIN_DEFAULT_COSTS: []int : { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + } + + m, n := utf8.rune_count_in_string(a), utf8.rune_count_in_string(b) + + if m == 0 do return n + if n == 0 do return m + + costs: []int + + if n + 1 > len(LEVENSHTEIN_DEFAULT_COSTS) { + costs = make([]int, n + 1, allocator) + } else { + costs = LEVENSHTEIN_DEFAULT_COSTS + } + + defer if n + 1 > len(LEVENSHTEIN_DEFAULT_COSTS) { + delete(costs, allocator) + } + + for k in 0..=n { + costs[k] = k + } + + i: int + for c1 in a { + costs[0] = i + 1 + corner := i + j: int + for c2 in b { + upper := costs[j + 1] + if c1 == c2 { + costs[j + 1] = corner + } else { + t := upper if upper < corner else corner + costs[j + 1] = (costs[j] if costs[j] < t else t) + 1 + } + + corner = upper + j += 1 + } + + i += 1 + } + + return costs[n] +} From 591732f347c094e706471994996fcffa44687e89 Mon Sep 17 00:00:00 2001 From: hikari Date: Thu, 21 Apr 2022 20:58:50 +0300 Subject: [PATCH 2/5] strings: levenshtein_distance: remove costs calculation for default array --- core/strings/strings.odin | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 87bbb42cf..3b33a5dc8 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1833,6 +1833,9 @@ levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> in if n + 1 > len(LEVENSHTEIN_DEFAULT_COSTS) { costs = make([]int, n + 1, allocator) + for k in 0..=n { + costs[k] = k + } } else { costs = LEVENSHTEIN_DEFAULT_COSTS } @@ -1841,10 +1844,6 @@ levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> in delete(costs, allocator) } - for k in 0..=n { - costs[k] = k - } - i: int for c1 in a { costs[0] = i + 1 From d8f0da164b92f52eda045d77e5f6abb5c5146f2a Mon Sep 17 00:00:00 2001 From: hikari Date: Thu, 21 Apr 2022 21:15:11 +0300 Subject: [PATCH 3/5] strings: levenshtein_distance: improve potential caching --- core/strings/strings.odin | 43 ++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 3b33a5dc8..f876aab3d 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -15,7 +15,7 @@ clone :: proc(s: string, allocator := context.allocator, loc := #caller_location } // returns a clone of the string `s` allocated using the `allocator` as a cstring -// a nul byte is appended to the clone, to make the cstring safe +// a nul byte is appended to the clone, to make the cstring safe clone_to_cstring :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> cstring { c := make([]byte, len(s)+1, allocator, loc) copy(c, s) @@ -37,7 +37,7 @@ string_from_nul_terminated_ptr :: proc(ptr: ^byte, len: int) -> string { return s } -// returns the raw ^byte start of the string `str` +// returns the raw ^byte start of the string `str` ptr_from_string :: proc(str: string) -> ^byte { d := transmute(mem.Raw_String)str return d.data @@ -969,7 +969,7 @@ count :: proc(s, substr: string) -> int { repeats the string `s` multiple `count` times and returns the allocated string panics when `count` is below 0 - strings.repeat("abc", 2) -> "abcabc" + strings.repeat("abc", 2) -> "abcabc" */ repeat :: proc(s: string, count: int, allocator := context.allocator) -> string { if count < 0 { @@ -1378,7 +1378,7 @@ split_multi :: proc(s: string, substrs: []string, allocator := context.allocator // skip when no results if substrings_found < 1 { - return + return } buf = make([]string, substrings_found + 1, allocator) @@ -1812,16 +1812,35 @@ fields_iterator :: proc(s: ^string) -> (field: string, ok: bool) { // `levenshtein_distance` returns the Levenshtein edit distance between 2 strings. // This is a single-row-version of the Wagner–Fischer algorithm, based on C code by Martin Ettl. -// Note: allocator isn't used if the length of string b in runes is smaller than 70. +// Note: allocator isn't used if the length of string b in runes is smaller than 256. levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> int { LEVENSHTEIN_DEFAULT_COSTS: []int : { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, + 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, + 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, + 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, + 250, 251, 252, 253, 254, 255, } m, n := utf8.rune_count_in_string(a), utf8.rune_count_in_string(b) From 71b1cce517e421fd7eeff4d5ac15e5165dd318c0 Mon Sep 17 00:00:00 2001 From: hikari Date: Thu, 21 Apr 2022 21:19:11 +0300 Subject: [PATCH 4/5] strings: levenshtein_distance: 64 is actually faster than 256 --- core/strings/strings.odin | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/core/strings/strings.odin b/core/strings/strings.odin index f876aab3d..6e01f5c8a 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1812,7 +1812,7 @@ fields_iterator :: proc(s: ^string) -> (field: string, ok: bool) { // `levenshtein_distance` returns the Levenshtein edit distance between 2 strings. // This is a single-row-version of the Wagner–Fischer algorithm, based on C code by Martin Ettl. -// Note: allocator isn't used if the length of string b in runes is smaller than 256. +// Note: allocator isn't used if the length of string b in runes is smaller than 64. levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> int { LEVENSHTEIN_DEFAULT_COSTS: []int : { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, @@ -1821,26 +1821,7 @@ levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> in 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, - 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, - 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, - 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, - 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, - 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, - 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, - 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, - 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, - 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, - 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, - 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, - 250, 251, 252, 253, 254, 255, + 60, 61, 62, 63, } m, n := utf8.rune_count_in_string(a), utf8.rune_count_in_string(b) From f0267536929c5909a8098baf95cb5eb1a4fa6522 Mon Sep 17 00:00:00 2001 From: hikari Date: Thu, 21 Apr 2022 21:19:43 +0300 Subject: [PATCH 5/5] strings: levenshtein_distance: remove `do` --- core/strings/strings.odin | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 6e01f5c8a..a3d9fa93e 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1826,8 +1826,12 @@ levenshtein_distance :: proc(a, b: string, allocator := context.allocator) -> in m, n := utf8.rune_count_in_string(a), utf8.rune_count_in_string(b) - if m == 0 do return n - if n == 0 do return m + if m == 0 { + return n + } + if n == 0 { + return m + } costs: []int