[compress/shoco] Add short string compressor.

This commit is contained in:
Jeroen van Rijn
2022-04-22 16:55:47 +02:00
parent b4f8efcbe6
commit e799476f90
8 changed files with 645 additions and 1 deletions

View File

@@ -0,0 +1,148 @@
/*
This file was generated, so don't edit this by hand.
Transliterated from https://github.com/Ed-von-Schleck/shoco/blob/master/shoco_model.h,
which is an English word model.
*/
// package shoco is an implementation of the shoco short string compressor
package shoco
DEFAULT_MODEL :: Shoco_Model {
min_char = 39,
max_char = 122,
characters_by_id = {
'e', 'a', 'i', 'o', 't', 'h', 'n', 'r', 's', 'l', 'u', 'c', 'w', 'm', 'd', 'b', 'p', 'f', 'g', 'v', 'y', 'k', '-', 'H', 'M', 'T', '\'', 'B', 'x', 'I', 'W', 'L',
},
ids_by_character = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 27, -1, -1, -1, -1, -1, 23, 29, -1, -1, 31, 24, -1, -1, -1, -1, -1, -1, 25, -1, -1, 30, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 15, 11, 14, 0, 17, 18, 5, 2, -1, 21, 9, 13, 6, 3, 16, -1, 7, 8, 4, 10, 19, 12, 28, 20, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
},
successors_by_bigram = {
{7, 4, 12, -1, 6, -1, 1, 0, 3, 5, -1, 9, -1, 8, 2, -1, 15, 14, -1, 10, 11, -1, -1, -1, -1, -1, -1, -1, 13, -1, -1, -1},
{-1, -1, 6, -1, 1, -1, 0, 3, 2, 4, 15, 11, -1, 9, 5, 10, 13, -1, 12, 8, 7, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{9, 11, -1, 4, 2, -1, 0, 8, 1, 5, -1, 6, -1, 3, 7, 15, -1, 12, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{-1, -1, 14, 7, 5, -1, 1, 2, 8, 9, 0, 15, 6, 4, 11, -1, 12, 3, -1, 10, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{2, 4, 3, 1, 5, 0, -1, 6, 10, 9, 7, 12, 11, -1, -1, -1, -1, 13, -1, -1, 8, -1, 15, -1, -1, -1, 14, -1, -1, -1, -1, -1},
{0, 1, 2, 3, 4, -1, -1, 5, 9, 10, 6, -1, -1, 8, 15, 11, -1, 14, -1, -1, 7, -1, 13, -1, -1, -1, 12, -1, -1, -1, -1, -1},
{2, 8, 7, 4, 3, -1, 9, -1, 6, 11, -1, 5, -1, -1, 0, -1, -1, 14, 1, 15, 10, 12, -1, -1, -1, -1, 13, -1, -1, -1, -1, -1},
{0, 3, 1, 2, 6, -1, 9, 8, 4, 12, 13, 10, -1, 11, 7, -1, -1, 15, 14, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 6, 3, 4, 1, 2, -1, -1, 5, 10, 7, 9, 11, 12, -1, -1, 8, 14, -1, -1, 15, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 6, 2, 5, 9, -1, -1, -1, 10, 1, 8, -1, 12, 14, 4, -1, 15, 7, -1, 13, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{8, 10, 9, 15, 1, -1, 4, 0, 3, 2, -1, 6, -1, 12, 11, 13, 7, 14, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{1, 3, 6, 0, 4, 2, -1, 7, 13, 8, 9, 11, -1, -1, 15, -1, -1, -1, -1, -1, 10, 5, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{3, 0, 1, 4, -1, 2, 5, 6, 7, 8, -1, 14, -1, -1, 9, 15, -1, 12, -1, -1, -1, 10, 11, -1, -1, -1, 13, -1, -1, -1, -1, -1},
{0, 1, 3, 2, 15, -1, 12, -1, 7, 14, 4, -1, -1, 9, -1, 8, 5, 10, -1, -1, 6, -1, 13, -1, -1, -1, 11, -1, -1, -1, -1, -1},
{0, 3, 1, 2, -1, -1, 12, 6, 4, 9, 7, -1, -1, 14, 8, -1, -1, 15, 11, 13, 5, -1, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 5, 7, 2, 10, 13, -1, 6, 8, 1, 3, -1, -1, 14, 15, 11, -1, -1, -1, 12, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 2, 6, 3, 7, 10, -1, 1, 9, 4, 8, -1, -1, 15, -1, 12, 5, -1, -1, -1, 11, -1, 13, -1, -1, -1, 14, -1, -1, -1, -1, -1},
{1, 3, 4, 0, 7, -1, 12, 2, 11, 8, 6, 13, -1, -1, -1, -1, -1, 5, -1, -1, 10, 15, 9, -1, -1, -1, 14, -1, -1, -1, -1, -1},
{1, 3, 5, 2, 13, 0, 9, 4, 7, 6, 8, -1, -1, 15, -1, 11, -1, -1, 10, -1, 14, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 2, 1, 3, -1, -1, -1, 6, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{1, 11, 4, 0, 3, -1, 13, 12, 2, 7, -1, -1, 15, 10, 5, 8, 14, -1, -1, -1, -1, -1, 9, -1, -1, -1, 6, -1, -1, -1, -1, -1},
{0, 9, 2, 14, 15, 4, 1, 13, 3, 5, -1, -1, 10, -1, -1, -1, -1, 6, 12, -1, 7, -1, 8, -1, -1, -1, 11, -1, -1, -1, -1, -1},
{-1, 2, 14, -1, 1, 5, 8, 7, 4, 12, -1, 6, 9, 11, 13, 3, 10, 15, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{0, 1, 3, 2, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{4, 3, 1, 5, -1, -1, -1, 0, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{2, 8, 4, 1, -1, 0, -1, 6, -1, -1, 5, -1, 7, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, 9, -1, -1, -1, -1, -1, -1, -1, -1},
{12, 5, -1, -1, 1, -1, -1, 7, 0, 3, -1, 2, -1, 4, 6, -1, -1, -1, -1, 8, -1, -1, 15, -1, 13, 9, -1, -1, -1, -1, -1, 11},
{1, 3, 2, 4, -1, -1, -1, 5, -1, 7, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1, 8, -1, -1},
{5, 3, 4, 12, 1, 6, -1, -1, -1, -1, 8, 2, -1, -1, -1, -1, 0, 9, -1, -1, 11, -1, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{-1, -1, -1, -1, 0, -1, 1, 12, 3, -1, -1, -1, -1, 5, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, 6, -1, 10},
{2, 3, 1, 4, -1, 0, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, 6, -1, -1},
{5, 1, 3, 0, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, 9, -1, -1, 6, -1, 7},
},
successors_reversed = {
{'s', 't', 'c', 'l', 'm', 'a', 'd', 'r', 'v', 'T', 'A', 'L', 'e', 'M', 'Y', '-'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'-', 't', 'a', 'b', 's', 'h', 'c', 'r', 'n', 'w', 'p', 'm', 'l', 'd', 'i', 'f'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'u', 'e', 'i', 'a', 'o', 'r', 'y', 'l', 'I', 'E', 'R', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'e', 'a', 'o', 'i', 'u', 'A', 'y', 'E', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'t', 'n', 'f', 's', '\'', 'm', 'I', 'N', 'A', 'E', 'L', 'Z', 'r', 'V', 'R', 'C'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'o', 'a', 'y', 'i', 'u', 'e', 'I', 'L', 'D', '\'', 'E', 'Y', '\x00', '\x00', '\x00', '\x00'},
{'r', 'i', 'y', 'a', 'e', 'o', 'u', 'Y', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'h', 'o', 'e', 'E', 'i', 'u', 'r', 'w', 'a', 'H', 'y', 'R', 'Z', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'h', 'i', 'e', 'a', 'o', 'r', 'I', 'y', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'n', 't', 's', 'r', 'l', 'd', 'i', 'y', 'v', 'm', 'b', 'c', 'g', 'p', 'k', 'u'},
{'e', 'l', 'o', 'u', 'y', 'a', 'r', 'i', 's', 'j', 't', 'b', 'v', 'h', 'm', 'd'},
{'o', 'e', 'h', 'a', 't', 'k', 'i', 'r', 'l', 'u', 'y', 'c', 'q', 's', '-', 'd'},
{'e', 'i', 'o', 'a', 's', 'y', 'r', 'u', 'd', 'l', '-', 'g', 'n', 'v', 'm', 'f'},
{'r', 'n', 'd', 's', 'a', 'l', 't', 'e', 'm', 'c', 'v', 'y', 'i', 'x', 'f', 'p'},
{'o', 'e', 'r', 'a', 'i', 'f', 'u', 't', 'l', '-', 'y', 's', 'n', 'c', '\'', 'k'},
{'h', 'e', 'o', 'a', 'r', 'i', 'l', 's', 'u', 'n', 'g', 'b', '-', 't', 'y', 'm'},
{'e', 'a', 'i', 'o', 't', 'r', 'u', 'y', 'm', 's', 'l', 'b', '\'', '-', 'f', 'd'},
{'n', 's', 't', 'm', 'o', 'l', 'c', 'd', 'r', 'e', 'g', 'a', 'f', 'v', 'z', 'b'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'e', 'n', 'i', 's', 'h', 'l', 'f', 'y', '-', 'a', 'w', '\'', 'g', 'r', 'o', 't'},
{'e', 'l', 'i', 'y', 'd', 'o', 'a', 'f', 'u', 't', 's', 'k', 'w', 'v', 'm', 'p'},
{'e', 'a', 'o', 'i', 'u', 'p', 'y', 's', 'b', 'm', 'f', '\'', 'n', '-', 'l', 't'},
{'d', 'g', 'e', 't', 'o', 'c', 's', 'i', 'a', 'n', 'y', 'l', 'k', '\'', 'f', 'v'},
{'u', 'n', 'r', 'f', 'm', 't', 'w', 'o', 's', 'l', 'v', 'd', 'p', 'k', 'i', 'c'},
{'e', 'r', 'a', 'o', 'l', 'p', 'i', 't', 'u', 's', 'h', 'y', 'b', '-', '\'', 'm'},
{'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'e', 'i', 'o', 'a', 's', 'y', 't', 'd', 'r', 'n', 'c', 'm', 'l', 'u', 'g', 'f'},
{'e', 't', 'h', 'i', 'o', 's', 'a', 'u', 'p', 'c', 'l', 'w', 'm', 'k', 'f', 'y'},
{'h', 'o', 'e', 'i', 'a', 't', 'r', 'u', 'y', 'l', 's', 'w', 'c', 'f', '\'', '-'},
{'r', 't', 'l', 's', 'n', 'g', 'c', 'p', 'e', 'i', 'a', 'd', 'm', 'b', 'f', 'o'},
{'e', 'i', 'a', 'o', 'y', 'u', 'r', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
{'a', 'i', 'h', 'e', 'o', 'n', 'r', 's', 'l', 'd', 'k', '-', 'f', '\'', 'c', 'b'},
{'p', 't', 'c', 'a', 'i', 'e', 'h', 'q', 'u', 'f', '-', 'y', 'o', '\x00', '\x00', '\x00'},
{'o', 'e', 's', 't', 'i', 'd', '\'', 'l', 'b', '-', 'm', 'a', 'r', 'n', 'p', 'w'},
},
character_count = 32,
successor_count = 16,
max_successor_n = 7,
packs = {
{ 0x80000000, 1, 2, { 26, 24, 24, 24, 24, 24, 24, 24 }, { 15, 3, 0, 0, 0, 0, 0, 0 }, 0xc0, 0x80 },
{ 0xc0000000, 2, 4, { 25, 22, 19, 16, 16, 16, 16, 16 }, { 15, 7, 7, 7, 0, 0, 0, 0 }, 0xe0, 0xc0 },
{ 0xe0000000, 4, 8, { 23, 19, 15, 11, 8, 5, 2, 0 }, { 31, 15, 15, 15, 7, 7, 7, 3 }, 0xf0, 0xe0 },
},
}

View File

@@ -0,0 +1,318 @@
/*
Copyright 2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
List of contributors:
Jeroen van Rijn: Initial implementation.
An implementation of [shoco](https://github.com/Ed-von-Schleck/shoco) by Christian Schramm.
*/
// package shoco is an implementation of the shoco short string compressor
package shoco
import "core:intrinsics"
import "core:compress"
Shoco_Pack :: struct {
word: u32,
bytes_packed: i8,
bytes_unpacked: i8,
offsets: [8]u16,
masks: [8]i16,
header_mask: u8,
header: u8,
}
Shoco_Model :: struct {
min_char: u8,
max_char: u8,
characters_by_id: []u8,
ids_by_character: [256]i16,
successors_by_bigram: [][]i8,
successors_reversed: [][]u8,
character_count: u8,
successor_count: u8,
max_successor_n: i8,
packs: []Shoco_Pack,
}
compress_bound :: proc(uncompressed_size: int) -> (worst_case_compressed_size: int) {
// Worst case compression happens when input is non-ASCII (128-255)
// Encoded as 0x00 + the byte in question.
return uncompressed_size * 2
}
decompress_bound :: proc(compressed_size: int, model := DEFAULT_MODEL) -> (maximum_decompressed_size: int) {
// Best case compression is 2:1
most: f64
for pack in model.packs {
val := f64(compressed_size) / f64(pack.bytes_packed) * f64(pack.bytes_unpacked)
most = max(most, val)
}
return int(most)
}
find_best_encoding :: proc(indices: []i16, n_consecutive: i8, model := DEFAULT_MODEL) -> (res: int) {
for p := len(model.packs); p > 0; p -= 1 {
pack := model.packs[p - 1]
if n_consecutive >= pack.bytes_unpacked {
have_index := true
for i := 0; i < int(pack.bytes_unpacked); i += 1 {
if indices[i] > pack.masks[i] {
have_index = false
break
}
}
if have_index {
return p - 1
}
}
}
return -1
}
validate_model :: proc(model: Shoco_Model) -> (int, compress.Error) {
if len(model.successors_reversed) != int(model.max_char - model.min_char) {
return 0, .Unknown_Compression_Method
}
if len(model.characters_by_id) != int(model.character_count) {
return 0, .Unknown_Compression_Method
}
if len(model.successors_by_bigram) != int(model.character_count) || len(model.successors_by_bigram[0]) != int(model.character_count) {
return 0, .Unknown_Compression_Method
}
if len(model.successors_reversed[0]) != int(model.successor_count) {
return 0, .Unknown_Compression_Method
}
// Model seems legit.
return 0, nil
}
// Decompresses into provided buffer.
decompress_slice_to_output_buffer :: proc(input: []u8, output: []u8, model := DEFAULT_MODEL) -> (size: int, err: compress.Error) {
inp, inp_end := 0, len(input)
out, out_end := 0, len(output)
validate_model(model) or_return
for inp < inp_end {
val := transmute(i8)input[inp]
mark := int(-1)
for val < 0 {
val <<= 1
mark += 1
}
if mark > len(model.packs) {
return out, .Unknown_Compression_Method
}
if mark < 0 {
if out >= out_end {
return out, .Output_Too_Short
}
// Ignore the sentinel value for non-ASCII chars
if input[inp] == 0x00 {
inp += 1
if inp >= inp_end {
return out, .Stream_Too_Short
}
}
output[out] = input[inp]
inp, out = inp + 1, out + 1
} else {
pack := model.packs[mark]
if out + int(pack.bytes_unpacked) > out_end {
return out, .Output_Too_Short
} else if inp + int(pack.bytes_packed) > inp_end {
return out, .Stream_Too_Short
}
code := intrinsics.unaligned_load((^u32)(&input[inp]))
when ODIN_ENDIAN == .Little {
code = intrinsics.byte_swap(code)
}
// Unpack the leading char
offset := pack.offsets[0]
mask := pack.masks[0]
last_chr := model.characters_by_id[(code >> offset) & u32(mask)]
output[out] = last_chr
// Unpack the successor chars
for i := 1; i < int(pack.bytes_unpacked); i += 1 {
offset = pack.offsets[i]
mask = pack.masks[i]
last_chr = model.successors_reversed[last_chr - model.min_char][(code >> offset) & u32(mask)]
output[out + i] = last_chr
}
out += int(pack.bytes_unpacked)
inp += int(pack.bytes_packed)
}
}
return out, nil
}
decompress_slice_to_string :: proc(input: []u8, model := DEFAULT_MODEL, allocator := context.allocator) -> (res: string, err: compress.Error) {
context.allocator = allocator
if len(input) == 0 {
return "", .Stream_Too_Short
}
max_output_size := decompress_bound(len(input), model)
buf: [dynamic]u8
if !resize(&buf, max_output_size) {
return "", .Out_Of_Memory
}
length, result := decompress_slice_to_output_buffer(input, buf[:])
resize(&buf, length)
return string(buf[:]), result
}
decompress :: proc{decompress_slice_to_output_buffer, decompress_slice_to_string}
compress_string_to_buffer :: proc(input: string, output: []u8, model := DEFAULT_MODEL, allocator := context.allocator) -> (size: int, err: compress.Error) {
inp, inp_end := 0, len(input)
out, out_end := 0, len(output)
output := output
validate_model(model) or_return
indices := make([]i16, model.max_successor_n + 1)
defer delete(indices)
last_resort := false
encode: for inp < inp_end {
if last_resort {
last_resort = false
if input[inp] & 0x80 == 0x80 {
// Non-ASCII case
if out + 2 > out_end {
return out, .Output_Too_Short
}
// Put in a sentinel byte
output[out] = 0x00
out += 1
} else {
// An ASCII byte
if out + 1 > out_end {
return out, .Output_Too_Short
}
}
output[out] = input[inp]
out, inp = out + 1, inp + 1
} else {
// Find the longest string of known successors
indices[0] = model.ids_by_character[input[inp]]
last_chr_index := indices[0]
if last_chr_index < 0 {
last_resort = true
continue encode
}
rest := inp_end - inp
n_consecutive: i8 = 1
for ; n_consecutive <= model.max_successor_n; n_consecutive += 1 {
if inp_end > 0 && int(n_consecutive) == rest {
break
}
current_index := model.ids_by_character[input[inp + int(n_consecutive)]]
if current_index < 0 { // '\0' is always -1
break
}
successor_index := model.successors_by_bigram[last_chr_index][current_index]
if successor_index < 0 {
break
}
indices[n_consecutive] = i16(successor_index)
last_chr_index = current_index
}
if n_consecutive < 2 {
last_resort = true
continue encode
}
pack_n := find_best_encoding(indices, n_consecutive)
if pack_n >= 0 {
if out + int(model.packs[pack_n].bytes_packed) > out_end {
return out, .Output_Too_Short
}
pack := model.packs[pack_n]
code := pack.word
for i := 0; i < int(pack.bytes_unpacked); i += 1 {
code |= u32(indices[i]) << pack.offsets[i]
}
// In the little-endian world, we need to swap what's in the register to match the memory representation.
when ODIN_ENDIAN == .Little {
code = intrinsics.byte_swap(code)
}
out_ptr := raw_data(output[out:])
switch pack.bytes_packed {
case 4:
intrinsics.unaligned_store(transmute(^u32)out_ptr, code)
case 2:
intrinsics.unaligned_store(transmute(^u16)out_ptr, u16(code))
case 1:
intrinsics.unaligned_store(transmute(^u8)out_ptr, u8(code))
case:
return out, .Unknown_Compression_Method
}
out += int(pack.bytes_packed)
inp += int(pack.bytes_unpacked)
} else {
last_resort = true
continue encode
}
}
}
return out, nil
}
compress_string :: proc(input: string, model := DEFAULT_MODEL, allocator := context.allocator) -> (output: []u8, err: compress.Error) {
context.allocator = allocator
if len(input) == 0 {
return {}, .Stream_Too_Short
}
max_output_size := compress_bound(len(input))
buf: [dynamic]u8
if !resize(&buf, max_output_size) {
return {}, .Out_Of_Memory
}
length, result := compress_string_to_buffer(input, buf[:])
resize(&buf, length)
return buf[:length], result
}
compress :: proc{compress_string_to_buffer, compress_string}

View File

@@ -10,6 +10,7 @@ import c "core:c"
import libc "core:c/libc"
import compress "core:compress"
import shoco "core:compress/shoco"
import gzip "core:compress/gzip"
import zlib "core:compress/zlib"
@@ -115,6 +116,7 @@ _ :: bytes
_ :: c
_ :: libc
_ :: compress
_ :: shoco
_ :: gzip
_ :: zlib
_ :: bit_array

View File

@@ -0,0 +1,26 @@
Copyright (c) 2016-2021 Ginger Bill. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Binary file not shown.

View File

@@ -0,0 +1,95 @@
<p align="center">
<img src="misc/logo-slim.png" alt="Odin logo" style="width:65%">
<br/>
The Data-Oriented Language for Sane Software Development.
<br/>
<br/>
<a href="https://github.com/odin-lang/odin/releases/latest">
<img src="https://img.shields.io/github/release/odin-lang/odin.svg">
</a>
<a href="https://github.com/odin-lang/odin/releases/latest">
<img src="https://img.shields.io/badge/platforms-Windows%20|%20Linux%20|%20macOS-green.svg">
</a>
<br>
<a href="https://discord.gg/odinlang">
<img src="https://img.shields.io/discord/568138951836172421?logo=discord">
</a>
<a href="https://github.com/odin-lang/odin/actions">
<img src="https://github.com/odin-lang/odin/workflows/CI/badge.svg?branch=master&event=push">
</a>
</p>
# The Odin Programming Language
Odin is a general-purpose programming language with distinct typing, built for high performance, modern systems, and built-in data-oriented data types. The Odin Programming Language, the C alternative for the joy of programming.
Website: [https://odin-lang.org/](https://odin-lang.org/)
```odin
package main
import "core:fmt"
main :: proc() {
program := "+ + * 😃 - /"
accumulator := 0
for token in program {
switch token {
case '+': accumulator += 1
case '-': accumulator -= 1
case '*': accumulator *= 2
case '/': accumulator /= 2
case '😃': accumulator *= accumulator
case: // Ignore everything else
}
}
fmt.printf("The program \"%s\" calculates the value %d\n",
program, accumulator)
}
```
## Documentation
#### [Getting Started](https://odin-lang.org/docs/install)
Instructions for downloading and installing the Odin compiler and libraries.
#### [Nightly Builds](https://odin-lang.org/docs/nightly/)
Get the latest nightly builds of Odin.
### Learning Odin
#### [Overview of Odin](https://odin-lang.org/docs/overview)
An overview of the Odin programming language.
#### [Frequently Asked Questions (FAQ)](https://odin-lang.org/docs/faq)
Answers to common questions about Odin.
#### [Packages](https://pkg.odin-lang.org/)
Documentation for all the official packages part of the [core](https://pkg.odin-lang.org/core/) and [vendor](https://pkg.odin-lang.org/vendor/) library collections.
#### [The Odin Wiki](https://github.com/odin-lang/Odin/wiki)
A wiki maintained by the Odin community.
#### [Odin Discord](https://discord.gg/sVBPHEv)
Get live support and talk with other odiners on the Odin Discord.
### Articles
#### [The Odin Blog](https://odin-lang.org/news/)
The official blog of the Odin programming language, featuring announcements, news, and in-depth articles by the Odin team and guests.
## Warnings
* The Odin compiler is still in development.

Binary file not shown.

View File

@@ -7,13 +7,14 @@ package test_core_compress
List of contributors:
Jeroen van Rijn: Initial implementation.
A test suite for ZLIB, GZIP.
A test suite for ZLIB, GZIP and Shoco.
*/
import "core:testing"
import "core:compress/zlib"
import "core:compress/gzip"
import "core:compress/shoco"
import "core:bytes"
import "core:fmt"
@@ -48,6 +49,7 @@ main :: proc() {
t := testing.T{w=w}
zlib_test(&t)
gzip_test(&t)
shoco_test(&t)
fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
if TEST_fail > 0 {
@@ -134,3 +136,56 @@ gzip_test :: proc(t: ^testing.T) {
expect(t, false, error)
}
}
@test
shoco_test :: proc(t: ^testing.T) {
Shoco_Tests :: []struct{
compressed: []u8,
raw: []u8,
short_pack: int,
short_sentinel: int,
}{
{ #load("../assets/Shoco/README.md.shoco"), #load("../assets/Shoco/README.md"), 10, 1006 },
{ #load("../assets/Shoco/LICENSE.shoco"), #load("../assets/Shoco/LICENSE"), 25, 68 },
}
for v in Shoco_Tests {
expected_raw := len(v.raw)
expected_compressed := len(v.compressed)
biggest_unpacked := shoco.decompress_bound(expected_compressed)
biggest_packed := shoco.compress_bound(expected_raw)
buffer := make([]u8, max(biggest_packed, biggest_unpacked))
defer delete(buffer)
size, err := shoco.decompress(v.compressed, buffer[:])
msg := fmt.tprintf("Expected `decompress` to return `nil`, got %v", err)
expect(t, err == nil, msg)
msg = fmt.tprintf("Decompressed %v bytes into %v. Expected to decompress into %v bytes.", len(v.compressed), size, expected_raw)
expect(t, size == expected_raw, msg)
expect(t, string(buffer[:size]) == string(v.raw), "Decompressed contents don't match.")
size, err = shoco.compress(string(v.raw), buffer[:])
expect(t, err == nil, "Expected `compress` to return `nil`.")
msg = fmt.tprintf("Compressed %v bytes into %v. Expected to compress into %v bytes.", expected_raw, size, expected_compressed)
expect(t, size == expected_compressed, msg)
size, err = shoco.decompress(v.compressed, buffer[:expected_raw - 10])
msg = fmt.tprintf("Decompressing into too small a buffer returned %v, expected `.Output_Too_Short`", err)
expect(t, err == .Output_Too_Short, msg)
size, err = shoco.compress(string(v.raw), buffer[:expected_compressed - 10])
msg = fmt.tprintf("Compressing into too small a buffer returned %v, expected `.Output_Too_Short`", err)
expect(t, err == .Output_Too_Short, msg)
size, err = shoco.decompress(v.compressed[:v.short_pack], buffer[:])
expect(t, err == .Stream_Too_Short, "Expected `decompress` to return `Stream_Too_Short` because there was no more data after selecting a pack.")
size, err = shoco.decompress(v.compressed[:v.short_sentinel], buffer[:])
expect(t, err == .Stream_Too_Short, "Expected `decompress` to return `Stream_Too_Short` because there was no more data after non-ASCII sentinel.")
}
}