stb/tools/unicode.c

750 lines
21 KiB
C
Raw Normal View History

#define STB_DEFINE
#include "../stb.h"
// create unicode mappings
//
// Two kinds of mappings:
// map to a number
// map to a bit
//
// For mapping to a number, we use the following strategy:
//
// User supplies:
// 1. a table of numbers (for now we use uint16, so full Unicode table is 4MB)
// 2. a "don't care" value
// 3. define a 'fallback' value (typically 0)
// 4. define a fast-path range (typically 0..255 or 0..1023) [@TODO: automate detecting this]
//
// Code:
// 1. Determine range of *end* of unicode codepoints (U+10FFFF and down) which
// all have the same value (or don't care). If large enough, emit this as a
// special case in the code.
// 2. Repeat above, limited to at most U+FFFF.
// 3. Cluster the data into intervals of 8,16,32,64,128,256 numeric values.
// 3a. If all the values in an interval are fallback/dont-care, no further processing
// 3b. Find the "trimmed range" outside which all the values are the fallback or don't care
// 3c. Find the "special trimmed range" outside which all the values are some constant or don't care
// 4. Pack the clusters into continuous memory, and find previous instances of
// the cluster. Repeat for trimmed & special-trimmed. In the first case, find
// previous instances of the cluster (allow don't-care to match in either
// direction), both aligned and mis-aligned; in the latter, starting where
// things start or mis-aligned. Build an index table specifying the
// location of each cluster (and its length). Allow an extra indirection here;
// the full-sized index can index a smaller table which has the actual offset
// (and lengths).
// 5. Associate with each packed continuous memory above the amount of memory
// required to store the data w/ smallest datatype (of uint8, uint16, uint32).
// Discard the continuous memory. Recurse on each index table, but avoid the
// smaller packing.
//
// For mapping to a bit, we pack the results for 8 characters into a byte, and then apply
// the above strategy. Note that there may be more optimal approaches with e.g. packing
// 8 different bits into a single structure, though, which we should explore eventually.
// currently we limit *indices* to being 2^16, and we pack them as
// index + end_trim*2^16 + start_trim*2^24; specials have to go in a separate table
typedef uint32 uval;
#define UVAL_DONT_CARE_DEFAULT 0xffffffff
typedef struct
{
uval *input;
uint32 dont_care;
uint32 fallback;
int fastpath;
int length;
int depth;
int has_sign;
int splittable;
int replace_fallback_with_codepoint;
size_t input_size;
size_t inherited_storage;
} table;
typedef struct
{
int split_log2;
table result; // index into not-returned table
int storage;
} output;
typedef struct
{
table t;
char **output_name;
} info;
typedef struct
{
size_t path;
size_t size;
} result;
typedef struct
{
uint8 trim_end;
uint8 trim_start;
uint8 special;
uint8 aligned;
uint8 indirect;
uint16 overhead; // add some forced overhead for each mode to avoid getting complex encoding when it doesn't save much
} mode_info;
mode_info modes[] =
{
{ 0,0,0,0,0, 32, },
{ 0,0,0,0,1, 100, },
{ 0,0,0,1,0, 32, },
{ 0,0,0,1,1, 100, },
{ 0,0,1,0,1, 100, },
{ 0,0,1,1,0, 32, },
{ 0,0,1,1,1, 200, },
{ 1,0,0,0,0, 100, },
{ 1,0,0,0,1, 120, },
{ 1,1,0,0,0, 100, },
{ 1,1,0,0,1, 130, },
{ 1,0,1,0,0, 130, },
{ 1,0,1,0,1, 180, },
{ 1,1,1,0,0, 180, },
{ 1,1,1,0,1, 200, },
};
#define MODECOUNT (sizeof(modes)/sizeof(modes[0]))
#define CLUSTERSIZECOUNT 6 // 8,16, 32,64, 128,256
size_t size_for_max_number(uint32 number)
{
if (number == 0) return 0;
if (number < 256) return 1;
if (number < 256*256) return 2;
if (number < 256*256*256) return 3;
return 4;
}
size_t size_for_max_number_aligned(uint32 number)
{
size_t n = size_for_max_number(number);
return n == 3 ? 4 : n;
}
uval get_data(uval *data, int offset, uval *end)
{
if (data + offset >= end)
return 0;
else
return data[offset];
}
int safe_len(uval *data, int len, uval *end)
{
if (len > end - data)
return end - data;
return len;
}
uval tempdata[256];
int dirty=0;
size_t find_packed(uval **packed, uval *data, int len, int aligned, int fastpath, uval *end, int offset, int replace)
{
int packlen = stb_arr_len(*packed);
int i,p;
if (data+len > end || replace) {
int safelen = safe_len(data, len, end);
memset(tempdata, 0, dirty*sizeof(tempdata[0]));
memcpy(tempdata, data, safelen * sizeof(data[0]));
data = tempdata;
dirty = len;
}
if (replace) {
int i;
int safelen = safe_len(data, len, end);
for (i=0; i < safelen; ++i)
if (data[i] == 0)
data[i] = offset+i;
}
if (len <= 0)
return 0;
if (!fastpath) {
if (aligned) {
for (i=0; i < packlen; i += len)
if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
return i / len;
} else {
for (i=0; i < packlen-len+1; i += 1 )
if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
return i;
}
}
p = stb_arr_len(*packed);
for (i=0; i < len; ++i)
stb_arr_push(*packed, data[i]);
return p;
}
void output_table(char *name1, char *name2, uval *data, int length, int sign, char **names)
{
char temp[20];
uval maxv = 0;
int bytes, numlen, at_newline;
int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
int i,pos, do_split=0;
for (i=0; i < length; ++i)
if (sign)
maxv = stb_max(maxv, (uval)abs((int)data[i]));
else
maxv = stb_max(maxv, data[i]);
bytes = size_for_max_number_aligned(maxv);
sprintf(temp, "%d", maxv);
numlen=strlen(temp);
if (sign)
++numlen;
if (bytes == 0)
return;
printf("uint%d %s%s[%d] = {\n", bytes*8, name1, name2, length);
at_newline = 1;
for (i=0; i < length; ++i) {
if (pos + numlen + 2 > linelen) {
printf("\n");
at_newline = 1;
pos = 0;
}
if (at_newline) {
printf(" ");
pos = 2;
at_newline = 0;
} else {
printf(" ");
++pos;
}
printf("%*d,", numlen, data[i]);
pos += numlen+1;
}
if (!at_newline) printf("\n");
printf("};\n");
}
void output_table_with_trims(char *name1, char *name2, uval *data, int length)
{
uval maxt=0, maxp=0;
int i,d,s,e, count;
// split the table into two pieces
uval *trims = NULL;
if (length == 0)
return;
for (i=0; i < stb_arr_len(data); ++i) {
stb_arr_push(trims, data[i] >> 16);
data[i] &= 0xffff;
maxt = stb_max(maxt, trims[i]);
maxp = stb_max(maxp, data[i]);
}
d=s=e=1;
if (maxt >= 256) {
// need to output start & end values
if (maxp >= 256) {
// can pack into a single table
printf("struct { uint16 val; uint8 start, end; } %s%s[%d] = {\n", name1, name2, length);
} else {
output_table(name1, name2, data, length, 0, 0);
d=0;
printf("struct { uint8 start, end; } %s%s_trim[%d] = {\n", name1, name2, length);
}
} else if (maxt > 0) {
if (maxp >= 256) {
output_table(name1, name2, data, length, 0, 0);
output_table(name1, stb_sprintf("%s_end", name2), trims, length, 0, 0);
return;
} else {
printf("struct { uint8 val, end; } %s%s[%d] = {\n", name1, name2, length);
s=0;
}
} else {
output_table(name1, name2, data, length, 0, 0);
return;
}
// d or s can be zero (but not both), e is always present and last
count = d + s + e;
assert(count >= 2 && count <= 3);
{
char temp[60];
uval maxv = 0;
int numlen, at_newline, len;
int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
int i,pos, do_split=0;
numlen = 0;
for (i=0; i < length; ++i) {
if (count == 2)
sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
else
sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
len = strlen(temp);
numlen = stb_max(len, numlen);
}
at_newline = 1;
for (i=0; i < length; ++i) {
if (pos + numlen + 2 > linelen) {
printf("\n");
at_newline = 1;
pos = 0;
}
if (at_newline) {
printf(" ");
pos = 2;
at_newline = 0;
} else {
printf(" ");
++pos;
}
if (count == 2)
sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
else
sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
printf("%*s,", numlen, temp);
pos += numlen+1;
}
if (!at_newline) printf("\n");
printf("};\n");
}
}
int weight=1;
table pack_for_mode(table *t, int mode, char *table_name)
{
size_t extra_size;
int i;
uval maxv;
mode_info mi = modes[mode % MODECOUNT];
int size = 8 << (mode / MODECOUNT);
table newtab;
uval *packed = NULL;
uval *index = NULL;
uval *indirect = NULL;
uval *specials = NULL;
newtab.dont_care = UVAL_DONT_CARE_DEFAULT;
if (table_name)
printf("// clusters of %d\n", size);
for (i=0; i < t->length; i += size) {
uval newval;
int fastpath = (i < t->fastpath);
if (mi.special) {
int end_trim = size-1;
int start_trim = 0;
uval special;
// @TODO: pick special from start or end instead of only end depending on which is longer
for(;;) {
special = t->input[i + end_trim];
if (special != t->dont_care || end_trim == 0)
break;
--end_trim;
}
// at this point, special==inp[end_trim], and end_trim >= 0
if (special == t->dont_care && !fastpath) {
// entire block is don't care, so OUTPUT don't care
stb_arr_push(index, newtab.dont_care);
continue;
} else {
uval pos, trim;
if (mi.trim_end && !fastpath) {
while (end_trim >= 0) {
if (t->input[i + end_trim] == special || t->input[i + end_trim] == t->dont_care)
--end_trim;
else
break;
}
}
if (mi.trim_start && !fastpath) {
while (start_trim < end_trim) {
if (t->input[i + start_trim] == special || t->input[i + start_trim] == t->dont_care)
++start_trim;
else
break;
}
}
// end_trim points to the last character we have to output
// find the first match, or add it
pos = find_packed(&packed, &t->input[i+start_trim], end_trim-start_trim+1, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
// encode as a uval
if (!mi.trim_end) {
if (end_trim == 0)
pos = special;
else
pos = pos | 0x80000000;
} else {
assert(end_trim < size && end_trim >= -1);
if (!fastpath) assert(end_trim < size-1); // special always matches last one
assert(end_trim < size && end_trim+1 >= 0);
if (!fastpath) assert(end_trim+1 < size);
if (mi.trim_start)
trim = start_trim*256 + (end_trim+1);
else
trim = end_trim+1;
assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
pos = pos + (trim << 16);
}
newval = pos;
stb_arr_push(specials, special);
}
} else if (mi.trim_end) {
int end_trim = size-1;
int start_trim = 0;
uval pos, trim;
while (end_trim >= 0 && !fastpath)
if (t->input[i + end_trim] == t->fallback || t->input[i + end_trim] == t->dont_care)
--end_trim;
else
break;
if (mi.trim_start && !fastpath) {
while (start_trim < end_trim) {
if (t->input[i + start_trim] == t->fallback || t->input[i + start_trim] == t->dont_care)
++start_trim;
else
break;
}
}
// end_trim points to the last character we have to output, and can be -1
++end_trim; // make exclusive at end
if (end_trim == 0 && size == 256)
start_trim = end_trim = 1; // we can't make encode a length from 0..256 in 8 bits, so restrict end_trim to 1..256
// find the first match, or add it
pos = find_packed(&packed, &t->input[i+start_trim], end_trim - start_trim, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
assert(end_trim <= size && end_trim >= 0);
if (size == 256)
assert(end_trim-1 < 256 && end_trim-1 >= 0);
else
assert(end_trim < 256 && end_trim >= 0);
if (size == 256)
--end_trim;
if (mi.trim_start)
trim = start_trim*256 + end_trim;
else
trim = end_trim;
assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
pos = pos + (trim << 16);
newval = pos;
} else {
newval = find_packed(&packed, &t->input[i], size, mi.aligned, fastpath, &t->input[t->length], i, t->replace_fallback_with_codepoint);
}
if (mi.indirect) {
int j;
for (j=0; j < stb_arr_len(indirect); ++j)
if (indirect[j] == newval)
break;
if (j == stb_arr_len(indirect))
stb_arr_push(indirect, newval);
stb_arr_push(index, j);
} else {
stb_arr_push(index, newval);
}
}
// total up the new size for everything but the index table
extra_size = mi.overhead * weight; // not the actual overhead cost; a penalty to avoid excessive complexity
extra_size += 150; // per indirection
if (table_name)
extra_size = 0;
if (t->has_sign) {
// 'packed' contains two values, which should be packed positive & negative for size
uval maxv2;
for (i=0; i < stb_arr_len(packed); ++i)
if (packed[i] & 0x80000000)
maxv2 = stb_max(maxv2, packed[i]);
else
maxv = stb_max(maxv, packed[i]);
maxv = stb_max(maxv, maxv2) << 1;
} else {
maxv = 0;
for (i=0; i < stb_arr_len(packed); ++i)
if (packed[i] > maxv && packed[i] != t->dont_care)
maxv = packed[i];
}
extra_size += stb_arr_len(packed) * (t->splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv));
if (table_name) {
if (t->splittable)
output_table_with_trims(table_name, "", packed, stb_arr_len(packed));
else
output_table(table_name, "", packed, stb_arr_len(packed), t->has_sign, NULL);
}
maxv = 0;
for (i=0; i < stb_arr_len(specials); ++i)
if (specials[i] > maxv)
maxv = specials[i];
extra_size += stb_arr_len(specials) * size_for_max_number_aligned(maxv);
if (table_name)
output_table(table_name, "_default", specials, stb_arr_len(specials), 0, NULL);
maxv = 0;
for (i=0; i < stb_arr_len(indirect); ++i)
if (indirect[i] > maxv)
maxv = indirect[i];
extra_size += stb_arr_len(indirect) * size_for_max_number(maxv);
if (table_name && stb_arr_len(indirect)) {
if (mi.trim_end)
output_table_with_trims(table_name, "_index", indirect, stb_arr_len(indirect));
else {
assert(0); // this case should only trigger in very extreme circumstances
output_table(table_name, "_index", indirect, stb_arr_len(indirect), 0, NULL);
}
mi.trim_end = mi.special = 0;
}
if (table_name)
printf("// above tables should be %d bytes\n", extra_size);
maxv = 0;
for (i=0; i < stb_arr_len(index); ++i)
if (index[i] > maxv && index[i] != t->dont_care)
maxv = index[i];
newtab.splittable = mi.trim_end;
newtab.input_size = newtab.splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv);
newtab.input = index;
newtab.length = stb_arr_len(index);
newtab.inherited_storage = t->inherited_storage + extra_size;
newtab.fastpath = 0;
newtab.depth = t->depth+1;
stb_arr_free(indirect);
stb_arr_free(packed);
stb_arr_free(specials);
return newtab;
}
result pack_table(table *t, size_t path, int min_storage)
{
int i;
result best;
best.size = t->inherited_storage + t->input_size * t->length;
best.path = path;
if ((int) t->inherited_storage > min_storage) {
best.size = stb_max(best.size, t->inherited_storage);
return best;
}
if (t->length <= 256 || t->depth >= 4) {
//printf("%08x: %7d\n", best.path, best.size);
return best;
}
path <<= 7;
for (i=0; i < MODECOUNT * CLUSTERSIZECOUNT; ++i) {
table newtab;
result r;
newtab = pack_for_mode(t, i, 0);
r = pack_table(&newtab, path+i+1, min_storage);
if (r.size < best.size)
best = r;
stb_arr_free(newtab.input);
//printf("Size: %6d + %6d\n", newtab.inherited_storage, newtab.input_size * newtab.length);
}
return best;
}
int pack_table_by_modes(table *t, int *modes)
{
table s = *t;
while (*modes > -1) {
table newtab;
newtab = pack_for_mode(&s, *modes, 0);
if (s.input != t->input)
stb_arr_free(s.input);
s = newtab;
++modes;
}
return s.inherited_storage + s.input_size * s.length;
}
int strip_table(table *t, int exceptions)
{
uval terminal_value;
int p = t->length-1;
while (t->input[p] == t->dont_care)
--p;
terminal_value = t->input[p];
while (p >= 0x10000) {
if (t->input[p] != terminal_value && t->input[p] != t->dont_care) {
if (exceptions)
--exceptions;
else
break;
}
--p;
}
return p+1; // p is a character we must output
}
void optimize_table(table *t, char *table_name)
{
int modelist[3] = { 85, -1 };
int modes[8];
int num_modes = 0;
int decent_size;
result r;
size_t path;
table s;
// strip tail end of table
int orig_length = t->length;
int threshhold = 0xffff;
int p = strip_table(t, 2);
int len_saved = t->length - p;
if (len_saved >= threshhold) {
t->length = p;
while (p > 0x10000) {
p = strip_table(t, 0);
len_saved = t->length - p;
if (len_saved < 0x10000)
break;
len_saved = orig_length - p;
if (len_saved < threshhold)
break;
threshhold *= 2;
}
}
t->depth = 1;
// find size of table if we use path 86
decent_size = pack_table_by_modes(t, modelist);
#if 1
// find best packing of remainder of table by exploring tree of packings
r = pack_table(t, 0, decent_size);
// use the computed 'path' to evaluate and output tree
path = r.path;
#else
path = 86;//90;//132097;
#endif
while (path) {
modes[num_modes++] = (path & 127) - 1;
path >>= 7;
}
printf("// modes: %d\n", r.path);
s = *t;
while (num_modes > 0) {
char name[256];
sprintf(name, "%s_%d", table_name, num_modes+1);
--num_modes;
s = pack_for_mode(&s, modes[num_modes], name);
}
// output the final table as-is
if (s.splittable)
output_table_with_trims(table_name, "_1", s.input, s.length);
else
output_table(table_name, "_1", s.input, s.length, 0, NULL);
}
uval unicode_table[0x110000];
typedef struct
{
uval lo,hi;
} char_range;
char_range get_range(char *str)
{
char_range cr;
char *p;
cr.lo = strtol(str, &p, 16);
p = stb_skipwhite(p);
if (*p == '.')
cr.hi = strtol(p+2, NULL, 16);
else
cr.hi = cr.lo;
return cr;
}
char *skip_semi(char *s, int count)
{
while (count) {
s = strchr(s, ';');
assert(s != NULL);
++s;
--count;
}
return s;
}
int main(int argc, char **argv)
{
table t;
uval maxv=0;
int i,n=0;
char **s = stb_stringfile("../../data/UnicodeData.txt", &n);
assert(s);
for (i=0; i < n; ++i) {
if (s[i][0] == '#' || s[i][0] == '\n' || s[i][0] == 0)
;
else {
char_range cr = get_range(s[i]);
char *t = skip_semi(s[i], 13);
uval j, v;
if (*t == ';' || *t == '\n' || *t == 0)
v = 0;
else {
v = strtol(t, NULL, 16);
if (v < 65536) {
maxv = stb_max(v, maxv);
for (j=cr.lo; j <= cr.hi; ++j) {
unicode_table[j] = v;
//printf("%06x => %06x\n", j, v);
}
}
}
}
}
t.depth = 0;
t.dont_care = UVAL_DONT_CARE_DEFAULT;
t.fallback = 0;
t.fastpath = 256;
t.inherited_storage = 0;
t.has_sign = 0;
t.splittable = 0;
t.input = unicode_table;
t.input_size = size_for_max_number(maxv);
t.length = 0x110000;
t.replace_fallback_with_codepoint = 1;
optimize_table(&t, "stbu_upppercase");
return 0;
}