61 lines
1.6 KiB
C
61 lines
1.6 KiB
C
/**
|
|
* @brief Decode UTF8 to codepoints.
|
|
*
|
|
* This is a simple implementation of a UTF-8 decoder with an
|
|
* equivalent API to the older third-party (and much cooler...)
|
|
* version that ToaruOS used to use. Keep feeding it bytes and
|
|
* will eventually set *codep to a codepoint. Should also be able
|
|
* to detect bad UTF-8.
|
|
*
|
|
* @copyright
|
|
* This file is part of ToaruOS and is released under the terms
|
|
* of the NCSA / University of Illinois License - see LICENSE.md
|
|
* Copyright (C) 2018 K. Lange
|
|
*/
|
|
#include <stdint.h>
|
|
|
|
#define UTF8_ACCEPT 0
|
|
#define UTF8_REJECT 1
|
|
|
|
/**
|
|
* Conceptually similar to its predecessor, this implementation is much
|
|
* less cool, as it uses three separate state tables and more shifts.
|
|
*/
|
|
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
|
static int state_table[32] = {
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xxxxxxx */
|
|
1,1,1,1,1,1,1,1, /* 10xxxxxx */
|
|
2,2,2,2, /* 110xxxxx */
|
|
3,3, /* 1110xxxx */
|
|
4, /* 11110xxx */
|
|
1 /* 11111xxx */
|
|
};
|
|
|
|
static int mask_bytes[32] = {
|
|
0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,
|
|
0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
|
0x1F,0x1F,0x1F,0x1F,
|
|
0x0F,0x0F,
|
|
0x07,
|
|
0x00
|
|
};
|
|
|
|
static int next[5] = {
|
|
0,
|
|
1,
|
|
0,
|
|
2,
|
|
3
|
|
};
|
|
|
|
if (*state == UTF8_ACCEPT) {
|
|
*codep = byte & mask_bytes[byte >> 3];
|
|
*state = state_table[byte >> 3];
|
|
} else if (*state > 0) {
|
|
*codep = (byte & 0x3F) | (*codep << 6);
|
|
*state = next[*state];
|
|
}
|
|
return *state;
|
|
}
|