/** * @brief Decode UTF8 to codepoints. * * This is a simple implementation of a UTF-8 decoder with an * equivalent API to the older third-party (and much cooler...) * version that ToaruOS used to use. Keep feeding it bytes and * will eventually set *codep to a codepoint. Should also be able * to detect bad UTF-8. * * @copyright * This file is part of ToaruOS and is released under the terms * of the NCSA / University of Illinois License - see LICENSE.md * Copyright (C) 2018 K. Lange */ #include #define UTF8_ACCEPT 0 #define UTF8_REJECT 1 /** * Conceptually similar to its predecessor, this implementation is much * less cool, as it uses three separate state tables and more shifts. */ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { static int state_table[32] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xxxxxxx */ 1,1,1,1,1,1,1,1, /* 10xxxxxx */ 2,2,2,2, /* 110xxxxx */ 3,3, /* 1110xxxx */ 4, /* 11110xxx */ 1 /* 11111xxx */ }; static int mask_bytes[32] = { 0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F, 0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x1F,0x1F,0x1F,0x1F, 0x0F,0x0F, 0x07, 0x00 }; static int next[5] = { 0, 1, 0, 2, 3 }; if (*state == UTF8_ACCEPT) { *codep = byte & mask_bytes[byte >> 3]; *state = state_table[byte >> 3]; } else if (*state > 0) { *codep = (byte & 0x3F) | (*codep << 6); *state = next[*state]; } return *state; }