2006-04-26 14:16:19 +04:00
|
|
|
/*
|
2010-05-28 22:37:36 +04:00
|
|
|
* Copyright 2004-2010, Haiku, Inc.
|
2006-04-26 14:16:19 +04:00
|
|
|
* Distributed under the terms of the MIT License.
|
|
|
|
*/
|
|
|
|
#ifndef _UTF8_FUNCTIONS_H
|
|
|
|
#define _UTF8_FUNCTIONS_H
|
2004-01-15 10:15:37 +03:00
|
|
|
|
2006-04-26 00:12:06 +04:00
|
|
|
|
|
|
|
#include <SupportDefs.h>
|
|
|
|
|
2004-01-15 10:15:37 +03:00
|
|
|
|
|
|
|
static inline bool
|
|
|
|
IsInsideGlyph(uchar ch)
|
|
|
|
{
|
2006-08-30 02:20:18 +04:00
|
|
|
return (ch & 0xc0) == 0x80;
|
2004-01-15 10:15:37 +03:00
|
|
|
}
|
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
|
2004-01-15 10:15:37 +03:00
|
|
|
static inline uint32
|
2005-06-15 21:09:00 +04:00
|
|
|
UTF8NextCharLenUnsafe(const char *text)
|
2004-01-15 10:15:37 +03:00
|
|
|
{
|
|
|
|
const char *ptr = text;
|
2005-06-15 21:09:00 +04:00
|
|
|
|
2004-01-15 10:15:37 +03:00
|
|
|
do {
|
|
|
|
ptr++;
|
|
|
|
} while (IsInsideGlyph(*ptr));
|
2010-05-28 22:37:36 +04:00
|
|
|
|
2004-01-15 10:15:37 +03:00
|
|
|
return ptr - text;
|
|
|
|
}
|
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
|
2005-06-15 21:09:00 +04:00
|
|
|
static inline uint32
|
|
|
|
UTF8NextCharLen(const char *text)
|
|
|
|
{
|
|
|
|
if (text == NULL || *text == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return UTF8NextCharLenUnsafe(text);
|
|
|
|
}
|
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
|
2011-12-06 22:09:56 +04:00
|
|
|
static inline uint32
|
|
|
|
UTF8NextCharLen(const char *bytes, size_t length)
|
|
|
|
{
|
|
|
|
if (bytes == NULL || length == 0 || bytes[0] == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if ((bytes[0] & 0x80) == 0) {
|
|
|
|
// A single ASCII char - or so...
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (IsInsideGlyph(bytes[0])) {
|
|
|
|
// Not a proper multibyte start.
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We already know that we have the upper two bits set due to the above
|
|
|
|
// two checks.
|
|
|
|
uint8 mask = 0x20;
|
|
|
|
size_t bytesExpected = 2;
|
|
|
|
while ((bytes[0] & mask) != 0) {
|
|
|
|
if (mask == 0x02) {
|
|
|
|
// Seven byte char - invalid.
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bytesExpected++;
|
|
|
|
mask >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// There would need to be more bytes to satisfy the char.
|
|
|
|
if (bytesExpected > length)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// We already know the first byte is fine, check the rest.
|
|
|
|
for (size_t i = 1; i < bytesExpected; i++) {
|
|
|
|
if (!IsInsideGlyph(bytes[i])) {
|
|
|
|
// The sequence is incomplete.
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Puh, everything's fine.
|
|
|
|
return bytesExpected;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-05-21 03:51:33 +04:00
|
|
|
static inline uint32
|
|
|
|
UTF8PreviousCharLen(const char *text, const char *limit)
|
|
|
|
{
|
|
|
|
const char *ptr = text;
|
2010-05-28 22:37:36 +04:00
|
|
|
|
2005-05-21 03:51:33 +04:00
|
|
|
if (ptr == NULL || limit == NULL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (ptr == limit)
|
|
|
|
break;
|
|
|
|
ptr--;
|
|
|
|
} while (IsInsideGlyph(*ptr));
|
2010-05-28 22:37:36 +04:00
|
|
|
|
2005-05-21 03:51:33 +04:00
|
|
|
return text - ptr;
|
|
|
|
}
|
|
|
|
|
2005-05-13 21:30:59 +04:00
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
/*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
|
|
|
|
numChars characters are read. If numChars is a negative value it is ignored
|
2006-12-26 01:02:01 +03:00
|
|
|
and the string is read up to the terminating 0.
|
2006-12-26 00:58:00 +03:00
|
|
|
*/
|
2005-06-15 21:09:00 +04:00
|
|
|
static inline uint32
|
2006-12-26 00:58:00 +03:00
|
|
|
UTF8CountBytes(const char *bytes, int32 numChars)
|
2005-06-15 21:09:00 +04:00
|
|
|
{
|
2008-02-28 22:17:33 +03:00
|
|
|
if (bytes == NULL)
|
2006-12-26 00:58:00 +03:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (numChars < 0)
|
|
|
|
numChars = INT_MAX;
|
|
|
|
|
|
|
|
const char *base = bytes;
|
2008-02-28 22:17:33 +03:00
|
|
|
while (bytes[0] != '\0') {
|
|
|
|
if ((bytes[0] & 0xc0) != 0x80) {
|
|
|
|
if (--numChars < 0)
|
|
|
|
break;
|
2005-06-15 21:09:00 +04:00
|
|
|
}
|
2008-02-28 22:17:33 +03:00
|
|
|
bytes++;
|
2005-06-15 21:09:00 +04:00
|
|
|
}
|
2006-12-26 00:58:00 +03:00
|
|
|
|
2008-02-28 22:17:33 +03:00
|
|
|
return bytes - base;
|
2005-05-13 21:30:59 +04:00
|
|
|
}
|
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
|
|
|
|
/*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
|
|
|
|
numBytes bytes are read. If numBytes is a negative value it is ignored
|
2006-12-26 01:02:01 +03:00
|
|
|
and the string is read up to the terminating 0.
|
2006-12-26 00:58:00 +03:00
|
|
|
*/
|
2004-01-15 10:15:37 +03:00
|
|
|
static inline uint32
|
2006-12-26 00:58:00 +03:00
|
|
|
UTF8CountChars(const char *bytes, int32 numBytes)
|
2004-01-15 10:15:37 +03:00
|
|
|
{
|
2008-02-28 22:17:33 +03:00
|
|
|
if (bytes == NULL)
|
2006-12-26 00:58:00 +03:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
uint32 length = 0;
|
2008-02-28 22:17:33 +03:00
|
|
|
const char *last;
|
2006-12-26 00:58:00 +03:00
|
|
|
if (numBytes < 0)
|
2010-05-28 22:37:36 +04:00
|
|
|
last = (const char *)SIZE_MAX;
|
2008-02-28 22:17:33 +03:00
|
|
|
else
|
|
|
|
last = bytes + numBytes - 1;
|
2006-12-26 00:58:00 +03:00
|
|
|
|
2008-02-28 22:17:33 +03:00
|
|
|
while (bytes[0] && bytes <= last) {
|
|
|
|
if ((bytes++[0] & 0xc0) != 0x80)
|
|
|
|
length++;
|
2005-05-21 03:51:33 +04:00
|
|
|
}
|
|
|
|
|
2006-12-26 00:58:00 +03:00
|
|
|
return length;
|
2004-01-15 10:15:37 +03:00
|
|
|
}
|
|
|
|
|
2005-05-21 03:51:33 +04:00
|
|
|
|
2006-08-30 02:20:18 +04:00
|
|
|
/*! UTF8ToCharCode converts the input that includes potential multibyte chars
|
2006-02-06 02:36:59 +03:00
|
|
|
to UTF-32 char codes that can be used by FreeType. The string pointer is
|
|
|
|
then advanced to the next character in the string. In case the terminating
|
2008-02-29 02:04:47 +03:00
|
|
|
0 is reached, the string pointer is not advanced anymore and nulls are
|
2006-02-06 02:36:59 +03:00
|
|
|
returned. This makes it safe to overruns and enables streamed processing
|
2006-08-30 02:20:18 +04:00
|
|
|
of UTF8 strings.
|
|
|
|
*/
|
2006-02-06 02:36:59 +03:00
|
|
|
static inline uint32
|
|
|
|
UTF8ToCharCode(const char **bytes)
|
|
|
|
{
|
2008-02-29 02:04:47 +03:00
|
|
|
#define UTF8_SUBSTITUTE_CHARACTER 0xfffd
|
|
|
|
|
|
|
|
uint32 result;
|
|
|
|
if (((*bytes)[0] & 0x80) == 0) {
|
|
|
|
// a single byte character
|
|
|
|
result = (*bytes)[0];
|
|
|
|
if (result != '\0') {
|
|
|
|
// do not advance beyond the terminating '\0'
|
|
|
|
(*bytes)++;
|
2006-02-06 02:36:59 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-02-29 02:04:47 +03:00
|
|
|
if (((*bytes)[0] & 0xc0) == 0x80) {
|
|
|
|
// not a proper multibyte start
|
|
|
|
(*bytes)++;
|
|
|
|
return UTF8_SUBSTITUTE_CHARACTER;
|
2006-02-06 02:36:59 +03:00
|
|
|
}
|
|
|
|
|
2008-02-29 02:04:47 +03:00
|
|
|
// start of a multibyte character
|
|
|
|
uint8 mask = 0x80;
|
|
|
|
result = (uint32)((*bytes)[0] & 0xff);
|
2006-02-06 02:36:59 +03:00
|
|
|
(*bytes)++;
|
2008-02-29 02:04:47 +03:00
|
|
|
|
|
|
|
while (result & mask) {
|
|
|
|
if (mask == 0x02) {
|
|
|
|
// seven byte char - invalid
|
|
|
|
return UTF8_SUBSTITUTE_CHARACTER;
|
|
|
|
}
|
|
|
|
|
|
|
|
result &= ~mask;
|
|
|
|
mask >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (((*bytes)[0] & 0xc0) == 0x80) {
|
|
|
|
result <<= 6;
|
|
|
|
result += (*bytes)[0] & 0x3f;
|
|
|
|
(*bytes)++;
|
|
|
|
|
|
|
|
mask <<= 1;
|
|
|
|
if (mask == 0x40)
|
2008-10-14 17:16:44 +04:00
|
|
|
return result;
|
2008-02-29 02:04:47 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (mask == 0x40)
|
|
|
|
return result;
|
|
|
|
|
|
|
|
if ((*bytes)[0] == '\0') {
|
|
|
|
// string terminated within multibyte char
|
|
|
|
return 0x00;
|
|
|
|
}
|
|
|
|
|
|
|
|
// not enough bytes in multibyte char
|
|
|
|
return UTF8_SUBSTITUTE_CHARACTER;
|
|
|
|
|
|
|
|
#undef UTF8_SUBSTITUTE_CHARACTER
|
2006-02-06 02:36:59 +03:00
|
|
|
}
|
|
|
|
|
2006-04-26 14:16:19 +04:00
|
|
|
#endif // _UTF8_FUNCTIONS_H
|