haiku/headers/build/private/interface/utf8_functions.h

/*
 * Copyright 2004-2010, Haiku, Inc.
 * Distributed under the terms of the MIT License.
 */
#ifndef _UTF8_FUNCTIONS_H
#define _UTF8_FUNCTIONS_H


#include <SupportDefs.h>


static inline bool
IsInsideGlyph(uchar ch)
{
	return (ch & 0xc0) == 0x80;
}


static inline uint32
UTF8NextCharLenUnsafe(const char *text)
{
	const char *ptr = text;

	do {
		ptr++;
	} while (IsInsideGlyph(*ptr));

	return ptr - text;
}


static inline uint32
UTF8NextCharLen(const char *text)
{
	if (text == NULL || *text == 0)
		return 0;

	return UTF8NextCharLenUnsafe(text);
}


static inline uint32
UTF8PreviousCharLen(const char *text, const char *limit)
{
	const char *ptr = text;

	if (ptr == NULL || limit == NULL)
		return 0;

	do {
		if (ptr == limit)
			break;
		ptr--;
	} while (IsInsideGlyph(*ptr));

	return text - ptr;
}


/*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
	numChars characters are read. If numChars is a negative value it is ignored
	and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountBytes(const char *bytes, int32 numChars)
{
	if (bytes == NULL)
		return 0;

	if (numChars < 0)
		numChars = INT_MAX;

	const char *base = bytes;
	while (bytes[0] != '\0') {
		if ((bytes[0] & 0xc0) != 0x80) {
			if (--numChars < 0)
				break;
		}
		bytes++;
	}

	return bytes - base;
}


/*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
	numBytes bytes are read. If numBytes is a negative value it is ignored
	and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountChars(const char *bytes, int32 numBytes)
{
	if (bytes == NULL)
		return 0;

	uint32 length = 0;
	const char *last;
	if (numBytes < 0)
		last = (const char *)SIZE_MAX;
	else
		last = bytes + numBytes - 1;

	while (bytes[0] && bytes <= last) {
		if ((bytes++[0] & 0xc0) != 0x80)
			length++;
	}

	return length;
}


/*!	UTF8ToCharCode converts the input that includes potential multibyte chars
	to UTF-32 char codes that can be used by FreeType. The string pointer is
	then advanced to the next character in the string. In case the terminating
	0 is reached, the string pointer is not advanced anymore and nulls are
	returned. This makes it safe to overruns and enables streamed processing
	of UTF8 strings.
*/
static inline uint32
UTF8ToCharCode(const char **bytes)
{
	#define UTF8_SUBSTITUTE_CHARACTER	0xfffd

	uint32 result;
	if (((*bytes)[0] & 0x80) == 0) {
		// a single byte character
		result = (*bytes)[0];
		if (result != '\0') {
			// do not advance beyond the terminating '\0'
			(*bytes)++;
		}

		return result;
	}

	if (((*bytes)[0] & 0xc0) == 0x80) {
		// not a proper multibyte start
		(*bytes)++;
		return UTF8_SUBSTITUTE_CHARACTER;
	}

	// start of a multibyte character
	uint8 mask = 0x80;
	result = (uint32)((*bytes)[0] & 0xff);
	(*bytes)++;

	while (result & mask) {
		if (mask == 0x02) {
			// seven byte char - invalid
			return UTF8_SUBSTITUTE_CHARACTER;
		}

		result &= ~mask;
		mask >>= 1;
	}

	while (((*bytes)[0] & 0xc0) == 0x80) {
		result <<= 6;
		result += (*bytes)[0] & 0x3f;
		(*bytes)++;

		mask <<= 1;
		if (mask == 0x40)
			return result;
	}

	if (mask == 0x40)
		return result;

	if ((*bytes)[0] == '\0') {
		// string terminated within multibyte char
		return 0x00;
	}

	// not enough bytes in multibyte char
	return UTF8_SUBSTITUTE_CHARACTER;

	#undef UTF8_SUBSTITUTE_CHARACTER
}

#endif	// _UTF8_FUNCTIONS_H
Part of fixing #7226: * the libbe_build version of BString was broken, at least with respect to LockBuffer() on an empty string - replaced the implementation and header with our current version (keeping the type-related changes required by the build-version) git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@40526 a95241bf-73f2-0310-859d-f6bbb57e9c96 2011-02-16 13:13:12 +03:00			`/*`
			`* Copyright 2004-2010, Haiku, Inc.`
			`* Distributed under the terms of the MIT License.`
			`*/`
			`#ifndef _UTF8_FUNCTIONS_H`
			`#define _UTF8_FUNCTIONS_H`


			`#include <SupportDefs.h>`


			`static inline bool`
			`IsInsideGlyph(uchar ch)`
			`{`
			`return (ch & 0xc0) == 0x80;`
			`}`


			`static inline uint32`
			`UTF8NextCharLenUnsafe(const char *text)`
			`{`
			`const char *ptr = text;`

			`do {`
			`ptr++;`
			`} while (IsInsideGlyph(*ptr));`

			`return ptr - text;`
			`}`


			`static inline uint32`
			`UTF8NextCharLen(const char *text)`
			`{`
			`if (text == NULL \|\| *text == 0)`
			`return 0;`

			`return UTF8NextCharLenUnsafe(text);`
			`}`


			`static inline uint32`
			`UTF8PreviousCharLen(const char text, const char limit)`
			`{`
			`const char *ptr = text;`

			`if (ptr == NULL \|\| limit == NULL)`
			`return 0;`

			`do {`
			`if (ptr == limit)`
			`break;`
			`ptr--;`
			`} while (IsInsideGlyph(*ptr));`

			`return text - ptr;`
			`}`


			`/*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to`
			`numChars characters are read. If numChars is a negative value it is ignored`
			`and the string is read up to the terminating 0.`
			`*/`
			`static inline uint32`
			`UTF8CountBytes(const char *bytes, int32 numChars)`
			`{`
			`if (bytes == NULL)`
			`return 0;`

			`if (numChars < 0)`
			`numChars = INT_MAX;`

			`const char *base = bytes;`
			`while (bytes[0] != '\0') {`
			`if ((bytes[0] & 0xc0) != 0x80) {`
			`if (--numChars < 0)`
			`break;`
			`}`
			`bytes++;`
			`}`

			`return bytes - base;`
			`}`


			`/*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to`
			`numBytes bytes are read. If numBytes is a negative value it is ignored`
			`and the string is read up to the terminating 0.`
			`*/`
			`static inline uint32`
			`UTF8CountChars(const char *bytes, int32 numBytes)`
			`{`
			`if (bytes == NULL)`
			`return 0;`

			`uint32 length = 0;`
			`const char *last;`
			`if (numBytes < 0)`
			`last = (const char *)SIZE_MAX;`
			`else`
			`last = bytes + numBytes - 1;`

			`while (bytes[0] && bytes <= last) {`
			`if ((bytes++[0] & 0xc0) != 0x80)`
			`length++;`
			`}`

			`return length;`
			`}`


			`/*! UTF8ToCharCode converts the input that includes potential multibyte chars`
			`to UTF-32 char codes that can be used by FreeType. The string pointer is`
			`then advanced to the next character in the string. In case the terminating`
			`0 is reached, the string pointer is not advanced anymore and nulls are`
			`returned. This makes it safe to overruns and enables streamed processing`
			`of UTF8 strings.`
			`*/`
			`static inline uint32`
			`UTF8ToCharCode(const char **bytes)`
			`{`
			`#define UTF8_SUBSTITUTE_CHARACTER 0xfffd`

			`uint32 result;`
			`if (((*bytes)[0] & 0x80) == 0) {`
			`// a single byte character`
			`result = (*bytes)[0];`
			`if (result != '\0') {`
			`// do not advance beyond the terminating '\0'`
			`(*bytes)++;`
			`}`

			`return result;`
			`}`

			`if (((*bytes)[0] & 0xc0) == 0x80) {`
			`// not a proper multibyte start`
			`(*bytes)++;`
			`return UTF8_SUBSTITUTE_CHARACTER;`
			`}`

			`// start of a multibyte character`
			`uint8 mask = 0x80;`
			`result = (uint32)((*bytes)[0] & 0xff);`
			`(*bytes)++;`

			`while (result & mask) {`
			`if (mask == 0x02) {`
			`// seven byte char - invalid`
			`return UTF8_SUBSTITUTE_CHARACTER;`
			`}`

			`result &= ~mask;`
			`mask >>= 1;`
			`}`

			`while (((*bytes)[0] & 0xc0) == 0x80) {`
			`result <<= 6;`
			`result += (*bytes)[0] & 0x3f;`
			`(*bytes)++;`

			`mask <<= 1;`
			`if (mask == 0x40)`
			`return result;`
			`}`

			`if (mask == 0x40)`
			`return result;`

			`if ((*bytes)[0] == '\0') {`
			`// string terminated within multibyte char`
			`return 0x00;`
			`}`

			`// not enough bytes in multibyte char`
			`return UTF8_SUBSTITUTE_CHARACTER;`

			`#undef UTF8_SUBSTITUTE_CHARACTER`
			`}`

			`#endif // _UTF8_FUNCTIONS_H`