haiku/headers/private/interface/utf8_functions.h

/*
 * Copyright 2004-2006, Haiku, Inc.
 * Distributed under the terms of the MIT License.
 */
#ifndef _UTF8_FUNCTIONS_H
#define _UTF8_FUNCTIONS_H


#include <SupportDefs.h>


static inline bool
IsInsideGlyph(uchar ch)
{
	return (ch & 0xC0) == 0x80;
}

static inline uint32
UTF8NextCharLenUnsafe(const char *text)
{
	const char *ptr = text;

	do {
		ptr++;
	} while (IsInsideGlyph(*ptr));
				
	return ptr - text;
}

static inline uint32
UTF8NextCharLen(const char *text)
{
	if (text == NULL || *text == 0)
		return 0;

	return UTF8NextCharLenUnsafe(text);
}

static inline uint32
UTF8PreviousCharLen(const char *text, const char *limit)
{
	const char *ptr = text;
	
	if (ptr == NULL || limit == NULL)
		return 0;

	do {
		if (ptr == limit)
			break;
		ptr--;
	} while (IsInsideGlyph(*ptr));
				
	return text - ptr;
}

// TODO: use this function in other places of this file...
static inline uint32
count_utf8_bytes(uchar ch)
{
	// the number of high bits set until the first
	// unset bit determine the count of bytes used for
	// this glyph from this byte on
	uchar bit = 1 << 7;
	uint32 count = 1;
	if (ch & bit) {
		bit = bit >> 1;
		while (ch & bit) {
			count++;
			bit = bit >> 1;
		}
	}
	return count;
}

static inline uint32
UTF8CountBytes(const char *text, uint32 numChars)
{
	if (text) {
		// iterate over numChars glyphs incrementing ptr by the
		// number of bytes for each glyph, which is encoded in
		// the first byte of any glyph.
		const char *ptr = text;
		while (numChars--) {
			ptr += count_utf8_bytes(*ptr);
		}
		return ptr - text;
	}
	return 0;
}

static inline uint32
UTF8CountChars(const char *text, int32 numBytes)
{
	const char* ptr = text;
	const char* last = ptr + numBytes - 1;

	uint32 count = 0;
	while (ptr <= last) {
		ptr += UTF8NextCharLen(ptr);
		count++;
	}

	return count;
}


/*	UTF8ToCharCode converts the input that includes potential multibyte chars
	to UTF-32 char codes that can be used by FreeType. The string pointer is
	then advanced to the next character in the string. In case the terminating
	0 is reached, the string pointer is not advanced anymore and spaces are
	returned. This makes it safe to overruns and enables streamed processing
	of UTF8 strings. */
static inline uint32
UTF8ToCharCode(const char **bytes)
{
	register uint32 result = 0;

	if ((*bytes)[0] & 0x80) {
		if ((*bytes)[0] & 0x40) {
			if ((*bytes)[0] & 0x20) {
				if ((*bytes)[0] & 0x10) {
					if ((*bytes)[0] & 0x08) {
						/*	A five byte char?!
							Something's wrong, substitute. */
						result += 0x20;
						(*bytes)++;
						return result;
					}

					/* A four byte char */
					result += (*bytes)[0] & 0x07;
					result <<= 6;
					result += (*bytes)[1] & 0x3f;
					result <<= 6;
					result += (*bytes)[2] & 0x3f;
					result <<= 6;
					result += (*bytes)[3] & 0x3f;
					(*bytes) += 3;
					return result;
				}

				/* A three byte char */
				result += (*bytes)[0] & 0x0f;
				result <<= 6;
				result += (*bytes)[1] & 0x3f;
				result <<= 6;
				result += (*bytes)[2] & 0x3f;
				(*bytes) += 3;
				return result;
			}

			/* A two byte char */
			result += (*bytes)[0] & 0x1f;
			result <<= 6;
			result += (*bytes)[1] & 0x3f;
			(*bytes) += 2;
			return result;
		}

		/*	This (10) is not a startbyte.
			Substitute with a space. */
		result += 0x20;
		(*bytes)++;
		return result;
	}

	if ((*bytes)[0] == 0) {
		/*	We do not advance beyond the terminating 0. */
		return 0x00;
	}

	result += (*bytes)[0];
	(*bytes)++;
	return result;
}


/*	UTF8ToLength works like strlen() but takes UTF8 encoded multibyte chars
	into account. It's a quicker version of UTF8CountChars above. */
static inline int32
UTF8ToLength(const char *bytes)
{
	int32 length = 0;
	while (*bytes) {
		length++;

		if (bytes[0] & 0x80) {
			if (bytes[0] & 0x40) {
				if (bytes[0] & 0x20) {
					if (bytes[0] & 0x10) {
						bytes += 4;
						continue;
					}

					bytes += 3;
					continue;
				}

				bytes += 2;
				continue;
			}

			/* Not a startbyte - skip */
		}

		bytes += 1;
	}

	return length;
}

#endif	// _UTF8_FUNCTIONS_H
Renamed moreUTF8.h to utf8_functions.h. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@17239 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-04-26 14:16:19 +04:00			`/*`
			`* Copyright 2004-2006, Haiku, Inc.`
			`* Distributed under the terms of the MIT License.`
			`*/`
			`#ifndef _UTF8_FUNCTIONS_H`
			`#define _UTF8_FUNCTIONS_H`
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00
* There is now a server_read_only_memory structure that is placed in a (surprise!) read-only area shared between the Desktop and all applications. * Right now, this area only contains the desktop colors, ie. B_PANEL_BACKGROUND_COLOR etc.; ui_color() no longer needs to ask the server for these colors. * The ui_colors are now maintained by DesktopSettings, though ColorSet is still there. * The default colors are now hardcoded once and for everyone in InterfaceDefs.h, ie. the app_server uses them as well. * Desktop::Init() can now also return an error (but that is not yet accounted for). * Cleaned up InterfaceDefs.h. * Fixed wrong include in moreUTF8.h. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@17232 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-04-26 00:12:06 +04:00
			`#include <SupportDefs.h>`

Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00
			`static inline bool`
			`IsInsideGlyph(uchar ch)`
			`{`
			`return (ch & 0xC0) == 0x80;`
			`}`

			`static inline uint32`
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`UTF8NextCharLenUnsafe(const char *text)`
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00			`{`
			`const char *ptr = text;`
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00			`do {`
			`ptr++;`
			`} while (IsInsideGlyph(*ptr));`

			`return ptr - text;`
			`}`

fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`static inline uint32`
			`UTF8NextCharLen(const char *text)`
			`{`
			`if (text == NULL \|\| *text == 0)`
			`return 0;`

			`return UTF8NextCharLenUnsafe(text);`
			`}`

fixed confusion of byteCount and charCount in ServerFont::StringWidth(), just in case anyone really uses it later. Added UTF8CountChars() to moreUTF8.h, but then I didn't need it... git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12750 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-21 03:51:33 +04:00			`static inline uint32`
			`UTF8PreviousCharLen(const char text, const char limit)`
			`{`
			`const char *ptr = text;`

			`if (ptr == NULL \|\| limit == NULL)`
			`return 0;`

			`do {`
			`if (ptr == limit)`
			`break;`
			`ptr--;`
			`} while (IsInsideGlyph(*ptr));`

			`return text - ptr;`
			`}`

fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`// TODO: use this function in other places of this file...`
added a function to count the bytes in a string if the number of UTF8 chars is already known git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12661 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-13 21:30:59 +04:00			`static inline uint32`
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`count_utf8_bytes(uchar ch)`
added a function to count the bytes in a string if the number of UTF8 chars is already known git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12661 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-13 21:30:59 +04:00			`{`
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`// the number of high bits set until the first`
			`// unset bit determine the count of bytes used for`
			`// this glyph from this byte on`
			`uchar bit = 1 << 7;`
			`uint32 count = 1;`
			`if (ch & bit) {`
			`bit = bit >> 1;`
			`while (ch & bit) {`
			`count++;`
			`bit = bit >> 1;`
			`}`
added a function to count the bytes in a string if the number of UTF8 chars is already known git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12661 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-13 21:30:59 +04:00			`}`
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`return count;`
			`}`
added a function to count the bytes in a string if the number of UTF8 chars is already known git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12661 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-13 21:30:59 +04:00
fixed memory corruption in the stuff I added to moreUTF8. This fixes GetEscapement crashes. The rest of the file needs reviewing as well, but I wanted to commit this ASAP git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13155 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-06-15 21:09:00 +04:00			`static inline uint32`
			`UTF8CountBytes(const char *text, uint32 numChars)`
			`{`
			`if (text) {`
			`// iterate over numChars glyphs incrementing ptr by the`
			`// number of bytes for each glyph, which is encoded in`
			`// the first byte of any glyph.`
			`const char *ptr = text;`
			`while (numChars--) {`
			`ptr += count_utf8_bytes(*ptr);`
			`}`
			`return ptr - text;`
			`}`
			`return 0;`
added a function to count the bytes in a string if the number of UTF8 chars is already known git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12661 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-13 21:30:59 +04:00			`}`

Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00			`static inline uint32`
fixed confusion of byteCount and charCount in ServerFont::StringWidth(), just in case anyone really uses it later. Added UTF8CountChars() to moreUTF8.h, but then I didn't need it... git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12750 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-21 03:51:33 +04:00			`UTF8CountChars(const char *text, int32 numBytes)`
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00			`{`
fixed confusion of byteCount and charCount in ServerFont::StringWidth(), just in case anyone really uses it later. Added UTF8CountChars() to moreUTF8.h, but then I didn't need it... git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12750 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-21 03:51:33 +04:00			`const char* ptr = text;`
			`const char* last = ptr + numBytes - 1;`
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00
fixed confusion of byteCount and charCount in ServerFont::StringWidth(), just in case anyone really uses it later. Added UTF8CountChars() to moreUTF8.h, but then I didn't need it... git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12750 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-21 03:51:33 +04:00			`uint32 count = 0;`
			`while (ptr <= last) {`
			`ptr += UTF8NextCharLen(ptr);`
			`count++;`
			`}`

			`return count;`
Moved some headers here so they can be included by Globals.cpp in the interface kit folder git-svn-id: file:///srv/svn/repos/haiku/trunk/current@6092 a95241bf-73f2-0310-859d-f6bbb57e9c96 2004-01-15 10:15:37 +03:00			`}`

fixed confusion of byteCount and charCount in ServerFont::StringWidth(), just in case anyone really uses it later. Added UTF8CountChars() to moreUTF8.h, but then I didn't need it... git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@12750 a95241bf-73f2-0310-859d-f6bbb57e9c96 2005-05-21 03:51:33 +04:00
More work on the BFont special functions. * Reworked functions like GetEscapements(), GetBoundingBoxesAsString() and GetGlyphShapes() completely * Made the ServerFont functions uniform in their prototypes and cleaned out unnecessary arguments * Added new UTF8 handling functions to moreUTF8.h that are now used by ServerFont * Put the common transformations of the FT_Face into an own GetTransformedFace() to lessen code duplication In other words, ServerFont is now cleaned and handles UTF8 pretty efficiently. Some ToDo's are still left though. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@16241 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-02-06 02:36:59 +03:00			`/* UTF8ToCharCode converts the input that includes potential multibyte chars`
			`to UTF-32 char codes that can be used by FreeType. The string pointer is`
			`then advanced to the next character in the string. In case the terminating`
			`0 is reached, the string pointer is not advanced anymore and spaces are`
			`returned. This makes it safe to overruns and enables streamed processing`
			`of UTF8 strings. */`
			`static inline uint32`
			`UTF8ToCharCode(const char **bytes)`
			`{`
			`register uint32 result = 0;`

			`if ((*bytes)[0] & 0x80) {`
			`if ((*bytes)[0] & 0x40) {`
			`if ((*bytes)[0] & 0x20) {`
			`if ((*bytes)[0] & 0x10) {`
			`if ((*bytes)[0] & 0x08) {`
			`/* A five byte char?!`
* Fixed locking of the FT_Face by moving it into Get/PutTransformedFace() * Removed the FaceGetter as it was only needed for locking * Cleaned up TruncateString() * Fixed a typo in moreUTF8.h git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@16252 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-02-06 20:42:18 +03:00			`Something's wrong, substitute. */`
More work on the BFont special functions. * Reworked functions like GetEscapements(), GetBoundingBoxesAsString() and GetGlyphShapes() completely * Made the ServerFont functions uniform in their prototypes and cleaned out unnecessary arguments * Added new UTF8 handling functions to moreUTF8.h that are now used by ServerFont * Put the common transformations of the FT_Face into an own GetTransformedFace() to lessen code duplication In other words, ServerFont is now cleaned and handles UTF8 pretty efficiently. Some ToDo's are still left though. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@16241 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-02-06 02:36:59 +03:00			`result += 0x20;`
			`(*bytes)++;`
			`return result;`
			`}`

			`/* A four byte char */`
			`result += (*bytes)[0] & 0x07;`
			`result <<= 6;`
			`result += (*bytes)[1] & 0x3f;`
			`result <<= 6;`
			`result += (*bytes)[2] & 0x3f;`
			`result <<= 6;`
			`result += (*bytes)[3] & 0x3f;`
			`(*bytes) += 3;`
			`return result;`
			`}`

			`/* A three byte char */`
			`result += (*bytes)[0] & 0x0f;`
			`result <<= 6;`
			`result += (*bytes)[1] & 0x3f;`
			`result <<= 6;`
			`result += (*bytes)[2] & 0x3f;`
			`(*bytes) += 3;`
			`return result;`
			`}`

			`/* A two byte char */`
			`result += (*bytes)[0] & 0x1f;`
			`result <<= 6;`
			`result += (*bytes)[1] & 0x3f;`
			`(*bytes) += 2;`
			`return result;`
			`}`

			`/* This (10) is not a startbyte.`
			`Substitute with a space. */`
			`result += 0x20;`
			`(*bytes)++;`
			`return result;`
			`}`

			`if ((*bytes)[0] == 0) {`
* Changed the AGGTextRenderer to use the new UTF8 handling * Added more char codes to is_white_space(), should be all I think Sorry if I stepped on your toes Stephan, but I wanted these changes flushed before I leave for holidays :-). git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@16501 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-02-23 12:04:26 +03:00			`/* We do not advance beyond the terminating 0. */`
			`return 0x00;`
More work on the BFont special functions. * Reworked functions like GetEscapements(), GetBoundingBoxesAsString() and GetGlyphShapes() completely * Made the ServerFont functions uniform in their prototypes and cleaned out unnecessary arguments * Added new UTF8 handling functions to moreUTF8.h that are now used by ServerFont * Put the common transformations of the FT_Face into an own GetTransformedFace() to lessen code duplication In other words, ServerFont is now cleaned and handles UTF8 pretty efficiently. Some ToDo's are still left though. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@16241 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-02-06 02:36:59 +03:00			`}`

			`result += (*bytes)[0];`
			`(*bytes)++;`
			`return result;`
			`}`


			`/* UTF8ToLength works like strlen() but takes UTF8 encoded multibyte chars`
			`into account. It's a quicker version of UTF8CountChars above. */`
			`static inline int32`
			`UTF8ToLength(const char *bytes)`
			`{`
			`int32 length = 0;`
			`while (*bytes) {`
			`length++;`

			`if (bytes[0] & 0x80) {`
			`if (bytes[0] & 0x40) {`
			`if (bytes[0] & 0x20) {`
			`if (bytes[0] & 0x10) {`
			`bytes += 4;`
			`continue;`
			`}`

			`bytes += 3;`
			`continue;`
			`}`

			`bytes += 2;`
			`continue;`
			`}`

			`/* Not a startbyte - skip */`
			`}`

			`bytes += 1;`
			`}`

			`return length;`
			`}`

Renamed moreUTF8.h to utf8_functions.h. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@17239 a95241bf-73f2-0310-859d-f6bbb57e9c96 2006-04-26 14:16:19 +04:00			`#endif // _UTF8_FUNCTIONS_H`