* Made UTF8 functions overflow safe

* Unified UTF8CountChars and UTF8ToLength and removed the latter * Rewrote UTF8CountBytes to use the more safe algorithm from UTF8CountChars * Removed the unsafe count_utf8_bytes() function This should fix bug #839. Marcus can you please review? git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@19624 a95241bf-73f2-0310-859d-f6bbb57e9c96
2006-12-25 21:58:00 +00:00 · 2006-12-25 21:58:00 +00:00 · 3379428150
commit 3379428150
parent 5fd605e278
2 changed files with 108 additions and 73 deletions
--- a/headers/private/interface/utf8_functions.h
+++ b/headers/private/interface/utf8_functions.h
@ -15,6 +15,7 @@ IsInsideGlyph(uchar ch)
 	return (ch & 0xc0) == 0x80;
 }

+
 static inline uint32
 UTF8NextCharLenUnsafe(const char *text)
 {
@ -27,6 +28,7 @@ UTF8NextCharLenUnsafe(const char *text)
 	return ptr - text;
 }

+
 static inline uint32
 UTF8NextCharLen(const char *text)
 {
@ -36,6 +38,7 @@ UTF8NextCharLen(const char *text)
 	return UTF8NextCharLenUnsafe(text);
 }

+
 static inline uint32
 UTF8PreviousCharLen(const char *text, const char *limit)
 {
@ -53,54 +56,113 @@ UTF8PreviousCharLen(const char *text, const char *limit)
 	return text - ptr;
 }

-// TODO: use this function in other places of this file...
+
+/*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
+	numChars characters are read. If numChars is a negative value it is ignored
+	and the string is read up to the terminating NULL.
+*/
 static inline uint32
-count_utf8_bytes(uchar ch)
+UTF8CountBytes(const char *bytes, int32 numChars)
 {
-	// the number of high bits set until the first
-	// unset bit determine the count of bytes used for
-	// this glyph from this byte on
-	uchar bit = 1 << 7;
-	uint32 count = 1;
-	if (ch & bit) {
-		bit = bit >> 1;
-		while (ch & bit) {
-			count++;
-			bit = bit >> 1;
+	if (!bytes)
+		return 0;
+
+	if (numChars < 0)
+		numChars = INT_MAX;
+
+	const char *base = bytes;
+	while (*bytes && numChars-- > 0) {
+		if (bytes[0] & 0x80) {
+			if (bytes[0] & 0x40) {
+				if (bytes[0] & 0x20) {
+					if (bytes[0] & 0x10) {
+						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
+							return (bytes - base);
+
+						bytes += 4;
+						continue;
+					}
+
+					if (bytes[1] == 0 || bytes[2] == 0)
+						return (bytes - base);
+
+					bytes += 3;
+					continue;
+				}
+
+				if (bytes[1] == 0)
+					return (bytes - base);
+
+				bytes += 2;
+				continue;
+			}
+
+			/* Not a startbyte - skip */
+			bytes += 1;
+			continue;
 		}
+
+		bytes += 1;
 	}
-	return count;
+
+	return (bytes - base);
 }

+
+/*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
+	numBytes bytes are read. If numBytes is a negative value it is ignored
+	and the string is read up to the terminating NULL.
+*/
 static inline uint32
-UTF8CountBytes(const char *text, uint32 numChars)
+UTF8CountChars(const char *bytes, int32 numBytes)
 {
-	if (text) {
-		// iterate over numChars glyphs incrementing ptr by the
-		// number of bytes for each glyph, which is encoded in
-		// the first byte of any glyph.
-		const char *ptr = text;
-		while (numChars--) {
-			ptr += count_utf8_bytes(*ptr);
+	if (!bytes)
+		return 0;
+
+	uint32 length = 0;
+	const char *last = bytes + numBytes - 1;
+	if (numBytes < 0)
+		last = (const char *)UINT_MAX;
+
+	while (*bytes && bytes <= last) {
+		if (bytes[0] & 0x80) {
+			if (bytes[0] & 0x40) {
+				if (bytes[0] & 0x20) {
+					if (bytes[0] & 0x10) {
+						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
+							return length;
+
+						bytes += 4;
+						length++;
+						continue;
+					}
+
+					if (bytes[1] == 0 || bytes[2] == 0)
+						return length;
+
+					bytes += 3;
+					length++;
+					continue;
+				}
+
+				if (bytes[1] == 0)
+					return length;
+
+				bytes += 2;
+				length++;
+				continue;
+			}
+
+			/* Not a startbyte - skip */
+			bytes += 1;
+			continue;
 		}
-		return ptr - text;
-	}
-	return 0;
-}

-static inline uint32
-UTF8CountChars(const char *text, int32 numBytes)
-{
-	const char* ptr = text;
-	const char* last = ptr + numBytes - 1;
-
-	uint32 count = 0;
-	while (ptr <= last) {
-		ptr += UTF8NextCharLen(ptr);
-		count++;
+		bytes += 1;
+		length++;
 	}

-	return count;
+	return length;
 }


@ -128,6 +190,9 @@ UTF8ToCharCode(const char **bytes)
 						return result;
 					}

+					if ((*bytes)[1] == 0 || (*bytes)[2] == 0 || (*bytes)[3] == 0)
+						return 0x00;
+
 					/* A four byte char */
 					result += (*bytes)[0] & 0x07;
 					result <<= 6;
@ -140,6 +205,9 @@ UTF8ToCharCode(const char **bytes)
 					return result;
 				}

+				if ((*bytes)[1] == 0 || (*bytes)[2] == 0)
+					return 0x00;
+
 				/* A three byte char */
 				result += (*bytes)[0] & 0x0f;
 				result <<= 6;
@ -150,6 +218,9 @@ UTF8ToCharCode(const char **bytes)
 				return result;
 			}

+			if ((*bytes)[1] == 0)
+				return 0x00;
+
 			/* A two byte char */
 			result += (*bytes)[0] & 0x1f;
 			result <<= 6;
@ -175,40 +246,4 @@ UTF8ToCharCode(const char **bytes)
 	return result;
 }

-
-/*!	UTF8ToLength works like strlen() but takes UTF8 encoded multibyte chars
-	into account. It's a quicker version of UTF8CountChars above.
-*/
-static inline int32
-UTF8ToLength(const char *bytes)
-{
-	int32 length = 0;
-	while (*bytes) {
-		length++;
-
-		if (bytes[0] & 0x80) {
-			if (bytes[0] & 0x40) {
-				if (bytes[0] & 0x20) {
-					if (bytes[0] & 0x10) {
-						bytes += 4;
-						continue;
-					}
-
-					bytes += 3;
-					continue;
-				}
-
-				bytes += 2;
-				continue;
-			}
-
-			/* Not a startbyte - skip */
-		}
-
-		bytes += 1;
-	}
-
-	return length;
-}
-
 #endif	// _UTF8_FUNCTIONS_H
--- a/src/servers/app/ServerFont.cpp
+++ b/src/servers/app/ServerFont.cpp
@ -612,7 +612,7 @@ ServerFont::TruncateString(BString* inOut, uint32 mode, float width) const
 	char *result = new char[length + 3];

 	// count the individual glyphs
-	int32 numChars = UTF8ToLength(string);
+	int32 numChars = UTF8CountChars(string, -1);

 	// get the escapement of each glyph in font units
 	float *escapementArray = new float[numChars];