* Extended the text recognition to UTF-8 characters - if a text contains too many

bad characters, it will be rejected, at least after bug #675 has been fixed. * In the future, we should use the same mechanism as the TextSnifferAddon of the registrar; it would also be nice to use additional charset information (at least differentiating between latin-1 and UTF-8). * Minor cleanup. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@18703 a95241bf-73f2-0310-859d-f6bbb57e9c96
2006-08-29 23:24:58 +00:00 · 2006-08-29 23:24:58 +00:00 · d3669ef900
commit d3669ef900
parent e04ec8626e
1 changed files with 103 additions and 43 deletions
--- a/src/add-ons/translators/stxt/STXTTranslator.cpp
+++ b/src/add-ons/translators/stxt/STXTTranslator.cpp
@ -35,7 +35,7 @@
 #include "STXTView.h"

 #define READ_BUFFER_SIZE 2048
-#define DATA_BUFFER_SIZE 64
+#define DATA_BUFFER_SIZE 256

 // The input formats that this translator supports.
 translation_format gInputFormats[] = {
@ -206,17 +206,15 @@ identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 		B_SWAP_BENDIAN_TO_HOST) != B_OK)
 		return B_ERROR;
 		
-	if (txtheader.header.magic != 'TEXT' ||
-		txtheader.header.header_size !=
-			sizeof(TranslatorStyledTextTextHeader) ||
-		txtheader.charset != B_UNICODE_UTF8)
+	if (txtheader.header.magic != 'TEXT'
+		|| txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
+		|| txtheader.charset != B_UNICODE_UTF8)
 		return B_NO_TRANSLATOR;

 	// skip the text data
 	off_t seekresult, pos;
-	pos = header.header.header_size +
-		txtheader.header.header_size +
-		txtheader.header.data_size;
+	pos = header.header.header_size + txtheader.header.header_size
+		+ txtheader.header.data_size;
 	seekresult = inSource->Seek(txtheader.header.data_size,
 		SEEK_CUR);
 	if (seekresult < pos)
@ -232,20 +230,20 @@ identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 		return read;
 	if (read != kstylsize && read != 0)
 		return B_NO_TRANSLATOR;
-	
+
 	// If there is a STYL header
 	if (read == kstylsize) {
 		memcpy(&stylheader, buffer, kstylsize);
 		if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
 			B_SWAP_BENDIAN_TO_HOST) != B_OK)
 			return B_ERROR;
-		
-		if (stylheader.header.magic != 'STYL' ||
-			stylheader.header.header_size !=
+
+		if (stylheader.header.magic != 'STYL'
+			|| stylheader.header.header_size != 
 				sizeof(TranslatorStyledTextStyleHeader))
 			return B_NO_TRANSLATOR;
 	}
-	
+
 	// if output TEXT header is supplied, fill it with data
 	if (ptxtheader) {
 		ptxtheader->header.magic = txtheader.header.magic;
@ -253,7 +251,7 @@ identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 		ptxtheader->header.data_size = txtheader.header.data_size;
 		ptxtheader->charset = txtheader.charset;
 	}
-	
+
 	// return information about the data in the stream
 	outInfo->type = B_STYLED_TEXT_FORMAT;
 	outInfo->group = B_TRANSLATOR_TEXT;
@ -261,10 +259,50 @@ identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 	outInfo->capability = STXT_IN_CAPABILITY;
 	strcpy(outInfo->name, "Be styled text file");
 	strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
-	
+
 	return B_OK;
 }

+
+static bool
+valid_utf8_char(const uint8* bytes, size_t& length)
+{
+	length = 1;
+
+	if (bytes[0] & 0x80) {
+		if (bytes[0] & 0x40) {
+			if (bytes[0] & 0x20) {
+				if (bytes[0] & 0x10) {
+					if (bytes[0] & 0x08) {
+						// A five byte char?!
+						return false;
+					}
+
+					// A four byte char
+					length = 4;
+					return true;
+				}
+
+				/* A three byte char */
+				length = 3;
+				return true;
+			}
+
+			/* A two byte char */
+			length = 2;
+			return true;
+		}
+
+		return false;
+	}
+
+	if (bytes[0] == 0)
+		return false;
+
+	return true;
+}
+
+
 // ---------------------------------------------------------------
 // identify_txt_header
 //
@ -300,32 +338,55 @@ identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 // determine the format
 // ---------------------------------------------------------------
 status_t
-identify_txt_header(uint8 *data, int32 nread,
+identify_txt_header(uint8 *data, int32 bytesRead,
 	BPositionIO *inSource, translator_info *outInfo, uint32 outType)
 {
-	float capability = TEXT_IN_CAPABILITY;
-	uint8 ch;
-	ssize_t readlater = 0;
-	readlater = inSource->Read(data + nread, DATA_BUFFER_SIZE - nread);
-	if (readlater < 0)
+	ssize_t readLater = inSource->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
+	if (readLater < 0)
 		return B_NO_TRANSLATOR;
-	
-	nread += readlater;
-	for (int32 i = 0; i < nread; i++) {
-		ch = data[i];
+
+	bytesRead += readLater;
+	float capability = TEXT_IN_CAPABILITY;
+	int32 bad = 0;
+
+	// TODO: use the same mechanism as the TextSnifferAddon class in the registrar
+	//	and add support for indicating other character sets as well
+	for (int32 i = 0; i < bytesRead; i++) {
+		uint8 c = data[i];
+		size_t length;
+
 		// if any null characters or control characters
 		// are found, reduce our ability to handle the data
-		if (ch < 0x20 && 
-			ch != 0x08 && // backspace
-			ch != 0x09 && // tab
-			ch != 0x0A && // line feed
-			ch != 0x0C && // form feed
-			ch != 0x0D) { // carriage return
-			capability *= 0.6;
-			break;
-		}
+		if (c < 0x20
+			&& c != 0x08	// backspace
+			&& c != 0x09	// tab
+			&& c != 0x0A	// line feed
+			&& c != 0x0C	// form feed
+			&& c != 0x0D) { // carriage return
+			bad++;
+		} else if ((c & 0x80) != 0 && valid_utf8_char(data + i, length)) {
+			int32 j = 1;
+			for (; j < (int32)length && i + j < bytesRead; j++) {
+				if ((data[j] & 0xc0) != 0x80) {
+					bad++;
+					break;
+				}
+			}
+
+			i += j - 1;
+		} else if (c & 0x80)
+			bad++;
 	}

+// TODO: enable me once bug #675 is fixed
+#if 0
+	// if more than 1/3 of the characters are bad, we don't accept the text
+	if (bad > bytesRead / 3)
+		return B_NO_TRANSLATOR;
+#endif
+
+	capability *= (float)bad / bytesRead;
+
 	// return information about the data in the stream
 	outInfo->type = B_TRANSLATOR_TEXT;
 	outInfo->group = B_TRANSLATOR_TEXT;
@ -333,7 +394,7 @@ identify_txt_header(uint8 *data, int32 nread,
 	outInfo->capability = capability;
 	strcpy(outInfo->name, "Plain text file");
 	strcpy(outInfo->MIME, "text/plain");
-	
+
 	return B_OK;
 }

@ -401,17 +462,16 @@ STXTTranslator::Identify(BPositionIO *inSource,
 		TranslatorStyledTextStreamHeader header;
 		memcpy(&header, buffer, kstxtsize);
 		if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
-			B_SWAP_BENDIAN_TO_HOST) != B_OK)
+				B_SWAP_BENDIAN_TO_HOST) != B_OK)
 			return B_ERROR;
-		
-		if (header.header.magic == B_STYLED_TEXT_FORMAT && 
-			header.header.header_size == 
-			sizeof(TranslatorStyledTextStreamHeader) &&
-			header.header.data_size == 0 &&
-			header.version == 100)
+
+		if (header.header.magic == B_STYLED_TEXT_FORMAT
+			&& header.header.header_size == (int32)kstxtsize
+			&& header.header.data_size == 0
+			&& header.version == 100)
 			return identify_stxt_header(header, inSource, outInfo, outType);
 	}
-	
+
 	// if the data is not styled text, check if it is plain text
 	return identify_txt_header(buffer, nread, inSource, outInfo, outType);
 }