From 45bfb95c2a7843e2b56b74684e8acdf7532ecc5a Mon Sep 17 00:00:00 2001 From: "Alexander G.M. Smith" Date: Mon, 28 Nov 2005 01:37:13 +0000 Subject: [PATCH] Truncate overly long URLs to the maximum word length. Convert Japanese periods to spaces so that more "words" are found. Fix UTF-8 comparison problems with tolower() incorrectly converting characters with the high bit set. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@15194 a95241bf-73f2-0310-859d-f6bbb57e9c96 --- src/bin/bemail_utils/spamdbm.cpp | 128 ++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 44 deletions(-) diff --git a/src/bin/bemail_utils/spamdbm.cpp b/src/bin/bemail_utils/spamdbm.cpp index 538cbd162d..c2fd903d33 100644 --- a/src/bin/bemail_utils/spamdbm.cpp +++ b/src/bin/bemail_utils/spamdbm.cpp @@ -65,6 +65,12 @@ * * Revision History (now manually updated due to SVN's philosophy) * $Log: spamdbm.cpp,v $ + * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines + * Added better tokenization so that HTML is parsed and things like tags + * between letters of a word no longer hide that word. After testing, the + * result seems to be a tighter spread of ratings when done in full text plus + * header mode. + * * Revision 1.10 2005/11/24 02:08:39 agmsmith * Fixed up prefix codes, Z for things that are inside other things. * @@ -1946,20 +1952,44 @@ static size_t TokenizerPassLowerCase ( while (BufferPntr < EndOfStringPntr) { - if ((unsigned char) *BufferPntr < 128) - /* If it is not a UTF-8 character code, convert to lower case ASCII. */ - *BufferPntr = tolower (*BufferPntr); + /* Do our own lower case conversion; tolower () has problems with UTF-8 + characters that have the high bit set. */ + + if (*BufferPntr >= 'A' && *BufferPntr <= 'Z') + *BufferPntr = *BufferPntr + ('a' - 'A'); BufferPntr++; } return NumberOfBytes; } +/* A utility function for some commonly repeated code. If this was Modula-2, +we could use a nested procedure. But it's not. Adds the given word to the set +of words, checking for maximum word length and prepending the prefix to the +word, which gets modified by this function to reflect the word actually added +to the set. */ + +static void AddWordAndPrefixToSet ( + string &Word, + const char *PrefixString, + set &WordSet) +{ + if (Word.empty ()) + return; + + if (Word.size () > g_MaxWordLength) + Word.resize (g_MaxWordLength); + Word.insert (0, PrefixString); + WordSet.insert (Word); +} + + /* Hunt through the text for various URLs and extract the components as separate words. Doesn't affect the text in the buffer. Looks for protocol://user:password@computer:port/path?query=key#anchor strings. Also www.blah strings are detected and broken down. Doesn't do HREF="" strings -where the string has a relative path (no host computer name). */ +where the string has a relative path (no host computer name). Assumes the +input buffer is already in lower case. */ static size_t TokenizerPassExtractURLs ( char *BufferPntr, @@ -1994,12 +2024,9 @@ static size_t TokenizerPassExtractURLs ( while (ProtocolStringPntr > BufferPntr && isalpha (ProtocolStringPntr[-1])) ProtocolStringPntr--; - Word = PrefixString; - Word.append (ProtocolStringPntr, + Word.assign (ProtocolStringPntr, (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */); - if (!Word.empty ()) - WordSet.insert (Word); - + AddWordAndPrefixToSet (Word, PrefixString, WordSet); HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */ } if (HostStringPntr == NULL) @@ -2023,20 +2050,16 @@ static size_t TokenizerPassExtractURLs ( if (AtSignStringPntr != NULL) { /* Add a word with the user and password, unseparated. */ - Word = PrefixString; - Word.append (HostStringPntr, + Word.assign (HostStringPntr, AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */); - if (!Word.empty ()) - WordSet.insert (Word); + AddWordAndPrefixToSet (Word, PrefixString, WordSet); HostStringPntr = AtSignStringPntr + 1; } /* Add a word with the computer and port, unseparated. */ - Word = PrefixString; - Word.append (HostStringPntr, InputStringPntr - HostStringPntr); - if (!Word.empty ()) - WordSet.insert (Word); + Word.assign (HostStringPntr, InputStringPntr - HostStringPntr); + AddWordAndPrefixToSet (Word, PrefixString, WordSet); /* Now get the path name, not including the extra junk after ? and # separators (they're stored as separate options). Stops at white space or a @@ -2056,24 +2079,18 @@ static size_t TokenizerPassExtractURLs ( if (OptionsStringPntr == NULL) { /* No options, all path. */ - Word = PrefixString; - Word.append (PathStringPntr, InputStringPntr - PathStringPntr); - if (!Word.empty ()) - WordSet.insert (Word); + Word.assign (PathStringPntr, InputStringPntr - PathStringPntr); + AddWordAndPrefixToSet (Word, PrefixString, WordSet); } else { /* Insert the path before the options. */ - Word = PrefixString; - Word.append (PathStringPntr, OptionsStringPntr - PathStringPntr); - if (!Word.empty ()) - WordSet.insert (Word); + Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr); + AddWordAndPrefixToSet (Word, PrefixString, WordSet); /* Insert all the options as a word. */ - Word = PrefixString; - Word.append (OptionsStringPntr, InputStringPntr - OptionsStringPntr); - if (!Word.empty ()) - WordSet.insert (Word); + Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr); + AddWordAndPrefixToSet (Word, PrefixString, WordSet); } } return NumberOfBytes; @@ -2171,6 +2188,9 @@ static size_t TokenizerPassGetPlainWords ( size_t Length; int Letter; + if (NumberOfBytes <= 0) + return 0; /* Nothing to process. */ + if (PrefixCharacter != 0) AccumulatedWord = PrefixCharacter; EndOfStringPntr = BufferPntr + NumberOfBytes; @@ -2227,8 +2247,9 @@ static size_t TokenizerPassGetPlainWords ( /* Delete Things from the text. The Thing is marked by a start string and an end string, such as " for HTML comment things. All the text between the markers will be added to the word list before it gets deleted from -the buffer. The markers must be prepared in lower case. You can specify an -empty string for the end marker if you're just matching a string constant like +the buffer. The markers must be prepared in lower case and the buffer is +assumed to have already been converted to lower case. You can specify an empty +string for the end marker if you're just matching a string constant like " ", which you would put in the starting marker. This is a utility function used by other tokenizer functions. */ @@ -2265,8 +2286,8 @@ static size_t TokenizerUtilRemoveStartEndThing ( FoundAndDeletedThing = false; if (EndOfStringPntr - InputStringPntr >= ThingStartLength + ThingEndLength /* space remains for start + end */ && - tolower (*InputStringPntr) == *ThingStartCode && - strncasecmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0) + *InputStringPntr == *ThingStartCode && + memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0) { /* Found the start marker. Look for the terminating string. If it is an empty string, then we've found it right now! */ @@ -2275,8 +2296,8 @@ static size_t TokenizerUtilRemoveStartEndThing ( while (EndOfStringPntr - ThingEndPntr >= ThingEndLength) { if (ThingEndLength == 0 || - (tolower (*ThingEndPntr) == *ThingEndCode && - strncasecmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0)) + (*ThingEndPntr == *ThingEndCode && + memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0)) { /* Got the end of the Thing. First dump the text inbetween the start and end markers into the words list. */ @@ -2328,6 +2349,21 @@ static size_t TokenizerPassRemoveHTMLStyle ( } +/* Convert Japanese periods (a round hollow dot symbol) to spaces so that the +start of the next sentence is recognised at least as the start of a very long +word. */ + +static size_t TokenizerPassJapanesePeriodsToSpaces ( + char *BufferPntr, + size_t NumberOfBytes, + char PrefixCharacter, + set &WordSet) +{ + return TokenizerUtilRemoveStartEndThing (BufferPntr, + NumberOfBytes, PrefixCharacter, WordSet, "。", "", true); +} + + /* Delete HTML tags from the text. The contents of the tag are added as words before being deleted.

,
and   are replaced by spaces at this stage while other HTML things get replaced by nothing. */ @@ -2752,23 +2788,27 @@ void ABSApp::AddWordsToSet ( and may add words to the word set. */ CurrentSize = NumberOfBytes; - for (PassNumber = 1; PassNumber <= 7 && CurrentSize > 0 ; PassNumber++) + for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++) { switch (PassNumber) { - case 1: CurrentSize = TokenizerPassLowerCase ( + case 1: /* Lowercase first, rest of them assume lower case inputs. */ + CurrentSize = TokenizerPassLowerCase ( + BufferPntr, CurrentSize, PrefixCharacter, WordSet); + break; + case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces ( BufferPntr, CurrentSize, PrefixCharacter, WordSet); break; - case 2: CurrentSize = TokenizerPassTruncateLongAsianWords ( + case 3: CurrentSize = TokenizerPassTruncateLongAsianWords ( BufferPntr, CurrentSize, PrefixCharacter, WordSet); break; - case 3: CurrentSize = TokenizerPassRemoveHTMLComments ( + case 4: CurrentSize = TokenizerPassRemoveHTMLComments ( BufferPntr, CurrentSize, 'Z', WordSet); break; - case 4: CurrentSize = TokenizerPassRemoveHTMLStyle ( + case 5: CurrentSize = TokenizerPassRemoveHTMLStyle ( BufferPntr, CurrentSize, 'Z', WordSet); break; - case 5: CurrentSize = TokenizerPassExtractURLs ( + case 6: CurrentSize = TokenizerPassExtractURLs ( BufferPntr, CurrentSize, 'Z', WordSet); break; - case 6: CurrentSize = TokenizerPassRemoveHTMLTags ( + case 7: CurrentSize = TokenizerPassRemoveHTMLTags ( BufferPntr, CurrentSize, 'Z', WordSet); break; - case 7: CurrentSize = TokenizerPassGetPlainWords ( + case 8: CurrentSize = TokenizerPassGetPlainWords ( BufferPntr, CurrentSize, PrefixCharacter, WordSet); break; default: break; } @@ -5221,7 +5261,7 @@ status_t ABSApp::TokenizeWhole ( apparently that isn't all that useful a distinction, so do it. */ if (Letter >= 'A' && Letter < 'Z') - Letter = tolower (Letter); + Letter = Letter + ('a' - 'A'); /* See if it is a letter we treat as white space - all control characters and all punctuation except for: apostrophe (so "it's" and possessive