Truncate overly long URLs to the maximum word length. Convert Japanese

periods to spaces so that more "words" are found. Fix UTF-8 comparison problems with tolower() incorrectly converting characters with the high bit set. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@15194 a95241bf-73f2-0310-859d-f6bbb57e9c96
2005-11-28 01:37:13 +00:00 · 2005-11-28 01:37:13 +00:00 · 45bfb95c2a
commit 45bfb95c2a
parent 12aa597dc6
1 changed files with 84 additions and 44 deletions
--- a/src/bin/bemail_utils/spamdbm.cpp
+++ b/src/bin/bemail_utils/spamdbm.cpp
@ -65,6 +65,12 @@
 *
 * Revision History (now manually updated due to SVN's philosophy)
 * $Log: spamdbm.cpp,v $
 * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
 * Added better tokenization so that HTML is parsed and things like tags
 * between letters of a word no longer hide that word.  After testing, the
 * result seems to be a tighter spread of ratings when done in full text plus
 * header mode.
 *
 * Revision 1.10  2005/11/24 02:08:39  agmsmith
 * Fixed up prefix codes, Z for things that are inside other things.
 *
@ -1946,20 +1952,44 @@ static size_t TokenizerPassLowerCase (
  while (BufferPntr < EndOfStringPntr)
  {
-    if ((unsigned char) *BufferPntr < 128)
+    /* Do our own lower case conversion; tolower () has problems with UTF-8
-      /* If it is not a UTF-8 character code, convert to lower case ASCII. */
+    characters that have the high bit set. */
-      *BufferPntr = tolower (*BufferPntr);
+
    if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
      *BufferPntr = *BufferPntr + ('a' - 'A');
    BufferPntr++;
  }
  return NumberOfBytes;
 }
 /* A utility function for some commonly repeated code.  If this was Modula-2,
 we could use a nested procedure.  But it's not.  Adds the given word to the set
 of words, checking for maximum word length and prepending the prefix to the
 word, which gets modified by this function to reflect the word actually added
 to the set. */
 static void AddWordAndPrefixToSet (
  string &Word,
  const char *PrefixString,
  set<string> &WordSet)
 {
  if (Word.empty ())
    return;
  if (Word.size () > g_MaxWordLength)
    Word.resize (g_MaxWordLength);
  Word.insert (0, PrefixString);
  WordSet.insert (Word);
 }
 /* Hunt through the text for various URLs and extract the components as
 separate words.  Doesn't affect the text in the buffer.  Looks for
 protocol://user:password@computer:port/path?query=key#anchor strings.  Also
 www.blah strings are detected and broken down.  Doesn't do HREF="" strings
-where the string has a relative path (no host computer name). */
+where the string has a relative path (no host computer name).  Assumes the
 input buffer is already in lower case. */
 static size_t TokenizerPassExtractURLs (
  char *BufferPntr,
@ -1994,12 +2024,9 @@ static size_t TokenizerPassExtractURLs (
      while (ProtocolStringPntr > BufferPntr &&
      isalpha (ProtocolStringPntr[-1]))
        ProtocolStringPntr--;
-      Word = PrefixString;
+      Word.assign (ProtocolStringPntr,
      Word.append (ProtocolStringPntr,
        (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
-      if (!Word.empty ())
+      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
        WordSet.insert (Word);
      HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
    }
    if (HostStringPntr == NULL)
@ -2023,20 +2050,16 @@ static size_t TokenizerPassExtractURLs (
    if (AtSignStringPntr != NULL)
    {
      /* Add a word with the user and password, unseparated. */
-      Word = PrefixString;
+      Word.assign (HostStringPntr,
      Word.append (HostStringPntr,
        AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
-      if (!Word.empty ())
+      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
        WordSet.insert (Word);
      HostStringPntr = AtSignStringPntr + 1;
    }
    /* Add a word with the computer and port, unseparated. */
-    Word = PrefixString;
+    Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
-    Word.append (HostStringPntr, InputStringPntr - HostStringPntr);
+    AddWordAndPrefixToSet (Word, PrefixString, WordSet);
    if (!Word.empty ())
      WordSet.insert (Word);
    /* Now get the path name, not including the extra junk after ?  and #
    separators (they're stored as separate options).  Stops at white space or a
@ -2056,24 +2079,18 @@ static size_t TokenizerPassExtractURLs (
    if (OptionsStringPntr == NULL)
    {
      /* No options, all path. */
-      Word = PrefixString;
+      Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
-      Word.append (PathStringPntr, InputStringPntr - PathStringPntr);
+      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
      if (!Word.empty ())
        WordSet.insert (Word);
    }
    else
    {
      /* Insert the path before the options. */
-      Word = PrefixString;
+      Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
-      Word.append (PathStringPntr, OptionsStringPntr - PathStringPntr);
+      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
      if (!Word.empty ())
        WordSet.insert (Word);
      /* Insert all the options as a word. */
-      Word = PrefixString;
+      Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
-      Word.append (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
+      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
      if (!Word.empty ())
        WordSet.insert (Word);
    }
  }
  return NumberOfBytes;
@ -2171,6 +2188,9 @@ static size_t TokenizerPassGetPlainWords (
  size_t  Length;
  int     Letter;
  if (NumberOfBytes <= 0)
    return 0; /* Nothing to process. */
  if (PrefixCharacter != 0)
    AccumulatedWord = PrefixCharacter;
  EndOfStringPntr = BufferPntr + NumberOfBytes;
@ -2227,8 +2247,9 @@ static size_t TokenizerPassGetPlainWords (
 /* Delete Things from the text.  The Thing is marked by a start string and an
 end string, such as "<!--" and "--> for HTML comment things.  All the text
 between the markers will be added to the word list before it gets deleted from
-the buffer.  The markers must be prepared in lower case.  You can specify an
+the buffer.  The markers must be prepared in lower case and the buffer is
-empty string for the end marker if you're just matching a string constant like
+assumed to have already been converted to lower case.  You can specify an empty
 string for the end marker if you're just matching a string constant like
 "&nbsp;", which you would put in the starting marker.  This is a utility
 function used by other tokenizer functions. */
@ -2265,8 +2286,8 @@ static size_t TokenizerUtilRemoveStartEndThing (
    FoundAndDeletedThing = false;
    if (EndOfStringPntr - InputStringPntr >=
    ThingStartLength + ThingEndLength /* space remains for start + end */ &&
-    tolower (*InputStringPntr) == *ThingStartCode &&
+    *InputStringPntr == *ThingStartCode &&
-    strncasecmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
+    memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
    {
      /* Found the start marker.  Look for the terminating string.  If it is an
      empty string, then we've found it right now! */
@ -2275,8 +2296,8 @@ static size_t TokenizerUtilRemoveStartEndThing (
      while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
      {
        if (ThingEndLength == 0 ||
-        (tolower (*ThingEndPntr) == *ThingEndCode &&
+        (*ThingEndPntr == *ThingEndCode &&
-        strncasecmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
+        memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
        {
          /* Got the end of the Thing.  First dump the text inbetween the start
          and end markers into the words list. */
@ -2328,6 +2349,21 @@ static size_t TokenizerPassRemoveHTMLStyle (
 }
 /* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
 start of the next sentence is recognised at least as the start of a very long
 word. */
 static size_t TokenizerPassJapanesePeriodsToSpaces (
  char *BufferPntr,
  size_t NumberOfBytes,
  char PrefixCharacter,
  set<string> &WordSet)
 {
  return TokenizerUtilRemoveStartEndThing (BufferPntr,
    NumberOfBytes, PrefixCharacter, WordSet, "。", "", true);
 }
 /* Delete HTML tags from the text.  The contents of the tag are added as words
 before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
 stage while other HTML things get replaced by nothing. */
@ -2752,23 +2788,27 @@ void ABSApp::AddWordsToSet (
  and may add words to the word set. */
  CurrentSize = NumberOfBytes;
-  for (PassNumber = 1; PassNumber <= 7 && CurrentSize > 0 ; PassNumber++)
+  for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
  {
    switch (PassNumber)
    {
-      case 1: CurrentSize = TokenizerPassLowerCase (
+      case 1: /* Lowercase first, rest of them assume lower case inputs. */
        CurrentSize = TokenizerPassLowerCase (
          BufferPntr, CurrentSize, PrefixCharacter, WordSet);
        break;
      case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
        BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
-      case 2: CurrentSize = TokenizerPassTruncateLongAsianWords (
+      case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
        BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
-      case 3: CurrentSize = TokenizerPassRemoveHTMLComments (
+      case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
        BufferPntr, CurrentSize, 'Z', WordSet); break;
-      case 4: CurrentSize = TokenizerPassRemoveHTMLStyle (
+      case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
        BufferPntr, CurrentSize, 'Z', WordSet); break;
-      case 5: CurrentSize = TokenizerPassExtractURLs (
+      case 6: CurrentSize = TokenizerPassExtractURLs (
        BufferPntr, CurrentSize, 'Z', WordSet); break;
-      case 6: CurrentSize = TokenizerPassRemoveHTMLTags (
+      case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
        BufferPntr, CurrentSize, 'Z', WordSet); break;
-      case 7: CurrentSize = TokenizerPassGetPlainWords (
+      case 8: CurrentSize = TokenizerPassGetPlainWords (
        BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
      default: break;
    }
@ -5221,7 +5261,7 @@ status_t ABSApp::TokenizeWhole (
    apparently that isn't all that useful a distinction, so do it. */
    if (Letter >= 'A' && Letter < 'Z')
-      Letter = tolower (Letter);
+      Letter = Letter + ('a' - 'A');
    /* See if it is a letter we treat as white space - all control characters
    and all punctuation except for: apostrophe (so "it's" and possessive