Truncate overly long URLs to the maximum word length. Convert Japanese
periods to spaces so that more "words" are found. Fix UTF-8 comparison problems with tolower() incorrectly converting characters with the high bit set. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@15194 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
parent
12aa597dc6
commit
45bfb95c2a
@ -65,6 +65,12 @@
|
||||
*
|
||||
* Revision History (now manually updated due to SVN's philosophy)
|
||||
* $Log: spamdbm.cpp,v $
|
||||
* r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
|
||||
* Added better tokenization so that HTML is parsed and things like tags
|
||||
* between letters of a word no longer hide that word. After testing, the
|
||||
* result seems to be a tighter spread of ratings when done in full text plus
|
||||
* header mode.
|
||||
*
|
||||
* Revision 1.10 2005/11/24 02:08:39 agmsmith
|
||||
* Fixed up prefix codes, Z for things that are inside other things.
|
||||
*
|
||||
@ -1946,20 +1952,44 @@ static size_t TokenizerPassLowerCase (
|
||||
|
||||
while (BufferPntr < EndOfStringPntr)
|
||||
{
|
||||
if ((unsigned char) *BufferPntr < 128)
|
||||
/* If it is not a UTF-8 character code, convert to lower case ASCII. */
|
||||
*BufferPntr = tolower (*BufferPntr);
|
||||
/* Do our own lower case conversion; tolower () has problems with UTF-8
|
||||
characters that have the high bit set. */
|
||||
|
||||
if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
|
||||
*BufferPntr = *BufferPntr + ('a' - 'A');
|
||||
BufferPntr++;
|
||||
}
|
||||
return NumberOfBytes;
|
||||
}
|
||||
|
||||
|
||||
/* A utility function for some commonly repeated code. If this was Modula-2,
|
||||
we could use a nested procedure. But it's not. Adds the given word to the set
|
||||
of words, checking for maximum word length and prepending the prefix to the
|
||||
word, which gets modified by this function to reflect the word actually added
|
||||
to the set. */
|
||||
|
||||
static void AddWordAndPrefixToSet (
|
||||
string &Word,
|
||||
const char *PrefixString,
|
||||
set<string> &WordSet)
|
||||
{
|
||||
if (Word.empty ())
|
||||
return;
|
||||
|
||||
if (Word.size () > g_MaxWordLength)
|
||||
Word.resize (g_MaxWordLength);
|
||||
Word.insert (0, PrefixString);
|
||||
WordSet.insert (Word);
|
||||
}
|
||||
|
||||
|
||||
/* Hunt through the text for various URLs and extract the components as
|
||||
separate words. Doesn't affect the text in the buffer. Looks for
|
||||
protocol://user:password@computer:port/path?query=key#anchor strings. Also
|
||||
www.blah strings are detected and broken down. Doesn't do HREF="" strings
|
||||
where the string has a relative path (no host computer name). */
|
||||
where the string has a relative path (no host computer name). Assumes the
|
||||
input buffer is already in lower case. */
|
||||
|
||||
static size_t TokenizerPassExtractURLs (
|
||||
char *BufferPntr,
|
||||
@ -1994,12 +2024,9 @@ static size_t TokenizerPassExtractURLs (
|
||||
while (ProtocolStringPntr > BufferPntr &&
|
||||
isalpha (ProtocolStringPntr[-1]))
|
||||
ProtocolStringPntr--;
|
||||
Word = PrefixString;
|
||||
Word.append (ProtocolStringPntr,
|
||||
Word.assign (ProtocolStringPntr,
|
||||
(InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
|
||||
}
|
||||
if (HostStringPntr == NULL)
|
||||
@ -2023,20 +2050,16 @@ static size_t TokenizerPassExtractURLs (
|
||||
if (AtSignStringPntr != NULL)
|
||||
{
|
||||
/* Add a word with the user and password, unseparated. */
|
||||
Word = PrefixString;
|
||||
Word.append (HostStringPntr,
|
||||
Word.assign (HostStringPntr,
|
||||
AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
HostStringPntr = AtSignStringPntr + 1;
|
||||
}
|
||||
|
||||
/* Add a word with the computer and port, unseparated. */
|
||||
|
||||
Word = PrefixString;
|
||||
Word.append (HostStringPntr, InputStringPntr - HostStringPntr);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
|
||||
/* Now get the path name, not including the extra junk after ? and #
|
||||
separators (they're stored as separate options). Stops at white space or a
|
||||
@ -2056,24 +2079,18 @@ static size_t TokenizerPassExtractURLs (
|
||||
if (OptionsStringPntr == NULL)
|
||||
{
|
||||
/* No options, all path. */
|
||||
Word = PrefixString;
|
||||
Word.append (PathStringPntr, InputStringPntr - PathStringPntr);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Insert the path before the options. */
|
||||
Word = PrefixString;
|
||||
Word.append (PathStringPntr, OptionsStringPntr - PathStringPntr);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
|
||||
/* Insert all the options as a word. */
|
||||
Word = PrefixString;
|
||||
Word.append (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
|
||||
if (!Word.empty ())
|
||||
WordSet.insert (Word);
|
||||
Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
|
||||
AddWordAndPrefixToSet (Word, PrefixString, WordSet);
|
||||
}
|
||||
}
|
||||
return NumberOfBytes;
|
||||
@ -2171,6 +2188,9 @@ static size_t TokenizerPassGetPlainWords (
|
||||
size_t Length;
|
||||
int Letter;
|
||||
|
||||
if (NumberOfBytes <= 0)
|
||||
return 0; /* Nothing to process. */
|
||||
|
||||
if (PrefixCharacter != 0)
|
||||
AccumulatedWord = PrefixCharacter;
|
||||
EndOfStringPntr = BufferPntr + NumberOfBytes;
|
||||
@ -2227,8 +2247,9 @@ static size_t TokenizerPassGetPlainWords (
|
||||
/* Delete Things from the text. The Thing is marked by a start string and an
|
||||
end string, such as "<!--" and "--> for HTML comment things. All the text
|
||||
between the markers will be added to the word list before it gets deleted from
|
||||
the buffer. The markers must be prepared in lower case. You can specify an
|
||||
empty string for the end marker if you're just matching a string constant like
|
||||
the buffer. The markers must be prepared in lower case and the buffer is
|
||||
assumed to have already been converted to lower case. You can specify an empty
|
||||
string for the end marker if you're just matching a string constant like
|
||||
" ", which you would put in the starting marker. This is a utility
|
||||
function used by other tokenizer functions. */
|
||||
|
||||
@ -2265,8 +2286,8 @@ static size_t TokenizerUtilRemoveStartEndThing (
|
||||
FoundAndDeletedThing = false;
|
||||
if (EndOfStringPntr - InputStringPntr >=
|
||||
ThingStartLength + ThingEndLength /* space remains for start + end */ &&
|
||||
tolower (*InputStringPntr) == *ThingStartCode &&
|
||||
strncasecmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
|
||||
*InputStringPntr == *ThingStartCode &&
|
||||
memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
|
||||
{
|
||||
/* Found the start marker. Look for the terminating string. If it is an
|
||||
empty string, then we've found it right now! */
|
||||
@ -2275,8 +2296,8 @@ static size_t TokenizerUtilRemoveStartEndThing (
|
||||
while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
|
||||
{
|
||||
if (ThingEndLength == 0 ||
|
||||
(tolower (*ThingEndPntr) == *ThingEndCode &&
|
||||
strncasecmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
|
||||
(*ThingEndPntr == *ThingEndCode &&
|
||||
memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
|
||||
{
|
||||
/* Got the end of the Thing. First dump the text inbetween the start
|
||||
and end markers into the words list. */
|
||||
@ -2328,6 +2349,21 @@ static size_t TokenizerPassRemoveHTMLStyle (
|
||||
}
|
||||
|
||||
|
||||
/* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
|
||||
start of the next sentence is recognised at least as the start of a very long
|
||||
word. */
|
||||
|
||||
static size_t TokenizerPassJapanesePeriodsToSpaces (
|
||||
char *BufferPntr,
|
||||
size_t NumberOfBytes,
|
||||
char PrefixCharacter,
|
||||
set<string> &WordSet)
|
||||
{
|
||||
return TokenizerUtilRemoveStartEndThing (BufferPntr,
|
||||
NumberOfBytes, PrefixCharacter, WordSet, "。", "", true);
|
||||
}
|
||||
|
||||
|
||||
/* Delete HTML tags from the text. The contents of the tag are added as words
|
||||
before being deleted. <P>, <BR> and are replaced by spaces at this
|
||||
stage while other HTML things get replaced by nothing. */
|
||||
@ -2752,23 +2788,27 @@ void ABSApp::AddWordsToSet (
|
||||
and may add words to the word set. */
|
||||
|
||||
CurrentSize = NumberOfBytes;
|
||||
for (PassNumber = 1; PassNumber <= 7 && CurrentSize > 0 ; PassNumber++)
|
||||
for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
|
||||
{
|
||||
switch (PassNumber)
|
||||
{
|
||||
case 1: CurrentSize = TokenizerPassLowerCase (
|
||||
case 1: /* Lowercase first, rest of them assume lower case inputs. */
|
||||
CurrentSize = TokenizerPassLowerCase (
|
||||
BufferPntr, CurrentSize, PrefixCharacter, WordSet);
|
||||
break;
|
||||
case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
|
||||
BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
|
||||
case 2: CurrentSize = TokenizerPassTruncateLongAsianWords (
|
||||
case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
|
||||
BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
|
||||
case 3: CurrentSize = TokenizerPassRemoveHTMLComments (
|
||||
case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
|
||||
BufferPntr, CurrentSize, 'Z', WordSet); break;
|
||||
case 4: CurrentSize = TokenizerPassRemoveHTMLStyle (
|
||||
case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
|
||||
BufferPntr, CurrentSize, 'Z', WordSet); break;
|
||||
case 5: CurrentSize = TokenizerPassExtractURLs (
|
||||
case 6: CurrentSize = TokenizerPassExtractURLs (
|
||||
BufferPntr, CurrentSize, 'Z', WordSet); break;
|
||||
case 6: CurrentSize = TokenizerPassRemoveHTMLTags (
|
||||
case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
|
||||
BufferPntr, CurrentSize, 'Z', WordSet); break;
|
||||
case 7: CurrentSize = TokenizerPassGetPlainWords (
|
||||
case 8: CurrentSize = TokenizerPassGetPlainWords (
|
||||
BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
|
||||
default: break;
|
||||
}
|
||||
@ -5221,7 +5261,7 @@ status_t ABSApp::TokenizeWhole (
|
||||
apparently that isn't all that useful a distinction, so do it. */
|
||||
|
||||
if (Letter >= 'A' && Letter < 'Z')
|
||||
Letter = tolower (Letter);
|
||||
Letter = Letter + ('a' - 'A');
|
||||
|
||||
/* See if it is a letter we treat as white space - all control characters
|
||||
and all punctuation except for: apostrophe (so "it's" and possessive
|
||||
|
Loading…
Reference in New Issue
Block a user