postgres/src/backend/tsearch/wparser_def.c
Tom Lane dbaec70c15 Rename and slightly redefine the default text search parser's "word"
categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
2007-10-23 20:46:12 +00:00

1867 lines
52 KiB
C

/*-------------------------------------------------------------------------
*
* wparser_def.c
* Default text search parser
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/defrem.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
/* Output token categories */
#define ASCIIWORD 1
#define WORD_T 2
#define NUMWORD 3
#define EMAIL 4
#define URL_T 5
#define HOST 6
#define SCIENTIFIC 7
#define VERSIONNUMBER 8
#define NUMPARTHWORD 9
#define PARTHWORD 10
#define ASCIIPARTHWORD 11
#define SPACE 12
#define TAG_T 13
#define PROTOCOL 14
#define NUMHWORD 15
#define ASCIIHWORD 16
#define HWORD 17
#define URI 18
#define FILEPATH 19
#define DECIMAL 20
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define HTMLENTITY 23
#define LASTNUM 23
static const char * const tok_alias[] = {
"",
"asciiword",
"word",
"numword",
"email",
"url",
"host",
"sfloat",
"version",
"hword_numpart",
"hword_part",
"hword_asciipart",
"blank",
"tag",
"protocol",
"numhword",
"asciihword",
"hword",
"uri",
"file",
"float",
"int",
"uint",
"entity"
};
static const char * const lex_descr[] = {
"",
"Word, all ASCII",
"Word, all letters",
"Word, letters and digits",
"Email address",
"URL",
"Host",
"Scientific notation",
"Version number",
"Hyphenated word part, letters and digits",
"Hyphenated word part, all letters",
"Hyphenated word part, all ASCII",
"Space symbols",
"HTML tag",
"Protocol head",
"Hyphenated word, letters and digits",
"Hyphenated word, all ASCII",
"Hyphenated word, all letters",
"URI",
"File or path name",
"Decimal notation",
"Signed integer",
"Unsigned integer",
"HTML entity"
};
/* Parser states */
typedef enum
{
TPS_Base = 0,
TPS_InNumWord,
TPS_InAsciiWord,
TPS_InWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InSpace,
TPS_InUDecimalFirst,
TPS_InUDecimal,
TPS_InDecimalFirst,
TPS_InDecimal,
TPS_InVerVersion,
TPS_InSVerVersion,
TPS_InVersionFirst,
TPS_InVersion,
TPS_InMantissaFirst,
TPS_InMantissaSign,
TPS_InMantissa,
TPS_InHTMLEntityFirst,
TPS_InHTMLEntity,
TPS_InHTMLEntityNumFirst,
TPS_InHTMLEntityNum,
TPS_InHTMLEntityEnd,
TPS_InTagFirst,
TPS_InXMLBegin,
TPS_InTagCloseFirst,
TPS_InTagName,
TPS_InTagBeginEnd,
TPS_InTag,
TPS_InTagEscapeK,
TPS_InTagEscapeKK,
TPS_InTagBackSleshed,
TPS_InTagEnd,
TPS_InCommentFirst,
TPS_InCommentLast,
TPS_InComment,
TPS_InCloseCommentFirst,
TPS_InCloseCommentLast,
TPS_InCommentEnd,
TPS_InHostFirstDomain,
TPS_InHostDomainSecond,
TPS_InHostDomain,
TPS_InPortFirst,
TPS_InPort,
TPS_InHostFirstAN,
TPS_InHost,
TPS_InEmail,
TPS_InFileFirst,
TPS_InFileTwiddle,
TPS_InPathFirst,
TPS_InPathFirstFirst,
TPS_InPathSecond,
TPS_InFile,
TPS_InFileNext,
TPS_InURIFirst,
TPS_InURIStart,
TPS_InURI,
TPS_InFURL,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenAsciiWordFirst,
TPS_InHyphenAsciiWord,
TPS_InHyphenWordFirst,
TPS_InHyphenWord,
TPS_InHyphenNumWordFirst,
TPS_InHyphenNumWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenWordPart,
TPS_InHyphenAsciiWordPart,
TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
* except p_iseq */
typedef void (*TParserSpecial) (struct TParser *); /* special handler for
* special cases... */
typedef struct
{
TParserCharTest isclass;
char c;
uint16 flags;
TParserState tostate;
int type;
TParserSpecial special;
} TParserStateActionItem;
/* Flag bits in TParserStateActionItem.flags */
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
typedef struct
{
TParserState state;
TParserStateActionItem *action;
} TParserStateAction;
typedef struct TParserPosition
{
int posbyte; /* position of parser in bytes */
int poschar; /* osition of parser in characters */
int charlen; /* length of current char */
int lenbytelexeme;
int lencharlexeme;
TParserState state;
struct TParserPosition *prev;
int flags;
TParserStateActionItem *pushedAtAction;
} TParserPosition;
typedef struct TParser
{
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
#ifdef TS_USE_WIDE
wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */
#endif
/* State of parse */
int charmaxlen;
bool usewide;
TParserPosition *state;
bool ignore;
bool wanthost;
/* silly char */
char c;
/* out */
char *lexeme;
int lenbytelexeme;
int lencharlexeme;
int type;
} TParser;
/* forward decls here */
static bool TParserGet(TParser * prs);
static TParserPosition *
newTParserPosition(TParserPosition * prev)
{
TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
if (prev)
memcpy(res, prev, sizeof(TParserPosition));
else
memset(res, 0, sizeof(TParserPosition));
res->prev = prev;
res->pushedAtAction = NULL;
return res;
}
static TParser *
TParserInit(char *str, int len)
{
TParser *prs = (TParser *) palloc0(sizeof(TParser));
prs->charmaxlen = pg_database_encoding_max_length();
prs->str = str;
prs->lenstr = len;
#ifdef TS_USE_WIDE
/*
* Use wide char code only when max encoding length > 1.
*/
if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
#endif
prs->usewide = false;
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
return prs;
}
static void
TParserClose(TParser * prs)
{
while (prs->state)
{
TParserPosition *ptr = prs->state->prev;
pfree(prs->state);
prs->state = ptr;
}
#ifdef TS_USE_WIDE
if (prs->wstr)
pfree(prs->wstr);
#endif
pfree(prs);
}
/*
* Character-type support functions, equivalent to is* macros, but
* working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail
* or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages
*/
#ifdef TS_USE_WIDE
#define p_iswhat(type) \
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
if ( prs->usewide ) \
{ \
if ( lc_ctype_is_c() ) \
return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
\
return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
} \
\
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
} \
\
static int \
p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \
}
static int
p_isalnum(TParser * prs)
{
Assert(prs->state);
if (prs->usewide)
{
if (lc_ctype_is_c())
{
unsigned int c = *(prs->wstr + prs->state->poschar);
/*
* any non-ascii symbol with multibyte encoding with C-locale is
* an alpha character
*/
if (c > 0x7f)
return 1;
return isalnum(0xff & c);
}
return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
}
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
}
static int
p_isnotalnum(TParser * prs)
{
return !p_isalnum(prs);
}
static int
p_isalpha(TParser * prs)
{
Assert(prs->state);
if (prs->usewide)
{
if (lc_ctype_is_c())
{
unsigned int c = *(prs->wstr + prs->state->poschar);
/*
* any non-ascii symbol with multibyte encoding with C-locale is
* an alpha character
*/
if (c > 0x7f)
return 1;
return isalpha(0xff & c);
}
return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
}
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
}
static int
p_isnotalpha(TParser * prs)
{
return !p_isalpha(prs);
}
/* p_iseq should be used only for ascii symbols */
static int
p_iseq(TParser * prs, char c)
{
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
#else /* TS_USE_WIDE */
#define p_iswhat(type) \
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
} \
\
static int \
p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \
}
static int
p_iseq(TParser * prs, char c)
{
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
p_iswhat(alnum)
p_iswhat(alpha)
#endif /* TS_USE_WIDE */
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
p_iswhat(punct)
p_iswhat(space)
p_iswhat(upper)
p_iswhat(xdigit)
static int
p_isEOF(TParser * prs)
{
Assert(prs->state);
return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
}
static int
p_iseqC(TParser * prs)
{
return p_iseq(prs, prs->c);
}
static int
p_isneC(TParser * prs)
{
return !p_iseq(prs, prs->c);
}
static int
p_isascii(TParser * prs)
{
return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
}
static int
p_isasclet(TParser * prs)
{
return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
}
/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void);
void
_make_compiler_happy(void)
{
p_isalnum(NULL);
p_isnotalnum(NULL);
p_isalpha(NULL);
p_isnotalpha(NULL);
p_isdigit(NULL);
p_isnotdigit(NULL);
p_islower(NULL);
p_isnotlower(NULL);
p_isprint(NULL);
p_isnotprint(NULL);
p_ispunct(NULL);
p_isnotpunct(NULL);
p_isspace(NULL);
p_isnotspace(NULL);
p_isupper(NULL);
p_isnotupper(NULL);
p_isxdigit(NULL);
p_isnotxdigit(NULL);
p_isEOF(NULL);
p_iseqC(NULL);
p_isneC(NULL);
}
static void
SpecialTags(TParser * prs)
{
switch (prs->state->lencharlexeme)
{
case 8: /* </script */
if (pg_strncasecmp(prs->lexeme, "</script", 8) == 0)
prs->ignore = false;
break;
case 7: /* <script || </style */
if (pg_strncasecmp(prs->lexeme, "</style", 7) == 0)
prs->ignore = false;
else if (pg_strncasecmp(prs->lexeme, "<script", 7) == 0)
prs->ignore = true;
break;
case 6: /* <style */
if (pg_strncasecmp(prs->lexeme, "<style", 6) == 0)
prs->ignore = true;
break;
default:
break;
}
}
static void
SpecialFURL(TParser * prs)
{
prs->wanthost = true;
prs->state->posbyte -= prs->state->lenbytelexeme;
prs->state->poschar -= prs->state->lencharlexeme;
}
static void
SpecialHyphen(TParser * prs)
{
prs->state->posbyte -= prs->state->lenbytelexeme;
prs->state->poschar -= prs->state->lencharlexeme;
}
static void
SpecialVerVersion(TParser * prs)
{
prs->state->posbyte -= prs->state->lenbytelexeme;
prs->state->poschar -= prs->state->lencharlexeme;
prs->state->lenbytelexeme = 0;
prs->state->lencharlexeme = 0;
}
static int
p_isstophost(TParser * prs)
{
if (prs->wanthost)
{
prs->wanthost = false;
return 1;
}
return 0;
}
static int
p_isignore(TParser * prs)
{
return (prs->ignore) ? 1 : 0;
}
static int
p_ishost(TParser * prs)
{
TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
int res = 0;
if (TParserGet(tmpprs) && tmpprs->type == HOST)
{
prs->state->posbyte += tmpprs->lenbytelexeme;
prs->state->poschar += tmpprs->lencharlexeme;
prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
prs->state->lencharlexeme += tmpprs->lencharlexeme;
prs->state->charlen = tmpprs->state->charlen;
res = 1;
}
TParserClose(tmpprs);
return res;
}
static int
p_isURI(TParser * prs)
{
TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
int res = 0;
tmpprs->state = newTParserPosition(tmpprs->state);
tmpprs->state->state = TPS_InFileFirst;
if (TParserGet(tmpprs) && (tmpprs->type == URI || tmpprs->type == FILEPATH))
{
prs->state->posbyte += tmpprs->lenbytelexeme;
prs->state->poschar += tmpprs->lencharlexeme;
prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
prs->state->lencharlexeme += tmpprs->lencharlexeme;
prs->state->charlen = tmpprs->state->charlen;
res = 1;
}
TParserClose(tmpprs);
return res;
}
/*
* Table of state/action of parser
*/
static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL},
{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
{NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
};
static TParserStateActionItem actionTPS_InNumWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
};
static TParserStateActionItem actionTPS_InAsciiWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
};
static TParserStateActionItem actionTPS_InWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
};
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InSignedIntFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InSignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InSpace[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
{p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
{p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
};
static TParserStateActionItem actionTPS_InUDecimalFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InUDecimal[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InDecimalFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InDecimal[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InVerVersion[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InSVerVersion[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InVersionFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InVersion[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
{p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
};
static TParserStateActionItem actionTPS_InMantissaFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
{p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InMantissaSign[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InMantissa[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
{p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityNum[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityEnd[] = {
{NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL}
};
static TParserStateActionItem actionTPS_InTagFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
{p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InXMLBegin[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
/* <?xml ... */
{p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
{p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagName[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
/* <br/> case */
{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagBeginEnd[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTag[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
{p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEscapeK[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
{p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEscapeKK[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{NULL, 0, A_MERGE, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InCommentFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
/* <!DOCTYPE ...> */
{p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
{p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InCommentLast[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InComment[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
};
static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
};
static TParserStateActionItem actionTPS_InCommentEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomain[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURIStart, HOST, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
};
static TParserStateActionItem actionTPS_InPortFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InPort[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURIStart, HOST, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InEmail[] = {
{p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InFileTwiddle[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InPathFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InPathFirstFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InPathSecond[] = {
{p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
{p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
{p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
};
static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InURIFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
{p_isnotspace, 0, A_CLEAR, TPS_InURI, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL},
};
static TParserStateActionItem actionTPS_InURIStart[] = {
{NULL, 0, A_NEXT, TPS_InURI, 0, NULL}
};
static TParserStateActionItem actionTPS_InURI[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, URI, NULL},
{p_iseqC, '"', A_BINGO, TPS_Base, URI, NULL},
{p_iseqC, '\'', A_BINGO, TPS_Base, URI, NULL},
{p_isnotspace, 0, A_NEXT, TPS_InURI, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, URI, NULL}
};
static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
};
static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenValue[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
};
static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHDecimalPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHVersionPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
};
/*
* order must be the same as in typedef enum {} TParserState!!
*/
static const TParserStateAction Actions[] = {
{TPS_Base, actionTPS_Base},
{TPS_InNumWord, actionTPS_InNumWord},
{TPS_InAsciiWord, actionTPS_InAsciiWord},
{TPS_InWord, actionTPS_InWord},
{TPS_InUnsignedInt, actionTPS_InUnsignedInt},
{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
{TPS_InSignedInt, actionTPS_InSignedInt},
{TPS_InSpace, actionTPS_InSpace},
{TPS_InUDecimalFirst, actionTPS_InUDecimalFirst},
{TPS_InUDecimal, actionTPS_InUDecimal},
{TPS_InDecimalFirst, actionTPS_InDecimalFirst},
{TPS_InDecimal, actionTPS_InDecimal},
{TPS_InVerVersion, actionTPS_InVerVersion},
{TPS_InSVerVersion, actionTPS_InSVerVersion},
{TPS_InVersionFirst, actionTPS_InVersionFirst},
{TPS_InVersion, actionTPS_InVersion},
{TPS_InMantissaFirst, actionTPS_InMantissaFirst},
{TPS_InMantissaSign, actionTPS_InMantissaSign},
{TPS_InMantissa, actionTPS_InMantissa},
{TPS_InHTMLEntityFirst, actionTPS_InHTMLEntityFirst},
{TPS_InHTMLEntity, actionTPS_InHTMLEntity},
{TPS_InHTMLEntityNumFirst, actionTPS_InHTMLEntityNumFirst},
{TPS_InHTMLEntityNum, actionTPS_InHTMLEntityNum},
{TPS_InHTMLEntityEnd, actionTPS_InHTMLEntityEnd},
{TPS_InTagFirst, actionTPS_InTagFirst},
{TPS_InXMLBegin, actionTPS_InXMLBegin},
{TPS_InTagCloseFirst, actionTPS_InTagCloseFirst},
{TPS_InTagName, actionTPS_InTagName},
{TPS_InTagBeginEnd, actionTPS_InTagBeginEnd},
{TPS_InTag, actionTPS_InTag},
{TPS_InTagEscapeK, actionTPS_InTagEscapeK},
{TPS_InTagEscapeKK, actionTPS_InTagEscapeKK},
{TPS_InTagBackSleshed, actionTPS_InTagBackSleshed},
{TPS_InTagEnd, actionTPS_InTagEnd},
{TPS_InCommentFirst, actionTPS_InCommentFirst},
{TPS_InCommentLast, actionTPS_InCommentLast},
{TPS_InComment, actionTPS_InComment},
{TPS_InCloseCommentFirst, actionTPS_InCloseCommentFirst},
{TPS_InCloseCommentLast, actionTPS_InCloseCommentLast},
{TPS_InCommentEnd, actionTPS_InCommentEnd},
{TPS_InHostFirstDomain, actionTPS_InHostFirstDomain},
{TPS_InHostDomainSecond, actionTPS_InHostDomainSecond},
{TPS_InHostDomain, actionTPS_InHostDomain},
{TPS_InPortFirst, actionTPS_InPortFirst},
{TPS_InPort, actionTPS_InPort},
{TPS_InHostFirstAN, actionTPS_InHostFirstAN},
{TPS_InHost, actionTPS_InHost},
{TPS_InEmail, actionTPS_InEmail},
{TPS_InFileFirst, actionTPS_InFileFirst},
{TPS_InFileTwiddle, actionTPS_InFileTwiddle},
{TPS_InPathFirst, actionTPS_InPathFirst},
{TPS_InPathFirstFirst, actionTPS_InPathFirstFirst},
{TPS_InPathSecond, actionTPS_InPathSecond},
{TPS_InFile, actionTPS_InFile},
{TPS_InFileNext, actionTPS_InFileNext},
{TPS_InURIFirst, actionTPS_InURIFirst},
{TPS_InURIStart, actionTPS_InURIStart},
{TPS_InURI, actionTPS_InURI},
{TPS_InFURL, actionTPS_InFURL},
{TPS_InProtocolFirst, actionTPS_InProtocolFirst},
{TPS_InProtocolSecond, actionTPS_InProtocolSecond},
{TPS_InProtocolEnd, actionTPS_InProtocolEnd},
{TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
{TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
{TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
{TPS_InHyphenWord, actionTPS_InHyphenWord},
{TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
{TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
{TPS_InHyphenValue, actionTPS_InHyphenValue},
{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
{TPS_InParseHyphen, actionTPS_InParseHyphen},
{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
{TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
{TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
{TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
{TPS_InHDecimalPart, actionTPS_InHDecimalPart},
{TPS_InHVersionPartFirst, actionTPS_InHVersionPartFirst},
{TPS_InHVersionPart, actionTPS_InHVersionPart},
{TPS_Null, NULL}
};
static bool
TParserGet(TParser * prs)
{
TParserStateActionItem *item = NULL;
Assert(prs->state);
if (prs->state->posbyte >= prs->lenstr)
return false;
prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL;
/* look at string */
while (prs->state->posbyte <= prs->lenstr)
{
if (prs->state->posbyte == prs->lenstr)
prs->state->charlen = 0;
else
prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
pg_mblen(prs->str + prs->state->posbyte);
Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
Assert(Actions[prs->state->state].state == prs->state->state);
item = Actions[prs->state->state].action;
Assert(item != NULL);
if (item < prs->state->pushedAtAction)
item = prs->state->pushedAtAction;
/* find action by character class */
while (item->isclass)
{
prs->c = item->c;
if (item->isclass(prs) != 0)
{
if (item > prs->state->pushedAtAction) /* remember: after
* pushing we were by
* false way */
break;
}
item++;
}
prs->state->pushedAtAction = NULL;
/* call special handler if exists */
if (item->special)
item->special(prs);
/* BINGO, lexeme is found */
if (item->flags & A_BINGO)
{
Assert(item->type > 0);
prs->lenbytelexeme = prs->state->lenbytelexeme;
prs->lencharlexeme = prs->state->lencharlexeme;
prs->state->lenbytelexeme = prs->state->lencharlexeme = 0;
prs->type = item->type;
}
/* do various actions by flags */
if (item->flags & A_POP)
{ /* pop stored state in stack */
TParserPosition *ptr = prs->state->prev;
pfree(prs->state);
prs->state = ptr;
Assert(prs->state);
}
else if (item->flags & A_PUSH)
{ /* push (store) state in stack */
prs->state->pushedAtAction = item; /* remember where we push */
prs->state = newTParserPosition(prs->state);
}
else if (item->flags & A_CLEAR)
{ /* clear previous pushed state */
TParserPosition *ptr;
Assert(prs->state->prev);
ptr = prs->state->prev->prev;
pfree(prs->state->prev);
prs->state->prev = ptr;
}
else if (item->flags & A_CLRALL)
{ /* clear all previous pushed state */
TParserPosition *ptr;
while (prs->state->prev)
{
ptr = prs->state->prev->prev;
pfree(prs->state->prev);
prs->state->prev = ptr;
}
}
else if (item->flags & A_MERGE)
{ /* merge posinfo with current and pushed state */
TParserPosition *ptr = prs->state;
Assert(prs->state->prev);
prs->state = prs->state->prev;
prs->state->posbyte = ptr->posbyte;
prs->state->poschar = ptr->poschar;
prs->state->charlen = ptr->charlen;
prs->state->lenbytelexeme = ptr->lenbytelexeme;
prs->state->lencharlexeme = ptr->lencharlexeme;
pfree(ptr);
}
/* set new state if pointed */
if (item->tostate != TPS_Null)
prs->state->state = item->tostate;
/* check for go away */
if ((item->flags & A_BINGO) ||
(prs->state->posbyte >= prs->lenstr &&
(item->flags & A_RERUN) == 0))
break;
/* go to beginning of loop if we should rerun or we just restore state */
if (item->flags & (A_RERUN | A_POP))
continue;
/* move forward */
if (prs->state->charlen)
{
prs->state->posbyte += prs->state->charlen;
prs->state->lenbytelexeme += prs->state->charlen;
prs->state->poschar++;
prs->state->lencharlexeme++;
}
}
return (item && (item->flags & A_BINGO)) ? true : false;
}
Datum
prsd_lextype(PG_FUNCTION_ARGS)
{
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
int i;
for (i = 1; i <= LASTNUM; i++)
{
descr[i - 1].lexid = i;
descr[i - 1].alias = pstrdup(tok_alias[i]);
descr[i - 1].descr = pstrdup(lex_descr[i]);
}
descr[LASTNUM].lexid = 0;
PG_RETURN_POINTER(descr);
}
Datum
prsd_start(PG_FUNCTION_ARGS)
{
PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
}
Datum
prsd_nexttoken(PG_FUNCTION_ARGS)
{
TParser *p = (TParser *) PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
if (!TParserGet(p))
PG_RETURN_INT32(0);
*t = p->lexeme;
*tlen = p->lenbytelexeme;
PG_RETURN_INT32(p->type);
}
Datum
prsd_end(PG_FUNCTION_ARGS)
{
TParser *p = (TParser *) PG_GETARG_POINTER(0);
TParserClose(p);
PG_RETURN_VOID();
}
#define LEAVETOKEN(x) ( (x)==SPACE )
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define ENDPUNCTOKEN(x) ( (x)==SPACE )
#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
typedef struct
{
HeadlineWordEntry *words;
int len;
} hlCheck;
static bool
checkcondition_HL(void *checkval, QueryOperand * val)
{
int i;
for (i = 0; i < ((hlCheck *) checkval)->len; i++)
{
if (((hlCheck *) checkval)->words[i].item == val)
return true;
}
return false;
}
static bool
hlCover(HeadlineParsedText * prs, TSQuery query, int *p, int *q)
{
int i,
j;
QueryItem *item = GETQUERY(query);
int pos = *p;
*q = 0;
*p = 0x7fffffff;
for (j = 0; j < query->size; j++)
{
if (item->type != QI_VAL)
{
item++;
continue;
}
for (i = pos; i < prs->curwords; i++)
{
if (prs->words[i].item == &item->operand)
{
if (i > *q)
*q = i;
break;
}
}
item++;
}
if (*q == 0)
return false;
item = GETQUERY(query);
for (j = 0; j < query->size; j++)
{
if (item->type != QI_VAL)
{
item++;
continue;
}
for (i = *q; i >= pos; i--)
{
if (prs->words[i].item == &item->operand)
{
if (i < *p)
*p = i;
break;
}
}
item++;
}
if (*p <= *q)
{
hlCheck ch;
ch.words = &(prs->words[*p]);
ch.len = *q - *p + 1;
if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
return true;
else
{
(*p)++;
return hlCover(prs, query, p, q);
}
}
return false;
}
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
List *prsoptions = (List *) PG_GETARG_POINTER(1);
TSQuery query = PG_GETARG_TSQUERY(2);
/* from opt + start and and tag */
int min_words = 15;
int max_words = 35;
int shortword = 3;
int p = 0,
q = 0;
int bestb = -1,
beste = -1;
int bestlen = -1;
int pose = 0,
posb,
poslen,
curlen;
int i;
int highlight = 0;
ListCell *l;
/* config */
prs->startsel = NULL;
prs->stopsel = NULL;
foreach(l, prsoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
char *val = defGetString(defel);
if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
max_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
min_words = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
shortword = pg_atoi(val, sizeof(int32), 0);
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
prs->startsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
prs->stopsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
highlight = (pg_strcasecmp(val, "1") == 0 ||
pg_strcasecmp(val, "on") == 0 ||
pg_strcasecmp(val, "true") == 0 ||
pg_strcasecmp(val, "t") == 0 ||
pg_strcasecmp(val, "y") == 0 ||
pg_strcasecmp(val, "yes") == 0);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized headline parameter: \"%s\"",
defel->defname)));
}
if (highlight == 0)
{
if (min_words >= max_words)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be less than MaxWords")));
if (min_words <= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
if (shortword < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
curlen = 0;
poslen = 0;
for (i = p; i <= q && curlen < max_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
pose = i;
}
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
{
/* best already finded, so try one more cover */
p++;
continue;
}
posb = p;
if (curlen < max_words)
{ /* find good end */
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
{
if (i != q)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
}
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
if (curlen < min_words && i >= prs->curwords)
{ /* got end of text and our cover is shoter
* than min_words */
for (i = p; i >= 0; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
posb = (i >= 0) ? i : 0;
}
}
else
{ /* shorter cover :((( */
for (; curlen > min_words; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
if (prs->words[i].item && !prs->words[i].repeated)
poslen--;
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
break;
}
}
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
{
bestb = posb;
beste = pose;
bestlen = poslen;
}
p++;
}
if (bestlen < 0)
{
curlen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
pose = i;
}
bestb = 0;
beste = pose;
}
}
else
{
bestb = 0;
beste = prs->curwords - 1;
}
for (i = bestb; i <= beste; i++)
{
if (prs->words[i].item)
prs->words[i].selected = 1;
if (highlight == 0)
{
if (HLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
else
{
if (HTMLHLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}
if (!prs->startsel)
prs->startsel = pstrdup("<b>");
if (!prs->stopsel)
prs->stopsel = pstrdup("</b>");
prs->startsellen = strlen(prs->startsel);
prs->stopsellen = strlen(prs->stopsel);
PG_RETURN_POINTER(prs);
}