/*------------------------------------------------------------------------- * * wparser_def.c * Default text search parser * * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "commands/defrem.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" /* Output token categories */ #define ASCIIWORD 1 #define WORD_T 2 #define NUMWORD 3 #define EMAIL 4 #define URL_T 5 #define HOST 6 #define SCIENTIFIC 7 #define VERSIONNUMBER 8 #define NUMPARTHWORD 9 #define PARTHWORD 10 #define ASCIIPARTHWORD 11 #define SPACE 12 #define TAG_T 13 #define PROTOCOL 14 #define NUMHWORD 15 #define ASCIIHWORD 16 #define HWORD 17 #define URI 18 #define FILEPATH 19 #define DECIMAL 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 #define HTMLENTITY 23 #define LASTNUM 23 static const char * const tok_alias[] = { "", "asciiword", "word", "numword", "email", "url", "host", "sfloat", "version", "hword_numpart", "hword_part", "hword_asciipart", "blank", "tag", "protocol", "numhword", "asciihword", "hword", "uri", "file", "float", "int", "uint", "entity" }; static const char * const lex_descr[] = { "", "Word, all ASCII", "Word, all letters", "Word, letters and digits", "Email address", "URL", "Host", "Scientific notation", "Version number", "Hyphenated word part, letters and digits", "Hyphenated word part, all letters", "Hyphenated word part, all ASCII", "Space symbols", "HTML tag", "Protocol head", "Hyphenated word, letters and digits", "Hyphenated word, all ASCII", "Hyphenated word, all letters", "URI", "File or path name", "Decimal notation", "Signed integer", "Unsigned integer", "HTML entity" }; /* Parser states */ typedef enum { TPS_Base = 0, TPS_InNumWord, TPS_InAsciiWord, TPS_InWord, TPS_InUnsignedInt, TPS_InSignedIntFirst, TPS_InSignedInt, TPS_InSpace, TPS_InUDecimalFirst, TPS_InUDecimal, TPS_InDecimalFirst, TPS_InDecimal, TPS_InVerVersion, TPS_InSVerVersion, TPS_InVersionFirst, TPS_InVersion, TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, TPS_InHTMLEntityFirst, TPS_InHTMLEntity, TPS_InHTMLEntityNumFirst, TPS_InHTMLEntityNum, TPS_InHTMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, TPS_InTagName, TPS_InTagBeginEnd, TPS_InTag, TPS_InTagEscapeK, TPS_InTagEscapeKK, TPS_InTagBackSleshed, TPS_InTagEnd, TPS_InCommentFirst, TPS_InCommentLast, TPS_InComment, TPS_InCloseCommentFirst, TPS_InCloseCommentLast, TPS_InCommentEnd, TPS_InHostFirstDomain, TPS_InHostDomainSecond, TPS_InHostDomain, TPS_InPortFirst, TPS_InPort, TPS_InHostFirstAN, TPS_InHost, TPS_InEmail, TPS_InFileFirst, TPS_InFileTwiddle, TPS_InPathFirst, TPS_InPathFirstFirst, TPS_InPathSecond, TPS_InFile, TPS_InFileNext, TPS_InURIFirst, TPS_InURIStart, TPS_InURI, TPS_InFURL, TPS_InProtocolFirst, TPS_InProtocolSecond, TPS_InProtocolEnd, TPS_InHyphenAsciiWordFirst, TPS_InHyphenAsciiWord, TPS_InHyphenWordFirst, TPS_InHyphenWord, TPS_InHyphenNumWordFirst, TPS_InHyphenNumWord, TPS_InHyphenValueFirst, TPS_InHyphenValue, TPS_InHyphenValueExact, TPS_InParseHyphen, TPS_InParseHyphenHyphen, TPS_InHyphenWordPart, TPS_InHyphenAsciiWordPart, TPS_InHyphenNumWordPart, TPS_InHyphenUnsignedInt, TPS_InHDecimalPartFirst, TPS_InHDecimalPart, TPS_InHVersionPartFirst, TPS_InHVersionPart, TPS_Null /* last state (fake value) */ } TParserState; /* forward declaration */ struct TParser; typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions * except p_iseq */ typedef void (*TParserSpecial) (struct TParser *); /* special handler for * special cases... */ typedef struct { TParserCharTest isclass; char c; uint16 flags; TParserState tostate; int type; TParserSpecial special; } TParserStateActionItem; /* Flag bits in TParserStateActionItem.flags */ #define A_NEXT 0x0000 #define A_BINGO 0x0001 #define A_POP 0x0002 #define A_PUSH 0x0004 #define A_RERUN 0x0008 #define A_CLEAR 0x0010 #define A_MERGE 0x0020 #define A_CLRALL 0x0040 typedef struct { TParserState state; TParserStateActionItem *action; } TParserStateAction; typedef struct TParserPosition { int posbyte; /* position of parser in bytes */ int poschar; /* osition of parser in characters */ int charlen; /* length of current char */ int lenbytelexeme; int lencharlexeme; TParserState state; struct TParserPosition *prev; int flags; TParserStateActionItem *pushedAtAction; } TParserPosition; typedef struct TParser { /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ #ifdef TS_USE_WIDE wchar_t *wstr; /* wide character string */ int lenwstr; /* length of wsting */ #endif /* State of parse */ int charmaxlen; bool usewide; TParserPosition *state; bool ignore; bool wanthost; /* silly char */ char c; /* out */ char *lexeme; int lenbytelexeme; int lencharlexeme; int type; } TParser; /* forward decls here */ static bool TParserGet(TParser * prs); static TParserPosition * newTParserPosition(TParserPosition * prev) { TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); if (prev) memcpy(res, prev, sizeof(TParserPosition)); else memset(res, 0, sizeof(TParserPosition)); res->prev = prev; res->pushedAtAction = NULL; return res; } static TParser * TParserInit(char *str, int len) { TParser *prs = (TParser *) palloc0(sizeof(TParser)); prs->charmaxlen = pg_database_encoding_max_length(); prs->str = str; prs->lenstr = len; #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else #endif prs->usewide = false; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; return prs; } static void TParserClose(TParser * prs) { while (prs->state) { TParserPosition *ptr = prs->state->prev; pfree(prs->state); prs->state = ptr; } #ifdef TS_USE_WIDE if (prs->wstr) pfree(prs->wstr); #endif pfree(prs); } /* * Character-type support functions, equivalent to is* macros, but * working with any possible encodings and locales. Note, * that with multibyte encoding and C-locale isw* function may fail * or give wrong result. Note 2: multibyte encoding and C-locale * often are used for Asian languages */ #ifdef TS_USE_WIDE #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ if ( prs->usewide ) \ { \ if ( lc_ctype_is_c() ) \ return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ \ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ } \ \ return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_isalnum(TParser * prs) { Assert(prs->state); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is * an alpha character */ if (c > 0x7f) return 1; return isalnum(0xff & c); } return iswalnum((wint_t) *(prs->wstr + prs->state->poschar)); } return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); } static int p_isnotalnum(TParser * prs) { return !p_isalnum(prs); } static int p_isalpha(TParser * prs) { Assert(prs->state); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is * an alpha character */ if (c > 0x7f) return 1; return isalpha(0xff & c); } return iswalpha((wint_t) *(prs->wstr + prs->state->poschar)); } return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); } static int p_isnotalpha(TParser * prs) { return !p_isalpha(prs); } /* p_iseq should be used only for ascii symbols */ static int p_iseq(TParser * prs, char c) { Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } #else /* TS_USE_WIDE */ #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_iseq(TParser * prs, char c) { Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } p_iswhat(alnum) p_iswhat(alpha) #endif /* TS_USE_WIDE */ p_iswhat(digit) p_iswhat(lower) p_iswhat(print) p_iswhat(punct) p_iswhat(space) p_iswhat(upper) p_iswhat(xdigit) static int p_isEOF(TParser * prs) { Assert(prs->state); return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; } static int p_iseqC(TParser * prs) { return p_iseq(prs, prs->c); } static int p_isneC(TParser * prs) { return !p_iseq(prs, prs->c); } static int p_isascii(TParser * prs) { return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; } static int p_isasclet(TParser * prs) { return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; } /* deliberately suppress unused-function complaints for the above */ void _make_compiler_happy(void); void _make_compiler_happy(void) { p_isalnum(NULL); p_isnotalnum(NULL); p_isalpha(NULL); p_isnotalpha(NULL); p_isdigit(NULL); p_isnotdigit(NULL); p_islower(NULL); p_isnotlower(NULL); p_isprint(NULL); p_isnotprint(NULL); p_ispunct(NULL); p_isnotpunct(NULL); p_isspace(NULL); p_isnotspace(NULL); p_isupper(NULL); p_isnotupper(NULL); p_isxdigit(NULL); p_isnotxdigit(NULL); p_isEOF(NULL); p_iseqC(NULL); p_isneC(NULL); } static void SpecialTags(TParser * prs) { switch (prs->state->lencharlexeme) { case 8: /* lexeme, "ignore = false; break; case 7: /*