Change text search parsing rules for hyphenated words so that digit strings
containing decimal points aren't considered part of a hyphenated word. Sync the hyphenated-word lookahead states with the subsequent part-by-part reparsing states so that we don't get different answers about how much text is part of the hyphenated word. Per my gripe of a few days ago.
This commit is contained in:
parent
1aaf39bd20
commit
73e6f9d3b6
@ -7,7 +7,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/27 17:53:15 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -181,19 +181,13 @@ typedef enum
|
||||
TPS_InHyphenWord,
|
||||
TPS_InHyphenNumWordFirst,
|
||||
TPS_InHyphenNumWord,
|
||||
TPS_InHyphenValueFirst,
|
||||
TPS_InHyphenValue,
|
||||
TPS_InHyphenValueExact,
|
||||
TPS_InHyphenDigitLookahead,
|
||||
TPS_InParseHyphen,
|
||||
TPS_InParseHyphenHyphen,
|
||||
TPS_InHyphenWordPart,
|
||||
TPS_InHyphenAsciiWordPart,
|
||||
TPS_InHyphenNumWordPart,
|
||||
TPS_InHyphenUnsignedInt,
|
||||
TPS_InHDecimalPartFirst,
|
||||
TPS_InHDecimalPart,
|
||||
TPS_InHVersionPartFirst,
|
||||
TPS_InHVersionPart,
|
||||
TPS_Null /* last state (fake value) */
|
||||
} TParserState;
|
||||
|
||||
@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
|
||||
static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = {
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
|
||||
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
|
||||
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHyphenValue[] = {
|
||||
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
|
||||
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
|
||||
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
|
||||
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHyphenValueExact[] = {
|
||||
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
|
||||
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
|
||||
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
|
||||
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InParseHyphen[] = {
|
||||
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
|
||||
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
|
||||
{p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
|
||||
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
|
||||
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
|
||||
};
|
||||
@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
|
||||
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
|
||||
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
|
||||
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
|
||||
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
|
||||
{p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHDecimalPart[] = {
|
||||
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
|
||||
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
|
||||
{NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
|
||||
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
|
||||
{p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
|
||||
{NULL, 0, A_POP, TPS_Null, 0, NULL}
|
||||
};
|
||||
|
||||
static const TParserStateActionItem actionTPS_InHVersionPart[] = {
|
||||
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
|
||||
{p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
|
||||
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
|
||||
{NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* main table of per-state parser actions
|
||||
@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = {
|
||||
TPARSERSTATEACTION(TPS_InHyphenWord),
|
||||
TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
|
||||
TPARSERSTATEACTION(TPS_InHyphenNumWord),
|
||||
TPARSERSTATEACTION(TPS_InHyphenValueFirst),
|
||||
TPARSERSTATEACTION(TPS_InHyphenValue),
|
||||
TPARSERSTATEACTION(TPS_InHyphenValueExact),
|
||||
TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
|
||||
TPARSERSTATEACTION(TPS_InParseHyphen),
|
||||
TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
|
||||
TPARSERSTATEACTION(TPS_InHyphenWordPart),
|
||||
TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
|
||||
TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
|
||||
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt),
|
||||
TPARSERSTATEACTION(TPS_InHDecimalPartFirst),
|
||||
TPARSERSTATEACTION(TPS_InHDecimalPart),
|
||||
TPARSERSTATEACTION(TPS_InHVersionPartFirst),
|
||||
TPARSERSTATEACTION(TPS_InHVersionPart)
|
||||
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
|
||||
};
|
||||
|
||||
|
||||
|
@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
|
||||
12 | .
|
||||
20 | 4.2
|
||||
12 | ,
|
||||
15 | readline-4.2
|
||||
11 | readline
|
||||
12 | -
|
||||
20 | 4.2
|
||||
1 | readline
|
||||
20 | -4.2
|
||||
12 |
|
||||
15 | readline-4.2
|
||||
11 | readline
|
||||
12 | -
|
||||
20 | 4.2
|
||||
1 | readline
|
||||
20 | -4.2
|
||||
12 | .
|
||||
22 | 234
|
||||
12 |
|
||||
@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
|
||||
12 |
|
||||
12 | <>
|
||||
1 | qwerty
|
||||
(135 rows)
|
||||
(131 rows)
|
||||
|
||||
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
|
||||
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
|
||||
<i <b> wow < jqw <> qwerty');
|
||||
to_tsvector
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
|
||||
to_tsvector
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
|
||||
(1 row)
|
||||
|
||||
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
|
||||
|
Loading…
x
Reference in New Issue
Block a user