diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 0b60c61d48..11e246fa35 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -3923,11 +3923,18 @@ SELECT to_tsvector('english', 'The Fat Rats'); A tsquery value stores lexemes that are to be - searched for, and combines them honoring the Boolean operators - & (AND), | (OR), - ! (NOT) and <-> (FOLLOWED BY) phrase search - operator. Parentheses can be used to enforce grouping - of the operators: + searched for, and can combine them using the Boolean operators + & (AND), | (OR), and + ! (NOT), as well as the phrase search operator + <-> (FOLLOWED BY). There is also a variant + <N> of the FOLLOWED BY + operator, where N is an integer constant that + specifies a maximum distance between the two lexemes being searched + for. <-> is equivalent to <1>. + + + + Parentheses can be used to enforce grouping of the operators: SELECT 'fat & rat'::tsquery; diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index ff7545de15..54eb8e56f5 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9081,10 +9081,11 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Text Search Operators - + Operator + Return Type Description Example Result @@ -9093,54 +9094,63 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple @@ + boolean tsvector matches tsquery ? to_tsvector('fat cats ate rats') @@ to_tsquery('cat & rat') t @@@ + boolean deprecated synonym for @@ to_tsvector('fat cats ate rats') @@@ to_tsquery('cat & rat') t || + tsvector concatenate tsvectors 'a:1 b:2'::tsvector || 'c:1 d:2 b:3'::tsvector 'a':1 'b':2,5 'c':3 'd':4 && + tsquery AND tsquerys together 'fat | rat'::tsquery && 'cat'::tsquery ( 'fat' | 'rat' ) & 'cat' || + tsquery OR tsquerys together 'fat | rat'::tsquery || 'cat'::tsquery ( 'fat' | 'rat' ) | 'cat' !! + tsquery negate a tsquery !! 'cat'::tsquery !'cat' <-> + tsquery tsquery followed by tsquery to_tsquery('fat') <-> to_tsquery('rat') 'fat' <-> 'rat' @> + boolean tsquery contains another ? 'cat'::tsquery @> 'cat & rat'::tsquery f <@ + boolean tsquery is contained in ? 'cat'::tsquery <@ 'cat & rat'::tsquery t @@ -9245,7 +9255,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple phraseto_tsquery( config regconfig , query text) tsquery - produce tsquery ignoring punctuation + produce tsquery that searches for a phrase, + ignoring punctuation phraseto_tsquery('english', 'The Fat Rats') 'fat' <-> 'rat' @@ -9400,7 +9411,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple ts_rewrite(query tsquery, target tsquery, substitute tsquery)tsquery - replace target with substitute within query + replace target with substitute + within queryts_rewrite('a & b'::tsquery, 'a'::tsquery, 'foo|bar'::tsquery)'b' & ( 'foo' | 'bar' ) @@ -9419,7 +9431,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple tsquery_phrase(query1 tsquery, query2 tsquery)tsquery - implementation of <-> (FOLLOWED BY) operator + make query that searches for query1 followed + by query2 (same as <-> + operator)tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'))'fat' <-> 'cat' @@ -9428,7 +9442,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple tsquery_phrase(query1 tsquery, query2 tsquery, distance integer)tsquery - phrase-concatenate with distance + make query that searches for query1 followed by + query2 at maximum distance distancetsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10)'fat' <10> 'cat' diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index bee1fbf174..9028bedd1b 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -263,12 +263,12 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t As the above example suggests, a tsquery is not just raw text, any more than a tsvector is. A tsquery contains search terms, which must be already-normalized lexemes, and - may combine multiple terms using AND, OR, NOT and FOLLOWED BY operators. - (For details see .) There are - functions to_tsquery, plainto_tsquery + may combine multiple terms using AND, OR, NOT, and FOLLOWED BY operators. + (For details see .) There are + functions to_tsquery, plainto_tsquery, and phraseto_tsquery that are helpful in converting user-written text into a proper - tsquery, for example by normalizing words appearing in + tsquery, primarily by normalizing words appearing in the text. Similarly, to_tsvector is used to parse and normalize a document string. So in practice a text search match would look more like this: @@ -294,35 +294,6 @@ SELECT 'fat cats ate fat rats'::tsvector @@ to_tsquery('fat & rat'); already normalized, so rats does not match rat. - - Phrase search is made possible with the help of the <-> - (FOLLOWED BY) operator, which enforces lexeme order. This allows you - to discard strings not containing the desired phrase, for example: - - -SELECT q @@ to_tsquery('fatal <-> error') -FROM unnest(array[to_tsvector('fatal error'), - to_tsvector('error is not fatal')]) AS q; - ?column? ----------- - t - f - - - A more generic version of the FOLLOWED BY operator takes form of - <N>, where N stands for the greatest allowed distance - between the specified lexemes. The phraseto_tsquery - function makes use of this behavior in order to construct a - tsquery capable of matching the provided phrase: - - -SELECT phraseto_tsquery('cat ate some rats'); - phraseto_tsquery -------------------------------- - ( 'cat' <-> 'ate' ) <2> 'rat' - - - The @@ operator also supports text input, allowing explicit conversion of a text @@ -344,6 +315,57 @@ text @@ text The form text @@ text is equivalent to to_tsvector(x) @@ plainto_tsquery(y). + + + Within a tsquery, the & (AND) operator + specifies that both its arguments must appear in the document to have a + match. Similarly, the | (OR) operator specifies that + at least one of its arguments must appear, while the ! (NOT) + operator specifies that its argument must not appear in + order to have a match. Parentheses can be used to control nesting of + these operators. + + + + Searching for phrases is possible with the help of + the <-> (FOLLOWED BY) tsquery operator, which + matches only if its arguments have matches that are adjacent and in the + given order. For example: + + +SELECT to_tsvector('fatal error') @@ to_tsquery('fatal <-> error'); + ?column? +---------- + t + +SELECT to_tsvector('error is not fatal') @@ to_tsquery('fatal <-> error'); + ?column? +---------- + f + + + There is a more general version of the FOLLOWED BY operator having the + form <N>, + where N is an integer standing for the greatest distance + allowed between the matching lexemes. <1> is + the same as <->, while <2> + allows one other lexeme to optionally appear between the matches, and so + on. The phraseto_tsquery function makes use of this + operator to construct a tsquery that can match a multi-word + phrase when some of the words are stop words. For example: + + +SELECT phraseto_tsquery('cats ate rats'); + phraseto_tsquery +------------------------------- + ( 'cat' <-> 'ate' ) <-> 'rat' + +SELECT phraseto_tsquery('the cats ate the rats'); + phraseto_tsquery +------------------------------- + ( 'cat' <-> 'ate' ) <2> 'rat' + + @@ -740,12 +762,12 @@ UPDATE tt SET ti = PostgreSQL provides the functions to_tsquery, - plainto_tsquery and + plainto_tsquery, and phraseto_tsquery for converting a query to the tsquery data type. to_tsquery offers access to more features - than both plainto_tsquery and - phraseto_tsquery, but is less forgiving + than either plainto_tsquery or + phraseto_tsquery, but it is less forgiving about its input. @@ -760,15 +782,15 @@ to_tsquery( config to_tsquery creates a tsquery value from querytext, which must consist of single tokens - separated by the Boolean operators & (AND), - | (OR), ! (NOT), and also the - <-> (FOLLOWED BY) phrase search operator. These operators - can be grouped using parentheses. In other words, the input to + separated by the tsquery operators & (AND), + | (OR), ! (NOT), and + <-> (FOLLOWED BY), possibly grouped + using parentheses. In other words, the input to to_tsquery must already follow the general rules for tsquery input, as described in . The difference is that while basic + linkend="datatype-tsquery">. The difference is that while basic tsquery input takes the tokens at face value, - to_tsquery normalizes each token to a lexeme using + to_tsquery normalizes each token into a lexeme using the specified or default configuration, and discards any tokens that are stop words according to the configuration. For example: @@ -818,7 +840,8 @@ SELECT to_tsquery('''supernovae stars'' & !crab'); Without quotes, to_tsquery will generate a syntax - error for tokens that are not separated by an AND or OR operator. + error for tokens that are not separated by an AND, OR, or FOLLOWED BY + operator. @@ -830,11 +853,11 @@ plainto_tsquery( config < - plainto_tsquery transforms unformatted text - querytext to tsquery. + plainto_tsquery transforms the unformatted text + querytext to a tsquery value. The text is parsed and normalized much as for to_tsvector, - then the & (AND) Boolean operator is inserted - between surviving words. + then the & (AND) tsquery operator is + inserted between surviving words. @@ -847,8 +870,8 @@ SELECT plainto_tsquery('english', 'The Fat Rats'); 'fat' & 'rat' - Note that plainto_tsquery cannot - recognize Boolean and phrase search operators, weight labels, + Note that plainto_tsquery will not + recognize tsquery operators, weight labels, or prefix-match labels in its input: @@ -871,11 +894,14 @@ phraseto_tsquery( config phraseto_tsquery behaves much like - plainto_tsquery, with the exception - that it utilizes the <-> (FOLLOWED BY) phrase search - operator instead of the & (AND) Boolean operator. - This is particularly useful when searching for exact lexeme sequences, - since the phrase search operator helps to maintain lexeme order. + plainto_tsquery, except that it inserts + the <-> (FOLLOWED BY) operator between + surviving words instead of the & (AND) operator. + Also, stop words are not simply discarded, but are accounted for by + inserting <N> operators rather + than <-> operators. This function is useful + when searching for exact lexeme sequences, since the FOLLOWED BY + operators check lexeme order not just the presence of all the lexemes. @@ -888,9 +914,9 @@ SELECT phraseto_tsquery('english', 'The Fat Rats'); 'fat' <-> 'rat' - Just like the plainto_tsquery, the - phraseto_tsquery function cannot - recognize Boolean and phrase search operators, weight labels, + Like plainto_tsquery, the + phraseto_tsquery function will not + recognize tsquery operators, weight labels, or prefix-match labels in its input: @@ -899,17 +925,6 @@ SELECT phraseto_tsquery('english', 'The Fat & Rats:C'); ----------------------------- ( 'fat' <-> 'rat' ) <-> 'c' - - It is possible to specify the configuration to be used to parse the document, - for example, we could create a new one using the hunspell dictionary - (namely 'eng_hunspell') in order to match phrases with different word forms: - - -SELECT phraseto_tsquery('eng_hunspell', 'developer of the building which collapsed'); - phraseto_tsquery --------------------------------------------------------------------------------------------- - ( 'developer' <3> 'building' ) <2> 'collapse' | ( 'developer' <3> 'build' ) <2> 'collapse' - @@ -1400,10 +1415,13 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank - Returns a vector which lists the same lexemes as the given vector, but - which lacks any position or weight information. While the returned - vector is much less useful than an unstripped vector for relevance - ranking, it will usually be much smaller. + Returns a vector that lists the same lexemes as the given vector, but + lacks any position or weight information. The result is usually much + smaller than an unstripped vector, but it is also less useful. + Relevance ranking does not work as well on stripped vectors as + unstripped ones. Also, when given stripped input, + the <-> (FOLLOWED BY) tsquery operator + effectively degenerates to a simple & (AND) test. @@ -1481,7 +1499,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank - Returns the phrase-concatenation of the two given queries. + Returns a query that searches for a match to the first given query + immediately followed by a match to the second given query, using + the <-> (FOLLOWED BY) + tsquery operator. For example: SELECT to_tsquery('fat') <-> to_tsquery('cat | rat'); @@ -1506,8 +1527,11 @@ SELECT to_tsquery('fat') <-> to_tsquery('cat | rat'); - Returns the distanced phrase-concatenation of the two given queries. - This function lies in the implementation of the <-> operator. + Returns a query that searches for a match to the first given query + followed by a match to the second given query at a distance of at + most distance lexemes, using + the <N> + tsquery operator. For example: SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10); @@ -3785,6 +3809,11 @@ Parser: "pg_catalog.default" Position values in tsvector must be greater than 0 and no more than 16,383 + + The match distance in a <N> + (FOLLOWED BY) tsquery operator cannot be more than + 16,384 + No more than 256 positions per lexeme