Move full text search operators, functions, and data type sections into
the main documentation, out of its own text search chapter.
This commit is contained in:
parent
8bc225e799
commit
bb8f629c7a
@ -1,4 +1,4 @@
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.207 2007/08/21 01:11:11 tgl Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.208 2007/08/29 20:37:14 momjian Exp $ -->
|
||||
|
||||
<chapter id="datatype">
|
||||
<title id="datatype-title">Data Types</title>
|
||||
@ -234,6 +234,18 @@
|
||||
<entry>date and time, including time zone</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><type>tsquery</type></entry>
|
||||
<entry></entry>
|
||||
<entry>full text search query</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><type>tsvector</type></entry>
|
||||
<entry></entry>
|
||||
<entry>full text search document</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><type>uuid</type></entry>
|
||||
<entry></entry>
|
||||
@ -3264,6 +3276,137 @@ a0eebc999c0b4ef8bb6d6bb9bd380a11
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="datatype-textsearch">
|
||||
<title>Full Text Search</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<indexterm zone="datatype-textsearch">
|
||||
<primary>tsvector</primary>
|
||||
</indexterm>
|
||||
|
||||
<varlistentry>
|
||||
<term><firstterm>tsvector</firstterm></term>
|
||||
<listitem>
|
||||
|
||||
<para>
|
||||
<type>tsvector</type> is a data type that represents a document and is
|
||||
optimized for full text searching. In the simplest case,
|
||||
<type>tsvector</type> is a sorted list of lexemes, so even without indexes
|
||||
full text searches perform better than standard <literal>~</literal> and
|
||||
<literal>LIKE</literal> operations:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector;
|
||||
tsvector
|
||||
----------------------------------------------------
|
||||
'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat'
|
||||
</programlisting>
|
||||
|
||||
Notice, that <literal>space</literal> is also a lexeme:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'space '' '' is a lexeme'::tsvector;
|
||||
tsvector
|
||||
----------------------------------
|
||||
'a' 'is' ' ' 'space' 'lexeme'
|
||||
</programlisting>
|
||||
|
||||
Each lexeme, optionally, can have positional information which is used for
|
||||
<varname>proximity ranking</varname>:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
|
||||
tsvector
|
||||
-------------------------------------------------------------------------------
|
||||
'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
|
||||
</programlisting>
|
||||
|
||||
Each lexeme position also can be labeled as <literal>A</literal>,
|
||||
<literal>B</literal>, <literal>C</literal>, <literal>D</literal>,
|
||||
where <literal>D</literal> is the default. These labels can be used to group
|
||||
lexemes into different <emphasis>importance</emphasis> or
|
||||
<emphasis>rankings</emphasis>, for example to reflect document structure.
|
||||
Actual values can be assigned at search time and used during the calculation
|
||||
of the document rank. This is very useful for controlling search results.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The concatenation operator, e.g. <literal>tsvector || tsvector</literal>,
|
||||
can "construct" a document from several parts. The order is important if
|
||||
<type>tsvector</type> contains positional information. Of course,
|
||||
it is also possible to build a document using different tables:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector;
|
||||
?column?
|
||||
---------------------------
|
||||
'cat':2 'fat':1,3 'rat':4
|
||||
|
||||
SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector;
|
||||
?column?
|
||||
---------------------------
|
||||
'cat':4 'fat':1,3 'rat':2
|
||||
</programlisting>
|
||||
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
|
||||
</varlistentry>
|
||||
|
||||
<indexterm zone="datatype-textsearch">
|
||||
<primary>tsquery</primary>
|
||||
</indexterm>
|
||||
|
||||
<varlistentry>
|
||||
<term><firstterm>tsquery</firstterm></term>
|
||||
<listitem>
|
||||
|
||||
<para>
|
||||
<type>tsquery</type> is a data type for textual queries which supports
|
||||
the boolean operators <literal>&</literal> (AND), <literal>|</literal> (OR),
|
||||
and parentheses. A <type>tsquery</type> consists of lexemes
|
||||
(optionally labeled by letters) with boolean operators in between:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'fat & cat'::tsquery;
|
||||
tsquery
|
||||
---------------
|
||||
'fat' & 'cat'
|
||||
SELECT 'fat:ab & cat'::tsquery;
|
||||
tsquery
|
||||
------------------
|
||||
'fat':AB & 'cat'
|
||||
</programlisting>
|
||||
|
||||
Labels can be used to restrict the search region, which allows the
|
||||
development of different search engines using the same full text index.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<type>tsqueries</type> can be concatenated using <literal>&&</literal> (AND)
|
||||
and <literal>||</literal> (OR) operators:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a & b'::tsquery && 'c | d'::tsquery;
|
||||
?column?
|
||||
---------------------------
|
||||
'a' & 'b' & ( 'c' | 'd' )
|
||||
|
||||
SELECT 'a & b'::tsquery || 'c|d'::tsquery;
|
||||
?column?
|
||||
---------------------------
|
||||
'a' & 'b' | ( 'c' | 'd' )
|
||||
</programlisting>
|
||||
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="datatype-xml">
|
||||
<title><acronym>XML</> Type</title>
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.388 2007/08/21 01:11:11 tgl Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.389 2007/08/29 20:37:14 momjian Exp $ -->
|
||||
|
||||
<chapter id="functions">
|
||||
<title>Functions and Operators</title>
|
||||
@ -7551,6 +7551,920 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="functions-textsearch">
|
||||
<title>Full Text Search Operators and Functions</title>
|
||||
|
||||
<para>
|
||||
This section outlines all the functions and operators that are available
|
||||
for full text searching.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Full text search vectors and queries both use lexemes, but for different
|
||||
purposes. A <type>tsvector</type> represents the lexemes (tokens) parsed
|
||||
out of a document, with an optional position. A <type>tsquery</type>
|
||||
specifies a boolean condition using lexemes.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
All of the following functions that accept a configuration argument can
|
||||
use a textual configuration name to select a configuration. If the option
|
||||
is omitted the configuration specified by
|
||||
<varname>default_text_search_config</> is used. For more information on
|
||||
configuration, see <xref linkend="textsearch-tables-configuration">.
|
||||
</para>
|
||||
|
||||
<sect2 id="functions-textsearch-search-operator">
|
||||
<title>Search</title>
|
||||
|
||||
<para>The operator <literal>@@</> is used to perform full text
|
||||
searches:
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-search-operator">
|
||||
<primary>TSVECTOR @@ TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
<!-- why allow such combinations? -->
|
||||
TSVECTOR @@ TSQUERY
|
||||
TSQUERY @@ TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>true</literal> if <literal>TSQUERY</literal> is contained
|
||||
in <literal>TSVECTOR</literal>, and <literal>false</literal> if not:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery;
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery;
|
||||
?column?
|
||||
----------
|
||||
f
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-search-operator">
|
||||
<primary>TEXT @@ TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
text @@ tsquery
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>true</literal> if <literal>TSQUERY</literal> is contained
|
||||
in <literal>TEXT</literal>, and <literal>false</literal> if not:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery;
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery;
|
||||
?column?
|
||||
----------
|
||||
f
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-search-operator">
|
||||
<primary>TEXT @@ TEXT</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
<!-- this is very confusing because there is no rule suggesting which is
|
||||
first. -->
|
||||
text @@ text
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>true</literal> if the right
|
||||
argument (the query) is contained in the left argument, and
|
||||
<literal>false</literal> otherwise:
|
||||
|
||||
<programlisting>
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat';
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
|
||||
SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow';
|
||||
?column?
|
||||
----------
|
||||
f
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
For index support of full text operators consult <xref linkend="textsearch-indexes">.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="functions-textsearch-tsvector">
|
||||
<title>tsvector</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>to_tsvector</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
to_tsvector(<optional><replaceable class="PARAMETER">config_name</replaceable></optional>, <replaceable class="PARAMETER">document</replaceable> TEXT) returns TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Parses a document into tokens, reduces the tokens to lexemes, and returns a
|
||||
<type>tsvector</type> which lists the lexemes together with their positions in the document
|
||||
in lexicographic order.
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>strip</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
strip(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR) returns TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns a vector which lists the same lexemes as the given vector, but
|
||||
which lacks any information about where in the document each lexeme
|
||||
appeared. While the returned vector is useless for relevance ranking it
|
||||
will usually be much smaller.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>setweight</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
setweight(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR, <replaceable class="PARAMETER">letter</replaceable>) returns TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
This function returns a copy of the input vector in which every location
|
||||
has been labeled with either the letter <literal>A</literal>,
|
||||
<literal>B</literal>, or <literal>C</literal>, or the default label
|
||||
<literal>D</literal> (which is the default for new vectors
|
||||
and as such is usually not displayed). These labels are retained
|
||||
when vectors are concatenated, allowing words from different parts of a
|
||||
document to be weighted differently by ranking functions.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>tsvector concatenation</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
<replaceable class="PARAMETER">vector1</replaceable> || <replaceable class="PARAMETER">vector2</replaceable>
|
||||
tsvector_concat(<replaceable class="PARAMETER">vector1</replaceable> TSVECTOR, <replaceable class="PARAMETER">vector2</replaceable> TSVECTOR) returns TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns a vector which combines the lexemes and positional information of
|
||||
the two vectors given as arguments. Positional weight labels (described
|
||||
in the previous paragraph) are retained during the concatenation. This
|
||||
has at least two uses. First, if some sections of your document need to be
|
||||
parsed with different configurations than others, you can parse them
|
||||
separately and then concatenate the resulting vectors. Second, you can
|
||||
weigh words from one section of your document differently than the others
|
||||
by parsing the sections into separate vectors and assigning each vector
|
||||
a different position label with the <function>setweight()</function>
|
||||
function. You can then concatenate them into a single vector and provide
|
||||
a weights argument to the <function>ts_rank()</function> function that assigns
|
||||
different weights to positions with different labels.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>length(tsvector)</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
length(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR) returns INT4
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns the number of lexemes stored in the vector.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>text::tsvector</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
<replaceable>text</replaceable>::TSVECTOR returns TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Directly casting <type>text</type> to a <type>tsvector</type> allows you
|
||||
to directly inject lexemes into a vector with whatever positions and
|
||||
positional weights you choose to specify. The text should be formatted to
|
||||
match the way a vector is displayed by <literal>SELECT</literal>.
|
||||
<!-- TODO what a strange definition, I think something like
|
||||
"input format" or so should be used (and defined somewhere, didn't see
|
||||
it yet) -->
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>trigger</primary>
|
||||
<secondary>for updating a derived tsvector column</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
tsvector_update_trigger(<replaceable class="PARAMETER">tsvector_column_name</replaceable>, <replaceable class="PARAMETER">config_name</replaceable>, <replaceable class="PARAMETER">text_column_name</replaceable> <optional>, ... </optional>)
|
||||
tsvector_update_trigger_column(<replaceable class="PARAMETER">tsvector_column_name</replaceable>, <replaceable class="PARAMETER">config_column_name</replaceable>, <replaceable class="PARAMETER">text_column_name</replaceable> <optional>, ... </optional>)
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Two built-in trigger functions are available to automatically update a
|
||||
<type>tsvector</> column from one or more textual columns. An example
|
||||
of their use is:
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE tblMessages (
|
||||
strMessage text,
|
||||
tsv tsvector
|
||||
);
|
||||
|
||||
CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE
|
||||
ON tblMessages FOR EACH ROW EXECUTE PROCEDURE
|
||||
tsvector_update_trigger(tsv, 'pg_catalog.english', strMessage);
|
||||
</programlisting>
|
||||
|
||||
Having created this trigger, any change in <structfield>strMessage</>
|
||||
will be automatically reflected into <structfield>tsv</>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Both triggers require you to specify the text search configuration to
|
||||
be used to perform the conversion. For
|
||||
<function>tsvector_update_trigger</>, the configuration name is simply
|
||||
given as the second trigger argument. It must be schema-qualified as
|
||||
shown above, so that the trigger behavior will not change with changes
|
||||
in <varname>search_path</>. For
|
||||
<function>tsvector_update_trigger_column</>, the second trigger argument
|
||||
is the name of another table column, which must be of type
|
||||
<type>regconfig</>. This allows a per-row selection of configuration
|
||||
to be made.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>ts_stat</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
ts_stat(<replaceable class="PARAMETER">sqlquery</replaceable> text <optional>, <replaceable class="PARAMETER">weights</replaceable> text </optional>) returns SETOF statinfo
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Here <type>statinfo</type> is a type, defined as:
|
||||
|
||||
<programlisting>
|
||||
CREATE TYPE statinfo AS (word text, ndoc integer, nentry integer);
|
||||
</programlisting>
|
||||
|
||||
and <replaceable>sqlquery</replaceable> is a text value containing a SQL query
|
||||
which returns a single <type>tsvector</type> column. <function>ts_stat</>
|
||||
executes the query and returns statistics about the resulting
|
||||
<type>tsvector</type> data, i.e., the number of documents, <literal>ndoc</>,
|
||||
and the total number of words in the collection, <literal>nentry</>. It is
|
||||
useful for checking your configuration and to find stop word candidates. For
|
||||
example, to find the ten most frequent words:
|
||||
|
||||
<programlisting>
|
||||
SELECT * FROM ts_stat('SELECT vector from apod')
|
||||
ORDER BY ndoc DESC, nentry DESC, word
|
||||
LIMIT 10;
|
||||
</programlisting>
|
||||
|
||||
Optionally, one can specify <replaceable>weights</replaceable> to obtain
|
||||
statistics about words with a specific <replaceable>weight</replaceable>:
|
||||
|
||||
<programlisting>
|
||||
SELECT * FROM ts_stat('SELECT vector FROM apod','a')
|
||||
ORDER BY ndoc DESC, nentry DESC, word
|
||||
LIMIT 10;
|
||||
</programlisting>
|
||||
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsvector">
|
||||
<primary>Btree operations for tsvector</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSVECTOR < TSVECTOR
|
||||
TSVECTOR <= TSVECTOR
|
||||
TSVECTOR = TSVECTOR
|
||||
TSVECTOR >= TSVECTOR
|
||||
TSVECTOR > TSVECTOR
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
All btree operations are defined for the <type>tsvector</type> type.
|
||||
<type>tsvector</>s are compared with each other using
|
||||
<emphasis>lexicographical</emphasis> ordering.
|
||||
<!-- TODO of the output representation or something else? -->
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="functions-textsearch-tsquery">
|
||||
<title>tsquery</title>
|
||||
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>to_tsquery</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
to_tsquery(<optional><replaceable class="PARAMETER">config_name</replaceable></optional>, <replaceable class="PARAMETER">querytext</replaceable> text) returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Accepts <replaceable>querytext</replaceable>, which should consist of single tokens
|
||||
separated by the boolean operators <literal>&</literal> (and), <literal>|</literal>
|
||||
(or) and <literal>!</literal> (not), which can be grouped using parentheses.
|
||||
In other words, <function>to_tsquery</function> expects already parsed text.
|
||||
Each token is reduced to a lexeme using the specified or current configuration.
|
||||
A weight class can be assigned to each lexeme entry to restrict the search region
|
||||
(see <function>setweight</function> for an explanation). For example:
|
||||
|
||||
<programlisting>
|
||||
'fat:a & rats'
|
||||
</programlisting>
|
||||
|
||||
The <function>to_tsquery</function> function can also accept a <literal>text
|
||||
string</literal>. In this case <replaceable>querytext</replaceable> should
|
||||
be quoted. This may be useful, for example, to use with a thesaurus
|
||||
dictionary. In the example below, a thesaurus contains rule <literal>supernovae
|
||||
stars : sn</literal>:
|
||||
|
||||
<programlisting>
|
||||
SELECT to_tsquery('''supernovae stars'' & !crab');
|
||||
to_tsquery
|
||||
---------------
|
||||
'sn' & !'crab'
|
||||
</programlisting>
|
||||
|
||||
Without quotes <function>to_tsquery</function> will generate a syntax error.
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>plainto_tsquery</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
plainto_tsquery(<optional><replaceable class="PARAMETER">config_name</replaceable></optional>, <replaceable class="PARAMETER">querytext</replaceable> text) returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Transforms unformatted text <replaceable>querytext</replaceable> to <type>tsquery</type>.
|
||||
It is the same as <function>to_tsquery</function> but accepts <literal>text</literal>
|
||||
without quotes and will call the parser to break it into tokens.
|
||||
<function>plainto_tsquery</function> assumes the <literal>&</literal> boolean
|
||||
operator between words and does not recognize weight classes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>querytree</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
querytree(<replaceable class="PARAMETER">query</replaceable> TSQUERY) returns TEXT
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
This returns the query used for searching an index. It can be used to test
|
||||
for an empty query. The <command>SELECT</> below returns <literal>NULL</>,
|
||||
which corresponds to an empty query since GIN indexes do not support queries with negation
|
||||
<!-- TODO or "negated queries" (depending on what the correct rule is) -->
|
||||
(a full index scan is inefficient):
|
||||
|
||||
<programlisting>
|
||||
SELECT querytree(to_tsquery('!defined'));
|
||||
querytree
|
||||
-----------
|
||||
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>text::tsquery casting</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
<replaceable class="PARAMETER">text</replaceable>::TSQUERY returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Directly casting <replaceable>text</replaceable> to a <type>tsquery</type>
|
||||
allows you to directly inject lexemes into a query using whatever positions
|
||||
and positional weight flags you choose to specify. The text should be
|
||||
formatted to match the way a vector is displayed by
|
||||
<literal>SELECT</literal>.
|
||||
<!-- TODO what a strange definition, I think something like
|
||||
"input format" or so should be used (and defined somewhere, didn't see
|
||||
it yet) -->
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>numnode</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
numnode(<replaceable class="PARAMETER">query</replaceable> TSQUERY) returns INTEGER
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
This returns the number of nodes in a query tree. This function can be
|
||||
used to determine if <replaceable>query</replaceable> is meaningful
|
||||
(returns > 0), or contains only stop words (returns 0):
|
||||
|
||||
<programlisting>
|
||||
SELECT numnode(plainto_tsquery('the any'));
|
||||
NOTICE: query contains only stopword(s) or does not contain lexeme(s), ignored
|
||||
numnode
|
||||
---------
|
||||
0
|
||||
|
||||
SELECT numnode(plainto_tsquery('the table'));
|
||||
numnode
|
||||
---------
|
||||
1
|
||||
|
||||
SELECT numnode(plainto_tsquery('long table'));
|
||||
numnode
|
||||
---------
|
||||
3
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>TSQUERY && TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSQUERY && TSQUERY returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>AND</literal>-ed TSQUERY
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>TSQUERY || TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSQUERY || TSQUERY returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>OR</literal>-ed TSQUERY
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>!! TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
!! TSQUERY returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
negation of TSQUERY
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>Btree operations for tsquery</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSQUERY < TSQUERY
|
||||
TSQUERY <= TSQUERY
|
||||
TSQUERY = TSQUERY
|
||||
TSQUERY >= TSQUERY
|
||||
TSQUERY > TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
All btree operations are defined for the <type>tsquery</type> type.
|
||||
tsqueries are compared to each other using <emphasis>lexicographical</emphasis>
|
||||
ordering.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<sect3 id="functions-textsearch-queryrewriting">
|
||||
<title>Query Rewriting</title>
|
||||
|
||||
<para>
|
||||
Query rewriting is a set of functions and operators for the
|
||||
<type>tsquery</type> data type. It allows control at search
|
||||
<emphasis>query time</emphasis> without reindexing (the opposite of the
|
||||
thesaurus). For example, you can expand the search using synonyms
|
||||
(<literal>new york</>, <literal>big apple</>, <literal>nyc</>,
|
||||
<literal>gotham</>) or narrow the search to direct the user to some hot
|
||||
topic.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The <function>ts_rewrite()</function> function changes the original query by
|
||||
replacing part of the query with some other string of type <type>tsquery</type>,
|
||||
as defined by the rewrite rule. Arguments to <function>ts_rewrite()</function>
|
||||
can be names of columns of type <type>tsquery</type>.
|
||||
</para>
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY);
|
||||
INSERT INTO aliases VALUES('a', 'c');
|
||||
</programlisting>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>ts_rewrite</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
ts_rewrite (<replaceable class="PARAMETER">query</replaceable> TSQUERY, <replaceable class="PARAMETER">target</replaceable> TSQUERY, <replaceable class="PARAMETER">sample</replaceable> TSQUERY) returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<programlisting>
|
||||
SELECT ts_rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery);
|
||||
ts_rewrite
|
||||
------------
|
||||
'b' & 'c'
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
ts_rewrite(ARRAY[<replaceable class="PARAMETER">query</replaceable> TSQUERY, <replaceable class="PARAMETER">target</replaceable> TSQUERY, <replaceable class="PARAMETER">sample</replaceable> TSQUERY]) returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<programlisting>
|
||||
SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases;
|
||||
ts_rewrite
|
||||
------------
|
||||
'b' & 'c'
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
ts_rewrite (<replaceable class="PARAMETER">query</> TSQUERY,<literal>'SELECT target ,sample FROM test'</literal>::text) returns TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<programlisting>
|
||||
SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases');
|
||||
ts_rewrite
|
||||
------------
|
||||
'b' & 'c'
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
What if there are several instances of rewriting? For example, query
|
||||
<literal>'a & b'</literal> can be rewritten as
|
||||
<literal>'b & c'</literal> and <literal>'cc'</literal>.
|
||||
|
||||
<programlisting>
|
||||
SELECT * FROM aliases;
|
||||
t | s
|
||||
-----------+------
|
||||
'a' | 'c'
|
||||
'x' | 'z'
|
||||
'a' & 'b' | 'cc'
|
||||
</programlisting>
|
||||
|
||||
This ambiguity can be resolved by specifying a sort order:
|
||||
|
||||
<programlisting>
|
||||
SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC');
|
||||
ts_rewrite
|
||||
---------
|
||||
'cc'
|
||||
|
||||
SELECT ts_rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC');
|
||||
ts_rewrite
|
||||
--------------
|
||||
'b' & 'c'
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Let's consider a real-life astronomical example. We'll expand query
|
||||
<literal>supernovae</literal> using table-driven rewriting rules:
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE aliases (t tsquery primary key, s tsquery);
|
||||
INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn'));
|
||||
|
||||
SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab');
|
||||
?column?
|
||||
-------------------------------
|
||||
( 'supernova' | 'sn' ) & 'crab'
|
||||
</programlisting>
|
||||
|
||||
Notice, that we can change the rewriting rule online<!-- TODO maybe use another word for "online"? -->:
|
||||
|
||||
<programlisting>
|
||||
UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae');
|
||||
SELECT ts_rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab');
|
||||
?column?
|
||||
-----------------------------------------------
|
||||
'supernova' | 'sn' & !'nebula' ) & 'crab'
|
||||
</programlisting>
|
||||
</para>
|
||||
</sect3>
|
||||
|
||||
<sect3 id="functions-textsearch-tsquery-ops">
|
||||
<title>Operators For tsquery</title>
|
||||
|
||||
<para>
|
||||
Rewriting can be slow for many rewriting rules since it checks every rule
|
||||
for a possible hit. To filter out obvious non-candidate rules there are containment
|
||||
operators for the <type>tsquery</type> type. In the example below, we select only those
|
||||
rules which might contain the original query:
|
||||
|
||||
<programlisting>
|
||||
SELECT ts_rewrite(ARRAY['a & b'::tsquery, t,s])
|
||||
FROM aliases
|
||||
WHERE 'a & b' @> t;
|
||||
ts_rewrite
|
||||
------------
|
||||
'b' & 'c'
|
||||
</programlisting>
|
||||
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Two operators are defined for <type>tsquery</type>:
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>TSQUERY @> TSQUERY</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSQUERY @> TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>true</literal> if the right argument might be contained in left argument.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm zone="functions-textsearch-tsquery">
|
||||
<primary>tsquery <@ tsquery</primary>
|
||||
</indexterm>
|
||||
|
||||
<term>
|
||||
<synopsis>
|
||||
TSQUERY <@ TSQUERY
|
||||
</synopsis>
|
||||
</term>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Returns <literal>true</literal> if the left argument might be contained in right argument.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
|
||||
</sect3>
|
||||
|
||||
<sect3 id="functions-textsearch-tsqueryindex">
|
||||
<title>Index For tsquery</title>
|
||||
|
||||
<para>
|
||||
To speed up operators <literal><@</> and <literal>@></literal> for
|
||||
<type>tsquery</type> one can use a <acronym>GiST</acronym> index with
|
||||
a <literal>tsquery_ops</literal> opclass:
|
||||
|
||||
<programlisting>
|
||||
CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops);
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
</sect3>
|
||||
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="functions-xml">
|
||||
<title>XML Functions</title>
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user