diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
index a337df61af..5a31f85a13 100644
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -104,11 +104,21 @@ initTrie(char *filename)
while ((line = tsearch_readline(&trst)) != NULL)
{
- /*
- * The format of each line must be "src trg" where src and trg
- * are sequences of one or more non-whitespace characters,
- * separated by whitespace. Whitespace at start or end of
- * line is ignored.
+ /*----------
+ * The format of each line must be "src" or "src trg", where
+ * src and trg are sequences of one or more non-whitespace
+ * characters, separated by whitespace. Whitespace at start
+ * or end of line is ignored. If trg is omitted, an empty
+ * string is used as the replacement.
+ *
+ * We use a simple state machine, with states
+ * 0 initial (before src)
+ * 1 in src
+ * 2 in whitespace after src
+ * 3 in trg
+ * 4 in whitespace after trg
+ * -1 syntax error detected (line will be ignored)
+ *----------
*/
int state;
char *ptr;
@@ -160,7 +170,14 @@ initTrie(char *filename)
}
}
- if (state >= 3)
+ if (state == 1 || state == 2)
+ {
+ /* trg was omitted, so use "" */
+ trg = "";
+ trglen = 0;
+ }
+
+ if (state > 0)
rootTrie = placeChar(rootTrie,
(unsigned char *) src, srclen,
trg, trglen);
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml
index af9cad5d8c..aef0031dcb 100644
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -45,9 +45,9 @@
- Each line represents a pair, consisting of a character with accent
- followed by a character without accent. The first is translated into
- the second. For example,
+ Each line represents one translation rule, consisting of a character with
+ accent followed by a character without accent. The first is translated
+ into the second. For example,
À A
Á A
@@ -57,6 +57,27 @@
Å A
Æ A
+ The two characters must be separated by whitespace, and any leading or
+ trailing whitespace on a line is ignored.
+
+
+
+
+
+ Alternatively, if only one character is given on a line, instances of
+ that character are deleted; this is useful in languages where accents
+ are represented by separate characters.
+
+
+
+
+
+ As with other PostgreSQL> text search configuration files,
+ the rules file must be stored in UTF-8 encoding. The data is
+ automatically translated into the current database's encoding when
+ loaded. Any lines containing untranslatable characters are silently
+ ignored, so that rules files can contain rules that are not applicable in
+ the current encoding.
@@ -132,8 +153,8 @@ mydb=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels')
The unaccent()> function removes accents (diacritic signs) from
- a given string. Basically, it's a wrapper around the
- unaccent> dictionary, but it can be used outside normal
+ a given string. Basically, it's a wrapper around
+ unaccent>-type dictionaries, but it can be used outside normal
text search contexts.
@@ -145,6 +166,11 @@ mydb=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels')
unaccent(dictionary, string) returns text
+
+ If the dictionary argument is
+ omitted, unaccent> is assumed.
+
+
For example: