diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index a337df61af..5a31f85a13 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -104,11 +104,21 @@ initTrie(char *filename) while ((line = tsearch_readline(&trst)) != NULL) { - /* - * The format of each line must be "src trg" where src and trg - * are sequences of one or more non-whitespace characters, - * separated by whitespace. Whitespace at start or end of - * line is ignored. + /*---------- + * The format of each line must be "src" or "src trg", where + * src and trg are sequences of one or more non-whitespace + * characters, separated by whitespace. Whitespace at start + * or end of line is ignored. If trg is omitted, an empty + * string is used as the replacement. + * + * We use a simple state machine, with states + * 0 initial (before src) + * 1 in src + * 2 in whitespace after src + * 3 in trg + * 4 in whitespace after trg + * -1 syntax error detected (line will be ignored) + *---------- */ int state; char *ptr; @@ -160,7 +170,14 @@ initTrie(char *filename) } } - if (state >= 3) + if (state == 1 || state == 2) + { + /* trg was omitted, so use "" */ + trg = ""; + trglen = 0; + } + + if (state > 0) rootTrie = placeChar(rootTrie, (unsigned char *) src, srclen, trg, trglen); diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml index af9cad5d8c..aef0031dcb 100644 --- a/doc/src/sgml/unaccent.sgml +++ b/doc/src/sgml/unaccent.sgml @@ -45,9 +45,9 @@ - Each line represents a pair, consisting of a character with accent - followed by a character without accent. The first is translated into - the second. For example, + Each line represents one translation rule, consisting of a character with + accent followed by a character without accent. The first is translated + into the second. For example, À A Á A @@ -57,6 +57,27 @@ Å A Æ A + The two characters must be separated by whitespace, and any leading or + trailing whitespace on a line is ignored. + + + + + + Alternatively, if only one character is given on a line, instances of + that character are deleted; this is useful in languages where accents + are represented by separate characters. + + + + + + As with other PostgreSQL text search configuration files, + the rules file must be stored in UTF-8 encoding. The data is + automatically translated into the current database's encoding when + loaded. Any lines containing untranslatable characters are silently + ignored, so that rules files can contain rules that are not applicable in + the current encoding. @@ -132,8 +153,8 @@ mydb=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels') The unaccent() function removes accents (diacritic signs) from - a given string. Basically, it's a wrapper around the - unaccent dictionary, but it can be used outside normal + a given string. Basically, it's a wrapper around + unaccent-type dictionaries, but it can be used outside normal text search contexts. @@ -145,6 +166,11 @@ mydb=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels') unaccent(dictionary, string) returns text + + If the dictionary argument is + omitted, unaccent is assumed. + + For example: