Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly

Add Python script for buiding unaccent.rules from Unicode data. Don't backpatch because unaccent changes may require tsvector/index rebuild. Thomas Munro <thomas.munro@enterprisedb.com>
2015-09-04 12:51:53 +03:00 · 2015-09-04 12:51:53 +03:00 · 1bbd52cb9a
commit 1bbd52cb9a
parent 4aec49899e
2 changed files with 415 additions and 66 deletions
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@ -0,0 +1,123 @@
 #!/usr/bin/python
 #
 # This script builds unaccent.rules on standard output when given the
 # contents of UnicodeData.txt[1] on standard input.  Optionally includes
 # ligature expansion, if --expand-ligatures is given on the command line.
 #
 # The approach is to use the Unicode decomposition data to identify
 # precomposed codepoints that are equivalent to a ligature of several
 # letters, or a base letter with any number of diacritical marks.
 # There is also a small set of special cases for codepoints that we
 # traditionally support even though Unicode doesn't consider them to
 # be ligatures or letters with marks.
 #
 # [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
 import re
 import sys
 def print_record(codepoint, letter):
    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
 class Codepoint:
    def __init__(self, id, general_category, combining_ids):
        self.id = id
        self.general_category = general_category
        self.combining_ids = combining_ids
 def is_plain_letter(codepoint):
    """Return true if codepoint represents a plain ASCII letter."""
    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
 def is_mark(codepoint):
    """Returns true for diacritical marks (combining codepoints)."""
    return codepoint.general_category in ("Mn", "Me", "Mc")
 def is_letter_with_marks(codepoint, table):
    """Returns true for plain letters combined with one or more marks."""
    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
    return len(codepoint.combining_ids) > 1 and \
           is_plain_letter(table[codepoint.combining_ids[0]]) and \
           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
 def is_letter(codepoint, table):
    """Return true for letter with or without diacritical marks."""
    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
 def get_plain_letter(codepoint, table):
    """Return the base codepoint without marks."""
    if is_letter_with_marks(codepoint, table):
        return table[codepoint.combining_ids[0]]
    elif is_plain_letter(codepoint):
        return codepoint
    else:
        raise "mu"
 def is_ligature(codepoint, table):
    """Return true for letters combined with letters."""
    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
 def get_plain_letters(codepoint, table):
    """Return a list of plain letters from a ligature."""
    assert(is_ligature(codepoint, table))
    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
 def main(expand_ligatures):
    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
    decomposition_type_pattern = re.compile(" *<[^>]*> *")
    table = {}
    all = []
    # read everything we need into memory
    for line in sys.stdin.readlines():
        fields = line.split(";")
        if len(fields) > 5:
            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
            general_category = fields[2]
            decomposition = fields[5]
            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
            id = int(fields[0], 16)
            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
            codepoint = Codepoint(id, general_category, combining_ids)
            table[id] = codepoint
            all.append(codepoint)
    # walk through all the codepoints looking for interesting mappings
    for codepoint in all:
        if codepoint.general_category.startswith('L') and \
           len(codepoint.combining_ids) > 1:
            if is_letter_with_marks(codepoint, table):
                print_record(codepoint.id,
                             chr(get_plain_letter(codepoint, table).id))
            elif expand_ligatures and is_ligature(codepoint, table):
                print_record(codepoint.id,
                             "".join(unichr(combining_codepoint.id)
                                     for combining_codepoint \
                                     in get_plain_letters(codepoint, table)))
    # some special cases
    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
    if expand_ligatures:
        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
 if __name__ == "__main__":
    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -4,22 +4,59 @@
 Ã	A
 Ä	A
 Å	A
-Æ	A
+Ç	C
 È	E
 É	E
 Ê	E
 Ë	E
 Ì	I
 Í	I
 Î	I
 Ï	I
 Ñ	N
 Ò	O
 Ó	O
 Ô	O
 Õ	O
 Ö	O
 Ù	U
 Ú	U
 Û	U
 Ü	U
 Ý	Y
 à	a
 á	a
 â	a
 ã	a
 ä	a
 å	a
-æ	a
+ç	c
 è	e
 é	e
 ê	e
 ë	e
 ì	i
 í	i
 î	i
 ï	i
 ñ	n
 ò	o
 ó	o
 ô	o
 õ	o
 ö	o
 ù	u
 ú	u
 û	u
 ü	u
 ý	y
 ÿ	y
 Ā	A
 ā	a
 Ă	A
 ă	a
 Ą	A
 ą	a
 Ç	C
 ç	c
 Ć	C
 ć	c
 Ĉ	C
@ -30,16 +67,6 @@
 č	c
 Ď	D
 ď	d
 Đ	D
 đ	d
 È	E
 É	E
 Ê	E
 Ë	E
 è	e
 é	e
 ê	e
 ë	e
 Ē	E
 ē	e
 Ĕ	E
@ -60,17 +87,7 @@
 ģ	g
 Ĥ	H
 ĥ	h
 Ħ	H
 ħ	h
 Ĩ	I
 Ì	I
 Í	I
 Î	I
 Ï	I
 ì	i
 í	i
 î	i
 ï	i
 ĩ	i
 Ī	I
 ī	i
@ -79,62 +96,36 @@
 Į	I
 į	i
 İ	I
-ı	i
+Ĳ	IJ
-Ĳ	I
+ĳ	ij
 ĳ	i
 Ĵ	J
 ĵ	j
 Ķ	K
 ķ	k
 ĸ	k
 Ĺ	L
 ĺ	l
 Ļ	L
 ļ	l
 Ľ	L
 ľ	l
 Ŀ	L
 ŀ	l
 Ł	L
 ł	l
 Ñ	N
 ñ	n
 Ń	N
 ń	n
 Ņ	N
 ņ	n
 Ň	N
 ň	n
 ŉ	n
 Ŋ	N
 ŋ	n
 Ò	O
 Ó	O
 Ô	O
 Õ	O
 Ö	O
 ò	o
 ó	o
 ô	o
 õ	o
 ö	o
 Ō	O
 ō	o
 Ŏ	O
 ŏ	o
 Ő	O
 ő	o
 Œ	E
 œ	e
 Ø	O
 ø	o
 Ŕ	R
 ŕ	r
 Ŗ	R
 ŗ	r
 Ř	R
 ř	r
 ß	S
 Ś	S
 ś	s
 Ŝ	S
@ -147,16 +138,6 @@
 ţ	t
 Ť	T
 ť	t
 Ŧ	T
 ŧ	t
 Ù	U
 Ú	U
 Û	U
 Ü	U
 ù	u
 ú	u
 û	u
 ü	u
 Ũ	U
 ũ	u
 Ū	U
@ -171,9 +152,6 @@
 ų	u
 Ŵ	W
 ŵ	w
 Ý	Y
 ý	y
 ÿ	y
 Ŷ	Y
 ŷ	y
 Ÿ	Y
@ -183,5 +161,253 @@
 ż	z
 Ž	Z
 ž	z
-ё	е
+Ơ	O
 ơ	o
 Ư	U
 ư	u
 Ǆ	DZ
 ǅ	Dz
 ǆ	dz
 Ǉ	LJ
 ǈ	Lj
 ǉ	lj
 Ǌ	NJ
 ǋ	Nj
 ǌ	nj
 Ǎ	A
 ǎ	a
 Ǐ	I
 ǐ	i
 Ǒ	O
 ǒ	o
 Ǔ	U
 ǔ	u
 Ǧ	G
 ǧ	g
 Ǩ	K
 ǩ	k
 Ǫ	O
 ǫ	o
 ǰ	j
 Ǳ	DZ
 ǲ	Dz
 ǳ	dz
 Ǵ	G
 ǵ	g
 Ǹ	N
 ǹ	n
 Ȁ	A
 ȁ	a
 Ȃ	A
 ȃ	a
 Ȅ	E
 ȅ	e
 Ȇ	E
 ȇ	e
 Ȉ	I
 ȉ	i
 Ȋ	I
 ȋ	i
 Ȍ	O
 ȍ	o
 Ȏ	O
 ȏ	o
 Ȑ	R
 ȑ	r
 Ȓ	R
 ȓ	r
 Ȕ	U
 ȕ	u
 Ȗ	U
 ȗ	u
 Ș	S
 ș	s
 Ț	T
 ț	t
 Ȟ	H
 ȟ	h
 Ȧ	A
 ȧ	a
 Ȩ	E
 ȩ	e
 Ȯ	O
 ȯ	o
 Ȳ	Y
 ȳ	y
 Ḁ	A
 ḁ	a
 Ḃ	B
 ḃ	b
 Ḅ	B
 ḅ	b
 Ḇ	B
 ḇ	b
 Ḋ	D
 ḋ	d
 Ḍ	D
 ḍ	d
 Ḏ	D
 ḏ	d
 Ḑ	D
 ḑ	d
 Ḓ	D
 ḓ	d
 Ḙ	E
 ḙ	e
 Ḛ	E
 ḛ	e
 Ḟ	F
 ḟ	f
 Ḡ	G
 ḡ	g
 Ḣ	H
 ḣ	h
 Ḥ	H
 ḥ	h
 Ḧ	H
 ḧ	h
 Ḩ	H
 ḩ	h
 Ḫ	H
 ḫ	h
 Ḭ	I
 ḭ	i
 Ḱ	K
 ḱ	k
 Ḳ	K
 ḳ	k
 Ḵ	K
 ḵ	k
 Ḷ	L
 ḷ	l
 Ḻ	L
 ḻ	l
 Ḽ	L
 ḽ	l
 Ḿ	M
 ḿ	m
 Ṁ	M
 ṁ	m
 Ṃ	M
 ṃ	m
 Ṅ	N
 ṅ	n
 Ṇ	N
 ṇ	n
 Ṉ	N
 ṉ	n
 Ṋ	N
 ṋ	n
 Ṕ	P
 ṕ	p
 Ṗ	P
 ṗ	p
 Ṙ	R
 ṙ	r
 Ṛ	R
 ṛ	r
 Ṟ	R
 ṟ	r
 Ṡ	S
 ṡ	s
 Ṣ	S
 ṣ	s
 Ṫ	T
 ṫ	t
 Ṭ	T
 ṭ	t
 Ṯ	T
 ṯ	t
 Ṱ	T
 ṱ	t
 Ṳ	U
 ṳ	u
 Ṵ	U
 ṵ	u
 Ṷ	U
 ṷ	u
 Ṽ	V
 ṽ	v
 Ṿ	V
 ṿ	v
 Ẁ	W
 ẁ	w
 Ẃ	W
 ẃ	w
 Ẅ	W
 ẅ	w
 Ẇ	W
 ẇ	w
 Ẉ	W
 ẉ	w
 Ẋ	X
 ẋ	x
 Ẍ	X
 ẍ	x
 Ẏ	Y
 ẏ	y
 Ẑ	Z
 ẑ	z
 Ẓ	Z
 ẓ	z
 Ẕ	Z
 ẕ	z
 ẖ	h
 ẗ	t
 ẘ	w
 ẙ	y
 Ạ	A
 ạ	a
 Ả	A
 ả	a
 Ẹ	E
 ẹ	e
 Ẻ	E
 ẻ	e
 Ẽ	E
 ẽ	e
 Ỉ	I
 ỉ	i
 Ị	I
 ị	i
 Ọ	O
 ọ	o
 Ỏ	O
 ỏ	o
 Ụ	U
 ụ	u
 Ủ	U
 ủ	u
 Ỳ	Y
 ỳ	y
 Ỵ	Y
 ỵ	y
 Ỷ	Y
 ỷ	y
 Ỹ	Y
 ỹ	y
 ﬀ	ff
 ﬁ	fi
 ﬂ	fl
 ﬃ	ffi
 ﬄ	ffl
 ﬆ	st
 Ø	O
 ø	o
 Đ	D
 đ	d
 ı	i
 Ħ	H
 ħ	h
 Ł	L
 ł	l
 ŉ	'n
 Ŧ	T
 ŧ	t
 Ё	Е
 ё	е
 Æ	AE
 ß	ss
 æ	ae
 Œ	OE
 œ	oe
 Ã	A
 Ä	A
 Å	A
-Æ	A
+Ç	C
+È	E
+É	E
+Ê	E
+Ë	E
+Ì	I
+Í	I
+Î	I
+Ï	I
+Ñ	N
+Ò	O
+Ó	O
+Ô	O
+Õ	O
+Ö	O
+Ù	U
+Ú	U
+Û	U
+Ü	U
+Ý	Y
 à	a
 á	a
 â	a
 ã	a
 ä	a
 å	a
-æ	a
+ç	c
+è	e
+é	e
+ê	e
+ë	e
+ì	i
+í	i
+î	i
+ï	i
+ñ	n
+ò	o
+ó	o
+ô	o
+õ	o
+ö	o
+ù	u
+ú	u
+û	u
+ü	u
+ý	y
+ÿ	y
 Ā	A
 ā	a
 Ă	A
 ă	a
 Ą	A
 ą	a
-Ç	C
-ç	c
 Ć	C
 ć	c
 Ĉ	C
 č	c
 Ď	D
 ď	d
-Đ	D
-đ	d
-È	E
-É	E
-Ê	E
-Ë	E
-è	e
-é	e
-ê	e
-ë	e
 Ē	E
 ē	e
 Ĕ	E
 ģ	g
 Ĥ	H
 ĥ	h
-Ħ	H
-ħ	h
 Ĩ	I
-Ì	I
-Í	I
-Î	I
-Ï	I
-ì	i
-í	i
-î	i
-ï	i
 ĩ	i
 Ī	I
 ī	i
 Į	I
 į	i
 İ	I
-ı	i
+Ĳ	IJ
-Ĳ	I
+ĳ	ij
-ĳ	i
 Ĵ	J
 ĵ	j
 Ķ	K
 ķ	k
-ĸ	k
 Ĺ	L
 ĺ	l
 Ļ	L
 ļ	l
 Ľ	L
 ľ	l
-Ŀ	L
-ŀ	l
-Ł	L
-ł	l
-Ñ	N
-ñ	n
 Ń	N
 ń	n
 Ņ	N
 ņ	n
 Ň	N
 ň	n
-ŉ	n
-Ŋ	N
-ŋ	n
-Ò	O
-Ó	O
-Ô	O
-Õ	O
-Ö	O
-ò	o
-ó	o
-ô	o
-õ	o
-ö	o
 Ō	O
 ō	o
 Ŏ	O
 ŏ	o
 Ő	O
 ő	o
-Œ	E
-œ	e
-Ø	O
-ø	o
 Ŕ	R
 ŕ	r
 Ŗ	R
 ŗ	r
 Ř	R
 ř	r
-ß	S
 Ś	S
 ś	s
 Ŝ	S
 ţ	t
 Ť	T
 ť	t
-Ŧ	T
-ŧ	t
-Ù	U
-Ú	U
-Û	U
-Ü	U
-ù	u
-ú	u
-û	u
-ü	u
 Ũ	U
 ũ	u
 Ū	U
 ų	u
 Ŵ	W
 ŵ	w
-Ý	Y
-ý	y
-ÿ	y
 Ŷ	Y
 ŷ	y
 Ÿ	Y
 ż	z
 Ž	Z
 ž	z
-ё	е
+Ơ	O
+ơ	o
+Ư	U
+ư	u
+Ǆ	DZ
+ǅ	Dz
+ǆ	dz
+Ǉ	LJ
+ǈ	Lj
+ǉ	lj
+Ǌ	NJ
+ǋ	Nj
+ǌ	nj
+Ǎ	A
+ǎ	a
+Ǐ	I
+ǐ	i
+Ǒ	O
+ǒ	o
+Ǔ	U
+ǔ	u
+Ǧ	G
+ǧ	g
+Ǩ	K
+ǩ	k
+Ǫ	O
+ǫ	o
+ǰ	j
+Ǳ	DZ
+ǲ	Dz
+ǳ	dz
+Ǵ	G
+ǵ	g
+Ǹ	N
+ǹ	n
+Ȁ	A
+ȁ	a
+Ȃ	A
+ȃ	a
+Ȅ	E
+ȅ	e
+Ȇ	E
+ȇ	e
+Ȉ	I
+ȉ	i
+Ȋ	I
+ȋ	i
+Ȍ	O
+ȍ	o
+Ȏ	O
+ȏ	o
+Ȑ	R
+ȑ	r
+Ȓ	R
+ȓ	r
+Ȕ	U
+ȕ	u
+Ȗ	U
+ȗ	u
+Ș	S
+ș	s
+Ț	T
+ț	t
+Ȟ	H
+ȟ	h
+Ȧ	A
+ȧ	a
+Ȩ	E
+ȩ	e
+Ȯ	O
+ȯ	o
+Ȳ	Y
+ȳ	y
+Ḁ	A
+ḁ	a
+Ḃ	B
+ḃ	b
+Ḅ	B
+ḅ	b
+Ḇ	B
+ḇ	b
+Ḋ	D
+ḋ	d
+Ḍ	D
+ḍ	d
+Ḏ	D
+ḏ	d
+Ḑ	D
+ḑ	d
+Ḓ	D
+ḓ	d
+Ḙ	E
+ḙ	e
+Ḛ	E
+ḛ	e
+Ḟ	F
+ḟ	f
+Ḡ	G
+ḡ	g
+Ḣ	H
+ḣ	h
+Ḥ	H
+ḥ	h
+Ḧ	H
+ḧ	h
+Ḩ	H
+ḩ	h
+Ḫ	H
+ḫ	h
+Ḭ	I
+ḭ	i
+Ḱ	K
+ḱ	k
+Ḳ	K
+ḳ	k
+Ḵ	K
+ḵ	k
+Ḷ	L
+ḷ	l
+Ḻ	L
+ḻ	l
+Ḽ	L
+ḽ	l
+Ḿ	M
+ḿ	m
+Ṁ	M
+ṁ	m
+Ṃ	M
+ṃ	m
+Ṅ	N
+ṅ	n
+Ṇ	N
+ṇ	n
+Ṉ	N
+ṉ	n
+Ṋ	N
+ṋ	n
+Ṕ	P
+ṕ	p
+Ṗ	P
+ṗ	p
+Ṙ	R
+ṙ	r
+Ṛ	R
+ṛ	r
+Ṟ	R
+ṟ	r
+Ṡ	S
+ṡ	s
+Ṣ	S
+ṣ	s
+Ṫ	T
+ṫ	t
+Ṭ	T
+ṭ	t
+Ṯ	T
+ṯ	t
+Ṱ	T
+ṱ	t
+Ṳ	U
+ṳ	u
+Ṵ	U
+ṵ	u
+Ṷ	U
+ṷ	u
+Ṽ	V
+ṽ	v
+Ṿ	V
+ṿ	v
+Ẁ	W
+ẁ	w
+Ẃ	W
+ẃ	w
+Ẅ	W
+ẅ	w
+Ẇ	W
+ẇ	w
+Ẉ	W
+ẉ	w
+Ẋ	X
+ẋ	x
+Ẍ	X
+ẍ	x
+Ẏ	Y
+ẏ	y
+Ẑ	Z
+ẑ	z
+Ẓ	Z
+ẓ	z
+Ẕ	Z
+ẕ	z
+ẖ	h
+ẗ	t
+ẘ	w
+ẙ	y
+Ạ	A
+ạ	a
+Ả	A
+ả	a
+Ẹ	E
+ẹ	e
+Ẻ	E
+ẻ	e
+Ẽ	E
+ẽ	e
+Ỉ	I
+ỉ	i
+Ị	I
+ị	i
+Ọ	O
+ọ	o
+Ỏ	O
+ỏ	o
+Ụ	U
+ụ	u
+Ủ	U
+ủ	u
+Ỳ	Y
+ỳ	y
+Ỵ	Y
+ỵ	y
+Ỷ	Y
+ỷ	y
+Ỹ	Y
+ỹ	y
+ﬀ	ff
+ﬁ	fi
+ﬂ	fl
+ﬃ	ffi
+ﬄ	ffl
+ﬆ	st
+Ø	O
+ø	o
+Đ	D
+đ	d
+ı	i
+Ħ	H
+ħ	h
+Ł	L
+ł	l
+ŉ	'n
+Ŧ	T
+ŧ	t
 Ё	Е
+ё	е
+Æ	AE
+ß	ss
+æ	ae
+Œ	OE
+œ	oe