Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly

Add Python script for buiding unaccent.rules from Unicode data. Don't
backpatch because unaccent changes may require tsvector/index
rebuild.

Thomas Munro <thomas.munro@enterprisedb.com>
This commit is contained in:
Teodor Sigaev 2015-09-04 12:51:53 +03:00
parent 4aec49899e
commit 1bbd52cb9a
2 changed files with 415 additions and 66 deletions

View File

@ -0,0 +1,123 @@
#!/usr/bin/python
#
# This script builds unaccent.rules on standard output when given the
# contents of UnicodeData.txt[1] on standard input. Optionally includes
# ligature expansion, if --expand-ligatures is given on the command line.
#
# The approach is to use the Unicode decomposition data to identify
# precomposed codepoints that are equivalent to a ligature of several
# letters, or a base letter with any number of diacritical marks.
# There is also a small set of special cases for codepoints that we
# traditionally support even though Unicode doesn't consider them to
# be ligatures or letters with marks.
#
# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
import re
import sys
def print_record(codepoint, letter):
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
class Codepoint:
def __init__(self, id, general_category, combining_ids):
self.id = id
self.general_category = general_category
self.combining_ids = combining_ids
def is_plain_letter(codepoint):
"""Return true if codepoint represents a plain ASCII letter."""
return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
(codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
def is_mark(codepoint):
"""Returns true for diacritical marks (combining codepoints)."""
return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table):
"""Returns true for plain letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
return len(codepoint.combining_ids) > 1 and \
is_plain_letter(table[codepoint.combining_ids[0]]) and \
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table):
"""Return the base codepoint without marks."""
if is_letter_with_marks(codepoint, table):
return table[codepoint.combining_ids[0]]
elif is_plain_letter(codepoint):
return codepoint
else:
raise "mu"
def is_ligature(codepoint, table):
"""Return true for letters combined with letters."""
return all(is_letter(table[i], table) for i in codepoint.combining_ids)
def get_plain_letters(codepoint, table):
"""Return a list of plain letters from a ligature."""
assert(is_ligature(codepoint, table))
return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
def main(expand_ligatures):
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
decomposition_type_pattern = re.compile(" *<[^>]*> *")
table = {}
all = []
# read everything we need into memory
for line in sys.stdin.readlines():
fields = line.split(";")
if len(fields) > 5:
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
general_category = fields[2]
decomposition = fields[5]
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
id = int(fields[0], 16)
combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
codepoint = Codepoint(id, general_category, combining_ids)
table[id] = codepoint
all.append(codepoint)
# walk through all the codepoints looking for interesting mappings
for codepoint in all:
if codepoint.general_category.startswith('L') and \
len(codepoint.combining_ids) > 1:
if is_letter_with_marks(codepoint, table):
print_record(codepoint.id,
chr(get_plain_letter(codepoint, table).id))
elif expand_ligatures and is_ligature(codepoint, table):
print_record(codepoint.id,
"".join(unichr(combining_codepoint.id)
for combining_codepoint \
in get_plain_letters(codepoint, table)))
# some special cases
print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
if expand_ligatures:
print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
if __name__ == "__main__":
main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")

View File

@ -4,22 +4,59 @@
à A
Ä A
Å A
Æ A
Ç C
È E
É E
Ê E
Ë E
Ì I
Í I
Î I
Ï I
Ñ N
Ò O
Ó O
Ô O
Õ O
Ö O
Ù U
Ú U
Û U
Ü U
Ý Y
à a
á a
â a
ã a
ä a
å a
æ a
ç c
è e
é e
ê e
ë e
ì i
í i
î i
ï i
ñ n
ò o
ó o
ô o
õ o
ö o
ù u
ú u
û u
ü u
ý y
ÿ y
Ā A
ā a
Ă A
ă a
Ą A
ą a
Ç C
ç c
Ć C
ć c
Ĉ C
@ -30,16 +67,6 @@
č c
Ď D
ď d
Đ D
đ d
È E
É E
Ê E
Ë E
è e
é e
ê e
ë e
Ē E
ē e
Ĕ E
@ -60,17 +87,7 @@
ģ g
Ĥ H
ĥ h
Ħ H
ħ h
Ĩ I
Ì I
Í I
Î I
Ï I
ì i
í i
î i
ï i
ĩ i
Ī I
ī i
@ -79,62 +96,36 @@
Į I
į i
İ I
ı i
IJ I
ij i
IJ IJ
ij ij
Ĵ J
ĵ j
Ķ K
ķ k
ĸ k
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
Ŀ L
ŀ l
Ł L
ł l
Ñ N
ñ n
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
ʼn n
Ŋ N
ŋ n
Ò O
Ó O
Ô O
Õ O
Ö O
ò o
ó o
ô o
õ o
ö o
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
Œ E
œ e
Ø O
ø o
Ŕ R
ŕ r
Ŗ R
ŗ r
Ř R
ř r
ß S
Ś S
ś s
Ŝ S
@ -147,16 +138,6 @@
ţ t
Ť T
ť t
Ŧ T
ŧ t
Ù U
Ú U
Û U
Ü U
ù u
ú u
û u
ü u
Ũ U
ũ u
Ū U
@ -171,9 +152,6 @@
ų u
Ŵ W
ŵ w
Ý Y
ý y
ÿ y
Ŷ Y
ŷ y
Ÿ Y
@ -183,5 +161,253 @@
ż z
Ž Z
ž z
ё е
Ơ O
ơ o
Ư U
ư u
DŽ DZ
Dž Dz
dž dz
LJ LJ
Lj Lj
lj lj
NJ NJ
Nj Nj
nj nj
Ǎ A
ǎ a
Ǐ I
ǐ i
Ǒ O
ǒ o
Ǔ U
ǔ u
Ǧ G
ǧ g
Ǩ K
ǩ k
Ǫ O
ǫ o
ǰ j
DZ DZ
Dz Dz
dz dz
Ǵ G
ǵ g
Ǹ N
ǹ n
Ȁ A
ȁ a
Ȃ A
ȃ a
Ȅ E
ȅ e
Ȇ E
ȇ e
Ȉ I
ȉ i
Ȋ I
ȋ i
Ȍ O
ȍ o
Ȏ O
ȏ o
Ȑ R
ȑ r
Ȓ R
ȓ r
Ȕ U
ȕ u
Ȗ U
ȗ u
Ș S
ș s
Ț T
ț t
Ȟ H
ȟ h
Ȧ A
ȧ a
Ȩ E
ȩ e
Ȯ O
ȯ o
Ȳ Y
ȳ y
Ḁ A
ḁ a
Ḃ B
ḃ b
Ḅ B
ḅ b
Ḇ B
ḇ b
Ḋ D
ḋ d
Ḍ D
ḍ d
Ḏ D
ḏ d
Ḑ D
ḑ d
Ḓ D
ḓ d
Ḙ E
ḙ e
Ḛ E
ḛ e
Ḟ F
ḟ f
Ḡ G
ḡ g
Ḣ H
ḣ h
Ḥ H
ḥ h
Ḧ H
ḧ h
Ḩ H
ḩ h
Ḫ H
ḫ h
Ḭ I
ḭ i
Ḱ K
ḱ k
Ḳ K
ḳ k
Ḵ K
ḵ k
Ḷ L
ḷ l
Ḻ L
ḻ l
Ḽ L
ḽ l
Ḿ M
ḿ m
Ṁ M
ṁ m
Ṃ M
ṃ m
Ṅ N
ṅ n
Ṇ N
ṇ n
Ṉ N
ṉ n
Ṋ N
ṋ n
Ṕ P
ṕ p
Ṗ P
ṗ p
Ṙ R
ṙ r
Ṛ R
ṛ r
Ṟ R
ṟ r
Ṡ S
ṡ s
Ṣ S
ṣ s
Ṫ T
ṫ t
Ṭ T
ṭ t
Ṯ T
ṯ t
Ṱ T
ṱ t
Ṳ U
ṳ u
Ṵ U
ṵ u
Ṷ U
ṷ u
Ṽ V
ṽ v
Ṿ V
ṿ v
Ẁ W
ẁ w
Ẃ W
ẃ w
Ẅ W
ẅ w
Ẇ W
ẇ w
Ẉ W
ẉ w
Ẋ X
ẋ x
Ẍ X
ẍ x
Ẏ Y
ẏ y
Ẑ Z
ẑ z
Ẓ Z
ẓ z
Ẕ Z
ẕ z
ẖ h
ẗ t
ẘ w
ẙ y
Ạ A
ạ a
Ả A
ả a
Ẹ E
ẹ e
Ẻ E
ẻ e
Ẽ E
ẽ e
Ỉ I
ỉ i
Ị I
ị i
Ọ O
ọ o
Ỏ O
ỏ o
Ụ U
ụ u
Ủ U
ủ u
Ỳ Y
ỳ y
Ỵ Y
ỵ y
Ỷ Y
ỷ y
Ỹ Y
ỹ y
ff ff
fi fi
fl fl
ffi ffi
ffl ffl
st st
Ø O
ø o
Đ D
đ d
ı i
Ħ H
ħ h
Ł L
ł l
ʼn 'n
Ŧ T
ŧ t
Ё Е
ё е
Æ AE
ß ss
æ ae
Œ OE
œ oe