mirror of https://github.com/postgres/postgres
Add combining characters to unaccent.rules.
Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
This commit is contained in:
parent
80579f9bb1
commit
456e3718e7
|
@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
|
|||
>+-~
|
||||
(1 row)
|
||||
|
||||
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
|
||||
unaccent
|
||||
----------
|
||||
A
|
||||
(1 row)
|
||||
|
||||
SELECT unaccent('unaccent', 'foobar');
|
||||
unaccent
|
||||
----------
|
||||
|
@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
|
|||
>+-~
|
||||
(1 row)
|
||||
|
||||
SELECT unaccent('unaccent', 'À');
|
||||
unaccent
|
||||
----------
|
||||
A
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('unaccent', 'foobar');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
|
|||
{>+-~}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('unaccent', 'À');
|
||||
ts_lexize
|
||||
-----------
|
||||
{A}
|
||||
(1 row)
|
||||
|
||||
|
|
|
@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
|
|||
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
|
||||
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
|
||||
|
||||
# Combining marks follow a "base" character, and result in a composite
|
||||
# character. Example: "U&'A\0300'"produces "À".There are three types of
|
||||
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
|
||||
# combining (Mc). We identify the ranges of marks we feel safe removing.
|
||||
# References:
|
||||
# https://en.wikipedia.org/wiki/Combining_character
|
||||
# https://www.unicode.org/charts/PDF/U0300.pdf
|
||||
# https://www.unicode.org/charts/PDF/U20D0.pdf
|
||||
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
|
||||
(0x20dd, 0x20E0), # Me: Symbols
|
||||
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
|
||||
|
||||
def print_record(codepoint, letter):
|
||||
print (chr(codepoint) + "\t" + letter)
|
||||
if letter:
|
||||
output = chr(codepoint) + "\t" + letter
|
||||
else:
|
||||
output = chr(codepoint)
|
||||
|
||||
print(output)
|
||||
|
||||
class Codepoint:
|
||||
def __init__(self, id, general_category, combining_ids):
|
||||
|
@ -70,6 +87,16 @@ class Codepoint:
|
|||
self.general_category = general_category
|
||||
self.combining_ids = combining_ids
|
||||
|
||||
def is_mark_to_remove(codepoint):
|
||||
"""Return true if this is a combining mark to remove."""
|
||||
if not is_mark(codepoint):
|
||||
return False
|
||||
|
||||
for begin, end in COMBINING_MARK_RANGES:
|
||||
if codepoint.id >= begin and codepoint.id <= end:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_plain_letter(codepoint):
|
||||
"""Return true if codepoint represents a "plain letter"."""
|
||||
for begin, end in PLAIN_LETTER_RANGES:
|
||||
|
@ -234,6 +261,8 @@ def main(args):
|
|||
"".join(chr(combining_codepoint.id)
|
||||
for combining_codepoint \
|
||||
in get_plain_letters(codepoint, table))))
|
||||
elif is_mark_to_remove(codepoint):
|
||||
charactersSet.add((codepoint.id, None))
|
||||
|
||||
# add CLDR Latin-ASCII characters
|
||||
if not args.noLigaturesExpansion:
|
||||
|
|
|
@ -9,13 +9,16 @@ SELECT unaccent('foobar');
|
|||
SELECT unaccent('ёлка');
|
||||
SELECT unaccent('ЁЖИК');
|
||||
SELECT unaccent('˃˖˗˜');
|
||||
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
|
||||
|
||||
SELECT unaccent('unaccent', 'foobar');
|
||||
SELECT unaccent('unaccent', 'ёлка');
|
||||
SELECT unaccent('unaccent', 'ЁЖИК');
|
||||
SELECT unaccent('unaccent', '˃˖˗˜');
|
||||
SELECT unaccent('unaccent', 'À');
|
||||
|
||||
SELECT ts_lexize('unaccent', 'foobar');
|
||||
SELECT ts_lexize('unaccent', 'ёлка');
|
||||
SELECT ts_lexize('unaccent', 'ЁЖИК');
|
||||
SELECT ts_lexize('unaccent', '˃˖˗˜');
|
||||
SELECT ts_lexize('unaccent', 'À');
|
||||
|
|
|
@ -414,6 +414,105 @@
|
|||
˖ +
|
||||
˗ -
|
||||
˜ ~
|
||||
̀
|
||||
́
|
||||
̂
|
||||
̃
|
||||
̄
|
||||
̅
|
||||
̆
|
||||
̇
|
||||
̈
|
||||
̉
|
||||
̊
|
||||
̋
|
||||
̌
|
||||
̍
|
||||
̎
|
||||
̏
|
||||
̐
|
||||
̑
|
||||
̒
|
||||
̓
|
||||
̔
|
||||
̕
|
||||
̖
|
||||
̗
|
||||
̘
|
||||
̙
|
||||
̚
|
||||
̛
|
||||
̜
|
||||
̝
|
||||
̞
|
||||
̟
|
||||
̠
|
||||
̡
|
||||
̢
|
||||
̣
|
||||
̤
|
||||
̥
|
||||
̦
|
||||
̧
|
||||
̨
|
||||
̩
|
||||
̪
|
||||
̫
|
||||
̬
|
||||
̭
|
||||
̮
|
||||
̯
|
||||
̰
|
||||
̱
|
||||
̲
|
||||
̳
|
||||
̴
|
||||
̵
|
||||
̶
|
||||
̷
|
||||
̸
|
||||
̹
|
||||
̺
|
||||
̻
|
||||
̼
|
||||
̽
|
||||
̾
|
||||
̿
|
||||
̀
|
||||
́
|
||||
͂
|
||||
̓
|
||||
̈́
|
||||
ͅ
|
||||
͆
|
||||
͇
|
||||
͈
|
||||
͉
|
||||
͊
|
||||
͋
|
||||
͌
|
||||
͍
|
||||
͎
|
||||
͏
|
||||
͐
|
||||
͑
|
||||
͒
|
||||
͓
|
||||
͔
|
||||
͕
|
||||
͖
|
||||
͗
|
||||
͘
|
||||
͙
|
||||
͚
|
||||
͛
|
||||
͜
|
||||
͝
|
||||
͞
|
||||
͟
|
||||
͠
|
||||
͡
|
||||
͢
|
||||
Ά Α
|
||||
Έ Ε
|
||||
Ή Η
|
||||
|
@ -982,6 +1081,13 @@
|
|||
₧ Pts
|
||||
₹ Rs
|
||||
₺ TL
|
||||
⃝
|
||||
⃞
|
||||
⃟
|
||||
⃠
|
||||
⃢
|
||||
⃣
|
||||
⃤
|
||||
℀ a/c
|
||||
℁ a/s
|
||||
ℂ C
|
||||
|
|
Loading…
Reference in New Issue