From 5e8d670c313531c0dca245943fb84c94a477ddc4 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sun, 2 Sep 2018 07:12:24 +1200 Subject: [PATCH] Add Greek characters to unaccent.rules. Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com --- contrib/unaccent/generate_unaccent_rules.py | 19 +- contrib/unaccent/unaccent.rules | 221 ++++++++++++++++++++ 2 files changed, 236 insertions(+), 4 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 4b1b011861..859cac40fa 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -29,6 +29,15 @@ import argparse import sys import xml.etree.ElementTree as ET +# The ranges of Unicode characters that we consider to be "plain letters". +# For now we are being conservative by including only Latin and Greek. This +# could be extended in future based on feedback from people with relevant +# language knowledge. +PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case + (ord('A'), ord('Z')), # Latin upper case + (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA + (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA + def print_record(codepoint, letter): print (unichr(codepoint) + "\t" + letter).encode("UTF-8") @@ -39,9 +48,11 @@ class Codepoint: self.combining_ids = combining_ids def is_plain_letter(codepoint): - """Return true if codepoint represents a plain ASCII letter.""" - return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \ - (codepoint.id >= ord('A') and codepoint.id <= ord('Z')) + """Return true if codepoint represents a "plain letter".""" + for begin, end in PLAIN_LETTER_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False def is_mark(codepoint): """Returns true for diacritical marks (combining codepoints).""" @@ -184,7 +195,7 @@ def main(args): len(codepoint.combining_ids) > 1: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, - chr(get_plain_letter(codepoint, table).id))) + unichr(get_plain_letter(codepoint, table).id))) elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): charactersSet.add((codepoint.id, "".join(unichr(combining_codepoint.id) diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 97f9ed47cf..76e4e69beb 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -399,6 +399,26 @@ ʦ ts ʪ ls ʫ lz +Ά Α +Έ Ε +Ή Η +Ί Ι +Ό Ο +Ύ Υ +Ώ Ω +ΐ ι +Ϊ Ι +Ϋ Υ +ά α +έ ε +ή η +ί ι +ΰ υ +ϊ ι +ϋ υ +ό ο +ύ υ +ώ ω Ё Е ё е ᴀ A @@ -709,6 +729,207 @@ ỽ v Ỿ Y ỿ y +ἀ α +ἁ α +ἂ α +ἃ α +ἄ α +ἅ α +ἆ α +ἇ α +Ἀ Α +Ἁ Α +Ἂ Α +Ἃ Α +Ἄ Α +Ἅ Α +Ἆ Α +Ἇ Α +ἐ ε +ἑ ε +ἒ ε +ἓ ε +ἔ ε +ἕ ε +Ἐ Ε +Ἑ Ε +Ἒ Ε +Ἓ Ε +Ἔ Ε +Ἕ Ε +ἠ η +ἡ η +ἢ η +ἣ η +ἤ η +ἥ η +ἦ η +ἧ η +Ἠ Η +Ἡ Η +Ἢ Η +Ἣ Η +Ἤ Η +Ἥ Η +Ἦ Η +Ἧ Η +ἰ ι +ἱ ι +ἲ ι +ἳ ι +ἴ ι +ἵ ι +ἶ ι +ἷ ι +Ἰ Ι +Ἱ Ι +Ἲ Ι +Ἳ Ι +Ἴ Ι +Ἵ Ι +Ἶ Ι +Ἷ Ι +ὀ ο +ὁ ο +ὂ ο +ὃ ο +ὄ ο +ὅ ο +Ὀ Ο +Ὁ Ο +Ὂ Ο +Ὃ Ο +Ὄ Ο +Ὅ Ο +ὐ υ +ὑ υ +ὒ υ +ὓ υ +ὔ υ +ὕ υ +ὖ υ +ὗ υ +Ὑ Υ +Ὓ Υ +Ὕ Υ +Ὗ Υ +ὠ ω +ὡ ω +ὢ ω +ὣ ω +ὤ ω +ὥ ω +ὦ ω +ὧ ω +Ὠ Ω +Ὡ Ω +Ὢ Ω +Ὣ Ω +Ὤ Ω +Ὥ Ω +Ὦ Ω +Ὧ Ω +ὰ α +ὲ ε +ὴ η +ὶ ι +ὸ ο +ὺ υ +ὼ ω +ᾀ α +ᾁ α +ᾂ α +ᾃ α +ᾄ α +ᾅ α +ᾆ α +ᾇ α +ᾈ Α +ᾉ Α +ᾊ Α +ᾋ Α +ᾌ Α +ᾍ Α +ᾎ Α +ᾏ Α +ᾐ η +ᾑ η +ᾒ η +ᾓ η +ᾔ η +ᾕ η +ᾖ η +ᾗ η +ᾘ Η +ᾙ Η +ᾚ Η +ᾛ Η +ᾜ Η +ᾝ Η +ᾞ Η +ᾟ Η +ᾠ ω +ᾡ ω +ᾢ ω +ᾣ ω +ᾤ ω +ᾥ ω +ᾦ ω +ᾧ ω +ᾨ Ω +ᾩ Ω +ᾪ Ω +ᾫ Ω +ᾬ Ω +ᾭ Ω +ᾮ Ω +ᾯ Ω +ᾰ α +ᾱ α +ᾲ α +ᾳ α +ᾴ α +ᾶ α +ᾷ α +Ᾰ Α +Ᾱ Α +Ὰ Α +ᾼ Α +ῂ η +ῃ η +ῄ η +ῆ η +ῇ η +Ὲ Ε +Ὴ Η +ῌ Η +ῐ ι +ῑ ι +ῒ ι +ῖ ι +ῗ ι +Ῐ Ι +Ῑ Ι +Ὶ Ι +ῠ υ +ῡ υ +ῢ υ +ῤ ρ +ῥ ρ +ῦ υ +ῧ υ +Ῠ Υ +Ῡ Υ +Ὺ Υ +Ῥ Ρ +ῲ ω +ῳ ω +ῴ ω +ῶ ω +ῷ ω +Ὸ Ο +Ὼ Ω +ῼ Ω ‐ - ‑ - ‒ -