From 0afc0a7841889c6221fd47430e72f4fe570833f4 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Tue, 10 Sep 2019 17:56:11 -0300 Subject: [PATCH] Fix unaccent generation script in Windows As originally coded, the script would fail on Windows 10 and Python 3 because stdout would not be switched to UTF-8 only for Python 2. This patch makes that apply to both versions. Also add python 2 compatibility markers so that we know what to remove once we drop support for that. Also use a "with" clause to ensure file descriptor is closed promptly. Author: Hugh Ranalli, Ramanarayana Reviewed-by: Kyotaro Horiguchi Discussion: https://postgr.es/m/CAKm4Xs7_61XMyOWmHs3n0mmkS0O4S0pvfWk=7cQ5P0gs177f7A@mail.gmail.com Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org --- contrib/unaccent/generate_unaccent_rules.py | 48 +++++++++++---------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 58b6e7deb7..7a0a96e04f 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -32,9 +32,15 @@ # The approach is to be Python3 compatible with Python2 "backports". from __future__ import print_function from __future__ import unicode_literals -import codecs -import sys +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped +import argparse +import codecs +import re +import sys +import xml.etree.ElementTree as ET + +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped if sys.version_info[0] <= 2: # Encode stdout as UTF-8, so we can just print to it sys.stdout = codecs.getwriter('utf8')(sys.stdout) @@ -45,12 +51,9 @@ if sys.version_info[0] <= 2: # Python 2 and 3 compatible bytes call def bytes(source, encoding='ascii', errors='strict'): return source.encode(encoding=encoding, errors=errors) +else: # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped - -import re -import argparse -import sys -import xml.etree.ElementTree as ET + sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) # The ranges of Unicode characters that we consider to be "plain letters". # For now we are being conservative by including only Latin and Greek. This @@ -233,21 +236,22 @@ def main(args): charactersSet = set() # read file UnicodeData.txt - unicodeDataFile = open(args.unicodeDataFilePath, 'r') - - # read everything we need into memory - for line in unicodeDataFile: - fields = line.split(";") - if len(fields) > 5: - # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt - general_category = fields[2] - decomposition = fields[5] - decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) - id = int(fields[0], 16) - combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] - codepoint = Codepoint(id, general_category, combining_ids) - table[id] = codepoint - all.append(codepoint) + with codecs.open( + args.unicodeDataFilePath, mode='r', encoding='UTF-8', + ) as unicodeDataFile: + # read everything we need into memory + for line in unicodeDataFile: + fields = line.split(";") + if len(fields) > 5: + # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt + general_category = fields[2] + decomposition = fields[5] + decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) + id = int(fields[0], 16) + combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] + codepoint = Codepoint(id, general_category, combining_ids) + table[id] = codepoint + all.append(codepoint) # walk through all the codepoints looking for interesting mappings for codepoint in all: