pycodestyle (PEP 8) cleanup in Python scripts
These are mainly whitespace changes. I didn't fix "E501 line too long", which would require more significant surgery.
This commit is contained in:
parent
e80a7a1f3d
commit
ddf590b811
@ -38,10 +38,10 @@ sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
|
||||
# For now we are being conservative by including only Latin and Greek. This
|
||||
# could be extended in future based on feedback from people with relevant
|
||||
# language knowledge.
|
||||
PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
|
||||
(ord('A'), ord('Z')), # Latin upper case
|
||||
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
|
||||
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
|
||||
PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
|
||||
(ord('A'), ord('Z')), # Latin upper case
|
||||
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
|
||||
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
|
||||
|
||||
# Combining marks follow a "base" character, and result in a composite
|
||||
# character. Example: "U&'A\0300'"produces "À".There are three types of
|
||||
@ -51,9 +51,10 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
|
||||
# https://en.wikipedia.org/wiki/Combining_character
|
||||
# https://www.unicode.org/charts/PDF/U0300.pdf
|
||||
# https://www.unicode.org/charts/PDF/U20D0.pdf
|
||||
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
|
||||
(0x20dd, 0x20E0), # Me: Symbols
|
||||
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
|
||||
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
|
||||
(0x20dd, 0x20E0), # Me: Symbols
|
||||
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
|
||||
|
||||
|
||||
def print_record(codepoint, letter):
|
||||
if letter:
|
||||
@ -63,12 +64,14 @@ def print_record(codepoint, letter):
|
||||
|
||||
print(output)
|
||||
|
||||
|
||||
class Codepoint:
|
||||
def __init__(self, id, general_category, combining_ids):
|
||||
self.id = id
|
||||
self.general_category = general_category
|
||||
self.combining_ids = combining_ids
|
||||
|
||||
|
||||
def is_mark_to_remove(codepoint):
|
||||
"""Return true if this is a combining mark to remove."""
|
||||
if not is_mark(codepoint):
|
||||
@ -79,17 +82,20 @@ def is_mark_to_remove(codepoint):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_plain_letter(codepoint):
|
||||
"""Return true if codepoint represents a "plain letter"."""
|
||||
for begin, end in PLAIN_LETTER_RANGES:
|
||||
if codepoint.id >= begin and codepoint.id <= end:
|
||||
return True
|
||||
if codepoint.id >= begin and codepoint.id <= end:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_mark(codepoint):
|
||||
"""Returns true for diacritical marks (combining codepoints)."""
|
||||
return codepoint.general_category in ("Mn", "Me", "Mc")
|
||||
|
||||
|
||||
def is_letter_with_marks(codepoint, table):
|
||||
"""Returns true for letters combined with one or more marks."""
|
||||
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
|
||||
@ -105,16 +111,18 @@ def is_letter_with_marks(codepoint, table):
|
||||
|
||||
# Check if the base letter of this letter has marks.
|
||||
codepoint_base = codepoint.combining_ids[0]
|
||||
if (is_plain_letter(table[codepoint_base]) is False and \
|
||||
is_letter_with_marks(table[codepoint_base], table) is False):
|
||||
if is_plain_letter(table[codepoint_base]) is False and \
|
||||
is_letter_with_marks(table[codepoint_base], table) is False:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_letter(codepoint, table):
|
||||
"""Return true for letter with or without diacritical marks."""
|
||||
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
|
||||
|
||||
|
||||
def get_plain_letter(codepoint, table):
|
||||
"""Return the base codepoint without marks. If this codepoint has more
|
||||
than one combining character, do a recursive lookup on the table to
|
||||
@ -133,15 +141,18 @@ def get_plain_letter(codepoint, table):
|
||||
# Should not come here
|
||||
assert(False)
|
||||
|
||||
|
||||
def is_ligature(codepoint, table):
|
||||
"""Return true for letters combined with letters."""
|
||||
return all(is_letter(table[i], table) for i in codepoint.combining_ids)
|
||||
|
||||
|
||||
def get_plain_letters(codepoint, table):
|
||||
"""Return a list of plain letters from a ligature."""
|
||||
assert(is_ligature(codepoint, table))
|
||||
return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
|
||||
|
||||
|
||||
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
|
||||
"""Parse the XML file and return a set of tuples (src, trg), where "src"
|
||||
is the original character and "trg" the substitute."""
|
||||
@ -189,21 +200,23 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
|
||||
|
||||
return charactersSet
|
||||
|
||||
|
||||
def special_cases():
|
||||
"""Returns the special cases which are not handled by other methods"""
|
||||
charactersSet = set()
|
||||
|
||||
# Cyrillic
|
||||
charactersSet.add((0x0401, "\u0415")) # CYRILLIC CAPITAL LETTER IO
|
||||
charactersSet.add((0x0451, "\u0435")) # CYRILLIC SMALL LETTER IO
|
||||
charactersSet.add((0x0401, "\u0415")) # CYRILLIC CAPITAL LETTER IO
|
||||
charactersSet.add((0x0451, "\u0435")) # CYRILLIC SMALL LETTER IO
|
||||
|
||||
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
|
||||
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
|
||||
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
|
||||
charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
|
||||
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
|
||||
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
|
||||
charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
|
||||
|
||||
return charactersSet
|
||||
|
||||
|
||||
def main(args):
|
||||
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
|
||||
decomposition_type_pattern = re.compile(" *<[^>]*> *")
|
||||
@ -238,12 +251,12 @@ def main(args):
|
||||
len(codepoint.combining_ids) > 1:
|
||||
if is_letter_with_marks(codepoint, table):
|
||||
charactersSet.add((codepoint.id,
|
||||
chr(get_plain_letter(codepoint, table).id)))
|
||||
chr(get_plain_letter(codepoint, table).id)))
|
||||
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
|
||||
charactersSet.add((codepoint.id,
|
||||
"".join(chr(combining_codepoint.id)
|
||||
for combining_codepoint \
|
||||
in get_plain_letters(codepoint, table))))
|
||||
"".join(chr(combining_codepoint.id)
|
||||
for combining_codepoint
|
||||
in get_plain_letters(codepoint, table))))
|
||||
elif is_mark_to_remove(codepoint):
|
||||
charactersSet.add((codepoint.id, None))
|
||||
|
||||
@ -258,6 +271,7 @@ def main(args):
|
||||
for characterPair in charactersList:
|
||||
print_record(characterPair[0], characterPair[1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
|
||||
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
|
||||
|
@ -1,18 +1,20 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
import sys, string, locale
|
||||
import locale
|
||||
import sys
|
||||
|
||||
locale.setlocale(locale.LC_ALL, "")
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
sys.stderr.write("Usage: sort.py filename\n")
|
||||
sys.exit(1)
|
||||
sys.stderr.write("Usage: sort.py filename\n")
|
||||
sys.exit(1)
|
||||
|
||||
infile = open(sys.argv[1], 'r')
|
||||
list = infile.readlines()
|
||||
infile.close()
|
||||
|
||||
for i in range(0, len(list)):
|
||||
list[i] = list[i][:-1] # chop!
|
||||
list[i] = list[i][:-1] # chop!
|
||||
|
||||
list.sort(key=locale.strxfrm)
|
||||
print('\n'.join(list))
|
||||
|
Loading…
x
Reference in New Issue
Block a user