e89feee5c3
and FTS3/4. FossilOrigin-Name: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
206 lines
5.6 KiB
Tcl
206 lines
5.6 KiB
Tcl
|
|
#--------------------------------------------------------------------------
|
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
# reads the file and returns a list of mappings required to remove all
|
|
# diacritical marks from a unicode string. Each mapping is itself a list
|
|
# consisting of two elements - the unicode codepoint and the single ASCII
|
|
# character that it should be replaced with, or an empty string if the
|
|
# codepoint should simply be removed from the input. Examples:
|
|
#
|
|
# { 224 a 0 } (replace codepoint 224 to "a")
|
|
# { 769 "" 0 } (remove codepoint 769 from input)
|
|
#
|
|
# Mappings are only returned for non-upper case codepoints. It is assumed
|
|
# that the input has already been folded to lower case.
|
|
#
|
|
# The third value in the list is always either 0 or 1. 0 if the
|
|
# UnicodeData.txt file maps the codepoint to a single ASCII character and
|
|
# a diacritic, or 1 if the mapping is indirect. For example, consider the
|
|
# two entries:
|
|
#
|
|
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
|
|
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
|
|
#
|
|
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
|
|
# diacritic). The second is an indirect mapping, as it maps to the
|
|
# first codepoint plus 0302 (a diacritic).
|
|
#
|
|
proc rd_load_unicodedata_text {zName} {
|
|
global tl_lookup_table
|
|
|
|
set fd [open $zName]
|
|
set lField {
|
|
code
|
|
character_name
|
|
general_category
|
|
canonical_combining_classes
|
|
bidirectional_category
|
|
character_decomposition_mapping
|
|
decimal_digit_value
|
|
digit_value
|
|
numeric_value
|
|
mirrored
|
|
unicode_1_name
|
|
iso10646_comment_field
|
|
uppercase_mapping
|
|
lowercase_mapping
|
|
titlecase_mapping
|
|
}
|
|
set lRet [list]
|
|
|
|
while { ![eof $fd] } {
|
|
set line [gets $fd]
|
|
if {$line == ""} continue
|
|
|
|
set fields [split $line ";"]
|
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
foreach $lField $fields {}
|
|
if { [llength $character_decomposition_mapping]!=2
|
|
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
|
|
} {
|
|
continue
|
|
}
|
|
|
|
set iCode [expr "0x$code"]
|
|
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
|
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
|
|
|
# Filter out upper-case characters, as they will be mapped to their
|
|
# lower-case equivalents before this data is used.
|
|
if {[info exists tl_lookup_table($iCode)]} continue
|
|
|
|
# Check if this is an indirect mapping. If so, set bIndirect to true
|
|
# and change $iAscii to the indirectly mappped ASCII character.
|
|
set bIndirect 0
|
|
if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
|
|
set iAscii $mapping($iAscii)
|
|
set bIndirect 1
|
|
}
|
|
|
|
if { ($iAscii >= 97 && $iAscii <= 122)
|
|
|| ($iAscii >= 65 && $iAscii <= 90)
|
|
} {
|
|
lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
|
|
set mapping($iCode) $iAscii
|
|
set dia($iDia) 1
|
|
}
|
|
}
|
|
|
|
foreach d [array names dia] {
|
|
lappend lRet [list $d "" 0]
|
|
}
|
|
set lRet [lsort -integer -index 0 $lRet]
|
|
|
|
close $fd
|
|
set lRet
|
|
}
|
|
|
|
#-------------------------------------------------------------------------
|
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
# reads the file and returns a list of codepoints (integers). The list
|
|
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
|
# Category" that is not a "Letter" or "Number".
|
|
#
|
|
proc an_load_unicodedata_text {zName} {
|
|
set fd [open $zName]
|
|
set lField {
|
|
code
|
|
character_name
|
|
general_category
|
|
canonical_combining_classes
|
|
bidirectional_category
|
|
character_decomposition_mapping
|
|
decimal_digit_value
|
|
digit_value
|
|
numeric_value
|
|
mirrored
|
|
unicode_1_name
|
|
iso10646_comment_field
|
|
uppercase_mapping
|
|
lowercase_mapping
|
|
titlecase_mapping
|
|
}
|
|
set lRet [list]
|
|
|
|
while { ![eof $fd] } {
|
|
set line [gets $fd]
|
|
if {$line == ""} continue
|
|
|
|
set fields [split $line ";"]
|
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
foreach $lField $fields {}
|
|
|
|
set iCode [expr "0x$code"]
|
|
set bAlnum [expr {
|
|
[lsearch {L N} [string range $general_category 0 0]] >= 0
|
|
|| $general_category=="Co"
|
|
}]
|
|
|
|
if { !$bAlnum } { lappend lRet $iCode }
|
|
}
|
|
|
|
close $fd
|
|
set lRet
|
|
}
|
|
|
|
proc tl_load_casefolding_txt {zName} {
|
|
global tl_lookup_table
|
|
|
|
set fd [open $zName]
|
|
while { ![eof $fd] } {
|
|
set line [gets $fd]
|
|
if {[string range $line 0 0] == "#"} continue
|
|
if {$line == ""} continue
|
|
|
|
foreach x {a b c d} {unset -nocomplain $x}
|
|
foreach {a b c d} [split $line ";"] {}
|
|
|
|
set a2 [list]
|
|
set c2 [list]
|
|
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
|
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
|
set b [string trim $b]
|
|
set d [string trim $d]
|
|
|
|
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
|
}
|
|
}
|
|
|
|
proc cc_load_unicodedata_text {zName} {
|
|
set fd [open $zName]
|
|
set lField {
|
|
code
|
|
character_name
|
|
general_category
|
|
canonical_combining_classes
|
|
bidirectional_category
|
|
character_decomposition_mapping
|
|
decimal_digit_value
|
|
digit_value
|
|
numeric_value
|
|
mirrored
|
|
unicode_1_name
|
|
iso10646_comment_field
|
|
uppercase_mapping
|
|
lowercase_mapping
|
|
titlecase_mapping
|
|
}
|
|
set lRet [list]
|
|
|
|
while { ![eof $fd] } {
|
|
set line [gets $fd]
|
|
if {$line == ""} continue
|
|
|
|
set fields [split $line ";"]
|
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
foreach $lField $fields {}
|
|
|
|
lappend lRet [list $code $general_category]
|
|
}
|
|
|
|
close $fd
|
|
set lRet
|
|
}
|
|
|
|
|