2012-05-25 21:50:19 +04:00
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
#
|
|
|
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
|
|
# reads the file and returns a list of mappings required to remove all
|
|
|
|
# diacritical marks from a unicode string. Each mapping is itself a list
|
|
|
|
# consisting of two elements - the unicode codepoint and the single ASCII
|
|
|
|
# character that it should be replaced with, or an empty string if the
|
|
|
|
# codepoint should simply be removed from the input. Examples:
|
|
|
|
#
|
|
|
|
# { 224 a } (replace codepoint 224 to "a")
|
|
|
|
# { 769 "" } (remove codepoint 769 from input)
|
|
|
|
#
|
|
|
|
# Mappings are only returned for non-upper case codepoints. It is assumed
|
|
|
|
# that the input has already been folded to lower case.
|
|
|
|
#
|
|
|
|
proc rd_load_unicodedata_text {zName} {
|
|
|
|
global tl_lookup_table
|
|
|
|
|
|
|
|
set fd [open $zName]
|
|
|
|
set lField {
|
|
|
|
code
|
|
|
|
character_name
|
|
|
|
general_category
|
|
|
|
canonical_combining_classes
|
|
|
|
bidirectional_category
|
|
|
|
character_decomposition_mapping
|
|
|
|
decimal_digit_value
|
|
|
|
digit_value
|
|
|
|
numeric_value
|
|
|
|
mirrored
|
|
|
|
unicode_1_name
|
|
|
|
iso10646_comment_field
|
|
|
|
uppercase_mapping
|
|
|
|
lowercase_mapping
|
|
|
|
titlecase_mapping
|
|
|
|
}
|
|
|
|
set lRet [list]
|
|
|
|
|
|
|
|
while { ![eof $fd] } {
|
|
|
|
set line [gets $fd]
|
|
|
|
if {$line == ""} continue
|
|
|
|
|
|
|
|
set fields [split $line ";"]
|
|
|
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
|
|
foreach $lField $fields {}
|
|
|
|
if { [llength $character_decomposition_mapping]!=2
|
|
|
|
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
|
|
|
|
} {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
set iCode [expr "0x$code"]
|
|
|
|
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
|
|
|
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
|
|
|
|
|
|
|
if {[info exists tl_lookup_table($iCode)]} continue
|
|
|
|
|
|
|
|
if { ($iAscii >= 97 && $iAscii <= 122)
|
|
|
|
|| ($iAscii >= 65 && $iAscii <= 90)
|
|
|
|
} {
|
|
|
|
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
|
|
|
|
set dia($iDia) 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach d [array names dia] {
|
|
|
|
lappend lRet [list $d ""]
|
|
|
|
}
|
|
|
|
set lRet [lsort -integer -index 0 $lRet]
|
|
|
|
|
|
|
|
close $fd
|
|
|
|
set lRet
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
proc print_rd {map} {
|
|
|
|
global tl_lookup_table
|
|
|
|
set aChar [list]
|
|
|
|
set lRange [list]
|
|
|
|
|
|
|
|
set nRange 1
|
|
|
|
set iFirst [lindex $map 0 0]
|
|
|
|
set cPrev [lindex $map 0 1]
|
|
|
|
|
|
|
|
foreach m [lrange $map 1 end] {
|
|
|
|
foreach {i c} $m {}
|
|
|
|
|
|
|
|
if {$cPrev == $c} {
|
|
|
|
for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
|
|
|
|
if {[info exists tl_lookup_table($j)]==0} break
|
|
|
|
}
|
|
|
|
|
|
|
|
if {$j==$i} {
|
|
|
|
set nNew [expr {(1 + $i - $iFirst)}]
|
|
|
|
if {$nNew<=8} {
|
|
|
|
set nRange $nNew
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lappend lRange [list $iFirst $nRange]
|
|
|
|
lappend aChar $cPrev
|
|
|
|
|
|
|
|
set iFirst $i
|
|
|
|
set cPrev $c
|
|
|
|
set nRange 1
|
|
|
|
}
|
|
|
|
lappend lRange [list $iFirst $nRange]
|
|
|
|
lappend aChar $cPrev
|
|
|
|
|
|
|
|
puts "/*"
|
|
|
|
puts "** If the argument is a codepoint corresponding to a lowercase letter"
|
|
|
|
puts "** in the ASCII range with a diacritic added, return the codepoint"
|
|
|
|
puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
|
|
|
|
puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
|
|
|
|
puts "** E\"). The resuls of passing a codepoint that corresponds to an"
|
|
|
|
puts "** uppercase letter are undefined."
|
|
|
|
puts "*/"
|
|
|
|
puts "static int remove_diacritic(int c)\{"
|
|
|
|
puts " unsigned short aDia\[\] = \{"
|
|
|
|
puts -nonewline " 0, "
|
|
|
|
set i 1
|
|
|
|
foreach r $lRange {
|
|
|
|
foreach {iCode nRange} $r {}
|
|
|
|
if {($i % 8)==0} {puts "" ; puts -nonewline " " }
|
|
|
|
incr i
|
|
|
|
|
|
|
|
puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
|
|
|
|
puts -nonewline ", "
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
puts " char aChar\[\] = \{"
|
|
|
|
puts -nonewline " '\\0', "
|
|
|
|
set i 1
|
|
|
|
foreach c $aChar {
|
|
|
|
set str "'$c', "
|
|
|
|
if {$c == ""} { set str "'\\0', " }
|
|
|
|
|
|
|
|
if {($i % 12)==0} {puts "" ; puts -nonewline " " }
|
|
|
|
incr i
|
|
|
|
puts -nonewline "$str"
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
puts {
|
|
|
|
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
|
|
|
|
int iRes = 0;
|
|
|
|
int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
|
|
|
|
int iLo = 0;
|
|
|
|
while( iHi>=iLo ){
|
|
|
|
int iTest = (iHi + iLo) / 2;
|
|
|
|
if( key >= aDia[iTest] ){
|
|
|
|
iRes = iTest;
|
|
|
|
iLo = iTest+1;
|
|
|
|
}else{
|
|
|
|
iHi = iTest-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert( key>=aDia[iRes] );
|
|
|
|
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
|
2014-08-06 22:50:51 +04:00
|
|
|
puts "\}"
|
2012-06-06 23:30:38 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
proc print_isdiacritic {zFunc map} {
|
|
|
|
|
|
|
|
set lCode [list]
|
|
|
|
foreach m $map {
|
|
|
|
foreach {code char} $m {}
|
|
|
|
if {$code && $char == ""} { lappend lCode $code }
|
|
|
|
}
|
|
|
|
set lCode [lsort -integer $lCode]
|
|
|
|
set iFirst [lindex $lCode 0]
|
|
|
|
set iLast [lindex $lCode end]
|
|
|
|
|
|
|
|
set i1 0
|
|
|
|
set i2 0
|
|
|
|
|
|
|
|
foreach c $lCode {
|
|
|
|
set i [expr $c - $iFirst]
|
|
|
|
if {$i < 32} {
|
|
|
|
set i1 [expr {$i1 | (1<<$i)}]
|
|
|
|
} else {
|
|
|
|
set i2 [expr {$i2 | (1<<($i-32))}]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
puts "/*"
|
|
|
|
puts "** Return true if the argument interpreted as a unicode codepoint"
|
|
|
|
puts "** is a diacritical modifier character."
|
|
|
|
puts "*/"
|
|
|
|
puts "int ${zFunc}\(int c)\{"
|
|
|
|
puts " unsigned int mask0 = [format "0x%08X" $i1];"
|
|
|
|
puts " unsigned int mask1 = [format "0x%08X" $i2];"
|
|
|
|
|
|
|
|
puts " if( c<$iFirst || c>$iLast ) return 0;"
|
|
|
|
puts " return (c < $iFirst+32) ?"
|
|
|
|
puts " (mask0 & (1 << (c-$iFirst))) :"
|
|
|
|
puts " (mask1 & (1 << (c-$iFirst-32)));"
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------
|
2012-05-25 21:50:19 +04:00
|
|
|
|
|
|
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
|
|
# reads the file and returns a list of codepoints (integers). The list
|
|
|
|
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
|
|
|
# Category" that is not a "Letter" or "Number".
|
|
|
|
#
|
|
|
|
proc an_load_unicodedata_text {zName} {
|
|
|
|
set fd [open $zName]
|
|
|
|
set lField {
|
|
|
|
code
|
|
|
|
character_name
|
|
|
|
general_category
|
|
|
|
canonical_combining_classes
|
|
|
|
bidirectional_category
|
|
|
|
character_decomposition_mapping
|
|
|
|
decimal_digit_value
|
|
|
|
digit_value
|
|
|
|
numeric_value
|
|
|
|
mirrored
|
|
|
|
unicode_1_name
|
|
|
|
iso10646_comment_field
|
|
|
|
uppercase_mapping
|
|
|
|
lowercase_mapping
|
|
|
|
titlecase_mapping
|
|
|
|
}
|
|
|
|
set lRet [list]
|
|
|
|
|
|
|
|
while { ![eof $fd] } {
|
|
|
|
set line [gets $fd]
|
|
|
|
if {$line == ""} continue
|
|
|
|
|
|
|
|
set fields [split $line ";"]
|
|
|
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
|
|
foreach $lField $fields {}
|
|
|
|
|
|
|
|
set iCode [expr "0x$code"]
|
2013-06-05 20:17:21 +04:00
|
|
|
set bAlnum [expr {
|
|
|
|
[lsearch {L N} [string range $general_category 0 0]] >= 0
|
|
|
|
|| $general_category=="Co"
|
|
|
|
}]
|
2012-05-25 21:50:19 +04:00
|
|
|
|
|
|
|
if { !$bAlnum } { lappend lRet $iCode }
|
|
|
|
}
|
|
|
|
|
|
|
|
close $fd
|
|
|
|
set lRet
|
|
|
|
}
|
|
|
|
|
|
|
|
proc an_load_separator_ranges {} {
|
|
|
|
global unicodedata.txt
|
|
|
|
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
|
|
|
|
unset -nocomplain iFirst
|
|
|
|
unset -nocomplain nRange
|
|
|
|
set lRange [list]
|
|
|
|
foreach sep $lSep {
|
|
|
|
if {0==[info exists iFirst]} {
|
|
|
|
set iFirst $sep
|
|
|
|
set nRange 1
|
|
|
|
} elseif { $sep == ($iFirst+$nRange) } {
|
|
|
|
incr nRange
|
|
|
|
} else {
|
|
|
|
lappend lRange [list $iFirst $nRange]
|
|
|
|
set iFirst $sep
|
|
|
|
set nRange 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
lappend lRange [list $iFirst $nRange]
|
|
|
|
set lRange
|
|
|
|
}
|
|
|
|
|
|
|
|
proc an_print_range_array {lRange} {
|
|
|
|
set iFirstMax 0
|
|
|
|
set nRangeMax 0
|
|
|
|
foreach range $lRange {
|
|
|
|
foreach {iFirst nRange} $range {}
|
|
|
|
if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
|
|
|
|
if {$nRange > $nRangeMax} {set nRangeMax $nRange}
|
|
|
|
}
|
|
|
|
if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
|
|
|
|
if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
|
|
|
|
|
2012-05-25 22:48:48 +04:00
|
|
|
puts -nonewline " "
|
|
|
|
puts [string trim {
|
|
|
|
/* Each unsigned integer in the following array corresponds to a contiguous
|
|
|
|
** range of unicode codepoints that are not either letters or numbers (i.e.
|
|
|
|
** codepoints for which this function should return 0).
|
|
|
|
**
|
|
|
|
** The most significant 22 bits in each 32-bit value contain the first
|
|
|
|
** codepoint in the range. The least significant 10 bits are used to store
|
|
|
|
** the size of the range (always at least 1). In other words, the value
|
|
|
|
** ((C<<22) + N) represents a range of N codepoints starting with codepoint
|
|
|
|
** C. It is not possible to represent a range larger than 1023 codepoints
|
|
|
|
** using this format.
|
|
|
|
*/
|
|
|
|
}]
|
2014-08-06 21:49:13 +04:00
|
|
|
puts -nonewline " static const unsigned int aEntry\[\] = \{"
|
2012-05-25 21:50:19 +04:00
|
|
|
set i 0
|
|
|
|
foreach range $lRange {
|
|
|
|
foreach {iFirst nRange} $range {}
|
|
|
|
set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
|
|
|
|
|
|
|
|
if {($i % 5)==0} {puts "" ; puts -nonewline " "}
|
|
|
|
puts -nonewline " $u32,"
|
|
|
|
incr i
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
}
|
|
|
|
|
2012-05-25 23:50:12 +04:00
|
|
|
proc an_print_ascii_bitmap {lRange} {
|
|
|
|
foreach range $lRange {
|
|
|
|
foreach {iFirst nRange} $range {}
|
|
|
|
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
|
|
|
|
if {$i<=127} { set a($i) 1 }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
set aAscii [list 0 0 0 0]
|
|
|
|
foreach key [array names a] {
|
|
|
|
set idx [expr $key >> 5]
|
|
|
|
lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
|
|
|
|
}
|
|
|
|
|
|
|
|
puts " static const unsigned int aAscii\[4\] = \{"
|
|
|
|
puts -nonewline " "
|
|
|
|
foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
}
|
|
|
|
|
2012-05-25 21:50:19 +04:00
|
|
|
proc print_isalnum {zFunc lRange} {
|
2012-05-25 22:48:48 +04:00
|
|
|
puts "/*"
|
|
|
|
puts "** Return true if the argument corresponds to a unicode codepoint"
|
|
|
|
puts "** classified as either a letter or a number. Otherwise false."
|
|
|
|
puts "**"
|
|
|
|
puts "** The results are undefined if the value passed to this function"
|
|
|
|
puts "** is less than zero."
|
|
|
|
puts "*/"
|
2012-05-25 21:50:19 +04:00
|
|
|
puts "int ${zFunc}\(int c)\{"
|
|
|
|
an_print_range_array $lRange
|
2012-05-25 23:50:12 +04:00
|
|
|
an_print_ascii_bitmap $lRange
|
2012-05-25 21:50:19 +04:00
|
|
|
puts {
|
2012-05-25 23:50:12 +04:00
|
|
|
if( c<128 ){
|
|
|
|
return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
|
|
|
|
}else if( c<(1<<22) ){
|
2012-05-25 21:50:19 +04:00
|
|
|
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
|
2014-08-06 22:50:51 +04:00
|
|
|
int iRes = 0;
|
2012-05-25 21:50:19 +04:00
|
|
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
|
|
|
int iLo = 0;
|
|
|
|
while( iHi>=iLo ){
|
|
|
|
int iTest = (iHi + iLo) / 2;
|
|
|
|
if( key >= aEntry[iTest] ){
|
|
|
|
iRes = iTest;
|
|
|
|
iLo = iTest+1;
|
|
|
|
}else{
|
|
|
|
iHi = iTest-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert( aEntry[0]<key );
|
|
|
|
assert( key>=aEntry[iRes] );
|
2013-06-05 20:17:21 +04:00
|
|
|
return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
2012-05-25 21:50:19 +04:00
|
|
|
}
|
|
|
|
return 1;}
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
|
|
|
proc print_test_isalnum {zFunc lRange} {
|
|
|
|
foreach range $lRange {
|
|
|
|
foreach {iFirst nRange} $range {}
|
|
|
|
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
|
|
|
|
}
|
|
|
|
|
|
|
|
puts "static int isalnum_test(int *piCode)\{"
|
|
|
|
puts -nonewline " unsigned char aAlnum\[\] = \{"
|
|
|
|
for {set i 0} {$i < 70000} {incr i} {
|
|
|
|
if {($i % 32)==0} { puts "" ; puts -nonewline " " }
|
|
|
|
set bFlag [expr ![info exists a($i)]]
|
|
|
|
puts -nonewline "${bFlag},"
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
|
|
|
|
puts -nonewline " int aLargeSep\[\] = \{"
|
|
|
|
set i 0
|
|
|
|
foreach iSep [lsort -integer [array names a]] {
|
|
|
|
if {$iSep<70000} continue
|
|
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
|
|
|
puts -nonewline " $iSep,"
|
|
|
|
incr i
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
puts -nonewline " int aLargeOther\[\] = \{"
|
|
|
|
set i 0
|
|
|
|
foreach iSep [lsort -integer [array names a]] {
|
|
|
|
if {$iSep<70000} continue
|
|
|
|
if {[info exists a([expr $iSep-1])]==0} {
|
|
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
|
|
|
puts -nonewline " [expr $iSep-1],"
|
|
|
|
incr i
|
|
|
|
}
|
|
|
|
if {[info exists a([expr $iSep+1])]==0} {
|
|
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " }
|
|
|
|
puts -nonewline " [expr $iSep+1],"
|
|
|
|
incr i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
|
|
|
|
puts [subst -nocommands {
|
|
|
|
int i;
|
|
|
|
for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
|
|
|
|
if( ${zFunc}(i)!=aAlnum[i] ){
|
|
|
|
*piCode = i;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
|
|
|
|
if( ${zFunc}(aLargeSep[i])!=0 ){
|
|
|
|
*piCode = aLargeSep[i];
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
|
|
|
|
if( ${zFunc}(aLargeOther[i])!=1 ){
|
|
|
|
*piCode = aLargeOther[i];
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}]
|
|
|
|
puts " return 0;"
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
proc tl_load_casefolding_txt {zName} {
|
|
|
|
global tl_lookup_table
|
|
|
|
|
|
|
|
set fd [open $zName]
|
|
|
|
while { ![eof $fd] } {
|
|
|
|
set line [gets $fd]
|
|
|
|
if {[string range $line 0 0] == "#"} continue
|
|
|
|
if {$line == ""} continue
|
|
|
|
|
|
|
|
foreach x {a b c d} {unset -nocomplain $x}
|
|
|
|
foreach {a b c d} [split $line ";"] {}
|
|
|
|
|
|
|
|
set a2 [list]
|
|
|
|
set c2 [list]
|
|
|
|
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
|
|
|
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
|
|
|
set b [string trim $b]
|
|
|
|
set d [string trim $d]
|
|
|
|
|
|
|
|
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tl_create_records {} {
|
|
|
|
global tl_lookup_table
|
|
|
|
|
|
|
|
set iFirst ""
|
|
|
|
set nOff 0
|
|
|
|
set nRange 0
|
|
|
|
set nIncr 0
|
|
|
|
|
|
|
|
set lRecord [list]
|
|
|
|
foreach code [lsort -integer [array names tl_lookup_table]] {
|
|
|
|
set mapping $tl_lookup_table($code)
|
|
|
|
if {$iFirst == ""} {
|
|
|
|
set iFirst $code
|
|
|
|
set nOff [expr $mapping - $code]
|
|
|
|
set nRange 1
|
|
|
|
set nIncr 1
|
|
|
|
} else {
|
|
|
|
set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
|
|
|
|
if { $nRange==1 && ($diff==1 || $diff==2) } {
|
|
|
|
set nIncr $diff
|
|
|
|
}
|
|
|
|
|
|
|
|
if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
|
|
|
|
if { $nRange==1 } {set nIncr 1}
|
|
|
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
|
|
|
set iFirst $code
|
|
|
|
set nOff [expr $mapping - $code]
|
|
|
|
set nRange 1
|
|
|
|
set nIncr 1
|
|
|
|
} else {
|
|
|
|
incr nRange
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff]
|
|
|
|
|
|
|
|
set lRecord
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tl_print_table_header {} {
|
|
|
|
puts -nonewline " "
|
|
|
|
puts [string trim {
|
|
|
|
/* Each entry in the following array defines a rule for folding a range
|
|
|
|
** of codepoints to lower case. The rule applies to a range of nRange
|
|
|
|
** codepoints starting at codepoint iCode.
|
|
|
|
**
|
2012-05-26 21:57:02 +04:00
|
|
|
** If the least significant bit in flags is clear, then the rule applies
|
|
|
|
** to all nRange codepoints (i.e. all nRange codepoints are upper case and
|
|
|
|
** need to be folded). Or, if it is set, then the rule only applies to
|
|
|
|
** every second codepoint in the range, starting with codepoint C.
|
|
|
|
**
|
|
|
|
** The 7 most significant bits in flags are an index into the aiOff[]
|
|
|
|
** array. If a specific codepoint C does require folding, then its lower
|
|
|
|
** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
|
2012-05-25 21:50:19 +04:00
|
|
|
**
|
|
|
|
** The contents of this array are generated by parsing the CaseFolding.txt
|
|
|
|
** file distributed as part of the "Unicode Character Database". See
|
|
|
|
** http://www.unicode.org for details.
|
|
|
|
*/
|
|
|
|
}]
|
|
|
|
puts " static const struct TableEntry \{"
|
|
|
|
puts " unsigned short iCode;"
|
2012-05-26 21:57:02 +04:00
|
|
|
puts " unsigned char flags;"
|
2012-05-25 21:50:19 +04:00
|
|
|
puts " unsigned char nRange;"
|
|
|
|
puts " \} aEntry\[\] = \{"
|
|
|
|
}
|
|
|
|
|
2012-05-26 21:57:02 +04:00
|
|
|
proc tl_print_table_entry {togglevar entry liOff} {
|
2012-05-25 21:50:19 +04:00
|
|
|
upvar $togglevar t
|
|
|
|
foreach {iFirst nIncr nRange nOff} $entry {}
|
|
|
|
|
|
|
|
if {$iFirst > (1<<16)} { return 1 }
|
|
|
|
|
|
|
|
if {[info exists t]==0} {set t 0}
|
|
|
|
if {$t==0} { puts -nonewline " " }
|
|
|
|
|
|
|
|
set flags 0
|
|
|
|
if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
|
|
|
|
if {$nOff<0} { incr nOff [expr (1<<16)] }
|
|
|
|
|
2012-05-26 21:57:02 +04:00
|
|
|
set idx [lsearch $liOff $nOff]
|
|
|
|
if {$idx<0} {error "malfunction generating aiOff"}
|
|
|
|
set flags [expr $flags + $idx*2]
|
|
|
|
|
|
|
|
set txt "{$iFirst, $flags, $nRange},"
|
2012-05-25 21:50:19 +04:00
|
|
|
if {$t==2} {
|
|
|
|
puts $txt
|
|
|
|
} else {
|
|
|
|
puts -nonewline [format "% -23s" $txt]
|
|
|
|
}
|
|
|
|
set t [expr ($t+1)%3]
|
|
|
|
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tl_print_table_footer {togglevar} {
|
|
|
|
upvar $togglevar t
|
|
|
|
if {$t!=0} {puts ""}
|
|
|
|
puts " \};"
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tl_print_if_entry {entry} {
|
|
|
|
foreach {iFirst nIncr nRange nOff} $entry {}
|
|
|
|
if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
|
|
|
|
|
|
|
|
puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
|
|
|
|
puts " ret = c + $nOff;"
|
|
|
|
puts " \}"
|
|
|
|
}
|
|
|
|
|
2012-05-26 21:57:02 +04:00
|
|
|
proc tl_generate_ioff_table {lRecord} {
|
|
|
|
foreach entry $lRecord {
|
|
|
|
foreach {iFirst nIncr nRange iOff} $entry {}
|
|
|
|
if {$iOff<0} { incr iOff [expr (1<<16)] }
|
|
|
|
if {[info exists a($iOff)]} continue
|
|
|
|
set a($iOff) 1
|
|
|
|
}
|
|
|
|
|
|
|
|
set liOff [lsort -integer [array names a]]
|
|
|
|
if {[llength $liOff]>128} { error "Too many distinct ioffs" }
|
|
|
|
return $liOff
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tl_print_ioff_table {liOff} {
|
|
|
|
puts -nonewline " static const unsigned short aiOff\[\] = \{"
|
|
|
|
set i 0
|
|
|
|
foreach off $liOff {
|
|
|
|
if {($i % 8)==0} {puts "" ; puts -nonewline " "}
|
|
|
|
puts -nonewline [format "% -7s" "$off,"]
|
|
|
|
incr i
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
puts " \};"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
proc print_fold {zFunc} {
|
2012-05-25 21:50:19 +04:00
|
|
|
|
|
|
|
set lRecord [tl_create_records]
|
|
|
|
|
|
|
|
set lHigh [list]
|
2012-05-25 22:48:48 +04:00
|
|
|
puts "/*"
|
|
|
|
puts "** Interpret the argument as a unicode codepoint. If the codepoint"
|
|
|
|
puts "** is an upper case character that has a lower case equivalent,"
|
|
|
|
puts "** return the codepoint corresponding to the lower case version."
|
|
|
|
puts "** Otherwise, return a copy of the argument."
|
|
|
|
puts "**"
|
|
|
|
puts "** The results are undefined if the value passed to this function"
|
|
|
|
puts "** is less than zero."
|
|
|
|
puts "*/"
|
2012-06-06 23:30:38 +04:00
|
|
|
puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
|
2012-05-26 21:57:02 +04:00
|
|
|
|
|
|
|
set liOff [tl_generate_ioff_table $lRecord]
|
2012-05-25 21:50:19 +04:00
|
|
|
tl_print_table_header
|
|
|
|
foreach entry $lRecord {
|
2012-05-26 21:57:02 +04:00
|
|
|
if {[tl_print_table_entry toggle $entry $liOff]} {
|
2012-05-25 21:50:19 +04:00
|
|
|
lappend lHigh $entry
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tl_print_table_footer toggle
|
2012-05-26 21:57:02 +04:00
|
|
|
tl_print_ioff_table $liOff
|
|
|
|
|
2012-05-25 21:50:19 +04:00
|
|
|
puts {
|
|
|
|
int ret = c;
|
|
|
|
|
|
|
|
assert( c>=0 );
|
|
|
|
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
|
|
|
|
|
2012-05-25 23:50:12 +04:00
|
|
|
if( c<128 ){
|
|
|
|
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
|
|
|
}else if( c<65536 ){
|
2012-05-25 21:50:19 +04:00
|
|
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
|
|
|
int iLo = 0;
|
|
|
|
int iRes = -1;
|
|
|
|
|
|
|
|
while( iHi>=iLo ){
|
|
|
|
int iTest = (iHi + iLo) / 2;
|
|
|
|
int cmp = (c - aEntry[iTest].iCode);
|
|
|
|
if( cmp>=0 ){
|
|
|
|
iRes = iTest;
|
|
|
|
iLo = iTest+1;
|
|
|
|
}else{
|
|
|
|
iHi = iTest-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
|
|
|
|
|
|
|
if( iRes>=0 ){
|
|
|
|
const struct TableEntry *p = &aEntry[iRes];
|
2012-05-26 21:57:02 +04:00
|
|
|
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
|
|
|
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
2012-05-25 21:50:19 +04:00
|
|
|
assert( ret>0 );
|
|
|
|
}
|
|
|
|
}
|
2012-06-06 23:30:38 +04:00
|
|
|
|
|
|
|
if( bRemoveDiacritic ) ret = remove_diacritic(ret);
|
2012-05-25 21:50:19 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach entry $lHigh {
|
|
|
|
tl_print_if_entry $entry
|
|
|
|
}
|
|
|
|
|
|
|
|
puts ""
|
|
|
|
puts " return ret;"
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
proc print_fold_test {zFunc mappings} {
|
2012-05-25 21:50:19 +04:00
|
|
|
global tl_lookup_table
|
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
foreach m $mappings {
|
|
|
|
set c [lindex $m 1]
|
|
|
|
if {$c == ""} {
|
|
|
|
set extra([lindex $m 0]) 0
|
|
|
|
} else {
|
|
|
|
scan $c %c i
|
|
|
|
set extra([lindex $m 0]) $i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
puts "static int fold_test(int *piCode)\{"
|
2012-05-25 21:50:19 +04:00
|
|
|
puts -nonewline " static int aLookup\[\] = \{"
|
|
|
|
for {set i 0} {$i < 70000} {incr i} {
|
2012-06-06 23:30:38 +04:00
|
|
|
|
2012-05-25 21:50:19 +04:00
|
|
|
set expected $i
|
|
|
|
catch { set expected $tl_lookup_table($i) }
|
2012-06-06 23:30:38 +04:00
|
|
|
set expected2 $expected
|
|
|
|
catch { set expected2 $extra($expected2) }
|
|
|
|
|
|
|
|
if {($i % 4)==0} { puts "" ; puts -nonewline " " }
|
|
|
|
puts -nonewline "$expected, $expected2, "
|
2012-05-25 21:50:19 +04:00
|
|
|
}
|
|
|
|
puts " \};"
|
|
|
|
puts " int i;"
|
|
|
|
puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
|
2012-06-06 23:30:38 +04:00
|
|
|
puts " int iCode = (i/2);"
|
|
|
|
puts " int bFlag = i & 0x0001;"
|
|
|
|
puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
|
|
|
|
puts " *piCode = iCode;"
|
2012-05-25 21:50:19 +04:00
|
|
|
puts " return 1;"
|
|
|
|
puts " \}"
|
|
|
|
puts " \}"
|
|
|
|
puts " return 0;"
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
proc print_fileheader {} {
|
|
|
|
puts [string trim {
|
2012-05-25 22:48:48 +04:00
|
|
|
/*
|
|
|
|
** 2012 May 25
|
|
|
|
**
|
|
|
|
** The author disclaims copyright to this source code. In place of
|
|
|
|
** a legal notice, here is a blessing:
|
|
|
|
**
|
|
|
|
** May you do good and not evil.
|
|
|
|
** May you find forgiveness for yourself and forgive others.
|
|
|
|
** May you share freely, never taking more than you give.
|
|
|
|
**
|
|
|
|
******************************************************************************
|
|
|
|
*/
|
|
|
|
|
2012-05-25 21:50:19 +04:00
|
|
|
/*
|
|
|
|
** DO NOT EDIT THIS MACHINE GENERATED FILE.
|
|
|
|
*/
|
|
|
|
}]
|
|
|
|
puts ""
|
2015-01-01 19:46:10 +03:00
|
|
|
if {$::generate_fts5_code} {
|
|
|
|
puts "#if defined(SQLITE_ENABLE_FTS5)"
|
|
|
|
} else {
|
2015-01-01 21:03:49 +03:00
|
|
|
puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
|
2015-01-01 19:46:10 +03:00
|
|
|
puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
|
|
|
|
}
|
2012-05-26 22:28:14 +04:00
|
|
|
puts ""
|
2012-05-25 21:50:19 +04:00
|
|
|
puts "#include <assert.h>"
|
|
|
|
puts ""
|
|
|
|
}
|
|
|
|
|
|
|
|
proc print_test_main {} {
|
|
|
|
puts ""
|
|
|
|
puts "#include <stdio.h>"
|
|
|
|
puts ""
|
|
|
|
puts "int main(int argc, char **argv)\{"
|
|
|
|
puts " int r1, r2;"
|
|
|
|
puts " int code;"
|
|
|
|
puts " r1 = isalnum_test(&code);"
|
|
|
|
puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
|
|
|
|
puts " else printf(\"isalnum(): test passed\\n\");"
|
2012-06-06 23:30:38 +04:00
|
|
|
puts " r2 = fold_test(&code);"
|
|
|
|
puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
|
|
|
|
puts " else printf(\"fold(): test passed\\n\");"
|
2012-05-25 21:50:19 +04:00
|
|
|
puts " return (r1 || r2);"
|
|
|
|
puts "\}"
|
|
|
|
}
|
|
|
|
|
|
|
|
# Proces the command line arguments. Exit early if they are not to
|
|
|
|
# our liking.
|
|
|
|
#
|
|
|
|
proc usage {} {
|
2015-01-01 19:46:10 +03:00
|
|
|
puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
|
2012-05-25 21:50:19 +04:00
|
|
|
puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
|
|
|
|
exit 1
|
|
|
|
}
|
2015-01-01 19:46:10 +03:00
|
|
|
if {[llength $argv]<2} usage
|
2012-05-25 21:50:19 +04:00
|
|
|
set unicodedata.txt [lindex $argv end]
|
|
|
|
set casefolding.txt [lindex $argv end-1]
|
2015-01-01 19:46:10 +03:00
|
|
|
|
|
|
|
set generate_test_code 0
|
|
|
|
set generate_fts5_code 0
|
|
|
|
set function_prefix "sqlite3Fts"
|
|
|
|
for {set i 0} {$i < [llength $argv]-2} {incr i} {
|
|
|
|
switch -- [lindex $argv $i] {
|
|
|
|
-test {
|
|
|
|
set generate_test_code 1
|
|
|
|
}
|
|
|
|
-fts5 {
|
|
|
|
set function_prefix sqlite3Fts5
|
|
|
|
set generate_fts5_code 1
|
|
|
|
}
|
|
|
|
default {
|
|
|
|
usage
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-05-25 21:50:19 +04:00
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
print_fileheader
|
|
|
|
|
2012-05-25 21:50:19 +04:00
|
|
|
# Print the isalnum() function to stdout.
|
|
|
|
#
|
|
|
|
set lRange [an_load_separator_ranges]
|
2015-01-01 19:46:10 +03:00
|
|
|
print_isalnum ${function_prefix}UnicodeIsalnum $lRange
|
2012-05-25 21:50:19 +04:00
|
|
|
|
|
|
|
# Leave a gap between the two generated C functions.
|
|
|
|
#
|
|
|
|
puts ""
|
|
|
|
puts ""
|
|
|
|
|
2012-06-06 23:30:38 +04:00
|
|
|
# Load the fold data. This is used by the [rd_XXX] commands
|
|
|
|
# as well as [print_fold].
|
2012-05-25 21:50:19 +04:00
|
|
|
tl_load_casefolding_txt ${casefolding.txt}
|
2012-06-06 23:30:38 +04:00
|
|
|
|
|
|
|
set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
|
|
|
|
print_rd $mappings
|
|
|
|
puts ""
|
|
|
|
puts ""
|
2015-01-01 19:46:10 +03:00
|
|
|
print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
|
2012-06-06 23:30:38 +04:00
|
|
|
puts ""
|
|
|
|
puts ""
|
|
|
|
|
|
|
|
# Print the fold() function to stdout.
|
|
|
|
#
|
2015-01-01 19:46:10 +03:00
|
|
|
print_fold ${function_prefix}UnicodeFold
|
2012-05-25 21:50:19 +04:00
|
|
|
|
|
|
|
# Print the test routines and main() function to stdout, if -test
|
|
|
|
# was specified.
|
|
|
|
#
|
|
|
|
if {$::generate_test_code} {
|
2015-01-01 19:46:10 +03:00
|
|
|
print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
|
|
|
|
print_fold_test ${function_prefix}UnicodeFold $mappings
|
2012-05-25 21:50:19 +04:00
|
|
|
print_test_main
|
|
|
|
}
|
|
|
|
|
2015-01-01 19:46:10 +03:00
|
|
|
if {$generate_fts5_code} {
|
|
|
|
puts "#endif /* defined(SQLITE_ENABLE_FTS5) */"
|
|
|
|
} else {
|
|
|
|
puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
|
2015-01-01 21:03:49 +03:00
|
|
|
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
|
2015-01-01 19:46:10 +03:00
|
|
|
}
|