Switch BUnicodeChar to wrap the ICU's UChar32 one

Improve the unicode character processing and classifying routines by
wrapping up the UChar32 procedures from ICU. That fixes functional
regression introduced in hrev38017 and allows to fix East Asian Width
problems int the Temrinal.
This commit is contained in:
Siarzhuk Zharski 2013-04-02 07:47:26 +02:00
parent 4e17bdd83f
commit b6fd91b409
2 changed files with 413 additions and 628 deletions

View File

@ -8,6 +8,7 @@ enum unicode_char_category
// Non-category for unassigned and non-character code points. // Non-category for unassigned and non-character code points.
B_UNICODE_UNASSIGNED = 0, B_UNICODE_UNASSIGNED = 0,
B_UNICODE_GENERAL_OTHER_TYPES = 0, // Cn
B_UNICODE_UPPERCASE_LETTER = 1, // Lu B_UNICODE_UPPERCASE_LETTER = 1, // Lu
B_UNICODE_LOWERCASE_LETTER = 2, // Ll B_UNICODE_LOWERCASE_LETTER = 2, // Ll
B_UNICODE_TITLECASE_LETTER = 3, // Lt B_UNICODE_TITLECASE_LETTER = 3, // Lt
@ -37,152 +38,289 @@ enum unicode_char_category
B_UNICODE_OTHER_SYMBOL = 27, // So B_UNICODE_OTHER_SYMBOL = 27, // So
B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi
B_UNICODE_FINAL_PUNCTUATION = 29, // Pf B_UNICODE_FINAL_PUNCTUATION = 29, // Pf
B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn
B_UNICODE_CATEGORY_COUNT B_UNICODE_CATEGORY_COUNT
}; };
/** // This specifies the language directional property of a character set.
* This specifies the language directional property of a character set.
*/
enum unicode_char_direction { enum unicode_char_direction {
B_UNICODE_LEFT_TO_RIGHT = 0, B_UNICODE_LEFT_TO_RIGHT = 0,
B_UNICODE_RIGHT_TO_LEFT = 1, B_UNICODE_RIGHT_TO_LEFT = 1,
B_UNICODE_EUROPEAN_NUMBER = 2, B_UNICODE_EUROPEAN_NUMBER = 2,
B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3,
B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4,
B_UNICODE_ARABIC_NUMBER = 5, B_UNICODE_ARABIC_NUMBER = 5,
B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, B_UNICODE_COMMON_NUMBER_SEPARATOR = 6,
B_UNICODE_BLOCK_SEPARATOR = 7, B_UNICODE_BLOCK_SEPARATOR = 7,
B_UNICODE_SEGMENT_SEPARATOR = 8, B_UNICODE_SEGMENT_SEPARATOR = 8,
B_UNICODE_WHITE_SPACE_NEUTRAL = 9, B_UNICODE_WHITE_SPACE_NEUTRAL = 9,
B_UNICODE_OTHER_NEUTRAL = 10, B_UNICODE_OTHER_NEUTRAL = 10,
B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11,
B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12,
B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13,
B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14,
B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15,
B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, B_UNICODE_POP_DIRECTIONAL_FORMAT = 16,
B_UNICODE_DIR_NON_SPACING_MARK = 17, B_UNICODE_DIR_NON_SPACING_MARK = 17,
B_UNICODE_BOUNDARY_NEUTRAL = 18, B_UNICODE_BOUNDARY_NEUTRAL = 18,
B_UNICODE_DIRECTION_COUNT B_UNICODE_DIRECTION_COUNT
}; };
/** // Script range as defined in the Unicode standard.
* Script range as defined in the Unicode standard.
*/
enum unicode_char_script { enum unicode_char_script {
// Script names // New No_Block value in Unicode 4.
B_UNICODE_BASIC_LATIN, B_UNICODE_NO_BLOCK = 0, // [none] Special range
B_UNICODE_LATIN_1_SUPPLEMENT, B_UNICODE_BASIC_LATIN = 1, // [0000]
B_UNICODE_LATIN_EXTENDED_A, B_UNICODE_LATIN_1_SUPPLEMENT = 2, // [0080]
B_UNICODE_LATIN_EXTENDED_B, B_UNICODE_LATIN_EXTENDED_A = 3, // [0100]
B_UNICODE_IPA_EXTENSIONS, B_UNICODE_LATIN_EXTENDED_B = 4, // [0180]
B_UNICODE_SPACING_MODIFIER_LETTERS, B_UNICODE_IPA_EXTENSIONS = 5, // [0250]
B_UNICODE_COMBINING_DIACRITICAL_MARKS, B_UNICODE_SPACING_MODIFIER_LETTERS = 6, // [02B0]
B_UNICODE_GREEK, B_UNICODE_COMBINING_DIACRITICAL_MARKS = 7, // [0300]
B_UNICODE_CYRILLIC, B_UNICODE_GREEK = 8, // [0370]
B_UNICODE_ARMENIAN, B_UNICODE_CYRILLIC = 9, // [0400]
B_UNICODE_HEBREW, B_UNICODE_ARMENIAN = 10, // [0530]
B_UNICODE_ARABIC, B_UNICODE_HEBREW = 11, // [0590]
B_UNICODE_SYRIAC, B_UNICODE_ARABIC = 12, // [0600]
B_UNICODE_THAANA, B_UNICODE_SYRIAC = 13, // [0700]
B_UNICODE_DEVANAGARI, B_UNICODE_THAANA = 14, // [0780]
B_UNICODE_BENGALI, B_UNICODE_DEVANAGARI = 15, // [0900]
B_UNICODE_GURMUKHI, B_UNICODE_BENGALI = 16, // [0980]
B_UNICODE_GUJARATI, B_UNICODE_GURMUKHI = 17, // [0A00]
B_UNICODE_ORIYA, B_UNICODE_GUJARATI = 18, // [0A80]
B_UNICODE_TAMIL, B_UNICODE_ORIYA = 19, // [0B00]
B_UNICODE_TELUGU, B_UNICODE_TAMIL = 20, // [0B80]
B_UNICODE_KANNADA, B_UNICODE_TELUGU = 21, // [0C00]
B_UNICODE_MALAYALAM, B_UNICODE_KANNADA = 22, // [0C80]
B_UNICODE_SINHALA, B_UNICODE_MALAYALAM = 23, // [0D00]
B_UNICODE_THAI, B_UNICODE_SINHALA = 24, // [0D80]
B_UNICODE_LAO, B_UNICODE_THAI = 25, // [0E00]
B_UNICODE_TIBETAN, B_UNICODE_LAO = 26, // [0E80]
B_UNICODE_MYANMAR, B_UNICODE_TIBETAN = 27, // [0F00]
B_UNICODE_GEORGIAN, B_UNICODE_MYANMAR = 28, // [1000]
B_UNICODE_HANGUL_JAMO, B_UNICODE_GEORGIAN = 29, // [10A0]
B_UNICODE_ETHIOPIC, B_UNICODE_HANGUL_JAMO = 30, // [1100]
B_UNICODE_CHEROKEE, B_UNICODE_ETHIOPIC = 31, // [1200]
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, B_UNICODE_CHEROKEE = 32, // [13A0]
B_UNICODE_OGHAM, B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, // [1400]
B_UNICODE_RUNIC, B_UNICODE_OGHAM = 34, // [1680]
B_UNICODE_KHMER, B_UNICODE_RUNIC = 35, // [16A0]
B_UNICODE_MONGOLIAN, B_UNICODE_KHMER = 36, // [1780]
B_UNICODE_LATIN_EXTENDED_ADDITIONAL, B_UNICODE_MONGOLIAN = 37, // [1800]
B_UNICODE_GREEK_EXTENDED, B_UNICODE_LATIN_EXTENDED_ADDITIONAL = 38, // [1E00]
B_UNICODE_GENERAL_PUNCTUATION, B_UNICODE_GREEK_EXTENDED = 39, // [1F00]
B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS, B_UNICODE_GENERAL_PUNCTUATION = 40, // [2000]
B_UNICODE_CURRENCY_SYMBOLS, B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, // [2070]
B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS, B_UNICODE_CURRENCY_SYMBOLS = 42, // [20A0]
B_UNICODE_LETTERLIKE_SYMBOLS, B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS = 43, // [20D0]
B_UNICODE_NUMBER_FORMS, B_UNICODE_LETTERLIKE_SYMBOLS = 44, // [2100]
B_UNICODE_ARROWS, B_UNICODE_NUMBER_FORMS = 45, // [2150]
B_UNICODE_MATHEMATICAL_OPERATORS, B_UNICODE_ARROWS = 46, // [2190]
B_UNICODE_MISCELLANEOUS_TECHNICAL, B_UNICODE_MATHEMATICAL_OPERATORS = 47, // [2200]
B_UNICODE_CONTROL_PICTURES, B_UNICODE_MISCELLANEOUS_TECHNICAL = 48, // [2300]
B_UNICODE_OPTICAL_CHARACTER_RECOGNITION, B_UNICODE_CONTROL_PICTURES = 49, // [2400]
B_UNICODE_ENCLOSED_ALPHANUMERICS, B_UNICODE_OPTICAL_CHARACTER_RECOGNITION = 50, // [2440]
B_UNICODE_BOX_DRAWING, B_UNICODE_ENCLOSED_ALPHANUMERICS = 51, // [2460]
B_UNICODE_BLOCK_ELEMENTS, B_UNICODE_BOX_DRAWING = 52, // [2500]
B_UNICODE_GEOMETRIC_SHAPES, B_UNICODE_BLOCK_ELEMENTS = 53, // [2580]
B_UNICODE_MISCELLANEOUS_SYMBOLS, B_UNICODE_GEOMETRIC_SHAPES = 54, // [25A0]
B_UNICODE_DINGBATS, B_UNICODE_MISCELLANEOUS_SYMBOLS = 55, // [2600]
B_UNICODE_BRAILLE_PATTERNS, B_UNICODE_DINGBATS = 56, // [2700]
B_UNICODE_CJK_RADICALS_SUPPLEMENT, B_UNICODE_BRAILLE_PATTERNS = 57, // [2800]
B_UNICODE_KANGXI_RADICALS, B_UNICODE_CJK_RADICALS_SUPPLEMENT = 58, // [2E80]
B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS, B_UNICODE_KANGXI_RADICALS = 59, // [2F00]
B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION, B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, // [2FF0]
B_UNICODE_HIRAGANA, B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 61, // [3000]
B_UNICODE_KATAKANA, B_UNICODE_HIRAGANA = 62, // [3040]
B_UNICODE_BOPOMOFO, B_UNICODE_KATAKANA = 63, // [30A0]
B_UNICODE_HANGUL_COMPATIBILITY_JAMO, B_UNICODE_BOPOMOFO = 64, // [3100]
B_UNICODE_KANBUN, B_UNICODE_HANGUL_COMPATIBILITY_JAMO = 65, // [3130]
B_UNICODE_BOPOMOFO_EXTENDED, B_UNICODE_KANBUN = 66, // [3190]
B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS, B_UNICODE_BOPOMOFO_EXTENDED = 67, // [31A0]
B_UNICODE_CJK_COMPATIBILITY, B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, // [3200]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, B_UNICODE_CJK_COMPATIBILITY = 69, // [3300]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS, B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, // [3400]
B_UNICODE_YI_SYLLABLES, B_UNICODE_CJK_UNIFIED_IDEOGRAPHS = 71, // [4E00]
B_UNICODE_YI_RADICALS, B_UNICODE_YI_SYLLABLES = 72, // [A000]
B_UNICODE_HANGUL_SYLLABLES, B_UNICODE_YI_RADICALS = 73, // [A490]
B_UNICODE_HIGH_SURROGATES, B_UNICODE_HANGUL_SYLLABLES = 74, // [AC00]
B_UNICODE_HIGH_PRIVATE_USE_SURROGATES, B_UNICODE_HIGH_SURROGATES = 75, // [D800]
B_UNICODE_LOW_SURROGATES, B_UNICODE_HIGH_PRIVATE_USE_SURROGATES = 76, // [DB80]
B_UNICODE_PRIVATE_USE_AREA, B_UNICODE_LOW_SURROGATES = 77, // [DC00]
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS, B_UNICODE_PRIVATE_USE = 78,
B_UNICODE_ALPHABETIC_PRESENTATION_FORMS, B_UNICODE_PRIVATE_USE_AREA = B_UNICODE_PRIVATE_USE, // [E000]
B_UNICODE_ARABIC_PRESENTATION_FORMS_A, B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 79, // [F900]
B_UNICODE_COMBINING_HALF_MARKS, B_UNICODE_ALPHABETIC_PRESENTATION_FORMS = 80, // [FB00]
B_UNICODE_CJK_COMPATIBILITY_FORMS, B_UNICODE_ARABIC_PRESENTATION_FORMS_A = 81, // [FB50]
B_UNICODE_SMALL_FORM_VARIANTS, B_UNICODE_COMBINING_HALF_MARKS = 82, // [FE20]
B_UNICODE_ARABIC_PRESENTATION_FORMS_B, B_UNICODE_CJK_COMPATIBILITY_FORMS = 83, // [FE30]
B_UNICODE_SPECIALS, B_UNICODE_SMALL_FORM_VARIANTS = 84, // [FE50]
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS, B_UNICODE_ARABIC_PRESENTATION_FORMS_B = 85, // [FE70]
B_UNICODE_SPECIALS = 86, // [FFF0]
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, // [FF00]
B_UNICODE_SCRIPT_COUNT, // New blocks in Unicode 3.1
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT B_UNICODE_OLD_ITALIC = 88, // [10300]
B_UNICODE_GOTHIC = 89, // [10330]
B_UNICODE_DESERET = 90, // [10400]
B_UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 91, // [1D000]
B_UNICODE_MUSICAL_SYMBOLS = 92, // [1D100]
B_UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, // [1D400]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, // [20000]
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, // [2F800]
B_UNICODE_TAGS = 96, // [E0000]
// New blocks in Unicode
B_UNICODE_CYRILLIC_SUPPLEMENTARY = 97,
B_UNICODE_CYRILLIC_SUPPLEMENT = B_UNICODE_CYRILLIC_SUPPLEMENTARY, // [0500]
B_UNICODE_TAGALOG = 98, // [1700]
B_UNICODE_HANUNOO = 99, // [1720]
B_UNICODE_BUHID = 100, // [1740]
B_UNICODE_TAGBANWA = 101, // [1760]
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, // [27C0]
B_UNICODE_SUPPLEMENTAL_ARROWS_A = 103, // [27F0]
B_UNICODE_SUPPLEMENTAL_ARROWS_B = 104, // [2900]
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, // [2980]
B_UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, // [2A00]
B_UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 107, // [31F0]
B_UNICODE_VARIATION_SELECTORS = 108, // [FE00]
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, // [F0000]
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, // [100000]
// New blocks in Unicode 4
B_UNICODE_LIMBU = 111, // [1900]
B_UNICODE_TAI_LE = 112, // [1950]
B_UNICODE_KHMER_SYMBOLS = 113, // [19E0]
B_UNICODE_PHONETIC_EXTENSIONS = 114, // [1D00]
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, // [2B00]
B_UNICODE_YIJING_HEXAGRAM_SYMBOLS = 116, // [4DC0]
B_UNICODE_LINEAR_B_SYLLABARY = 117, // [10000]
B_UNICODE_LINEAR_B_IDEOGRAMS = 118, // [10080]
B_UNICODE_AEGEAN_NUMBERS = 119, // [10100]
B_UNICODE_UGARITIC = 120, // [10380]
B_UNICODE_SHAVIAN = 121, // [10450]
B_UNICODE_OSMANYA = 122, // [10480]
B_UNICODE_CYPRIOT_SYLLABARY = 123, // [10800]
B_UNICODE_TAI_XUAN_JING_SYMBOLS = 124, // [1D300]
B_UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 125, // [E0100]
// New blocks in Unicode 4.1
B_UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 126, // [1D200]
B_UNICODE_ANCIENT_GREEK_NUMBERS = 127, // [10140]
B_UNICODE_ARABIC_SUPPLEMENT = 128, // [0750]
B_UNICODE_BUGINESE = 129, // [1A00]
B_UNICODE_CJK_STROKES = 130, // [31C0]
B_UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, // [1DC0]
B_UNICODE_COPTIC = 132, // [2C80]
B_UNICODE_ETHIOPIC_EXTENDED = 133, // [2D80]
B_UNICODE_ETHIOPIC_SUPPLEMENT = 134, // [1380]
B_UNICODE_GEORGIAN_SUPPLEMENT = 135, // [2D00]
B_UNICODE_GLAGOLITIC = 136, // [2C00]
B_UNICODE_KHAROSHTHI = 137, // [10A00]
B_UNICODE_MODIFIER_TONE_LETTERS = 138, // [A700]
B_UNICODE_NEW_TAI_LUE = 139, // [1980]
B_UNICODE_OLD_PERSIAN = 140, // [103A0]
B_UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, // [1D80]
B_UNICODE_SUPPLEMENTAL_PUNCTUATION = 142, // [2E00]
B_UNICODE_SYLOTI_NAGRI = 143, // [A800]
B_UNICODE_TIFINAGH = 144, // [2D30]
B_UNICODE_VERTICAL_FORMS = 145, // [FE10]
// New blocks in Unicode 5.0
B_UNICODE_NKO = 146, // [07C0]
B_UNICODE_BALINESE = 147, // [1B00]
B_UNICODE_LATIN_EXTENDED_C = 148, // [2C60]
B_UNICODE_LATIN_EXTENDED_D = 149, // [A720]
B_UNICODE_PHAGS_PA = 150, // [A840]
B_UNICODE_PHOENICIAN = 151, // [10900]
B_UNICODE_CUNEIFORM = 152, // [12000]
B_UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, // [12400]
B_UNICODE_COUNTING_ROD_NUMERALS = 154, // [1D360]
// New blocks in Unicode 5.1
B_UNICODE_SUNDANESE = 155, // [1B80]
B_UNICODE_LEPCHA = 156, // [1C00]
B_UNICODE_OL_CHIKI = 157, // [1C50]
B_UNICODE_CYRILLIC_EXTENDED_A = 158, // [2DE0]
B_UNICODE_VAI = 159, // [A500]
B_UNICODE_CYRILLIC_EXTENDED_B = 160, // [A640]
B_UNICODE_SAURASHTRA = 161, // [A880]
B_UNICODE_KAYAH_LI = 162, // [A900]
B_UNICODE_REJANG = 163, // [A930]
B_UNICODE_CHAM = 164, // [AA00]
B_UNICODE_ANCIENT_SYMBOLS = 165, // [10190]
B_UNICODE_PHAISTOS_DISC = 166, // [101D0]
B_UNICODE_LYCIAN = 167, // [10280]
B_UNICODE_CARIAN = 168, // [102A0]
B_UNICODE_LYDIAN = 169, // [10920]
B_UNICODE_MAHJONG_TILES = 170, // [1F000]
B_UNICODE_DOMINO_TILES = 171, // [1F030]
// New blocks in Unicode 5.2
B_UNICODE_SAMARITAN = 172, // [0800]
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, // [18B0]
B_UNICODE_TAI_THAM = 174, // [1A20]
B_UNICODE_VEDIC_EXTENSIONS = 175, // [1CD0]
B_UNICODE_LISU = 176, // [A4D0]
B_UNICODE_BAMUM = 177, // [A6A0]
B_UNICODE_COMMON_INDIC_NUMBER_FORMS = 178, // [A830]
B_UNICODE_DEVANAGARI_EXTENDED = 179, // [A8E0]
B_UNICODE_HANGUL_JAMO_EXTENDED_A = 180, // [A960]
B_UNICODE_JAVANESE = 181, // [A980]
B_UNICODE_MYANMAR_EXTENDED_A = 182, // [AA60]
B_UNICODE_TAI_VIET = 183, // [AA80]
B_UNICODE_MEETEI_MAYEK = 184, // [ABC0]
B_UNICODE_HANGUL_JAMO_EXTENDED_B = 185, // [D7B0]
B_UNICODE_IMPERIAL_ARAMAIC = 186, // [10840]
B_UNICODE_OLD_SOUTH_ARABIAN = 187, // [10A60]
B_UNICODE_AVESTAN = 188, // [10B00]
B_UNICODE_INSCRIPTIONAL_PARTHIAN = 189, // [10B40]
B_UNICODE_INSCRIPTIONAL_PAHLAVI = 190, // [10B60]
B_UNICODE_OLD_TURKIC = 191, // [10C00]
B_UNICODE_RUMI_NUMERAL_SYMBOLS = 192, // [10E60]
B_UNICODE_KAITHI = 193, // [11080]
B_UNICODE_EGYPTIAN_HIEROGLYPHS = 194, // [13000]
B_UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, // [1F100]
B_UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, // [1F200]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, // [2A700]
// New blocks in Unicode 6.0
B_UNICODE_MANDAIC = 198, // [0840]
B_UNICODE_BATAK = 199, // [1BC0]
B_UNICODE_ETHIOPIC_EXTENDED_A = 200, // [AB00]
B_UNICODE_BRAHMI = 201, // [11000]
B_UNICODE_BAMUM_SUPPLEMENT = 202, // [16800]
B_UNICODE_KANA_SUPPLEMENT = 203, // [1B000]
B_UNICODE_PLAYING_CARDS = 204, // [1F0A0]
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, // [1F300]
B_UNICODE_EMOTICONS = 206, // [1F600]
B_UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 207, // [1F680]
B_UNICODE_ALCHEMICAL_SYMBOLS = 208, // [1F700]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, // [2B740]
B_UNICODE_SCRIPT_COUNT = 210,
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT,
B_UNICODE_INVALID_CODE = -1
}; };
/** // East Asian Width constants.
* Values returned by the u_getCellWidth() function.
*/
enum unicode_cell_width enum unicode_east_asian_width
{ {
B_UNICODE_ZERO_WIDTH = 0, B_UNICODE_EA_NEUTRAL, // [N]
B_UNICODE_HALF_WIDTH = 1, B_UNICODE_EA_AMBIGUOUS, // [A]
B_UNICODE_FULL_WIDTH = 2, B_UNICODE_EA_HALFWIDTH, // [H]
B_UNICODE_NEUTRAL_WIDTH = 3, B_UNICODE_EA_FULLWIDTH, // [F]
B_UNICODE_EA_NARROW, // [Na]
B_UNICODE_CELL_WIDTH_COUNT B_UNICODE_EA_WIDE, // [W]
B_UNICODE_EA_COUNT
}; };
@ -209,6 +347,7 @@ class BUnicodeChar {
static uint32 ToUpper(uint32 c); static uint32 ToUpper(uint32 c);
static uint32 ToTitle(uint32 c); static uint32 ToTitle(uint32 c);
static int32 DigitValue(uint32 c); static int32 DigitValue(uint32 c);
static unicode_east_asian_width EastAsianWidth(uint32 c);
static void ToUTF8(uint32 c, char **out); static void ToUTF8(uint32 c, char **out);
static uint32 FromUTF8(const char **in); static uint32 FromUTF8(const char **in);
@ -230,4 +369,4 @@ BUnicodeChar::FromUTF8(const char *in)
} }
#endif /* _UNICODE_CHAR_H_ */ #endif // _UNICODE_CHAR_H_

View File

@ -1,234 +1,18 @@
/* /*
** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
** Distributed under the terms of the OpenBeOS License. * Distributed under the terms of the MIT License.
*/
/* Reads the information out of the data files created by (an edited version of)
* IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart
* to ICU's uchar module, but is not as huge or broad as that one.
* *
* Note, it probably won't be able to handle the output of the orginal genprops * Authors:
* tool and vice versa - only use the tool provided with this project to create * Axel Dörfler, axeld@pinc-software.de
* the Unicode property file. * Siarzhuk Zharski, zharik@gmx.li
* However, the algorithmic idea behind the property file is still the same as
* found in ICU - nothing important has been changed, so more recent versions
* of genprops tool/data can probably be ported without too much effort.
* *
* In case no property file can be found it will still provide basic services
* for the Latin-1 part of the character tables.
*/ */
#include <OS.h>
#include <UnicodeChar.h> #include <UnicodeChar.h>
#include <stdlib.h> #include <unicode/uchar.h>
#include <stdio.h> #include <unicode/utf8.h>
#include <string.h>
#define FLAG(n) ((uint32)1 << (n))
enum {
UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER),
UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER),
UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER),
UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER),
UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER),
UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER),
UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER),
UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER)
};
static uint32 gStaticProps32Table[] = {
/* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c,
/* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f,
/* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c,
/* 0x20 */ 0x24c, 0x297, 0x297, 0x117,
/* 0x24 */ 0x119, 0x117, 0x297, 0x297,
/* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118,
/* 0x2c */ 0x197, 0x113, 0x197, 0xd7,
/* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089,
/* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089,
/* 0x38 */ 0x800089, 0x900089, 0x197, 0x297,
/* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297,
/* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001,
/* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94,
/* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296,
/* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002,
/* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94,
/* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f,
/* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f,
/* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f
};
enum {
INDEX_STAGE_2_BITS,
INDEX_STAGE_3_BITS,
INDEX_EXCEPTIONS,
INDEX_STAGE_3_INDEX,
INDEX_PROPS,
INDEX_UCHARS
};
/* constants and macros for access to the data */
enum {
EXC_UPPERCASE,
EXC_LOWERCASE,
EXC_TITLECASE,
EXC_DIGIT_VALUE,
EXC_NUMERIC_VALUE,
EXC_DENOMINATOR_VALUE,
EXC_MIRROR_MAPPING,
EXC_SPECIAL_CASING,
EXC_CASE_FOLDING
};
enum {
EXCEPTION_SHIFT = 5,
BIDI_SHIFT,
MIRROR_SHIFT = BIDI_SHIFT + 5,
VALUE_SHIFT = 20,
VALUE_BITS = 32 - VALUE_SHIFT
};
/* number of bits in an 8-bit integer value */
#define EXC_GROUP 8
static uint8 gFlagsOffset[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
#ifdef UCHAR_VARIABLE_TRIE_BITS
// access values calculated from indices
static uint16_t stage23Bits, stage2Mask, stage3Mask;
# define sStage3Bits indexes[INDEX_STAGE_3_BITS]
#else
// Use hardcoded bit distribution for the trie table access
# define sStage23Bits 10
# define sStage2Mask 0x3f
# define sStage3Mask 0xf
# define sStage3Bits 4
#endif
/** We need to change the char category for ISO 8 controls, since the
* genprops utility we got from IBM's ICU apparently changes it for
* some characters.
*/
static inline bool
isISO8Control(uint32 c)
{
return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20);
}
static inline uint32
getProperties(uint32 c)
{
if (c > 0x10ffff)
return 0;
// TODO : Data from unicode
return c > 0x9f ? 0 : gStaticProps32Table[c];
}
static inline uint8
getCategory(uint32 properties)
{
return properties & 0x1f;
}
static inline bool
propertyIsException(uint32 properties)
{
return properties & (1UL << EXCEPTION_SHIFT);
}
static inline uint32
getUnsignedValue(uint32 properties)
{
return properties >> VALUE_SHIFT;
}
static inline uint32
getSignedValue(uint32 properties)
{
return (int32)properties >> VALUE_SHIFT;
}
static inline uint32 *
getExceptions(uint32 properties)
{
// TODO : data from unicode
return 0;
}
static inline bool
haveExceptionValue(uint32 flags,int16 index)
{
return flags & (1UL << index);
}
static inline void
addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset)
{
if (index >= EXC_GROUP) {
*offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)];
flags >>= EXC_GROUP;
index -= EXC_GROUP;
}
*offset += gFlagsOffset[flags & ((1 << index) - 1)];
}
// #pragma mark -
BUnicodeChar::BUnicodeChar() BUnicodeChar::BUnicodeChar()
@ -236,382 +20,244 @@ BUnicodeChar::BUnicodeChar()
} }
bool // Returns the general category value for the code point.
BUnicodeChar::IsAlpha(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
) != 0;
}
/** Returns the type code of the specified unicode character */
int8 int8
BUnicodeChar::Type(uint32 c) BUnicodeChar::Type(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (int8)getCategory(getProperties(c)); return u_charType(c);
} }
bool // Determines whether the specified code point is a letter character.
BUnicodeChar::IsLower(uint32 c) // True for general categories "L" (letters).
bool
BUnicodeChar::IsAlpha(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER; return u_isalpha(c);
} }
bool // Determines whether the specified code point is an alphanumeric character
BUnicodeChar::IsUpper(uint32 c) // (letter or digit).
{ // True for characters with general categories
BUnicodeChar(); // "L" (letters) and "Nd" (decimal digit numbers).
return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER; bool
}
bool
BUnicodeChar::IsTitle(uint32 c)
{
BUnicodeChar();
return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER;
}
bool
BUnicodeChar::IsDigit(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER)
) != 0;
}
bool
BUnicodeChar::IsAlNum(uint32 c) BUnicodeChar::IsAlNum(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (FLAG(getCategory(getProperties(c))) return u_isalnum(c);
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE
| UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
) != 0;
} }
bool // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
bool
BUnicodeChar::IsLower(uint32 c)
{
BUnicodeChar();
return u_isULowercase(c);
}
// Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
bool
BUnicodeChar::IsUpper(uint32 c)
{
BUnicodeChar();
return u_isUUppercase(c);
}
// Determines whether the specified code point is a titlecase letter.
// True for general category "Lt" (titlecase letter).
bool
BUnicodeChar::IsTitle(uint32 c)
{
BUnicodeChar();
return u_istitle(c);
}
// Determines whether the specified code point is a digit character.
// True for characters with general category "Nd" (decimal digit numbers).
// Beginning with Unicode 4, this is the same as
// testing for the Numeric_Type of Decimal.
bool
BUnicodeChar::IsDigit(uint32 c)
{
BUnicodeChar();
return u_isdigit(c);
}
// Determines whether the specified code point is a hexadecimal digit.
// This is equivalent to u_digit(c, 16)>=0.
// True for characters with general category "Nd" (decimal digit numbers)
// as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
// (That is, for letters with code points
// 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
bool
BUnicodeChar::IsHexDigit(uint32 c)
{
BUnicodeChar();
return u_isxdigit(c);
}
// Determines whether the specified code point is "defined",
// which usually means that it is assigned a character.
// True for general categories other than "Cn" (other, not assigned),
// i.e., true for all code points mentioned in UnicodeData.txt.
bool
BUnicodeChar::IsDefined(uint32 c) BUnicodeChar::IsDefined(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return getProperties(c) != 0; return u_isdefined(c);
} }
/** Returns true if the specified unicode character is a base // Determines whether the specified code point is a base character.
* form character that can be used with a diacritic. // True for general categories "L" (letters), "N" (numbers),
* This doesn't mean that the character has to be distinct, // "Mc" (spacing combining marks), and "Me" (enclosing marks).
* though. bool
*/
bool
BUnicodeChar::IsBase(uint32 c) BUnicodeChar::IsBase(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (FLAG(getCategory(getProperties(c))) return u_isbase(c);
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER
| UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE
| UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK)
| FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK))
) != 0;
} }
/** Returns true if the specified unicode character is a // Determines whether the specified code point is a control character
* control character. // (as defined by this function).
*/ // A control character is one of the following:
// - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
bool // - U_CONTROL_CHAR (Cc)
// - U_FORMAT_CHAR (Cf)
// - U_LINE_SEPARATOR (Zl)
// - U_PARAGRAPH_SEPARATOR (Zp)
bool
BUnicodeChar::IsControl(uint32 c) BUnicodeChar::IsControl(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return isISO8Control(c) return u_iscntrl(c);
|| (FLAG(getCategory(getProperties(c)))
& (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR)
| FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0;
} }
/** Returns true if the specified unicode character is a // Determines whether the specified code point is a punctuation character.
* punctuation character. // True for characters with general categories "P" (punctuation).
*/
bool bool
BUnicodeChar::IsPunctuation(uint32 c) BUnicodeChar::IsPunctuation(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (FLAG(getCategory(getProperties(c))) return u_ispunct(c);
& (FLAG(B_UNICODE_DASH_PUNCTUATION)
| FLAG(B_UNICODE_START_PUNCTUATION)
| FLAG(B_UNICODE_END_PUNCTUATION)
| FLAG(B_UNICODE_CONNECTOR_PUNCTUATION)
| FLAG(B_UNICODE_OTHER_PUNCTUATION))
) != 0;
} }
/** Returns true if the specified unicode character is some // Determine if the specified code point is a space character according to Java.
* kind of a space character. // True for characters with general categories "Z" (separators),
*/ // which does not include control codes (e.g., TAB or Line Feed).
bool
bool
BUnicodeChar::IsSpace(uint32 c) BUnicodeChar::IsSpace(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (FLAG(getCategory(getProperties(c))) return u_isJavaSpaceChar(c);
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
| FLAG(B_UNICODE_LINE_SEPARATOR)
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0;
} }
/** Returns true if the specified unicode character is a white // Determines if the specified code point is a whitespace character
* space character. // A character is considered to be a whitespace character if and only
* This is essentially the same as IsSpace(), but excludes all // if it satisfies one of the following criteria:
* non-breakable spaces. // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
*/ // but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
// or U+202F Narrow NBSP).
bool // - It is U+0009 HORIZONTAL TABULATION.
// - It is U+000A LINE FEED.
// - It is U+000B VERTICAL TABULATION.
// - It is U+000C FORM FEED.
// - It is U+000D CARRIAGE RETURN.
// - It is U+001C FILE SEPARATOR.
// - It is U+001D GROUP SEPARATOR.
// - It is U+001E RECORD SEPARATOR.
// - It is U+001F UNIT SEPARATOR.
bool
BUnicodeChar::IsWhitespace(uint32 c) BUnicodeChar::IsWhitespace(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return (FLAG(getCategory(getProperties(c))) return u_isWhitespace(c);
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
| FLAG(B_UNICODE_LINE_SEPARATOR)
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces
} }
/** Returns true if the specified unicode character is printable. // Determines whether the specified code point is a printable character.
*/ // True for general categories other than "C" (controls).
bool
bool
BUnicodeChar::IsPrintable(uint32 c) BUnicodeChar::IsPrintable(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return !isISO8Control(c) return u_isprint(c);
&& (FLAG(getCategory(getProperties(c)))
& ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR)
| FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR)
| FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES)
| FLAG(31))
) != 0;
} }
// #pragma mark - // #pragma mark -
uint32
/** Transforms the specified unicode character to lowercase.
*/
uint32
BUnicodeChar::ToLower(uint32 c) BUnicodeChar::ToLower(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return u_tolower(c);
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE))
return c + getSignedValue(props);
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) {
int16 index = EXC_LOWERCASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
} }
/** Transforms the specified unicode character to uppercase. uint32
*/
uint32
BUnicodeChar::ToUpper(uint32 c) BUnicodeChar::ToUpper(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return u_toupper(c);
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER)
return c - getSignedValue(props);
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
int16 index = EXC_UPPERCASE;
++exceptions;
addExceptionOffset(firstExceptionValue, index, &exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
} }
/** Transforms the specified unicode character to title case. uint32
*/
uint32
BUnicodeChar::ToTitle(uint32 c) BUnicodeChar::ToTitle(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return u_totitle(c);
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) {
// here, titlecase is the same as uppercase
return c - getSignedValue(props);
}
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) {
int16 index = EXC_TITLECASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return (uint32)*exceptions;
} else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
// here, titlecase is the same as uppercase
int16 index = EXC_UPPERCASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
} }
int32 int32
BUnicodeChar::DigitValue(uint32 c) BUnicodeChar::DigitValue(uint32 c)
{ {
BUnicodeChar(); BUnicodeChar();
return u_digit(c, 10);
}
uint32 props = getProperties(c);
if (!propertyIsException(props)) { unicode_east_asian_width
if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER) BUnicodeChar::EastAsianWidth(uint32 c)
return getSignedValue(props); {
} else { return (unicode_east_asian_width)u_getIntPropertyValue(c,
uint32 *exceptions = getExceptions(props); UCHAR_EAST_ASIAN_WIDTH);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) {
int16 index = EXC_DIGIT_VALUE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
int32 value = (int32)(int16)*exceptions;
// the digit value is in the lower 16 bits
if (value != -1)
return value;
}
}
// If there is no value in the properties table,
// then check for some special characters
switch (c) {
case 0x3007: return 0;
case 0x4e00: return 1;
case 0x4e8c: return 2;
case 0x4e09: return 3;
case 0x56d8: return 4;
case 0x4e94: return 5;
case 0x516d: return 6;
case 0x4e03: return 7;
case 0x516b: return 8;
case 0x4e5d: return 9;
default: return -1;
}
} }
void void
BUnicodeChar::ToUTF8(uint32 c, char **out) BUnicodeChar::ToUTF8(uint32 c, char **out)
{ {
char *s = *out; int i = 0;
U8_APPEND_UNSAFE(*out, i, c);
if (c < 0x80)
*(s++) = c;
else if (c < 0x800) {
*(s++) = 0xc0 | (c >> 6);
*(s++) = 0x80 | (c & 0x3f);
} else if (c < 0x10000) {
*(s++) = 0xe0 | (c >> 12);
*(s++) = 0x80 | ((c >> 6) & 0x3f);
*(s++) = 0x80 | (c & 0x3f);
} else if (c <= 0x10ffff) {
*(s++) = 0xf0 | (c >> 18);
*(s++) = 0x80 | ((c >> 12) & 0x3f);
*(s++) = 0x80 | ((c >> 6) & 0x3f);
*(s++) = 0x80 | (c & 0x3f);
}
*out = s;
} }
uint32 uint32
BUnicodeChar::FromUTF8(const char **in) BUnicodeChar::FromUTF8(const char **in)
{ {
uint8 *bytes = (uint8 *)*in; int i = 0;
if (bytes == NULL) uint32 c = 0;
return 0; U8_GET_UNSAFE(*in, i, c);
int32 length;
uint8 mask = 0x1f;
switch (bytes[0] & 0xf0) {
case 0xc0:
case 0xd0: length = 2; break;
case 0xe0: length = 3; break;
case 0xf0:
mask = 0x0f;
length = 4;
break;
default:
// valid 1-byte character
// and invalid characters
(*in)++;
return bytes[0];
}
uint32 c = bytes[0] & mask;
int32 i = 1;
for (;i < length && (bytes[i] & 0x80) > 0;i++)
c = (c << 6) | (bytes[i] & 0x3f);
if (i < length) {
// invalid character
(*in)++;
return (uint32)bytes[0];
}
*in += length;
return c; return c;
} }
size_t size_t
BUnicodeChar::UTF8StringLength(const char *str) BUnicodeChar::UTF8StringLength(const char *str)
{ {
@ -623,6 +269,7 @@ BUnicodeChar::UTF8StringLength(const char *str)
return len; return len;
} }
size_t size_t
BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength) BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
{ {
@ -633,4 +280,3 @@ BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
} }
return len; return len;
} }