Switch BUnicodeChar to wrap the ICU's UChar32 one
Improve the unicode character processing and classifying routines by wrapping up the UChar32 procedures from ICU. That fixes functional regression introduced in hrev38017 and allows to fix East Asian Width problems int the Temrinal.
This commit is contained in:
parent
4e17bdd83f
commit
b6fd91b409
@ -8,6 +8,7 @@ enum unicode_char_category
|
||||
// Non-category for unassigned and non-character code points.
|
||||
B_UNICODE_UNASSIGNED = 0,
|
||||
|
||||
B_UNICODE_GENERAL_OTHER_TYPES = 0, // Cn
|
||||
B_UNICODE_UPPERCASE_LETTER = 1, // Lu
|
||||
B_UNICODE_LOWERCASE_LETTER = 2, // Ll
|
||||
B_UNICODE_TITLECASE_LETTER = 3, // Lt
|
||||
@ -37,152 +38,289 @@ enum unicode_char_category
|
||||
B_UNICODE_OTHER_SYMBOL = 27, // So
|
||||
B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi
|
||||
B_UNICODE_FINAL_PUNCTUATION = 29, // Pf
|
||||
B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn
|
||||
|
||||
B_UNICODE_CATEGORY_COUNT
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* This specifies the language directional property of a character set.
|
||||
*/
|
||||
// This specifies the language directional property of a character set.
|
||||
|
||||
enum unicode_char_direction {
|
||||
B_UNICODE_LEFT_TO_RIGHT = 0,
|
||||
B_UNICODE_RIGHT_TO_LEFT = 1,
|
||||
B_UNICODE_EUROPEAN_NUMBER = 2,
|
||||
B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3,
|
||||
B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4,
|
||||
B_UNICODE_ARABIC_NUMBER = 5,
|
||||
B_UNICODE_COMMON_NUMBER_SEPARATOR = 6,
|
||||
B_UNICODE_BLOCK_SEPARATOR = 7,
|
||||
B_UNICODE_SEGMENT_SEPARATOR = 8,
|
||||
B_UNICODE_WHITE_SPACE_NEUTRAL = 9,
|
||||
B_UNICODE_OTHER_NEUTRAL = 10,
|
||||
B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11,
|
||||
B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12,
|
||||
B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13,
|
||||
B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14,
|
||||
B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15,
|
||||
B_UNICODE_POP_DIRECTIONAL_FORMAT = 16,
|
||||
B_UNICODE_DIR_NON_SPACING_MARK = 17,
|
||||
B_UNICODE_BOUNDARY_NEUTRAL = 18,
|
||||
B_UNICODE_LEFT_TO_RIGHT = 0,
|
||||
B_UNICODE_RIGHT_TO_LEFT = 1,
|
||||
B_UNICODE_EUROPEAN_NUMBER = 2,
|
||||
B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3,
|
||||
B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4,
|
||||
B_UNICODE_ARABIC_NUMBER = 5,
|
||||
B_UNICODE_COMMON_NUMBER_SEPARATOR = 6,
|
||||
B_UNICODE_BLOCK_SEPARATOR = 7,
|
||||
B_UNICODE_SEGMENT_SEPARATOR = 8,
|
||||
B_UNICODE_WHITE_SPACE_NEUTRAL = 9,
|
||||
B_UNICODE_OTHER_NEUTRAL = 10,
|
||||
B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11,
|
||||
B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12,
|
||||
B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13,
|
||||
B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14,
|
||||
B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15,
|
||||
B_UNICODE_POP_DIRECTIONAL_FORMAT = 16,
|
||||
B_UNICODE_DIR_NON_SPACING_MARK = 17,
|
||||
B_UNICODE_BOUNDARY_NEUTRAL = 18,
|
||||
|
||||
B_UNICODE_DIRECTION_COUNT
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Script range as defined in the Unicode standard.
|
||||
*/
|
||||
// Script range as defined in the Unicode standard.
|
||||
|
||||
enum unicode_char_script {
|
||||
// Script names
|
||||
B_UNICODE_BASIC_LATIN,
|
||||
B_UNICODE_LATIN_1_SUPPLEMENT,
|
||||
B_UNICODE_LATIN_EXTENDED_A,
|
||||
B_UNICODE_LATIN_EXTENDED_B,
|
||||
B_UNICODE_IPA_EXTENSIONS,
|
||||
B_UNICODE_SPACING_MODIFIER_LETTERS,
|
||||
B_UNICODE_COMBINING_DIACRITICAL_MARKS,
|
||||
B_UNICODE_GREEK,
|
||||
B_UNICODE_CYRILLIC,
|
||||
B_UNICODE_ARMENIAN,
|
||||
B_UNICODE_HEBREW,
|
||||
B_UNICODE_ARABIC,
|
||||
B_UNICODE_SYRIAC,
|
||||
B_UNICODE_THAANA,
|
||||
B_UNICODE_DEVANAGARI,
|
||||
B_UNICODE_BENGALI,
|
||||
B_UNICODE_GURMUKHI,
|
||||
B_UNICODE_GUJARATI,
|
||||
B_UNICODE_ORIYA,
|
||||
B_UNICODE_TAMIL,
|
||||
B_UNICODE_TELUGU,
|
||||
B_UNICODE_KANNADA,
|
||||
B_UNICODE_MALAYALAM,
|
||||
B_UNICODE_SINHALA,
|
||||
B_UNICODE_THAI,
|
||||
B_UNICODE_LAO,
|
||||
B_UNICODE_TIBETAN,
|
||||
B_UNICODE_MYANMAR,
|
||||
B_UNICODE_GEORGIAN,
|
||||
B_UNICODE_HANGUL_JAMO,
|
||||
B_UNICODE_ETHIOPIC,
|
||||
B_UNICODE_CHEROKEE,
|
||||
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
|
||||
B_UNICODE_OGHAM,
|
||||
B_UNICODE_RUNIC,
|
||||
B_UNICODE_KHMER,
|
||||
B_UNICODE_MONGOLIAN,
|
||||
B_UNICODE_LATIN_EXTENDED_ADDITIONAL,
|
||||
B_UNICODE_GREEK_EXTENDED,
|
||||
B_UNICODE_GENERAL_PUNCTUATION,
|
||||
B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS,
|
||||
B_UNICODE_CURRENCY_SYMBOLS,
|
||||
B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS,
|
||||
B_UNICODE_LETTERLIKE_SYMBOLS,
|
||||
B_UNICODE_NUMBER_FORMS,
|
||||
B_UNICODE_ARROWS,
|
||||
B_UNICODE_MATHEMATICAL_OPERATORS,
|
||||
B_UNICODE_MISCELLANEOUS_TECHNICAL,
|
||||
B_UNICODE_CONTROL_PICTURES,
|
||||
B_UNICODE_OPTICAL_CHARACTER_RECOGNITION,
|
||||
B_UNICODE_ENCLOSED_ALPHANUMERICS,
|
||||
B_UNICODE_BOX_DRAWING,
|
||||
B_UNICODE_BLOCK_ELEMENTS,
|
||||
B_UNICODE_GEOMETRIC_SHAPES,
|
||||
B_UNICODE_MISCELLANEOUS_SYMBOLS,
|
||||
B_UNICODE_DINGBATS,
|
||||
B_UNICODE_BRAILLE_PATTERNS,
|
||||
B_UNICODE_CJK_RADICALS_SUPPLEMENT,
|
||||
B_UNICODE_KANGXI_RADICALS,
|
||||
B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
|
||||
B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION,
|
||||
B_UNICODE_HIRAGANA,
|
||||
B_UNICODE_KATAKANA,
|
||||
B_UNICODE_BOPOMOFO,
|
||||
B_UNICODE_HANGUL_COMPATIBILITY_JAMO,
|
||||
B_UNICODE_KANBUN,
|
||||
B_UNICODE_BOPOMOFO_EXTENDED,
|
||||
B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS,
|
||||
B_UNICODE_CJK_COMPATIBILITY,
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS,
|
||||
B_UNICODE_YI_SYLLABLES,
|
||||
B_UNICODE_YI_RADICALS,
|
||||
B_UNICODE_HANGUL_SYLLABLES,
|
||||
B_UNICODE_HIGH_SURROGATES,
|
||||
B_UNICODE_HIGH_PRIVATE_USE_SURROGATES,
|
||||
B_UNICODE_LOW_SURROGATES,
|
||||
B_UNICODE_PRIVATE_USE_AREA,
|
||||
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS,
|
||||
B_UNICODE_ALPHABETIC_PRESENTATION_FORMS,
|
||||
B_UNICODE_ARABIC_PRESENTATION_FORMS_A,
|
||||
B_UNICODE_COMBINING_HALF_MARKS,
|
||||
B_UNICODE_CJK_COMPATIBILITY_FORMS,
|
||||
B_UNICODE_SMALL_FORM_VARIANTS,
|
||||
B_UNICODE_ARABIC_PRESENTATION_FORMS_B,
|
||||
B_UNICODE_SPECIALS,
|
||||
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS,
|
||||
// New No_Block value in Unicode 4.
|
||||
B_UNICODE_NO_BLOCK = 0, // [none] Special range
|
||||
B_UNICODE_BASIC_LATIN = 1, // [0000]
|
||||
B_UNICODE_LATIN_1_SUPPLEMENT = 2, // [0080]
|
||||
B_UNICODE_LATIN_EXTENDED_A = 3, // [0100]
|
||||
B_UNICODE_LATIN_EXTENDED_B = 4, // [0180]
|
||||
B_UNICODE_IPA_EXTENSIONS = 5, // [0250]
|
||||
B_UNICODE_SPACING_MODIFIER_LETTERS = 6, // [02B0]
|
||||
B_UNICODE_COMBINING_DIACRITICAL_MARKS = 7, // [0300]
|
||||
B_UNICODE_GREEK = 8, // [0370]
|
||||
B_UNICODE_CYRILLIC = 9, // [0400]
|
||||
B_UNICODE_ARMENIAN = 10, // [0530]
|
||||
B_UNICODE_HEBREW = 11, // [0590]
|
||||
B_UNICODE_ARABIC = 12, // [0600]
|
||||
B_UNICODE_SYRIAC = 13, // [0700]
|
||||
B_UNICODE_THAANA = 14, // [0780]
|
||||
B_UNICODE_DEVANAGARI = 15, // [0900]
|
||||
B_UNICODE_BENGALI = 16, // [0980]
|
||||
B_UNICODE_GURMUKHI = 17, // [0A00]
|
||||
B_UNICODE_GUJARATI = 18, // [0A80]
|
||||
B_UNICODE_ORIYA = 19, // [0B00]
|
||||
B_UNICODE_TAMIL = 20, // [0B80]
|
||||
B_UNICODE_TELUGU = 21, // [0C00]
|
||||
B_UNICODE_KANNADA = 22, // [0C80]
|
||||
B_UNICODE_MALAYALAM = 23, // [0D00]
|
||||
B_UNICODE_SINHALA = 24, // [0D80]
|
||||
B_UNICODE_THAI = 25, // [0E00]
|
||||
B_UNICODE_LAO = 26, // [0E80]
|
||||
B_UNICODE_TIBETAN = 27, // [0F00]
|
||||
B_UNICODE_MYANMAR = 28, // [1000]
|
||||
B_UNICODE_GEORGIAN = 29, // [10A0]
|
||||
B_UNICODE_HANGUL_JAMO = 30, // [1100]
|
||||
B_UNICODE_ETHIOPIC = 31, // [1200]
|
||||
B_UNICODE_CHEROKEE = 32, // [13A0]
|
||||
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, // [1400]
|
||||
B_UNICODE_OGHAM = 34, // [1680]
|
||||
B_UNICODE_RUNIC = 35, // [16A0]
|
||||
B_UNICODE_KHMER = 36, // [1780]
|
||||
B_UNICODE_MONGOLIAN = 37, // [1800]
|
||||
B_UNICODE_LATIN_EXTENDED_ADDITIONAL = 38, // [1E00]
|
||||
B_UNICODE_GREEK_EXTENDED = 39, // [1F00]
|
||||
B_UNICODE_GENERAL_PUNCTUATION = 40, // [2000]
|
||||
B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, // [2070]
|
||||
B_UNICODE_CURRENCY_SYMBOLS = 42, // [20A0]
|
||||
B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS = 43, // [20D0]
|
||||
B_UNICODE_LETTERLIKE_SYMBOLS = 44, // [2100]
|
||||
B_UNICODE_NUMBER_FORMS = 45, // [2150]
|
||||
B_UNICODE_ARROWS = 46, // [2190]
|
||||
B_UNICODE_MATHEMATICAL_OPERATORS = 47, // [2200]
|
||||
B_UNICODE_MISCELLANEOUS_TECHNICAL = 48, // [2300]
|
||||
B_UNICODE_CONTROL_PICTURES = 49, // [2400]
|
||||
B_UNICODE_OPTICAL_CHARACTER_RECOGNITION = 50, // [2440]
|
||||
B_UNICODE_ENCLOSED_ALPHANUMERICS = 51, // [2460]
|
||||
B_UNICODE_BOX_DRAWING = 52, // [2500]
|
||||
B_UNICODE_BLOCK_ELEMENTS = 53, // [2580]
|
||||
B_UNICODE_GEOMETRIC_SHAPES = 54, // [25A0]
|
||||
B_UNICODE_MISCELLANEOUS_SYMBOLS = 55, // [2600]
|
||||
B_UNICODE_DINGBATS = 56, // [2700]
|
||||
B_UNICODE_BRAILLE_PATTERNS = 57, // [2800]
|
||||
B_UNICODE_CJK_RADICALS_SUPPLEMENT = 58, // [2E80]
|
||||
B_UNICODE_KANGXI_RADICALS = 59, // [2F00]
|
||||
B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, // [2FF0]
|
||||
B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 61, // [3000]
|
||||
B_UNICODE_HIRAGANA = 62, // [3040]
|
||||
B_UNICODE_KATAKANA = 63, // [30A0]
|
||||
B_UNICODE_BOPOMOFO = 64, // [3100]
|
||||
B_UNICODE_HANGUL_COMPATIBILITY_JAMO = 65, // [3130]
|
||||
B_UNICODE_KANBUN = 66, // [3190]
|
||||
B_UNICODE_BOPOMOFO_EXTENDED = 67, // [31A0]
|
||||
B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, // [3200]
|
||||
B_UNICODE_CJK_COMPATIBILITY = 69, // [3300]
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, // [3400]
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS = 71, // [4E00]
|
||||
B_UNICODE_YI_SYLLABLES = 72, // [A000]
|
||||
B_UNICODE_YI_RADICALS = 73, // [A490]
|
||||
B_UNICODE_HANGUL_SYLLABLES = 74, // [AC00]
|
||||
B_UNICODE_HIGH_SURROGATES = 75, // [D800]
|
||||
B_UNICODE_HIGH_PRIVATE_USE_SURROGATES = 76, // [DB80]
|
||||
B_UNICODE_LOW_SURROGATES = 77, // [DC00]
|
||||
B_UNICODE_PRIVATE_USE = 78,
|
||||
B_UNICODE_PRIVATE_USE_AREA = B_UNICODE_PRIVATE_USE, // [E000]
|
||||
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 79, // [F900]
|
||||
B_UNICODE_ALPHABETIC_PRESENTATION_FORMS = 80, // [FB00]
|
||||
B_UNICODE_ARABIC_PRESENTATION_FORMS_A = 81, // [FB50]
|
||||
B_UNICODE_COMBINING_HALF_MARKS = 82, // [FE20]
|
||||
B_UNICODE_CJK_COMPATIBILITY_FORMS = 83, // [FE30]
|
||||
B_UNICODE_SMALL_FORM_VARIANTS = 84, // [FE50]
|
||||
B_UNICODE_ARABIC_PRESENTATION_FORMS_B = 85, // [FE70]
|
||||
B_UNICODE_SPECIALS = 86, // [FFF0]
|
||||
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, // [FF00]
|
||||
|
||||
B_UNICODE_SCRIPT_COUNT,
|
||||
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT
|
||||
// New blocks in Unicode 3.1
|
||||
B_UNICODE_OLD_ITALIC = 88, // [10300]
|
||||
B_UNICODE_GOTHIC = 89, // [10330]
|
||||
B_UNICODE_DESERET = 90, // [10400]
|
||||
B_UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 91, // [1D000]
|
||||
B_UNICODE_MUSICAL_SYMBOLS = 92, // [1D100]
|
||||
B_UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, // [1D400]
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, // [20000]
|
||||
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, // [2F800]
|
||||
B_UNICODE_TAGS = 96, // [E0000]
|
||||
|
||||
// New blocks in Unicode
|
||||
B_UNICODE_CYRILLIC_SUPPLEMENTARY = 97,
|
||||
B_UNICODE_CYRILLIC_SUPPLEMENT = B_UNICODE_CYRILLIC_SUPPLEMENTARY, // [0500]
|
||||
B_UNICODE_TAGALOG = 98, // [1700]
|
||||
B_UNICODE_HANUNOO = 99, // [1720]
|
||||
B_UNICODE_BUHID = 100, // [1740]
|
||||
B_UNICODE_TAGBANWA = 101, // [1760]
|
||||
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, // [27C0]
|
||||
B_UNICODE_SUPPLEMENTAL_ARROWS_A = 103, // [27F0]
|
||||
B_UNICODE_SUPPLEMENTAL_ARROWS_B = 104, // [2900]
|
||||
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, // [2980]
|
||||
B_UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, // [2A00]
|
||||
B_UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 107, // [31F0]
|
||||
B_UNICODE_VARIATION_SELECTORS = 108, // [FE00]
|
||||
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, // [F0000]
|
||||
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, // [100000]
|
||||
|
||||
// New blocks in Unicode 4
|
||||
B_UNICODE_LIMBU = 111, // [1900]
|
||||
B_UNICODE_TAI_LE = 112, // [1950]
|
||||
B_UNICODE_KHMER_SYMBOLS = 113, // [19E0]
|
||||
B_UNICODE_PHONETIC_EXTENSIONS = 114, // [1D00]
|
||||
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, // [2B00]
|
||||
B_UNICODE_YIJING_HEXAGRAM_SYMBOLS = 116, // [4DC0]
|
||||
B_UNICODE_LINEAR_B_SYLLABARY = 117, // [10000]
|
||||
B_UNICODE_LINEAR_B_IDEOGRAMS = 118, // [10080]
|
||||
B_UNICODE_AEGEAN_NUMBERS = 119, // [10100]
|
||||
B_UNICODE_UGARITIC = 120, // [10380]
|
||||
B_UNICODE_SHAVIAN = 121, // [10450]
|
||||
B_UNICODE_OSMANYA = 122, // [10480]
|
||||
B_UNICODE_CYPRIOT_SYLLABARY = 123, // [10800]
|
||||
B_UNICODE_TAI_XUAN_JING_SYMBOLS = 124, // [1D300]
|
||||
B_UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 125, // [E0100]
|
||||
|
||||
// New blocks in Unicode 4.1
|
||||
B_UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 126, // [1D200]
|
||||
B_UNICODE_ANCIENT_GREEK_NUMBERS = 127, // [10140]
|
||||
B_UNICODE_ARABIC_SUPPLEMENT = 128, // [0750]
|
||||
B_UNICODE_BUGINESE = 129, // [1A00]
|
||||
B_UNICODE_CJK_STROKES = 130, // [31C0]
|
||||
B_UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, // [1DC0]
|
||||
B_UNICODE_COPTIC = 132, // [2C80]
|
||||
B_UNICODE_ETHIOPIC_EXTENDED = 133, // [2D80]
|
||||
B_UNICODE_ETHIOPIC_SUPPLEMENT = 134, // [1380]
|
||||
B_UNICODE_GEORGIAN_SUPPLEMENT = 135, // [2D00]
|
||||
B_UNICODE_GLAGOLITIC = 136, // [2C00]
|
||||
B_UNICODE_KHAROSHTHI = 137, // [10A00]
|
||||
B_UNICODE_MODIFIER_TONE_LETTERS = 138, // [A700]
|
||||
B_UNICODE_NEW_TAI_LUE = 139, // [1980]
|
||||
B_UNICODE_OLD_PERSIAN = 140, // [103A0]
|
||||
B_UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, // [1D80]
|
||||
B_UNICODE_SUPPLEMENTAL_PUNCTUATION = 142, // [2E00]
|
||||
B_UNICODE_SYLOTI_NAGRI = 143, // [A800]
|
||||
B_UNICODE_TIFINAGH = 144, // [2D30]
|
||||
B_UNICODE_VERTICAL_FORMS = 145, // [FE10]
|
||||
|
||||
// New blocks in Unicode 5.0
|
||||
B_UNICODE_NKO = 146, // [07C0]
|
||||
B_UNICODE_BALINESE = 147, // [1B00]
|
||||
B_UNICODE_LATIN_EXTENDED_C = 148, // [2C60]
|
||||
B_UNICODE_LATIN_EXTENDED_D = 149, // [A720]
|
||||
B_UNICODE_PHAGS_PA = 150, // [A840]
|
||||
B_UNICODE_PHOENICIAN = 151, // [10900]
|
||||
B_UNICODE_CUNEIFORM = 152, // [12000]
|
||||
B_UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, // [12400]
|
||||
B_UNICODE_COUNTING_ROD_NUMERALS = 154, // [1D360]
|
||||
|
||||
// New blocks in Unicode 5.1
|
||||
B_UNICODE_SUNDANESE = 155, // [1B80]
|
||||
B_UNICODE_LEPCHA = 156, // [1C00]
|
||||
B_UNICODE_OL_CHIKI = 157, // [1C50]
|
||||
B_UNICODE_CYRILLIC_EXTENDED_A = 158, // [2DE0]
|
||||
B_UNICODE_VAI = 159, // [A500]
|
||||
B_UNICODE_CYRILLIC_EXTENDED_B = 160, // [A640]
|
||||
B_UNICODE_SAURASHTRA = 161, // [A880]
|
||||
B_UNICODE_KAYAH_LI = 162, // [A900]
|
||||
B_UNICODE_REJANG = 163, // [A930]
|
||||
B_UNICODE_CHAM = 164, // [AA00]
|
||||
B_UNICODE_ANCIENT_SYMBOLS = 165, // [10190]
|
||||
B_UNICODE_PHAISTOS_DISC = 166, // [101D0]
|
||||
B_UNICODE_LYCIAN = 167, // [10280]
|
||||
B_UNICODE_CARIAN = 168, // [102A0]
|
||||
B_UNICODE_LYDIAN = 169, // [10920]
|
||||
B_UNICODE_MAHJONG_TILES = 170, // [1F000]
|
||||
B_UNICODE_DOMINO_TILES = 171, // [1F030]
|
||||
|
||||
// New blocks in Unicode 5.2
|
||||
B_UNICODE_SAMARITAN = 172, // [0800]
|
||||
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, // [18B0]
|
||||
B_UNICODE_TAI_THAM = 174, // [1A20]
|
||||
B_UNICODE_VEDIC_EXTENSIONS = 175, // [1CD0]
|
||||
B_UNICODE_LISU = 176, // [A4D0]
|
||||
B_UNICODE_BAMUM = 177, // [A6A0]
|
||||
B_UNICODE_COMMON_INDIC_NUMBER_FORMS = 178, // [A830]
|
||||
B_UNICODE_DEVANAGARI_EXTENDED = 179, // [A8E0]
|
||||
B_UNICODE_HANGUL_JAMO_EXTENDED_A = 180, // [A960]
|
||||
B_UNICODE_JAVANESE = 181, // [A980]
|
||||
B_UNICODE_MYANMAR_EXTENDED_A = 182, // [AA60]
|
||||
B_UNICODE_TAI_VIET = 183, // [AA80]
|
||||
B_UNICODE_MEETEI_MAYEK = 184, // [ABC0]
|
||||
B_UNICODE_HANGUL_JAMO_EXTENDED_B = 185, // [D7B0]
|
||||
B_UNICODE_IMPERIAL_ARAMAIC = 186, // [10840]
|
||||
B_UNICODE_OLD_SOUTH_ARABIAN = 187, // [10A60]
|
||||
B_UNICODE_AVESTAN = 188, // [10B00]
|
||||
B_UNICODE_INSCRIPTIONAL_PARTHIAN = 189, // [10B40]
|
||||
B_UNICODE_INSCRIPTIONAL_PAHLAVI = 190, // [10B60]
|
||||
B_UNICODE_OLD_TURKIC = 191, // [10C00]
|
||||
B_UNICODE_RUMI_NUMERAL_SYMBOLS = 192, // [10E60]
|
||||
B_UNICODE_KAITHI = 193, // [11080]
|
||||
B_UNICODE_EGYPTIAN_HIEROGLYPHS = 194, // [13000]
|
||||
B_UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, // [1F100]
|
||||
B_UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, // [1F200]
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, // [2A700]
|
||||
|
||||
// New blocks in Unicode 6.0
|
||||
B_UNICODE_MANDAIC = 198, // [0840]
|
||||
B_UNICODE_BATAK = 199, // [1BC0]
|
||||
B_UNICODE_ETHIOPIC_EXTENDED_A = 200, // [AB00]
|
||||
B_UNICODE_BRAHMI = 201, // [11000]
|
||||
B_UNICODE_BAMUM_SUPPLEMENT = 202, // [16800]
|
||||
B_UNICODE_KANA_SUPPLEMENT = 203, // [1B000]
|
||||
B_UNICODE_PLAYING_CARDS = 204, // [1F0A0]
|
||||
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, // [1F300]
|
||||
B_UNICODE_EMOTICONS = 206, // [1F600]
|
||||
B_UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 207, // [1F680]
|
||||
B_UNICODE_ALCHEMICAL_SYMBOLS = 208, // [1F700]
|
||||
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, // [2B740]
|
||||
|
||||
B_UNICODE_SCRIPT_COUNT = 210,
|
||||
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT,
|
||||
|
||||
B_UNICODE_INVALID_CODE = -1
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Values returned by the u_getCellWidth() function.
|
||||
*/
|
||||
// East Asian Width constants.
|
||||
|
||||
enum unicode_cell_width
|
||||
enum unicode_east_asian_width
|
||||
{
|
||||
B_UNICODE_ZERO_WIDTH = 0,
|
||||
B_UNICODE_HALF_WIDTH = 1,
|
||||
B_UNICODE_FULL_WIDTH = 2,
|
||||
B_UNICODE_NEUTRAL_WIDTH = 3,
|
||||
|
||||
B_UNICODE_CELL_WIDTH_COUNT
|
||||
B_UNICODE_EA_NEUTRAL, // [N]
|
||||
B_UNICODE_EA_AMBIGUOUS, // [A]
|
||||
B_UNICODE_EA_HALFWIDTH, // [H]
|
||||
B_UNICODE_EA_FULLWIDTH, // [F]
|
||||
B_UNICODE_EA_NARROW, // [Na]
|
||||
B_UNICODE_EA_WIDE, // [W]
|
||||
B_UNICODE_EA_COUNT
|
||||
};
|
||||
|
||||
|
||||
@ -209,6 +347,7 @@ class BUnicodeChar {
|
||||
static uint32 ToUpper(uint32 c);
|
||||
static uint32 ToTitle(uint32 c);
|
||||
static int32 DigitValue(uint32 c);
|
||||
static unicode_east_asian_width EastAsianWidth(uint32 c);
|
||||
|
||||
static void ToUTF8(uint32 c, char **out);
|
||||
static uint32 FromUTF8(const char **in);
|
||||
@ -230,4 +369,4 @@ BUnicodeChar::FromUTF8(const char *in)
|
||||
}
|
||||
|
||||
|
||||
#endif /* _UNICODE_CHAR_H_ */
|
||||
#endif // _UNICODE_CHAR_H_
|
||||
|
@ -1,234 +1,18 @@
|
||||
/*
|
||||
** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
|
||||
** Distributed under the terms of the OpenBeOS License.
|
||||
*/
|
||||
|
||||
/* Reads the information out of the data files created by (an edited version of)
|
||||
* IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart
|
||||
* to ICU's uchar module, but is not as huge or broad as that one.
|
||||
/*
|
||||
* Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
|
||||
* Distributed under the terms of the MIT License.
|
||||
*
|
||||
* Note, it probably won't be able to handle the output of the orginal genprops
|
||||
* tool and vice versa - only use the tool provided with this project to create
|
||||
* the Unicode property file.
|
||||
* However, the algorithmic idea behind the property file is still the same as
|
||||
* found in ICU - nothing important has been changed, so more recent versions
|
||||
* of genprops tool/data can probably be ported without too much effort.
|
||||
* Authors:
|
||||
* Axel Dörfler, axeld@pinc-software.de
|
||||
* Siarzhuk Zharski, zharik@gmx.li
|
||||
*
|
||||
* In case no property file can be found it will still provide basic services
|
||||
* for the Latin-1 part of the character tables.
|
||||
*/
|
||||
|
||||
|
||||
#include <OS.h>
|
||||
|
||||
#include <UnicodeChar.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#define FLAG(n) ((uint32)1 << (n))
|
||||
enum {
|
||||
UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER),
|
||||
UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER),
|
||||
UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER),
|
||||
UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER),
|
||||
UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER),
|
||||
UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER),
|
||||
UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER),
|
||||
UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER)
|
||||
};
|
||||
|
||||
|
||||
static uint32 gStaticProps32Table[] = {
|
||||
/* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c,
|
||||
/* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f,
|
||||
/* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c,
|
||||
/* 0x20 */ 0x24c, 0x297, 0x297, 0x117,
|
||||
/* 0x24 */ 0x119, 0x117, 0x297, 0x297,
|
||||
/* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118,
|
||||
/* 0x2c */ 0x197, 0x113, 0x197, 0xd7,
|
||||
/* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089,
|
||||
/* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089,
|
||||
/* 0x38 */ 0x800089, 0x900089, 0x197, 0x297,
|
||||
/* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297,
|
||||
/* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
|
||||
/* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94,
|
||||
/* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296,
|
||||
/* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
|
||||
/* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94,
|
||||
/* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f,
|
||||
/* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f,
|
||||
/* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f,
|
||||
/* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f
|
||||
};
|
||||
|
||||
enum {
|
||||
INDEX_STAGE_2_BITS,
|
||||
INDEX_STAGE_3_BITS,
|
||||
INDEX_EXCEPTIONS,
|
||||
INDEX_STAGE_3_INDEX,
|
||||
INDEX_PROPS,
|
||||
INDEX_UCHARS
|
||||
};
|
||||
|
||||
/* constants and macros for access to the data */
|
||||
enum {
|
||||
EXC_UPPERCASE,
|
||||
EXC_LOWERCASE,
|
||||
EXC_TITLECASE,
|
||||
EXC_DIGIT_VALUE,
|
||||
EXC_NUMERIC_VALUE,
|
||||
EXC_DENOMINATOR_VALUE,
|
||||
EXC_MIRROR_MAPPING,
|
||||
EXC_SPECIAL_CASING,
|
||||
EXC_CASE_FOLDING
|
||||
};
|
||||
|
||||
enum {
|
||||
EXCEPTION_SHIFT = 5,
|
||||
BIDI_SHIFT,
|
||||
MIRROR_SHIFT = BIDI_SHIFT + 5,
|
||||
VALUE_SHIFT = 20,
|
||||
|
||||
VALUE_BITS = 32 - VALUE_SHIFT
|
||||
};
|
||||
|
||||
/* number of bits in an 8-bit integer value */
|
||||
#define EXC_GROUP 8
|
||||
static uint8 gFlagsOffset[256] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
#ifdef UCHAR_VARIABLE_TRIE_BITS
|
||||
// access values calculated from indices
|
||||
static uint16_t stage23Bits, stage2Mask, stage3Mask;
|
||||
# define sStage3Bits indexes[INDEX_STAGE_3_BITS]
|
||||
#else
|
||||
// Use hardcoded bit distribution for the trie table access
|
||||
# define sStage23Bits 10
|
||||
# define sStage2Mask 0x3f
|
||||
# define sStage3Mask 0xf
|
||||
# define sStage3Bits 4
|
||||
#endif
|
||||
|
||||
|
||||
/** We need to change the char category for ISO 8 controls, since the
|
||||
* genprops utility we got from IBM's ICU apparently changes it for
|
||||
* some characters.
|
||||
*/
|
||||
|
||||
static inline bool
|
||||
isISO8Control(uint32 c)
|
||||
{
|
||||
return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20);
|
||||
}
|
||||
|
||||
|
||||
static inline uint32
|
||||
getProperties(uint32 c)
|
||||
{
|
||||
if (c > 0x10ffff)
|
||||
return 0;
|
||||
|
||||
// TODO : Data from unicode
|
||||
|
||||
return c > 0x9f ? 0 : gStaticProps32Table[c];
|
||||
}
|
||||
|
||||
|
||||
static inline uint8
|
||||
getCategory(uint32 properties)
|
||||
{
|
||||
return properties & 0x1f;
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
propertyIsException(uint32 properties)
|
||||
{
|
||||
return properties & (1UL << EXCEPTION_SHIFT);
|
||||
}
|
||||
|
||||
|
||||
static inline uint32
|
||||
getUnsignedValue(uint32 properties)
|
||||
{
|
||||
return properties >> VALUE_SHIFT;
|
||||
}
|
||||
|
||||
|
||||
static inline uint32
|
||||
getSignedValue(uint32 properties)
|
||||
{
|
||||
return (int32)properties >> VALUE_SHIFT;
|
||||
}
|
||||
|
||||
|
||||
static inline uint32 *
|
||||
getExceptions(uint32 properties)
|
||||
{
|
||||
// TODO : data from unicode
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
haveExceptionValue(uint32 flags,int16 index)
|
||||
{
|
||||
return flags & (1UL << index);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset)
|
||||
{
|
||||
if (index >= EXC_GROUP) {
|
||||
*offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)];
|
||||
flags >>= EXC_GROUP;
|
||||
index -= EXC_GROUP;
|
||||
}
|
||||
*offset += gFlagsOffset[flags & ((1 << index) - 1)];
|
||||
}
|
||||
|
||||
|
||||
// #pragma mark -
|
||||
#include <unicode/uchar.h>
|
||||
#include <unicode/utf8.h>
|
||||
|
||||
|
||||
BUnicodeChar::BUnicodeChar()
|
||||
@ -236,382 +20,244 @@ BUnicodeChar::BUnicodeChar()
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
BUnicodeChar::IsAlpha(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
|
||||
) != 0;
|
||||
}
|
||||
|
||||
|
||||
/** Returns the type code of the specified unicode character */
|
||||
// Returns the general category value for the code point.
|
||||
int8
|
||||
BUnicodeChar::Type(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (int8)getCategory(getProperties(c));
|
||||
return u_charType(c);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
BUnicodeChar::IsLower(uint32 c)
|
||||
// Determines whether the specified code point is a letter character.
|
||||
// True for general categories "L" (letters).
|
||||
bool
|
||||
BUnicodeChar::IsAlpha(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER;
|
||||
return u_isalpha(c);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
BUnicodeChar::IsUpper(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
BUnicodeChar::IsTitle(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
BUnicodeChar::IsDigit(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER)
|
||||
) != 0;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
// Determines whether the specified code point is an alphanumeric character
|
||||
// (letter or digit).
|
||||
// True for characters with general categories
|
||||
// "L" (letters) and "Nd" (decimal digit numbers).
|
||||
bool
|
||||
BUnicodeChar::IsAlNum(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE
|
||||
| UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
|
||||
) != 0;
|
||||
return u_isalnum(c);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
|
||||
bool
|
||||
BUnicodeChar::IsLower(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_isULowercase(c);
|
||||
}
|
||||
|
||||
|
||||
// Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
|
||||
bool
|
||||
BUnicodeChar::IsUpper(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_isUUppercase(c);
|
||||
}
|
||||
|
||||
|
||||
// Determines whether the specified code point is a titlecase letter.
|
||||
// True for general category "Lt" (titlecase letter).
|
||||
bool
|
||||
BUnicodeChar::IsTitle(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_istitle(c);
|
||||
}
|
||||
|
||||
|
||||
// Determines whether the specified code point is a digit character.
|
||||
// True for characters with general category "Nd" (decimal digit numbers).
|
||||
// Beginning with Unicode 4, this is the same as
|
||||
// testing for the Numeric_Type of Decimal.
|
||||
bool
|
||||
BUnicodeChar::IsDigit(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_isdigit(c);
|
||||
}
|
||||
|
||||
|
||||
// Determines whether the specified code point is a hexadecimal digit.
|
||||
// This is equivalent to u_digit(c, 16)>=0.
|
||||
// True for characters with general category "Nd" (decimal digit numbers)
|
||||
// as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
|
||||
// (That is, for letters with code points
|
||||
// 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
|
||||
bool
|
||||
BUnicodeChar::IsHexDigit(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_isxdigit(c);
|
||||
}
|
||||
|
||||
|
||||
// Determines whether the specified code point is "defined",
|
||||
// which usually means that it is assigned a character.
|
||||
// True for general categories other than "Cn" (other, not assigned),
|
||||
// i.e., true for all code points mentioned in UnicodeData.txt.
|
||||
bool
|
||||
BUnicodeChar::IsDefined(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return getProperties(c) != 0;
|
||||
return u_isdefined(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is a base
|
||||
* form character that can be used with a diacritic.
|
||||
* This doesn't mean that the character has to be distinct,
|
||||
* though.
|
||||
*/
|
||||
|
||||
bool
|
||||
// Determines whether the specified code point is a base character.
|
||||
// True for general categories "L" (letters), "N" (numbers),
|
||||
// "Mc" (spacing combining marks), and "Me" (enclosing marks).
|
||||
bool
|
||||
BUnicodeChar::IsBase(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER
|
||||
| UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE
|
||||
| UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK)
|
||||
| FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK))
|
||||
) != 0;
|
||||
return u_isbase(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is a
|
||||
* control character.
|
||||
*/
|
||||
|
||||
bool
|
||||
// Determines whether the specified code point is a control character
|
||||
// (as defined by this function).
|
||||
// A control character is one of the following:
|
||||
// - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
|
||||
// - U_CONTROL_CHAR (Cc)
|
||||
// - U_FORMAT_CHAR (Cf)
|
||||
// - U_LINE_SEPARATOR (Zl)
|
||||
// - U_PARAGRAPH_SEPARATOR (Zp)
|
||||
bool
|
||||
BUnicodeChar::IsControl(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return isISO8Control(c)
|
||||
|| (FLAG(getCategory(getProperties(c)))
|
||||
& (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR)
|
||||
| FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
|
||||
) != 0;
|
||||
return u_iscntrl(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is a
|
||||
* punctuation character.
|
||||
*/
|
||||
|
||||
// Determines whether the specified code point is a punctuation character.
|
||||
// True for characters with general categories "P" (punctuation).
|
||||
bool
|
||||
BUnicodeChar::IsPunctuation(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (FLAG(B_UNICODE_DASH_PUNCTUATION)
|
||||
| FLAG(B_UNICODE_START_PUNCTUATION)
|
||||
| FLAG(B_UNICODE_END_PUNCTUATION)
|
||||
| FLAG(B_UNICODE_CONNECTOR_PUNCTUATION)
|
||||
| FLAG(B_UNICODE_OTHER_PUNCTUATION))
|
||||
) != 0;
|
||||
return u_ispunct(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is some
|
||||
* kind of a space character.
|
||||
*/
|
||||
|
||||
bool
|
||||
// Determine if the specified code point is a space character according to Java.
|
||||
// True for characters with general categories "Z" (separators),
|
||||
// which does not include control codes (e.g., TAB or Line Feed).
|
||||
bool
|
||||
BUnicodeChar::IsSpace(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
|
||||
| FLAG(B_UNICODE_LINE_SEPARATOR)
|
||||
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
|
||||
) != 0;
|
||||
return u_isJavaSpaceChar(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is a white
|
||||
* space character.
|
||||
* This is essentially the same as IsSpace(), but excludes all
|
||||
* non-breakable spaces.
|
||||
*/
|
||||
|
||||
bool
|
||||
// Determines if the specified code point is a whitespace character
|
||||
// A character is considered to be a whitespace character if and only
|
||||
// if it satisfies one of the following criteria:
|
||||
// - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
|
||||
// but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
|
||||
// or U+202F Narrow NBSP).
|
||||
// - It is U+0009 HORIZONTAL TABULATION.
|
||||
// - It is U+000A LINE FEED.
|
||||
// - It is U+000B VERTICAL TABULATION.
|
||||
// - It is U+000C FORM FEED.
|
||||
// - It is U+000D CARRIAGE RETURN.
|
||||
// - It is U+001C FILE SEPARATOR.
|
||||
// - It is U+001D GROUP SEPARATOR.
|
||||
// - It is U+001E RECORD SEPARATOR.
|
||||
// - It is U+001F UNIT SEPARATOR.
|
||||
bool
|
||||
BUnicodeChar::IsWhitespace(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return (FLAG(getCategory(getProperties(c)))
|
||||
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
|
||||
| FLAG(B_UNICODE_LINE_SEPARATOR)
|
||||
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
|
||||
) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces
|
||||
return u_isWhitespace(c);
|
||||
}
|
||||
|
||||
|
||||
/** Returns true if the specified unicode character is printable.
|
||||
*/
|
||||
|
||||
bool
|
||||
// Determines whether the specified code point is a printable character.
|
||||
// True for general categories other than "C" (controls).
|
||||
bool
|
||||
BUnicodeChar::IsPrintable(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return !isISO8Control(c)
|
||||
&& (FLAG(getCategory(getProperties(c)))
|
||||
& ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR)
|
||||
| FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR)
|
||||
| FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES)
|
||||
| FLAG(31))
|
||||
) != 0;
|
||||
return u_isprint(c);
|
||||
}
|
||||
|
||||
|
||||
// #pragma mark -
|
||||
|
||||
|
||||
/** Transforms the specified unicode character to lowercase.
|
||||
*/
|
||||
|
||||
uint32
|
||||
uint32
|
||||
BUnicodeChar::ToLower(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
|
||||
uint32 props = getProperties(c);
|
||||
|
||||
if (!propertyIsException(props)) {
|
||||
if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE))
|
||||
return c + getSignedValue(props);
|
||||
} else {
|
||||
uint32 *exceptions = getExceptions(props);
|
||||
uint32 firstExceptionValue = *exceptions;
|
||||
|
||||
if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) {
|
||||
int16 index = EXC_LOWERCASE;
|
||||
addExceptionOffset(firstExceptionValue, index, &++exceptions);
|
||||
return *exceptions;
|
||||
}
|
||||
}
|
||||
// no mapping found, just return the character unchanged
|
||||
return c;
|
||||
return u_tolower(c);
|
||||
}
|
||||
|
||||
|
||||
/** Transforms the specified unicode character to uppercase.
|
||||
*/
|
||||
|
||||
uint32
|
||||
uint32
|
||||
BUnicodeChar::ToUpper(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
|
||||
uint32 props = getProperties(c);
|
||||
|
||||
if (!propertyIsException(props)) {
|
||||
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER)
|
||||
return c - getSignedValue(props);
|
||||
} else {
|
||||
uint32 *exceptions = getExceptions(props);
|
||||
uint32 firstExceptionValue = *exceptions;
|
||||
|
||||
if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
|
||||
int16 index = EXC_UPPERCASE;
|
||||
++exceptions;
|
||||
addExceptionOffset(firstExceptionValue, index, &exceptions);
|
||||
return *exceptions;
|
||||
}
|
||||
}
|
||||
// no mapping found, just return the character unchanged
|
||||
return c;
|
||||
return u_toupper(c);
|
||||
}
|
||||
|
||||
|
||||
/** Transforms the specified unicode character to title case.
|
||||
*/
|
||||
|
||||
uint32
|
||||
uint32
|
||||
BUnicodeChar::ToTitle(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
|
||||
uint32 props = getProperties(c);
|
||||
|
||||
if (!propertyIsException(props)) {
|
||||
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) {
|
||||
// here, titlecase is the same as uppercase
|
||||
return c - getSignedValue(props);
|
||||
}
|
||||
} else {
|
||||
uint32 *exceptions = getExceptions(props);
|
||||
uint32 firstExceptionValue = *exceptions;
|
||||
|
||||
if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) {
|
||||
int16 index = EXC_TITLECASE;
|
||||
addExceptionOffset(firstExceptionValue, index, &++exceptions);
|
||||
return (uint32)*exceptions;
|
||||
} else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
|
||||
// here, titlecase is the same as uppercase
|
||||
int16 index = EXC_UPPERCASE;
|
||||
addExceptionOffset(firstExceptionValue, index, &++exceptions);
|
||||
return *exceptions;
|
||||
}
|
||||
}
|
||||
// no mapping found, just return the character unchanged
|
||||
return c;
|
||||
return u_totitle(c);
|
||||
}
|
||||
|
||||
|
||||
int32
|
||||
int32
|
||||
BUnicodeChar::DigitValue(uint32 c)
|
||||
{
|
||||
BUnicodeChar();
|
||||
return u_digit(c, 10);
|
||||
}
|
||||
|
||||
uint32 props = getProperties(c);
|
||||
|
||||
if (!propertyIsException(props)) {
|
||||
if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER)
|
||||
return getSignedValue(props);
|
||||
} else {
|
||||
uint32 *exceptions = getExceptions(props);
|
||||
uint32 firstExceptionValue = *exceptions;
|
||||
|
||||
if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) {
|
||||
int16 index = EXC_DIGIT_VALUE;
|
||||
addExceptionOffset(firstExceptionValue, index, &++exceptions);
|
||||
|
||||
int32 value = (int32)(int16)*exceptions;
|
||||
// the digit value is in the lower 16 bits
|
||||
if (value != -1)
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// If there is no value in the properties table,
|
||||
// then check for some special characters
|
||||
switch (c) {
|
||||
case 0x3007: return 0;
|
||||
case 0x4e00: return 1;
|
||||
case 0x4e8c: return 2;
|
||||
case 0x4e09: return 3;
|
||||
case 0x56d8: return 4;
|
||||
case 0x4e94: return 5;
|
||||
case 0x516d: return 6;
|
||||
case 0x4e03: return 7;
|
||||
case 0x516b: return 8;
|
||||
case 0x4e5d: return 9;
|
||||
default: return -1;
|
||||
}
|
||||
unicode_east_asian_width
|
||||
BUnicodeChar::EastAsianWidth(uint32 c)
|
||||
{
|
||||
return (unicode_east_asian_width)u_getIntPropertyValue(c,
|
||||
UCHAR_EAST_ASIAN_WIDTH);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
BUnicodeChar::ToUTF8(uint32 c, char **out)
|
||||
{
|
||||
char *s = *out;
|
||||
|
||||
if (c < 0x80)
|
||||
*(s++) = c;
|
||||
else if (c < 0x800) {
|
||||
*(s++) = 0xc0 | (c >> 6);
|
||||
*(s++) = 0x80 | (c & 0x3f);
|
||||
} else if (c < 0x10000) {
|
||||
*(s++) = 0xe0 | (c >> 12);
|
||||
*(s++) = 0x80 | ((c >> 6) & 0x3f);
|
||||
*(s++) = 0x80 | (c & 0x3f);
|
||||
} else if (c <= 0x10ffff) {
|
||||
*(s++) = 0xf0 | (c >> 18);
|
||||
*(s++) = 0x80 | ((c >> 12) & 0x3f);
|
||||
*(s++) = 0x80 | ((c >> 6) & 0x3f);
|
||||
*(s++) = 0x80 | (c & 0x3f);
|
||||
}
|
||||
*out = s;
|
||||
int i = 0;
|
||||
U8_APPEND_UNSAFE(*out, i, c);
|
||||
}
|
||||
|
||||
|
||||
uint32
|
||||
uint32
|
||||
BUnicodeChar::FromUTF8(const char **in)
|
||||
{
|
||||
uint8 *bytes = (uint8 *)*in;
|
||||
if (bytes == NULL)
|
||||
return 0;
|
||||
|
||||
int32 length;
|
||||
uint8 mask = 0x1f;
|
||||
|
||||
switch (bytes[0] & 0xf0) {
|
||||
case 0xc0:
|
||||
case 0xd0: length = 2; break;
|
||||
case 0xe0: length = 3; break;
|
||||
case 0xf0:
|
||||
mask = 0x0f;
|
||||
length = 4;
|
||||
break;
|
||||
default:
|
||||
// valid 1-byte character
|
||||
// and invalid characters
|
||||
(*in)++;
|
||||
return bytes[0];
|
||||
}
|
||||
uint32 c = bytes[0] & mask;
|
||||
int32 i = 1;
|
||||
for (;i < length && (bytes[i] & 0x80) > 0;i++)
|
||||
c = (c << 6) | (bytes[i] & 0x3f);
|
||||
|
||||
if (i < length) {
|
||||
// invalid character
|
||||
(*in)++;
|
||||
return (uint32)bytes[0];
|
||||
}
|
||||
*in += length;
|
||||
int i = 0;
|
||||
uint32 c = 0;
|
||||
U8_GET_UNSAFE(*in, i, c);
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
size_t
|
||||
BUnicodeChar::UTF8StringLength(const char *str)
|
||||
{
|
||||
@ -623,6 +269,7 @@ BUnicodeChar::UTF8StringLength(const char *str)
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
size_t
|
||||
BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
|
||||
{
|
||||
@ -633,4 +280,3 @@ BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user