diff --git a/headers/os/locale/UnicodeChar.h b/headers/os/locale/UnicodeChar.h index d902a79c23..3031866aaf 100644 --- a/headers/os/locale/UnicodeChar.h +++ b/headers/os/locale/UnicodeChar.h @@ -8,6 +8,7 @@ enum unicode_char_category // Non-category for unassigned and non-character code points. B_UNICODE_UNASSIGNED = 0, + B_UNICODE_GENERAL_OTHER_TYPES = 0, // Cn B_UNICODE_UPPERCASE_LETTER = 1, // Lu B_UNICODE_LOWERCASE_LETTER = 2, // Ll B_UNICODE_TITLECASE_LETTER = 3, // Lt @@ -37,152 +38,289 @@ enum unicode_char_category B_UNICODE_OTHER_SYMBOL = 27, // So B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi B_UNICODE_FINAL_PUNCTUATION = 29, // Pf - B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn B_UNICODE_CATEGORY_COUNT }; -/** - * This specifies the language directional property of a character set. - */ +// This specifies the language directional property of a character set. enum unicode_char_direction { - B_UNICODE_LEFT_TO_RIGHT = 0, - B_UNICODE_RIGHT_TO_LEFT = 1, - B_UNICODE_EUROPEAN_NUMBER = 2, - B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, - B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, - B_UNICODE_ARABIC_NUMBER = 5, - B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, - B_UNICODE_BLOCK_SEPARATOR = 7, - B_UNICODE_SEGMENT_SEPARATOR = 8, - B_UNICODE_WHITE_SPACE_NEUTRAL = 9, - B_UNICODE_OTHER_NEUTRAL = 10, - B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, - B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, - B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, - B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, - B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, - B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, - B_UNICODE_DIR_NON_SPACING_MARK = 17, - B_UNICODE_BOUNDARY_NEUTRAL = 18, + B_UNICODE_LEFT_TO_RIGHT = 0, + B_UNICODE_RIGHT_TO_LEFT = 1, + B_UNICODE_EUROPEAN_NUMBER = 2, + B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, + B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, + B_UNICODE_ARABIC_NUMBER = 5, + B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, + B_UNICODE_BLOCK_SEPARATOR = 7, + B_UNICODE_SEGMENT_SEPARATOR = 8, + B_UNICODE_WHITE_SPACE_NEUTRAL = 9, + B_UNICODE_OTHER_NEUTRAL = 10, + B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, + B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, + B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, + B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, + B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, + B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, + B_UNICODE_DIR_NON_SPACING_MARK = 17, + B_UNICODE_BOUNDARY_NEUTRAL = 18, B_UNICODE_DIRECTION_COUNT }; -/** - * Script range as defined in the Unicode standard. - */ +// Script range as defined in the Unicode standard. enum unicode_char_script { - // Script names - B_UNICODE_BASIC_LATIN, - B_UNICODE_LATIN_1_SUPPLEMENT, - B_UNICODE_LATIN_EXTENDED_A, - B_UNICODE_LATIN_EXTENDED_B, - B_UNICODE_IPA_EXTENSIONS, - B_UNICODE_SPACING_MODIFIER_LETTERS, - B_UNICODE_COMBINING_DIACRITICAL_MARKS, - B_UNICODE_GREEK, - B_UNICODE_CYRILLIC, - B_UNICODE_ARMENIAN, - B_UNICODE_HEBREW, - B_UNICODE_ARABIC, - B_UNICODE_SYRIAC, - B_UNICODE_THAANA, - B_UNICODE_DEVANAGARI, - B_UNICODE_BENGALI, - B_UNICODE_GURMUKHI, - B_UNICODE_GUJARATI, - B_UNICODE_ORIYA, - B_UNICODE_TAMIL, - B_UNICODE_TELUGU, - B_UNICODE_KANNADA, - B_UNICODE_MALAYALAM, - B_UNICODE_SINHALA, - B_UNICODE_THAI, - B_UNICODE_LAO, - B_UNICODE_TIBETAN, - B_UNICODE_MYANMAR, - B_UNICODE_GEORGIAN, - B_UNICODE_HANGUL_JAMO, - B_UNICODE_ETHIOPIC, - B_UNICODE_CHEROKEE, - B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, - B_UNICODE_OGHAM, - B_UNICODE_RUNIC, - B_UNICODE_KHMER, - B_UNICODE_MONGOLIAN, - B_UNICODE_LATIN_EXTENDED_ADDITIONAL, - B_UNICODE_GREEK_EXTENDED, - B_UNICODE_GENERAL_PUNCTUATION, - B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS, - B_UNICODE_CURRENCY_SYMBOLS, - B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS, - B_UNICODE_LETTERLIKE_SYMBOLS, - B_UNICODE_NUMBER_FORMS, - B_UNICODE_ARROWS, - B_UNICODE_MATHEMATICAL_OPERATORS, - B_UNICODE_MISCELLANEOUS_TECHNICAL, - B_UNICODE_CONTROL_PICTURES, - B_UNICODE_OPTICAL_CHARACTER_RECOGNITION, - B_UNICODE_ENCLOSED_ALPHANUMERICS, - B_UNICODE_BOX_DRAWING, - B_UNICODE_BLOCK_ELEMENTS, - B_UNICODE_GEOMETRIC_SHAPES, - B_UNICODE_MISCELLANEOUS_SYMBOLS, - B_UNICODE_DINGBATS, - B_UNICODE_BRAILLE_PATTERNS, - B_UNICODE_CJK_RADICALS_SUPPLEMENT, - B_UNICODE_KANGXI_RADICALS, - B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS, - B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION, - B_UNICODE_HIRAGANA, - B_UNICODE_KATAKANA, - B_UNICODE_BOPOMOFO, - B_UNICODE_HANGUL_COMPATIBILITY_JAMO, - B_UNICODE_KANBUN, - B_UNICODE_BOPOMOFO_EXTENDED, - B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS, - B_UNICODE_CJK_COMPATIBILITY, - B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, - B_UNICODE_CJK_UNIFIED_IDEOGRAPHS, - B_UNICODE_YI_SYLLABLES, - B_UNICODE_YI_RADICALS, - B_UNICODE_HANGUL_SYLLABLES, - B_UNICODE_HIGH_SURROGATES, - B_UNICODE_HIGH_PRIVATE_USE_SURROGATES, - B_UNICODE_LOW_SURROGATES, - B_UNICODE_PRIVATE_USE_AREA, - B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS, - B_UNICODE_ALPHABETIC_PRESENTATION_FORMS, - B_UNICODE_ARABIC_PRESENTATION_FORMS_A, - B_UNICODE_COMBINING_HALF_MARKS, - B_UNICODE_CJK_COMPATIBILITY_FORMS, - B_UNICODE_SMALL_FORM_VARIANTS, - B_UNICODE_ARABIC_PRESENTATION_FORMS_B, - B_UNICODE_SPECIALS, - B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS, + // New No_Block value in Unicode 4. + B_UNICODE_NO_BLOCK = 0, // [none] Special range + B_UNICODE_BASIC_LATIN = 1, // [0000] + B_UNICODE_LATIN_1_SUPPLEMENT = 2, // [0080] + B_UNICODE_LATIN_EXTENDED_A = 3, // [0100] + B_UNICODE_LATIN_EXTENDED_B = 4, // [0180] + B_UNICODE_IPA_EXTENSIONS = 5, // [0250] + B_UNICODE_SPACING_MODIFIER_LETTERS = 6, // [02B0] + B_UNICODE_COMBINING_DIACRITICAL_MARKS = 7, // [0300] + B_UNICODE_GREEK = 8, // [0370] + B_UNICODE_CYRILLIC = 9, // [0400] + B_UNICODE_ARMENIAN = 10, // [0530] + B_UNICODE_HEBREW = 11, // [0590] + B_UNICODE_ARABIC = 12, // [0600] + B_UNICODE_SYRIAC = 13, // [0700] + B_UNICODE_THAANA = 14, // [0780] + B_UNICODE_DEVANAGARI = 15, // [0900] + B_UNICODE_BENGALI = 16, // [0980] + B_UNICODE_GURMUKHI = 17, // [0A00] + B_UNICODE_GUJARATI = 18, // [0A80] + B_UNICODE_ORIYA = 19, // [0B00] + B_UNICODE_TAMIL = 20, // [0B80] + B_UNICODE_TELUGU = 21, // [0C00] + B_UNICODE_KANNADA = 22, // [0C80] + B_UNICODE_MALAYALAM = 23, // [0D00] + B_UNICODE_SINHALA = 24, // [0D80] + B_UNICODE_THAI = 25, // [0E00] + B_UNICODE_LAO = 26, // [0E80] + B_UNICODE_TIBETAN = 27, // [0F00] + B_UNICODE_MYANMAR = 28, // [1000] + B_UNICODE_GEORGIAN = 29, // [10A0] + B_UNICODE_HANGUL_JAMO = 30, // [1100] + B_UNICODE_ETHIOPIC = 31, // [1200] + B_UNICODE_CHEROKEE = 32, // [13A0] + B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, // [1400] + B_UNICODE_OGHAM = 34, // [1680] + B_UNICODE_RUNIC = 35, // [16A0] + B_UNICODE_KHMER = 36, // [1780] + B_UNICODE_MONGOLIAN = 37, // [1800] + B_UNICODE_LATIN_EXTENDED_ADDITIONAL = 38, // [1E00] + B_UNICODE_GREEK_EXTENDED = 39, // [1F00] + B_UNICODE_GENERAL_PUNCTUATION = 40, // [2000] + B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, // [2070] + B_UNICODE_CURRENCY_SYMBOLS = 42, // [20A0] + B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS = 43, // [20D0] + B_UNICODE_LETTERLIKE_SYMBOLS = 44, // [2100] + B_UNICODE_NUMBER_FORMS = 45, // [2150] + B_UNICODE_ARROWS = 46, // [2190] + B_UNICODE_MATHEMATICAL_OPERATORS = 47, // [2200] + B_UNICODE_MISCELLANEOUS_TECHNICAL = 48, // [2300] + B_UNICODE_CONTROL_PICTURES = 49, // [2400] + B_UNICODE_OPTICAL_CHARACTER_RECOGNITION = 50, // [2440] + B_UNICODE_ENCLOSED_ALPHANUMERICS = 51, // [2460] + B_UNICODE_BOX_DRAWING = 52, // [2500] + B_UNICODE_BLOCK_ELEMENTS = 53, // [2580] + B_UNICODE_GEOMETRIC_SHAPES = 54, // [25A0] + B_UNICODE_MISCELLANEOUS_SYMBOLS = 55, // [2600] + B_UNICODE_DINGBATS = 56, // [2700] + B_UNICODE_BRAILLE_PATTERNS = 57, // [2800] + B_UNICODE_CJK_RADICALS_SUPPLEMENT = 58, // [2E80] + B_UNICODE_KANGXI_RADICALS = 59, // [2F00] + B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, // [2FF0] + B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 61, // [3000] + B_UNICODE_HIRAGANA = 62, // [3040] + B_UNICODE_KATAKANA = 63, // [30A0] + B_UNICODE_BOPOMOFO = 64, // [3100] + B_UNICODE_HANGUL_COMPATIBILITY_JAMO = 65, // [3130] + B_UNICODE_KANBUN = 66, // [3190] + B_UNICODE_BOPOMOFO_EXTENDED = 67, // [31A0] + B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, // [3200] + B_UNICODE_CJK_COMPATIBILITY = 69, // [3300] + B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, // [3400] + B_UNICODE_CJK_UNIFIED_IDEOGRAPHS = 71, // [4E00] + B_UNICODE_YI_SYLLABLES = 72, // [A000] + B_UNICODE_YI_RADICALS = 73, // [A490] + B_UNICODE_HANGUL_SYLLABLES = 74, // [AC00] + B_UNICODE_HIGH_SURROGATES = 75, // [D800] + B_UNICODE_HIGH_PRIVATE_USE_SURROGATES = 76, // [DB80] + B_UNICODE_LOW_SURROGATES = 77, // [DC00] + B_UNICODE_PRIVATE_USE = 78, + B_UNICODE_PRIVATE_USE_AREA = B_UNICODE_PRIVATE_USE, // [E000] + B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 79, // [F900] + B_UNICODE_ALPHABETIC_PRESENTATION_FORMS = 80, // [FB00] + B_UNICODE_ARABIC_PRESENTATION_FORMS_A = 81, // [FB50] + B_UNICODE_COMBINING_HALF_MARKS = 82, // [FE20] + B_UNICODE_CJK_COMPATIBILITY_FORMS = 83, // [FE30] + B_UNICODE_SMALL_FORM_VARIANTS = 84, // [FE50] + B_UNICODE_ARABIC_PRESENTATION_FORMS_B = 85, // [FE70] + B_UNICODE_SPECIALS = 86, // [FFF0] + B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, // [FF00] - B_UNICODE_SCRIPT_COUNT, - B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT + // New blocks in Unicode 3.1 + B_UNICODE_OLD_ITALIC = 88, // [10300] + B_UNICODE_GOTHIC = 89, // [10330] + B_UNICODE_DESERET = 90, // [10400] + B_UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 91, // [1D000] + B_UNICODE_MUSICAL_SYMBOLS = 92, // [1D100] + B_UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, // [1D400] + B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, // [20000] + B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, // [2F800] + B_UNICODE_TAGS = 96, // [E0000] + + // New blocks in Unicode + B_UNICODE_CYRILLIC_SUPPLEMENTARY = 97, + B_UNICODE_CYRILLIC_SUPPLEMENT = B_UNICODE_CYRILLIC_SUPPLEMENTARY, // [0500] + B_UNICODE_TAGALOG = 98, // [1700] + B_UNICODE_HANUNOO = 99, // [1720] + B_UNICODE_BUHID = 100, // [1740] + B_UNICODE_TAGBANWA = 101, // [1760] + B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, // [27C0] + B_UNICODE_SUPPLEMENTAL_ARROWS_A = 103, // [27F0] + B_UNICODE_SUPPLEMENTAL_ARROWS_B = 104, // [2900] + B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, // [2980] + B_UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, // [2A00] + B_UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 107, // [31F0] + B_UNICODE_VARIATION_SELECTORS = 108, // [FE00] + B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, // [F0000] + B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, // [100000] + + // New blocks in Unicode 4 + B_UNICODE_LIMBU = 111, // [1900] + B_UNICODE_TAI_LE = 112, // [1950] + B_UNICODE_KHMER_SYMBOLS = 113, // [19E0] + B_UNICODE_PHONETIC_EXTENSIONS = 114, // [1D00] + B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, // [2B00] + B_UNICODE_YIJING_HEXAGRAM_SYMBOLS = 116, // [4DC0] + B_UNICODE_LINEAR_B_SYLLABARY = 117, // [10000] + B_UNICODE_LINEAR_B_IDEOGRAMS = 118, // [10080] + B_UNICODE_AEGEAN_NUMBERS = 119, // [10100] + B_UNICODE_UGARITIC = 120, // [10380] + B_UNICODE_SHAVIAN = 121, // [10450] + B_UNICODE_OSMANYA = 122, // [10480] + B_UNICODE_CYPRIOT_SYLLABARY = 123, // [10800] + B_UNICODE_TAI_XUAN_JING_SYMBOLS = 124, // [1D300] + B_UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 125, // [E0100] + + // New blocks in Unicode 4.1 + B_UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 126, // [1D200] + B_UNICODE_ANCIENT_GREEK_NUMBERS = 127, // [10140] + B_UNICODE_ARABIC_SUPPLEMENT = 128, // [0750] + B_UNICODE_BUGINESE = 129, // [1A00] + B_UNICODE_CJK_STROKES = 130, // [31C0] + B_UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, // [1DC0] + B_UNICODE_COPTIC = 132, // [2C80] + B_UNICODE_ETHIOPIC_EXTENDED = 133, // [2D80] + B_UNICODE_ETHIOPIC_SUPPLEMENT = 134, // [1380] + B_UNICODE_GEORGIAN_SUPPLEMENT = 135, // [2D00] + B_UNICODE_GLAGOLITIC = 136, // [2C00] + B_UNICODE_KHAROSHTHI = 137, // [10A00] + B_UNICODE_MODIFIER_TONE_LETTERS = 138, // [A700] + B_UNICODE_NEW_TAI_LUE = 139, // [1980] + B_UNICODE_OLD_PERSIAN = 140, // [103A0] + B_UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, // [1D80] + B_UNICODE_SUPPLEMENTAL_PUNCTUATION = 142, // [2E00] + B_UNICODE_SYLOTI_NAGRI = 143, // [A800] + B_UNICODE_TIFINAGH = 144, // [2D30] + B_UNICODE_VERTICAL_FORMS = 145, // [FE10] + + // New blocks in Unicode 5.0 + B_UNICODE_NKO = 146, // [07C0] + B_UNICODE_BALINESE = 147, // [1B00] + B_UNICODE_LATIN_EXTENDED_C = 148, // [2C60] + B_UNICODE_LATIN_EXTENDED_D = 149, // [A720] + B_UNICODE_PHAGS_PA = 150, // [A840] + B_UNICODE_PHOENICIAN = 151, // [10900] + B_UNICODE_CUNEIFORM = 152, // [12000] + B_UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, // [12400] + B_UNICODE_COUNTING_ROD_NUMERALS = 154, // [1D360] + + // New blocks in Unicode 5.1 + B_UNICODE_SUNDANESE = 155, // [1B80] + B_UNICODE_LEPCHA = 156, // [1C00] + B_UNICODE_OL_CHIKI = 157, // [1C50] + B_UNICODE_CYRILLIC_EXTENDED_A = 158, // [2DE0] + B_UNICODE_VAI = 159, // [A500] + B_UNICODE_CYRILLIC_EXTENDED_B = 160, // [A640] + B_UNICODE_SAURASHTRA = 161, // [A880] + B_UNICODE_KAYAH_LI = 162, // [A900] + B_UNICODE_REJANG = 163, // [A930] + B_UNICODE_CHAM = 164, // [AA00] + B_UNICODE_ANCIENT_SYMBOLS = 165, // [10190] + B_UNICODE_PHAISTOS_DISC = 166, // [101D0] + B_UNICODE_LYCIAN = 167, // [10280] + B_UNICODE_CARIAN = 168, // [102A0] + B_UNICODE_LYDIAN = 169, // [10920] + B_UNICODE_MAHJONG_TILES = 170, // [1F000] + B_UNICODE_DOMINO_TILES = 171, // [1F030] + + // New blocks in Unicode 5.2 + B_UNICODE_SAMARITAN = 172, // [0800] + B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, // [18B0] + B_UNICODE_TAI_THAM = 174, // [1A20] + B_UNICODE_VEDIC_EXTENSIONS = 175, // [1CD0] + B_UNICODE_LISU = 176, // [A4D0] + B_UNICODE_BAMUM = 177, // [A6A0] + B_UNICODE_COMMON_INDIC_NUMBER_FORMS = 178, // [A830] + B_UNICODE_DEVANAGARI_EXTENDED = 179, // [A8E0] + B_UNICODE_HANGUL_JAMO_EXTENDED_A = 180, // [A960] + B_UNICODE_JAVANESE = 181, // [A980] + B_UNICODE_MYANMAR_EXTENDED_A = 182, // [AA60] + B_UNICODE_TAI_VIET = 183, // [AA80] + B_UNICODE_MEETEI_MAYEK = 184, // [ABC0] + B_UNICODE_HANGUL_JAMO_EXTENDED_B = 185, // [D7B0] + B_UNICODE_IMPERIAL_ARAMAIC = 186, // [10840] + B_UNICODE_OLD_SOUTH_ARABIAN = 187, // [10A60] + B_UNICODE_AVESTAN = 188, // [10B00] + B_UNICODE_INSCRIPTIONAL_PARTHIAN = 189, // [10B40] + B_UNICODE_INSCRIPTIONAL_PAHLAVI = 190, // [10B60] + B_UNICODE_OLD_TURKIC = 191, // [10C00] + B_UNICODE_RUMI_NUMERAL_SYMBOLS = 192, // [10E60] + B_UNICODE_KAITHI = 193, // [11080] + B_UNICODE_EGYPTIAN_HIEROGLYPHS = 194, // [13000] + B_UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, // [1F100] + B_UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, // [1F200] + B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, // [2A700] + + // New blocks in Unicode 6.0 + B_UNICODE_MANDAIC = 198, // [0840] + B_UNICODE_BATAK = 199, // [1BC0] + B_UNICODE_ETHIOPIC_EXTENDED_A = 200, // [AB00] + B_UNICODE_BRAHMI = 201, // [11000] + B_UNICODE_BAMUM_SUPPLEMENT = 202, // [16800] + B_UNICODE_KANA_SUPPLEMENT = 203, // [1B000] + B_UNICODE_PLAYING_CARDS = 204, // [1F0A0] + B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, // [1F300] + B_UNICODE_EMOTICONS = 206, // [1F600] + B_UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 207, // [1F680] + B_UNICODE_ALCHEMICAL_SYMBOLS = 208, // [1F700] + B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, // [2B740] + + B_UNICODE_SCRIPT_COUNT = 210, + B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT, + + B_UNICODE_INVALID_CODE = -1 }; -/** - * Values returned by the u_getCellWidth() function. - */ +// East Asian Width constants. -enum unicode_cell_width +enum unicode_east_asian_width { - B_UNICODE_ZERO_WIDTH = 0, - B_UNICODE_HALF_WIDTH = 1, - B_UNICODE_FULL_WIDTH = 2, - B_UNICODE_NEUTRAL_WIDTH = 3, - - B_UNICODE_CELL_WIDTH_COUNT + B_UNICODE_EA_NEUTRAL, // [N] + B_UNICODE_EA_AMBIGUOUS, // [A] + B_UNICODE_EA_HALFWIDTH, // [H] + B_UNICODE_EA_FULLWIDTH, // [F] + B_UNICODE_EA_NARROW, // [Na] + B_UNICODE_EA_WIDE, // [W] + B_UNICODE_EA_COUNT }; @@ -209,6 +347,7 @@ class BUnicodeChar { static uint32 ToUpper(uint32 c); static uint32 ToTitle(uint32 c); static int32 DigitValue(uint32 c); + static unicode_east_asian_width EastAsianWidth(uint32 c); static void ToUTF8(uint32 c, char **out); static uint32 FromUTF8(const char **in); @@ -230,4 +369,4 @@ BUnicodeChar::FromUTF8(const char *in) } -#endif /* _UNICODE_CHAR_H_ */ +#endif // _UNICODE_CHAR_H_ diff --git a/src/kits/locale/UnicodeChar.cpp b/src/kits/locale/UnicodeChar.cpp index a86192e782..242e16d5c8 100644 --- a/src/kits/locale/UnicodeChar.cpp +++ b/src/kits/locale/UnicodeChar.cpp @@ -1,234 +1,18 @@ -/* -** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. -** Distributed under the terms of the OpenBeOS License. -*/ - -/* Reads the information out of the data files created by (an edited version of) - * IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart - * to ICU's uchar module, but is not as huge or broad as that one. +/* + * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. + * Distributed under the terms of the MIT License. * - * Note, it probably won't be able to handle the output of the orginal genprops - * tool and vice versa - only use the tool provided with this project to create - * the Unicode property file. - * However, the algorithmic idea behind the property file is still the same as - * found in ICU - nothing important has been changed, so more recent versions - * of genprops tool/data can probably be ported without too much effort. + * Authors: + * Axel Dörfler, axeld@pinc-software.de + * Siarzhuk Zharski, zharik@gmx.li * - * In case no property file can be found it will still provide basic services - * for the Latin-1 part of the character tables. */ -#include - #include -#include -#include -#include - - -#define FLAG(n) ((uint32)1 << (n)) -enum { - UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER), - UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER), - UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER), - UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER), - UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER), - UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER), - UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER), - UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER) -}; - - -static uint32 gStaticProps32Table[] = { - /* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c, - /* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f, - /* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c, - /* 0x20 */ 0x24c, 0x297, 0x297, 0x117, - /* 0x24 */ 0x119, 0x117, 0x297, 0x297, - /* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118, - /* 0x2c */ 0x197, 0x113, 0x197, 0xd7, - /* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089, - /* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089, - /* 0x38 */ 0x800089, 0x900089, 0x197, 0x297, - /* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297, - /* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001, - /* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, - /* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, - /* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, - /* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, - /* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, - /* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94, - /* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296, - /* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002, - /* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, - /* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, - /* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, - /* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, - /* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, - /* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94, - /* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f, - /* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f, - /* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f, - /* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f -}; - -enum { - INDEX_STAGE_2_BITS, - INDEX_STAGE_3_BITS, - INDEX_EXCEPTIONS, - INDEX_STAGE_3_INDEX, - INDEX_PROPS, - INDEX_UCHARS -}; - -/* constants and macros for access to the data */ -enum { - EXC_UPPERCASE, - EXC_LOWERCASE, - EXC_TITLECASE, - EXC_DIGIT_VALUE, - EXC_NUMERIC_VALUE, - EXC_DENOMINATOR_VALUE, - EXC_MIRROR_MAPPING, - EXC_SPECIAL_CASING, - EXC_CASE_FOLDING -}; - -enum { - EXCEPTION_SHIFT = 5, - BIDI_SHIFT, - MIRROR_SHIFT = BIDI_SHIFT + 5, - VALUE_SHIFT = 20, - - VALUE_BITS = 32 - VALUE_SHIFT -}; - -/* number of bits in an 8-bit integer value */ -#define EXC_GROUP 8 -static uint8 gFlagsOffset[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 -}; - -#ifdef UCHAR_VARIABLE_TRIE_BITS - // access values calculated from indices - static uint16_t stage23Bits, stage2Mask, stage3Mask; -# define sStage3Bits indexes[INDEX_STAGE_3_BITS] -#else - // Use hardcoded bit distribution for the trie table access -# define sStage23Bits 10 -# define sStage2Mask 0x3f -# define sStage3Mask 0xf -# define sStage3Bits 4 -#endif - - -/** We need to change the char category for ISO 8 controls, since the - * genprops utility we got from IBM's ICU apparently changes it for - * some characters. - */ - -static inline bool -isISO8Control(uint32 c) -{ - return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20); -} - - -static inline uint32 -getProperties(uint32 c) -{ - if (c > 0x10ffff) - return 0; - - // TODO : Data from unicode - - return c > 0x9f ? 0 : gStaticProps32Table[c]; -} - - -static inline uint8 -getCategory(uint32 properties) -{ - return properties & 0x1f; -} - - -static inline bool -propertyIsException(uint32 properties) -{ - return properties & (1UL << EXCEPTION_SHIFT); -} - - -static inline uint32 -getUnsignedValue(uint32 properties) -{ - return properties >> VALUE_SHIFT; -} - - -static inline uint32 -getSignedValue(uint32 properties) -{ - return (int32)properties >> VALUE_SHIFT; -} - - -static inline uint32 * -getExceptions(uint32 properties) -{ - // TODO : data from unicode - return 0; -} - - -static inline bool -haveExceptionValue(uint32 flags,int16 index) -{ - return flags & (1UL << index); -} - - -static inline void -addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset) -{ - if (index >= EXC_GROUP) { - *offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)]; - flags >>= EXC_GROUP; - index -= EXC_GROUP; - } - *offset += gFlagsOffset[flags & ((1 << index) - 1)]; -} - - -// #pragma mark - +#include +#include BUnicodeChar::BUnicodeChar() @@ -236,382 +20,244 @@ BUnicodeChar::BUnicodeChar() } -bool -BUnicodeChar::IsAlpha(uint32 c) -{ - BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) - ) != 0; -} - - -/** Returns the type code of the specified unicode character */ +// Returns the general category value for the code point. int8 BUnicodeChar::Type(uint32 c) { BUnicodeChar(); - return (int8)getCategory(getProperties(c)); + return u_charType(c); } -bool -BUnicodeChar::IsLower(uint32 c) +// Determines whether the specified code point is a letter character. +// True for general categories "L" (letters). +bool +BUnicodeChar::IsAlpha(uint32 c) { BUnicodeChar(); - return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER; + return u_isalpha(c); } -bool -BUnicodeChar::IsUpper(uint32 c) -{ - BUnicodeChar(); - return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER; -} - - -bool -BUnicodeChar::IsTitle(uint32 c) -{ - BUnicodeChar(); - return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER; -} - - -bool -BUnicodeChar::IsDigit(uint32 c) -{ - BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER) - ) != 0; -} - - -bool +// Determines whether the specified code point is an alphanumeric character +// (letter or digit). +// True for characters with general categories +// "L" (letters) and "Nd" (decimal digit numbers). +bool BUnicodeChar::IsAlNum(uint32 c) { BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE - | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) - ) != 0; + return u_isalnum(c); } -bool +// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). +bool +BUnicodeChar::IsLower(uint32 c) +{ + BUnicodeChar(); + return u_isULowercase(c); +} + + +// Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE). +bool +BUnicodeChar::IsUpper(uint32 c) +{ + BUnicodeChar(); + return u_isUUppercase(c); +} + + +// Determines whether the specified code point is a titlecase letter. +// True for general category "Lt" (titlecase letter). +bool +BUnicodeChar::IsTitle(uint32 c) +{ + BUnicodeChar(); + return u_istitle(c); +} + + +// Determines whether the specified code point is a digit character. +// True for characters with general category "Nd" (decimal digit numbers). +// Beginning with Unicode 4, this is the same as +// testing for the Numeric_Type of Decimal. +bool +BUnicodeChar::IsDigit(uint32 c) +{ + BUnicodeChar(); + return u_isdigit(c); +} + + +// Determines whether the specified code point is a hexadecimal digit. +// This is equivalent to u_digit(c, 16)>=0. +// True for characters with general category "Nd" (decimal digit numbers) +// as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. +// (That is, for letters with code points +// 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) +bool +BUnicodeChar::IsHexDigit(uint32 c) +{ + BUnicodeChar(); + return u_isxdigit(c); +} + + +// Determines whether the specified code point is "defined", +// which usually means that it is assigned a character. +// True for general categories other than "Cn" (other, not assigned), +// i.e., true for all code points mentioned in UnicodeData.txt. +bool BUnicodeChar::IsDefined(uint32 c) { BUnicodeChar(); - return getProperties(c) != 0; + return u_isdefined(c); } -/** Returns true if the specified unicode character is a base - * form character that can be used with a diacritic. - * This doesn't mean that the character has to be distinct, - * though. - */ - -bool +// Determines whether the specified code point is a base character. +// True for general categories "L" (letters), "N" (numbers), +// "Mc" (spacing combining marks), and "Me" (enclosing marks). +bool BUnicodeChar::IsBase(uint32 c) { BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER - | UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE - | UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK) - | FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK)) - ) != 0; + return u_isbase(c); } -/** Returns true if the specified unicode character is a - * control character. - */ - -bool +// Determines whether the specified code point is a control character +// (as defined by this function). +// A control character is one of the following: +// - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) +// - U_CONTROL_CHAR (Cc) +// - U_FORMAT_CHAR (Cf) +// - U_LINE_SEPARATOR (Zl) +// - U_PARAGRAPH_SEPARATOR (Zp) +bool BUnicodeChar::IsControl(uint32 c) { BUnicodeChar(); - return isISO8Control(c) - || (FLAG(getCategory(getProperties(c))) - & (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR) - | FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) - ) != 0; + return u_iscntrl(c); } -/** Returns true if the specified unicode character is a - * punctuation character. - */ - +// Determines whether the specified code point is a punctuation character. +// True for characters with general categories "P" (punctuation). bool BUnicodeChar::IsPunctuation(uint32 c) { BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (FLAG(B_UNICODE_DASH_PUNCTUATION) - | FLAG(B_UNICODE_START_PUNCTUATION) - | FLAG(B_UNICODE_END_PUNCTUATION) - | FLAG(B_UNICODE_CONNECTOR_PUNCTUATION) - | FLAG(B_UNICODE_OTHER_PUNCTUATION)) - ) != 0; + return u_ispunct(c); } -/** Returns true if the specified unicode character is some - * kind of a space character. - */ - -bool +// Determine if the specified code point is a space character according to Java. +// True for characters with general categories "Z" (separators), +// which does not include control codes (e.g., TAB or Line Feed). +bool BUnicodeChar::IsSpace(uint32 c) { BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (FLAG(B_UNICODE_SPACE_SEPARATOR) - | FLAG(B_UNICODE_LINE_SEPARATOR) - | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) - ) != 0; + return u_isJavaSpaceChar(c); } -/** Returns true if the specified unicode character is a white - * space character. - * This is essentially the same as IsSpace(), but excludes all - * non-breakable spaces. - */ - -bool +// Determines if the specified code point is a whitespace character +// A character is considered to be a whitespace character if and only +// if it satisfies one of the following criteria: +// - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), +// but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space +// or U+202F Narrow NBSP). +// - It is U+0009 HORIZONTAL TABULATION. +// - It is U+000A LINE FEED. +// - It is U+000B VERTICAL TABULATION. +// - It is U+000C FORM FEED. +// - It is U+000D CARRIAGE RETURN. +// - It is U+001C FILE SEPARATOR. +// - It is U+001D GROUP SEPARATOR. +// - It is U+001E RECORD SEPARATOR. +// - It is U+001F UNIT SEPARATOR. +bool BUnicodeChar::IsWhitespace(uint32 c) { BUnicodeChar(); - return (FLAG(getCategory(getProperties(c))) - & (FLAG(B_UNICODE_SPACE_SEPARATOR) - | FLAG(B_UNICODE_LINE_SEPARATOR) - | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) - ) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces + return u_isWhitespace(c); } -/** Returns true if the specified unicode character is printable. - */ - -bool +// Determines whether the specified code point is a printable character. +// True for general categories other than "C" (controls). +bool BUnicodeChar::IsPrintable(uint32 c) { BUnicodeChar(); - return !isISO8Control(c) - && (FLAG(getCategory(getProperties(c))) - & ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR) - | FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR) - | FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES) - | FLAG(31)) - ) != 0; + return u_isprint(c); } // #pragma mark - - -/** Transforms the specified unicode character to lowercase. - */ - -uint32 +uint32 BUnicodeChar::ToLower(uint32 c) { BUnicodeChar(); - - uint32 props = getProperties(c); - - if (!propertyIsException(props)) { - if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE)) - return c + getSignedValue(props); - } else { - uint32 *exceptions = getExceptions(props); - uint32 firstExceptionValue = *exceptions; - - if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) { - int16 index = EXC_LOWERCASE; - addExceptionOffset(firstExceptionValue, index, &++exceptions); - return *exceptions; - } - } - // no mapping found, just return the character unchanged - return c; + return u_tolower(c); } -/** Transforms the specified unicode character to uppercase. - */ - -uint32 +uint32 BUnicodeChar::ToUpper(uint32 c) { BUnicodeChar(); - - uint32 props = getProperties(c); - - if (!propertyIsException(props)) { - if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) - return c - getSignedValue(props); - } else { - uint32 *exceptions = getExceptions(props); - uint32 firstExceptionValue = *exceptions; - - if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { - int16 index = EXC_UPPERCASE; - ++exceptions; - addExceptionOffset(firstExceptionValue, index, &exceptions); - return *exceptions; - } - } - // no mapping found, just return the character unchanged - return c; + return u_toupper(c); } -/** Transforms the specified unicode character to title case. - */ - -uint32 +uint32 BUnicodeChar::ToTitle(uint32 c) { BUnicodeChar(); - - uint32 props = getProperties(c); - - if (!propertyIsException(props)) { - if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) { - // here, titlecase is the same as uppercase - return c - getSignedValue(props); - } - } else { - uint32 *exceptions = getExceptions(props); - uint32 firstExceptionValue = *exceptions; - - if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) { - int16 index = EXC_TITLECASE; - addExceptionOffset(firstExceptionValue, index, &++exceptions); - return (uint32)*exceptions; - } else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { - // here, titlecase is the same as uppercase - int16 index = EXC_UPPERCASE; - addExceptionOffset(firstExceptionValue, index, &++exceptions); - return *exceptions; - } - } - // no mapping found, just return the character unchanged - return c; + return u_totitle(c); } -int32 +int32 BUnicodeChar::DigitValue(uint32 c) { BUnicodeChar(); + return u_digit(c, 10); +} - uint32 props = getProperties(c); - if (!propertyIsException(props)) { - if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER) - return getSignedValue(props); - } else { - uint32 *exceptions = getExceptions(props); - uint32 firstExceptionValue = *exceptions; - - if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) { - int16 index = EXC_DIGIT_VALUE; - addExceptionOffset(firstExceptionValue, index, &++exceptions); - - int32 value = (int32)(int16)*exceptions; - // the digit value is in the lower 16 bits - if (value != -1) - return value; - } - } - - // If there is no value in the properties table, - // then check for some special characters - switch (c) { - case 0x3007: return 0; - case 0x4e00: return 1; - case 0x4e8c: return 2; - case 0x4e09: return 3; - case 0x56d8: return 4; - case 0x4e94: return 5; - case 0x516d: return 6; - case 0x4e03: return 7; - case 0x516b: return 8; - case 0x4e5d: return 9; - default: return -1; - } +unicode_east_asian_width +BUnicodeChar::EastAsianWidth(uint32 c) +{ + return (unicode_east_asian_width)u_getIntPropertyValue(c, + UCHAR_EAST_ASIAN_WIDTH); } void BUnicodeChar::ToUTF8(uint32 c, char **out) { - char *s = *out; - - if (c < 0x80) - *(s++) = c; - else if (c < 0x800) { - *(s++) = 0xc0 | (c >> 6); - *(s++) = 0x80 | (c & 0x3f); - } else if (c < 0x10000) { - *(s++) = 0xe0 | (c >> 12); - *(s++) = 0x80 | ((c >> 6) & 0x3f); - *(s++) = 0x80 | (c & 0x3f); - } else if (c <= 0x10ffff) { - *(s++) = 0xf0 | (c >> 18); - *(s++) = 0x80 | ((c >> 12) & 0x3f); - *(s++) = 0x80 | ((c >> 6) & 0x3f); - *(s++) = 0x80 | (c & 0x3f); - } - *out = s; + int i = 0; + U8_APPEND_UNSAFE(*out, i, c); } -uint32 +uint32 BUnicodeChar::FromUTF8(const char **in) { - uint8 *bytes = (uint8 *)*in; - if (bytes == NULL) - return 0; - - int32 length; - uint8 mask = 0x1f; - - switch (bytes[0] & 0xf0) { - case 0xc0: - case 0xd0: length = 2; break; - case 0xe0: length = 3; break; - case 0xf0: - mask = 0x0f; - length = 4; - break; - default: - // valid 1-byte character - // and invalid characters - (*in)++; - return bytes[0]; - } - uint32 c = bytes[0] & mask; - int32 i = 1; - for (;i < length && (bytes[i] & 0x80) > 0;i++) - c = (c << 6) | (bytes[i] & 0x3f); - - if (i < length) { - // invalid character - (*in)++; - return (uint32)bytes[0]; - } - *in += length; + int i = 0; + uint32 c = 0; + U8_GET_UNSAFE(*in, i, c); return c; } + size_t BUnicodeChar::UTF8StringLength(const char *str) { @@ -623,6 +269,7 @@ BUnicodeChar::UTF8StringLength(const char *str) return len; } + size_t BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength) { @@ -633,4 +280,3 @@ BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength) } return len; } -