Switch BUnicodeChar to wrap the ICU's UChar32 one

Improve the unicode character processing and classifying routines by
wrapping up the UChar32 procedures from ICU. That fixes functional
regression introduced in hrev38017 and allows to fix East Asian Width
problems int the Temrinal.
This commit is contained in:
Siarzhuk Zharski 2013-04-02 07:47:26 +02:00
parent 4e17bdd83f
commit b6fd91b409
2 changed files with 413 additions and 628 deletions

View File

@ -8,6 +8,7 @@ enum unicode_char_category
// Non-category for unassigned and non-character code points.
B_UNICODE_UNASSIGNED = 0,
B_UNICODE_GENERAL_OTHER_TYPES = 0, // Cn
B_UNICODE_UPPERCASE_LETTER = 1, // Lu
B_UNICODE_LOWERCASE_LETTER = 2, // Ll
B_UNICODE_TITLECASE_LETTER = 3, // Lt
@ -37,15 +38,12 @@ enum unicode_char_category
B_UNICODE_OTHER_SYMBOL = 27, // So
B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi
B_UNICODE_FINAL_PUNCTUATION = 29, // Pf
B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn
B_UNICODE_CATEGORY_COUNT
};
/**
* This specifies the language directional property of a character set.
*/
// This specifies the language directional property of a character set.
enum unicode_char_direction {
B_UNICODE_LEFT_TO_RIGHT = 0,
@ -72,117 +70,257 @@ enum unicode_char_direction {
};
/**
* Script range as defined in the Unicode standard.
*/
// Script range as defined in the Unicode standard.
enum unicode_char_script {
// Script names
B_UNICODE_BASIC_LATIN,
B_UNICODE_LATIN_1_SUPPLEMENT,
B_UNICODE_LATIN_EXTENDED_A,
B_UNICODE_LATIN_EXTENDED_B,
B_UNICODE_IPA_EXTENSIONS,
B_UNICODE_SPACING_MODIFIER_LETTERS,
B_UNICODE_COMBINING_DIACRITICAL_MARKS,
B_UNICODE_GREEK,
B_UNICODE_CYRILLIC,
B_UNICODE_ARMENIAN,
B_UNICODE_HEBREW,
B_UNICODE_ARABIC,
B_UNICODE_SYRIAC,
B_UNICODE_THAANA,
B_UNICODE_DEVANAGARI,
B_UNICODE_BENGALI,
B_UNICODE_GURMUKHI,
B_UNICODE_GUJARATI,
B_UNICODE_ORIYA,
B_UNICODE_TAMIL,
B_UNICODE_TELUGU,
B_UNICODE_KANNADA,
B_UNICODE_MALAYALAM,
B_UNICODE_SINHALA,
B_UNICODE_THAI,
B_UNICODE_LAO,
B_UNICODE_TIBETAN,
B_UNICODE_MYANMAR,
B_UNICODE_GEORGIAN,
B_UNICODE_HANGUL_JAMO,
B_UNICODE_ETHIOPIC,
B_UNICODE_CHEROKEE,
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
B_UNICODE_OGHAM,
B_UNICODE_RUNIC,
B_UNICODE_KHMER,
B_UNICODE_MONGOLIAN,
B_UNICODE_LATIN_EXTENDED_ADDITIONAL,
B_UNICODE_GREEK_EXTENDED,
B_UNICODE_GENERAL_PUNCTUATION,
B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS,
B_UNICODE_CURRENCY_SYMBOLS,
B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS,
B_UNICODE_LETTERLIKE_SYMBOLS,
B_UNICODE_NUMBER_FORMS,
B_UNICODE_ARROWS,
B_UNICODE_MATHEMATICAL_OPERATORS,
B_UNICODE_MISCELLANEOUS_TECHNICAL,
B_UNICODE_CONTROL_PICTURES,
B_UNICODE_OPTICAL_CHARACTER_RECOGNITION,
B_UNICODE_ENCLOSED_ALPHANUMERICS,
B_UNICODE_BOX_DRAWING,
B_UNICODE_BLOCK_ELEMENTS,
B_UNICODE_GEOMETRIC_SHAPES,
B_UNICODE_MISCELLANEOUS_SYMBOLS,
B_UNICODE_DINGBATS,
B_UNICODE_BRAILLE_PATTERNS,
B_UNICODE_CJK_RADICALS_SUPPLEMENT,
B_UNICODE_KANGXI_RADICALS,
B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION,
B_UNICODE_HIRAGANA,
B_UNICODE_KATAKANA,
B_UNICODE_BOPOMOFO,
B_UNICODE_HANGUL_COMPATIBILITY_JAMO,
B_UNICODE_KANBUN,
B_UNICODE_BOPOMOFO_EXTENDED,
B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS,
B_UNICODE_CJK_COMPATIBILITY,
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS,
B_UNICODE_YI_SYLLABLES,
B_UNICODE_YI_RADICALS,
B_UNICODE_HANGUL_SYLLABLES,
B_UNICODE_HIGH_SURROGATES,
B_UNICODE_HIGH_PRIVATE_USE_SURROGATES,
B_UNICODE_LOW_SURROGATES,
B_UNICODE_PRIVATE_USE_AREA,
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS,
B_UNICODE_ALPHABETIC_PRESENTATION_FORMS,
B_UNICODE_ARABIC_PRESENTATION_FORMS_A,
B_UNICODE_COMBINING_HALF_MARKS,
B_UNICODE_CJK_COMPATIBILITY_FORMS,
B_UNICODE_SMALL_FORM_VARIANTS,
B_UNICODE_ARABIC_PRESENTATION_FORMS_B,
B_UNICODE_SPECIALS,
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS,
// New No_Block value in Unicode 4.
B_UNICODE_NO_BLOCK = 0, // [none] Special range
B_UNICODE_BASIC_LATIN = 1, // [0000]
B_UNICODE_LATIN_1_SUPPLEMENT = 2, // [0080]
B_UNICODE_LATIN_EXTENDED_A = 3, // [0100]
B_UNICODE_LATIN_EXTENDED_B = 4, // [0180]
B_UNICODE_IPA_EXTENSIONS = 5, // [0250]
B_UNICODE_SPACING_MODIFIER_LETTERS = 6, // [02B0]
B_UNICODE_COMBINING_DIACRITICAL_MARKS = 7, // [0300]
B_UNICODE_GREEK = 8, // [0370]
B_UNICODE_CYRILLIC = 9, // [0400]
B_UNICODE_ARMENIAN = 10, // [0530]
B_UNICODE_HEBREW = 11, // [0590]
B_UNICODE_ARABIC = 12, // [0600]
B_UNICODE_SYRIAC = 13, // [0700]
B_UNICODE_THAANA = 14, // [0780]
B_UNICODE_DEVANAGARI = 15, // [0900]
B_UNICODE_BENGALI = 16, // [0980]
B_UNICODE_GURMUKHI = 17, // [0A00]
B_UNICODE_GUJARATI = 18, // [0A80]
B_UNICODE_ORIYA = 19, // [0B00]
B_UNICODE_TAMIL = 20, // [0B80]
B_UNICODE_TELUGU = 21, // [0C00]
B_UNICODE_KANNADA = 22, // [0C80]
B_UNICODE_MALAYALAM = 23, // [0D00]
B_UNICODE_SINHALA = 24, // [0D80]
B_UNICODE_THAI = 25, // [0E00]
B_UNICODE_LAO = 26, // [0E80]
B_UNICODE_TIBETAN = 27, // [0F00]
B_UNICODE_MYANMAR = 28, // [1000]
B_UNICODE_GEORGIAN = 29, // [10A0]
B_UNICODE_HANGUL_JAMO = 30, // [1100]
B_UNICODE_ETHIOPIC = 31, // [1200]
B_UNICODE_CHEROKEE = 32, // [13A0]
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, // [1400]
B_UNICODE_OGHAM = 34, // [1680]
B_UNICODE_RUNIC = 35, // [16A0]
B_UNICODE_KHMER = 36, // [1780]
B_UNICODE_MONGOLIAN = 37, // [1800]
B_UNICODE_LATIN_EXTENDED_ADDITIONAL = 38, // [1E00]
B_UNICODE_GREEK_EXTENDED = 39, // [1F00]
B_UNICODE_GENERAL_PUNCTUATION = 40, // [2000]
B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, // [2070]
B_UNICODE_CURRENCY_SYMBOLS = 42, // [20A0]
B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS = 43, // [20D0]
B_UNICODE_LETTERLIKE_SYMBOLS = 44, // [2100]
B_UNICODE_NUMBER_FORMS = 45, // [2150]
B_UNICODE_ARROWS = 46, // [2190]
B_UNICODE_MATHEMATICAL_OPERATORS = 47, // [2200]
B_UNICODE_MISCELLANEOUS_TECHNICAL = 48, // [2300]
B_UNICODE_CONTROL_PICTURES = 49, // [2400]
B_UNICODE_OPTICAL_CHARACTER_RECOGNITION = 50, // [2440]
B_UNICODE_ENCLOSED_ALPHANUMERICS = 51, // [2460]
B_UNICODE_BOX_DRAWING = 52, // [2500]
B_UNICODE_BLOCK_ELEMENTS = 53, // [2580]
B_UNICODE_GEOMETRIC_SHAPES = 54, // [25A0]
B_UNICODE_MISCELLANEOUS_SYMBOLS = 55, // [2600]
B_UNICODE_DINGBATS = 56, // [2700]
B_UNICODE_BRAILLE_PATTERNS = 57, // [2800]
B_UNICODE_CJK_RADICALS_SUPPLEMENT = 58, // [2E80]
B_UNICODE_KANGXI_RADICALS = 59, // [2F00]
B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, // [2FF0]
B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 61, // [3000]
B_UNICODE_HIRAGANA = 62, // [3040]
B_UNICODE_KATAKANA = 63, // [30A0]
B_UNICODE_BOPOMOFO = 64, // [3100]
B_UNICODE_HANGUL_COMPATIBILITY_JAMO = 65, // [3130]
B_UNICODE_KANBUN = 66, // [3190]
B_UNICODE_BOPOMOFO_EXTENDED = 67, // [31A0]
B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, // [3200]
B_UNICODE_CJK_COMPATIBILITY = 69, // [3300]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, // [3400]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS = 71, // [4E00]
B_UNICODE_YI_SYLLABLES = 72, // [A000]
B_UNICODE_YI_RADICALS = 73, // [A490]
B_UNICODE_HANGUL_SYLLABLES = 74, // [AC00]
B_UNICODE_HIGH_SURROGATES = 75, // [D800]
B_UNICODE_HIGH_PRIVATE_USE_SURROGATES = 76, // [DB80]
B_UNICODE_LOW_SURROGATES = 77, // [DC00]
B_UNICODE_PRIVATE_USE = 78,
B_UNICODE_PRIVATE_USE_AREA = B_UNICODE_PRIVATE_USE, // [E000]
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 79, // [F900]
B_UNICODE_ALPHABETIC_PRESENTATION_FORMS = 80, // [FB00]
B_UNICODE_ARABIC_PRESENTATION_FORMS_A = 81, // [FB50]
B_UNICODE_COMBINING_HALF_MARKS = 82, // [FE20]
B_UNICODE_CJK_COMPATIBILITY_FORMS = 83, // [FE30]
B_UNICODE_SMALL_FORM_VARIANTS = 84, // [FE50]
B_UNICODE_ARABIC_PRESENTATION_FORMS_B = 85, // [FE70]
B_UNICODE_SPECIALS = 86, // [FFF0]
B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, // [FF00]
B_UNICODE_SCRIPT_COUNT,
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT
// New blocks in Unicode 3.1
B_UNICODE_OLD_ITALIC = 88, // [10300]
B_UNICODE_GOTHIC = 89, // [10330]
B_UNICODE_DESERET = 90, // [10400]
B_UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 91, // [1D000]
B_UNICODE_MUSICAL_SYMBOLS = 92, // [1D100]
B_UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, // [1D400]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, // [20000]
B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, // [2F800]
B_UNICODE_TAGS = 96, // [E0000]
// New blocks in Unicode
B_UNICODE_CYRILLIC_SUPPLEMENTARY = 97,
B_UNICODE_CYRILLIC_SUPPLEMENT = B_UNICODE_CYRILLIC_SUPPLEMENTARY, // [0500]
B_UNICODE_TAGALOG = 98, // [1700]
B_UNICODE_HANUNOO = 99, // [1720]
B_UNICODE_BUHID = 100, // [1740]
B_UNICODE_TAGBANWA = 101, // [1760]
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, // [27C0]
B_UNICODE_SUPPLEMENTAL_ARROWS_A = 103, // [27F0]
B_UNICODE_SUPPLEMENTAL_ARROWS_B = 104, // [2900]
B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, // [2980]
B_UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, // [2A00]
B_UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 107, // [31F0]
B_UNICODE_VARIATION_SELECTORS = 108, // [FE00]
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, // [F0000]
B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, // [100000]
// New blocks in Unicode 4
B_UNICODE_LIMBU = 111, // [1900]
B_UNICODE_TAI_LE = 112, // [1950]
B_UNICODE_KHMER_SYMBOLS = 113, // [19E0]
B_UNICODE_PHONETIC_EXTENSIONS = 114, // [1D00]
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, // [2B00]
B_UNICODE_YIJING_HEXAGRAM_SYMBOLS = 116, // [4DC0]
B_UNICODE_LINEAR_B_SYLLABARY = 117, // [10000]
B_UNICODE_LINEAR_B_IDEOGRAMS = 118, // [10080]
B_UNICODE_AEGEAN_NUMBERS = 119, // [10100]
B_UNICODE_UGARITIC = 120, // [10380]
B_UNICODE_SHAVIAN = 121, // [10450]
B_UNICODE_OSMANYA = 122, // [10480]
B_UNICODE_CYPRIOT_SYLLABARY = 123, // [10800]
B_UNICODE_TAI_XUAN_JING_SYMBOLS = 124, // [1D300]
B_UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 125, // [E0100]
// New blocks in Unicode 4.1
B_UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 126, // [1D200]
B_UNICODE_ANCIENT_GREEK_NUMBERS = 127, // [10140]
B_UNICODE_ARABIC_SUPPLEMENT = 128, // [0750]
B_UNICODE_BUGINESE = 129, // [1A00]
B_UNICODE_CJK_STROKES = 130, // [31C0]
B_UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, // [1DC0]
B_UNICODE_COPTIC = 132, // [2C80]
B_UNICODE_ETHIOPIC_EXTENDED = 133, // [2D80]
B_UNICODE_ETHIOPIC_SUPPLEMENT = 134, // [1380]
B_UNICODE_GEORGIAN_SUPPLEMENT = 135, // [2D00]
B_UNICODE_GLAGOLITIC = 136, // [2C00]
B_UNICODE_KHAROSHTHI = 137, // [10A00]
B_UNICODE_MODIFIER_TONE_LETTERS = 138, // [A700]
B_UNICODE_NEW_TAI_LUE = 139, // [1980]
B_UNICODE_OLD_PERSIAN = 140, // [103A0]
B_UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, // [1D80]
B_UNICODE_SUPPLEMENTAL_PUNCTUATION = 142, // [2E00]
B_UNICODE_SYLOTI_NAGRI = 143, // [A800]
B_UNICODE_TIFINAGH = 144, // [2D30]
B_UNICODE_VERTICAL_FORMS = 145, // [FE10]
// New blocks in Unicode 5.0
B_UNICODE_NKO = 146, // [07C0]
B_UNICODE_BALINESE = 147, // [1B00]
B_UNICODE_LATIN_EXTENDED_C = 148, // [2C60]
B_UNICODE_LATIN_EXTENDED_D = 149, // [A720]
B_UNICODE_PHAGS_PA = 150, // [A840]
B_UNICODE_PHOENICIAN = 151, // [10900]
B_UNICODE_CUNEIFORM = 152, // [12000]
B_UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, // [12400]
B_UNICODE_COUNTING_ROD_NUMERALS = 154, // [1D360]
// New blocks in Unicode 5.1
B_UNICODE_SUNDANESE = 155, // [1B80]
B_UNICODE_LEPCHA = 156, // [1C00]
B_UNICODE_OL_CHIKI = 157, // [1C50]
B_UNICODE_CYRILLIC_EXTENDED_A = 158, // [2DE0]
B_UNICODE_VAI = 159, // [A500]
B_UNICODE_CYRILLIC_EXTENDED_B = 160, // [A640]
B_UNICODE_SAURASHTRA = 161, // [A880]
B_UNICODE_KAYAH_LI = 162, // [A900]
B_UNICODE_REJANG = 163, // [A930]
B_UNICODE_CHAM = 164, // [AA00]
B_UNICODE_ANCIENT_SYMBOLS = 165, // [10190]
B_UNICODE_PHAISTOS_DISC = 166, // [101D0]
B_UNICODE_LYCIAN = 167, // [10280]
B_UNICODE_CARIAN = 168, // [102A0]
B_UNICODE_LYDIAN = 169, // [10920]
B_UNICODE_MAHJONG_TILES = 170, // [1F000]
B_UNICODE_DOMINO_TILES = 171, // [1F030]
// New blocks in Unicode 5.2
B_UNICODE_SAMARITAN = 172, // [0800]
B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, // [18B0]
B_UNICODE_TAI_THAM = 174, // [1A20]
B_UNICODE_VEDIC_EXTENSIONS = 175, // [1CD0]
B_UNICODE_LISU = 176, // [A4D0]
B_UNICODE_BAMUM = 177, // [A6A0]
B_UNICODE_COMMON_INDIC_NUMBER_FORMS = 178, // [A830]
B_UNICODE_DEVANAGARI_EXTENDED = 179, // [A8E0]
B_UNICODE_HANGUL_JAMO_EXTENDED_A = 180, // [A960]
B_UNICODE_JAVANESE = 181, // [A980]
B_UNICODE_MYANMAR_EXTENDED_A = 182, // [AA60]
B_UNICODE_TAI_VIET = 183, // [AA80]
B_UNICODE_MEETEI_MAYEK = 184, // [ABC0]
B_UNICODE_HANGUL_JAMO_EXTENDED_B = 185, // [D7B0]
B_UNICODE_IMPERIAL_ARAMAIC = 186, // [10840]
B_UNICODE_OLD_SOUTH_ARABIAN = 187, // [10A60]
B_UNICODE_AVESTAN = 188, // [10B00]
B_UNICODE_INSCRIPTIONAL_PARTHIAN = 189, // [10B40]
B_UNICODE_INSCRIPTIONAL_PAHLAVI = 190, // [10B60]
B_UNICODE_OLD_TURKIC = 191, // [10C00]
B_UNICODE_RUMI_NUMERAL_SYMBOLS = 192, // [10E60]
B_UNICODE_KAITHI = 193, // [11080]
B_UNICODE_EGYPTIAN_HIEROGLYPHS = 194, // [13000]
B_UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, // [1F100]
B_UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, // [1F200]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, // [2A700]
// New blocks in Unicode 6.0
B_UNICODE_MANDAIC = 198, // [0840]
B_UNICODE_BATAK = 199, // [1BC0]
B_UNICODE_ETHIOPIC_EXTENDED_A = 200, // [AB00]
B_UNICODE_BRAHMI = 201, // [11000]
B_UNICODE_BAMUM_SUPPLEMENT = 202, // [16800]
B_UNICODE_KANA_SUPPLEMENT = 203, // [1B000]
B_UNICODE_PLAYING_CARDS = 204, // [1F0A0]
B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, // [1F300]
B_UNICODE_EMOTICONS = 206, // [1F600]
B_UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 207, // [1F680]
B_UNICODE_ALCHEMICAL_SYMBOLS = 208, // [1F700]
B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, // [2B740]
B_UNICODE_SCRIPT_COUNT = 210,
B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT,
B_UNICODE_INVALID_CODE = -1
};
/**
* Values returned by the u_getCellWidth() function.
*/
// East Asian Width constants.
enum unicode_cell_width
enum unicode_east_asian_width
{
B_UNICODE_ZERO_WIDTH = 0,
B_UNICODE_HALF_WIDTH = 1,
B_UNICODE_FULL_WIDTH = 2,
B_UNICODE_NEUTRAL_WIDTH = 3,
B_UNICODE_CELL_WIDTH_COUNT
B_UNICODE_EA_NEUTRAL, // [N]
B_UNICODE_EA_AMBIGUOUS, // [A]
B_UNICODE_EA_HALFWIDTH, // [H]
B_UNICODE_EA_FULLWIDTH, // [F]
B_UNICODE_EA_NARROW, // [Na]
B_UNICODE_EA_WIDE, // [W]
B_UNICODE_EA_COUNT
};
@ -209,6 +347,7 @@ class BUnicodeChar {
static uint32 ToUpper(uint32 c);
static uint32 ToTitle(uint32 c);
static int32 DigitValue(uint32 c);
static unicode_east_asian_width EastAsianWidth(uint32 c);
static void ToUTF8(uint32 c, char **out);
static uint32 FromUTF8(const char **in);
@ -230,4 +369,4 @@ BUnicodeChar::FromUTF8(const char *in)
}
#endif /* _UNICODE_CHAR_H_ */
#endif // _UNICODE_CHAR_H_

View File

@ -1,234 +1,18 @@
/*
** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
** Distributed under the terms of the OpenBeOS License.
* Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
* Distributed under the terms of the MIT License.
*
* Authors:
* Axel Dörfler, axeld@pinc-software.de
* Siarzhuk Zharski, zharik@gmx.li
*
*/
/* Reads the information out of the data files created by (an edited version of)
* IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart
* to ICU's uchar module, but is not as huge or broad as that one.
*
* Note, it probably won't be able to handle the output of the orginal genprops
* tool and vice versa - only use the tool provided with this project to create
* the Unicode property file.
* However, the algorithmic idea behind the property file is still the same as
* found in ICU - nothing important has been changed, so more recent versions
* of genprops tool/data can probably be ported without too much effort.
*
* In case no property file can be found it will still provide basic services
* for the Latin-1 part of the character tables.
*/
#include <OS.h>
#include <UnicodeChar.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define FLAG(n) ((uint32)1 << (n))
enum {
UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER),
UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER),
UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER),
UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER),
UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER),
UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER),
UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER),
UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER)
};
static uint32 gStaticProps32Table[] = {
/* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c,
/* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f,
/* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c,
/* 0x20 */ 0x24c, 0x297, 0x297, 0x117,
/* 0x24 */ 0x119, 0x117, 0x297, 0x297,
/* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118,
/* 0x2c */ 0x197, 0x113, 0x197, 0xd7,
/* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089,
/* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089,
/* 0x38 */ 0x800089, 0x900089, 0x197, 0x297,
/* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297,
/* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001,
/* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001,
/* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94,
/* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296,
/* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002,
/* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002,
/* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94,
/* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f,
/* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f,
/* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f,
/* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f
};
enum {
INDEX_STAGE_2_BITS,
INDEX_STAGE_3_BITS,
INDEX_EXCEPTIONS,
INDEX_STAGE_3_INDEX,
INDEX_PROPS,
INDEX_UCHARS
};
/* constants and macros for access to the data */
enum {
EXC_UPPERCASE,
EXC_LOWERCASE,
EXC_TITLECASE,
EXC_DIGIT_VALUE,
EXC_NUMERIC_VALUE,
EXC_DENOMINATOR_VALUE,
EXC_MIRROR_MAPPING,
EXC_SPECIAL_CASING,
EXC_CASE_FOLDING
};
enum {
EXCEPTION_SHIFT = 5,
BIDI_SHIFT,
MIRROR_SHIFT = BIDI_SHIFT + 5,
VALUE_SHIFT = 20,
VALUE_BITS = 32 - VALUE_SHIFT
};
/* number of bits in an 8-bit integer value */
#define EXC_GROUP 8
static uint8 gFlagsOffset[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
#ifdef UCHAR_VARIABLE_TRIE_BITS
// access values calculated from indices
static uint16_t stage23Bits, stage2Mask, stage3Mask;
# define sStage3Bits indexes[INDEX_STAGE_3_BITS]
#else
// Use hardcoded bit distribution for the trie table access
# define sStage23Bits 10
# define sStage2Mask 0x3f
# define sStage3Mask 0xf
# define sStage3Bits 4
#endif
/** We need to change the char category for ISO 8 controls, since the
* genprops utility we got from IBM's ICU apparently changes it for
* some characters.
*/
static inline bool
isISO8Control(uint32 c)
{
return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20);
}
static inline uint32
getProperties(uint32 c)
{
if (c > 0x10ffff)
return 0;
// TODO : Data from unicode
return c > 0x9f ? 0 : gStaticProps32Table[c];
}
static inline uint8
getCategory(uint32 properties)
{
return properties & 0x1f;
}
static inline bool
propertyIsException(uint32 properties)
{
return properties & (1UL << EXCEPTION_SHIFT);
}
static inline uint32
getUnsignedValue(uint32 properties)
{
return properties >> VALUE_SHIFT;
}
static inline uint32
getSignedValue(uint32 properties)
{
return (int32)properties >> VALUE_SHIFT;
}
static inline uint32 *
getExceptions(uint32 properties)
{
// TODO : data from unicode
return 0;
}
static inline bool
haveExceptionValue(uint32 flags,int16 index)
{
return flags & (1UL << index);
}
static inline void
addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset)
{
if (index >= EXC_GROUP) {
*offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)];
flags >>= EXC_GROUP;
index -= EXC_GROUP;
}
*offset += gFlagsOffset[flags & ((1 << index) - 1)];
}
// #pragma mark -
#include <unicode/uchar.h>
#include <unicode/utf8.h>
BUnicodeChar::BUnicodeChar()
@ -236,274 +20,207 @@ BUnicodeChar::BUnicodeChar()
}
bool
BUnicodeChar::IsAlpha(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
) != 0;
}
/** Returns the type code of the specified unicode character */
// Returns the general category value for the code point.
int8
BUnicodeChar::Type(uint32 c)
{
BUnicodeChar();
return (int8)getCategory(getProperties(c));
return u_charType(c);
}
// Determines whether the specified code point is a letter character.
// True for general categories "L" (letters).
bool
BUnicodeChar::IsLower(uint32 c)
BUnicodeChar::IsAlpha(uint32 c)
{
BUnicodeChar();
return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER;
}
bool
BUnicodeChar::IsUpper(uint32 c)
{
BUnicodeChar();
return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER;
}
bool
BUnicodeChar::IsTitle(uint32 c)
{
BUnicodeChar();
return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER;
}
bool
BUnicodeChar::IsDigit(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER)
) != 0;
return u_isalpha(c);
}
// Determines whether the specified code point is an alphanumeric character
// (letter or digit).
// True for characters with general categories
// "L" (letters) and "Nd" (decimal digit numbers).
bool
BUnicodeChar::IsAlNum(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE
| UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
) != 0;
return u_isalnum(c);
}
// Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
bool
BUnicodeChar::IsLower(uint32 c)
{
BUnicodeChar();
return u_isULowercase(c);
}
// Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
bool
BUnicodeChar::IsUpper(uint32 c)
{
BUnicodeChar();
return u_isUUppercase(c);
}
// Determines whether the specified code point is a titlecase letter.
// True for general category "Lt" (titlecase letter).
bool
BUnicodeChar::IsTitle(uint32 c)
{
BUnicodeChar();
return u_istitle(c);
}
// Determines whether the specified code point is a digit character.
// True for characters with general category "Nd" (decimal digit numbers).
// Beginning with Unicode 4, this is the same as
// testing for the Numeric_Type of Decimal.
bool
BUnicodeChar::IsDigit(uint32 c)
{
BUnicodeChar();
return u_isdigit(c);
}
// Determines whether the specified code point is a hexadecimal digit.
// This is equivalent to u_digit(c, 16)>=0.
// True for characters with general category "Nd" (decimal digit numbers)
// as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
// (That is, for letters with code points
// 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
bool
BUnicodeChar::IsHexDigit(uint32 c)
{
BUnicodeChar();
return u_isxdigit(c);
}
// Determines whether the specified code point is "defined",
// which usually means that it is assigned a character.
// True for general categories other than "Cn" (other, not assigned),
// i.e., true for all code points mentioned in UnicodeData.txt.
bool
BUnicodeChar::IsDefined(uint32 c)
{
BUnicodeChar();
return getProperties(c) != 0;
return u_isdefined(c);
}
/** Returns true if the specified unicode character is a base
* form character that can be used with a diacritic.
* This doesn't mean that the character has to be distinct,
* though.
*/
// Determines whether the specified code point is a base character.
// True for general categories "L" (letters), "N" (numbers),
// "Mc" (spacing combining marks), and "Me" (enclosing marks).
bool
BUnicodeChar::IsBase(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER
| UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE
| UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK)
| FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK))
) != 0;
return u_isbase(c);
}
/** Returns true if the specified unicode character is a
* control character.
*/
// Determines whether the specified code point is a control character
// (as defined by this function).
// A control character is one of the following:
// - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
// - U_CONTROL_CHAR (Cc)
// - U_FORMAT_CHAR (Cf)
// - U_LINE_SEPARATOR (Zl)
// - U_PARAGRAPH_SEPARATOR (Zp)
bool
BUnicodeChar::IsControl(uint32 c)
{
BUnicodeChar();
return isISO8Control(c)
|| (FLAG(getCategory(getProperties(c)))
& (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR)
| FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0;
return u_iscntrl(c);
}
/** Returns true if the specified unicode character is a
* punctuation character.
*/
// Determines whether the specified code point is a punctuation character.
// True for characters with general categories "P" (punctuation).
bool
BUnicodeChar::IsPunctuation(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (FLAG(B_UNICODE_DASH_PUNCTUATION)
| FLAG(B_UNICODE_START_PUNCTUATION)
| FLAG(B_UNICODE_END_PUNCTUATION)
| FLAG(B_UNICODE_CONNECTOR_PUNCTUATION)
| FLAG(B_UNICODE_OTHER_PUNCTUATION))
) != 0;
return u_ispunct(c);
}
/** Returns true if the specified unicode character is some
* kind of a space character.
*/
// Determine if the specified code point is a space character according to Java.
// True for characters with general categories "Z" (separators),
// which does not include control codes (e.g., TAB or Line Feed).
bool
BUnicodeChar::IsSpace(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
| FLAG(B_UNICODE_LINE_SEPARATOR)
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0;
return u_isJavaSpaceChar(c);
}
/** Returns true if the specified unicode character is a white
* space character.
* This is essentially the same as IsSpace(), but excludes all
* non-breakable spaces.
*/
// Determines if the specified code point is a whitespace character
// A character is considered to be a whitespace character if and only
// if it satisfies one of the following criteria:
// - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
// but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
// or U+202F Narrow NBSP).
// - It is U+0009 HORIZONTAL TABULATION.
// - It is U+000A LINE FEED.
// - It is U+000B VERTICAL TABULATION.
// - It is U+000C FORM FEED.
// - It is U+000D CARRIAGE RETURN.
// - It is U+001C FILE SEPARATOR.
// - It is U+001D GROUP SEPARATOR.
// - It is U+001E RECORD SEPARATOR.
// - It is U+001F UNIT SEPARATOR.
bool
BUnicodeChar::IsWhitespace(uint32 c)
{
BUnicodeChar();
return (FLAG(getCategory(getProperties(c)))
& (FLAG(B_UNICODE_SPACE_SEPARATOR)
| FLAG(B_UNICODE_LINE_SEPARATOR)
| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces
return u_isWhitespace(c);
}
/** Returns true if the specified unicode character is printable.
*/
// Determines whether the specified code point is a printable character.
// True for general categories other than "C" (controls).
bool
BUnicodeChar::IsPrintable(uint32 c)
{
BUnicodeChar();
return !isISO8Control(c)
&& (FLAG(getCategory(getProperties(c)))
& ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR)
| FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR)
| FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES)
| FLAG(31))
) != 0;
return u_isprint(c);
}
// #pragma mark -
/** Transforms the specified unicode character to lowercase.
*/
uint32
BUnicodeChar::ToLower(uint32 c)
{
BUnicodeChar();
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE))
return c + getSignedValue(props);
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) {
int16 index = EXC_LOWERCASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
return u_tolower(c);
}
/** Transforms the specified unicode character to uppercase.
*/
uint32
BUnicodeChar::ToUpper(uint32 c)
{
BUnicodeChar();
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER)
return c - getSignedValue(props);
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
int16 index = EXC_UPPERCASE;
++exceptions;
addExceptionOffset(firstExceptionValue, index, &exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
return u_toupper(c);
}
/** Transforms the specified unicode character to title case.
*/
uint32
BUnicodeChar::ToTitle(uint32 c)
{
BUnicodeChar();
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) {
// here, titlecase is the same as uppercase
return c - getSignedValue(props);
}
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) {
int16 index = EXC_TITLECASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return (uint32)*exceptions;
} else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
// here, titlecase is the same as uppercase
int16 index = EXC_UPPERCASE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
return *exceptions;
}
}
// no mapping found, just return the character unchanged
return c;
return u_totitle(c);
}
@ -511,107 +228,36 @@ int32
BUnicodeChar::DigitValue(uint32 c)
{
BUnicodeChar();
uint32 props = getProperties(c);
if (!propertyIsException(props)) {
if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER)
return getSignedValue(props);
} else {
uint32 *exceptions = getExceptions(props);
uint32 firstExceptionValue = *exceptions;
if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) {
int16 index = EXC_DIGIT_VALUE;
addExceptionOffset(firstExceptionValue, index, &++exceptions);
int32 value = (int32)(int16)*exceptions;
// the digit value is in the lower 16 bits
if (value != -1)
return value;
}
return u_digit(c, 10);
}
// If there is no value in the properties table,
// then check for some special characters
switch (c) {
case 0x3007: return 0;
case 0x4e00: return 1;
case 0x4e8c: return 2;
case 0x4e09: return 3;
case 0x56d8: return 4;
case 0x4e94: return 5;
case 0x516d: return 6;
case 0x4e03: return 7;
case 0x516b: return 8;
case 0x4e5d: return 9;
default: return -1;
}
unicode_east_asian_width
BUnicodeChar::EastAsianWidth(uint32 c)
{
return (unicode_east_asian_width)u_getIntPropertyValue(c,
UCHAR_EAST_ASIAN_WIDTH);
}
void
BUnicodeChar::ToUTF8(uint32 c, char **out)
{
char *s = *out;
if (c < 0x80)
*(s++) = c;
else if (c < 0x800) {
*(s++) = 0xc0 | (c >> 6);
*(s++) = 0x80 | (c & 0x3f);
} else if (c < 0x10000) {
*(s++) = 0xe0 | (c >> 12);
*(s++) = 0x80 | ((c >> 6) & 0x3f);
*(s++) = 0x80 | (c & 0x3f);
} else if (c <= 0x10ffff) {
*(s++) = 0xf0 | (c >> 18);
*(s++) = 0x80 | ((c >> 12) & 0x3f);
*(s++) = 0x80 | ((c >> 6) & 0x3f);
*(s++) = 0x80 | (c & 0x3f);
}
*out = s;
int i = 0;
U8_APPEND_UNSAFE(*out, i, c);
}
uint32
BUnicodeChar::FromUTF8(const char **in)
{
uint8 *bytes = (uint8 *)*in;
if (bytes == NULL)
return 0;
int32 length;
uint8 mask = 0x1f;
switch (bytes[0] & 0xf0) {
case 0xc0:
case 0xd0: length = 2; break;
case 0xe0: length = 3; break;
case 0xf0:
mask = 0x0f;
length = 4;
break;
default:
// valid 1-byte character
// and invalid characters
(*in)++;
return bytes[0];
}
uint32 c = bytes[0] & mask;
int32 i = 1;
for (;i < length && (bytes[i] & 0x80) > 0;i++)
c = (c << 6) | (bytes[i] & 0x3f);
if (i < length) {
// invalid character
(*in)++;
return (uint32)bytes[0];
}
*in += length;
int i = 0;
uint32 c = 0;
U8_GET_UNSAFE(*in, i, c);
return c;
}
size_t
BUnicodeChar::UTF8StringLength(const char *str)
{
@ -623,6 +269,7 @@ BUnicodeChar::UTF8StringLength(const char *str)
return len;
}
size_t
BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
{
@ -633,4 +280,3 @@ BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
}
return len;
}