mirror of
https://github.com/frida/tinycc
synced 2025-01-01 17:44:26 +03:00
Fix wide char handling in wide string literal
This commit fixed the problem that TCC directly cast each byte in wide string literal to wchar_t, which is wrong when wide string literal contains real wide chars. It fixed the problem by assuming input charset is UTF-8, and wchar_t stores wide chars in UTF-16 (Windows) or UTF-32 (others). The UTF-8 decoder is coded according to The Unicode Standard Version 10.
This commit is contained in:
parent
b8fe8fc210
commit
a82c11f4b4
68
tccpp.c
68
tccpp.c
@ -2105,13 +2105,79 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
|
||||
tcc_warning("unknown escape sequence: \'\\x%x\'", c);
|
||||
break;
|
||||
}
|
||||
} else if (is_long && c >= 0x80) {
|
||||
/* assume we are processing UTF-8 sequence */
|
||||
/* reference: The Unicode Standard, Version 10.0, ch3.9 */
|
||||
|
||||
int cont; /* count of continuation bytes */
|
||||
int skip; /* how many bytes should skip when error occured */
|
||||
int i;
|
||||
|
||||
/* decode leading byte */
|
||||
if (c < 0xC2) {
|
||||
skip = 1; goto invalid_utf8_sequence;
|
||||
} else if (c <= 0xDF) {
|
||||
cont = 1; n = c & 0x1f;
|
||||
} else if (c <= 0xEF) {
|
||||
cont = 2; n = c & 0xf;
|
||||
} else if (c <= 0xF4) {
|
||||
cont = 3; n = c & 0x7;
|
||||
} else {
|
||||
skip = 1; goto invalid_utf8_sequence;
|
||||
}
|
||||
|
||||
/* decode continuation bytes */
|
||||
for (i = 1; i <= cont; i++) {
|
||||
int l = 0x80, h = 0xBF;
|
||||
|
||||
/* adjust limit for second byte */
|
||||
if (i == 1) {
|
||||
switch (c) {
|
||||
case 0xE0: l = 0xA0; break;
|
||||
case 0xED: h = 0x9F; break;
|
||||
case 0xF0: l = 0x90; break;
|
||||
case 0xF4: h = 0x8F; break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p[i] < l || p[i] > h) {
|
||||
skip = i; goto invalid_utf8_sequence;
|
||||
}
|
||||
|
||||
n = (n << 6) | (p[i] & 0x3f);
|
||||
}
|
||||
|
||||
/* advance pointer */
|
||||
p += 1 + cont;
|
||||
c = n;
|
||||
goto add_char_nonext;
|
||||
|
||||
/* error handling */
|
||||
invalid_utf8_sequence:
|
||||
tcc_warning("ill-formed UTF-8 subsequence starting with: \'\\x%x\'", c);
|
||||
c = 0xFFFD;
|
||||
p += skip;
|
||||
goto add_char_nonext;
|
||||
|
||||
}
|
||||
p++;
|
||||
add_char_nonext:
|
||||
if (!is_long)
|
||||
cstr_ccat(outstr, c);
|
||||
else
|
||||
else {
|
||||
#ifdef TCC_TARGET_PE
|
||||
/* store as UTF-16 */
|
||||
if (c < 0x10000) {
|
||||
cstr_wccat(outstr, c);
|
||||
} else {
|
||||
c -= 0x10000;
|
||||
cstr_wccat(outstr, (c >> 10) + 0xD800);
|
||||
cstr_wccat(outstr, (c & 0x3FF) + 0xDC00);
|
||||
}
|
||||
#else
|
||||
cstr_wccat(outstr, c);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
/* add a trailing '\0' */
|
||||
if (!is_long)
|
||||
|
Loading…
Reference in New Issue
Block a user