Fix wide char handling in wide string literal

This commit fixed the problem that TCC directly cast each byte in wide string literal to wchar_t, which is wrong when wide string literal contains real wide chars. It fixed the problem by assuming input charset is UTF-8, and wchar_t stores wide chars in UTF-16 (Windows) or UTF-32 (others). The UTF-8 decoder is coded according to The Unicode Standard Version 10.
2025-01-01 17:44:26 +03:00 · 2017-09-09 20:37:43 +08:00 · 2017-09-09 20:37:43 +08:00 · a82c11f4b4
commit a82c11f4b4
parent b8fe8fc210
1 changed files with 67 additions and 1 deletions
--- a/tccpp.c
+++ b/tccpp.c
@ -2105,13 +2105,79 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
                    tcc_warning("unknown escape sequence: \'\\x%x\'", c);
                break;
            }
+        } else if (is_long && c >= 0x80) {
+            /* assume we are processing UTF-8 sequence */
+            /* reference: The Unicode Standard, Version 10.0, ch3.9 */
+
+            int cont; /* count of continuation bytes */
+            int skip; /* how many bytes should skip when error occured */
+            int i;
+
+            /* decode leading byte */
+            if (c < 0xC2) {
+	            skip = 1; goto invalid_utf8_sequence;
+            } else if (c <= 0xDF) {
+	            cont = 1; n = c & 0x1f;
+            } else if (c <= 0xEF) {
+	            cont = 2; n = c & 0xf;
+            } else if (c <= 0xF4) {
+	            cont = 3; n = c & 0x7;
+            } else {
+	            skip = 1; goto invalid_utf8_sequence;
+            }
+
+            /* decode continuation bytes */
+            for (i = 1; i <= cont; i++) {
+                int l = 0x80, h = 0xBF;
+
+                /* adjust limit for second byte */
+                if (i == 1) {
+                    switch (c) {
+                    case 0xE0: l = 0xA0; break;
+                    case 0xED: h = 0x9F; break;
+                    case 0xF0: l = 0x90; break;
+                    case 0xF4: h = 0x8F; break;
+                    }
+                }
+
+                if (p[i] < l || p[i] > h) {
+                    skip = i; goto invalid_utf8_sequence;
+                }
+
+                n = (n << 6) | (p[i] & 0x3f);
+            }
+
+            /* advance pointer */
+            p += 1 + cont;
+            c = n;
+            goto add_char_nonext;
+
+            /* error handling */
+        invalid_utf8_sequence:
+            tcc_warning("ill-formed UTF-8 subsequence starting with: \'\\x%x\'", c);
+            c = 0xFFFD;
+            p += skip;
+            goto add_char_nonext;
+
        }
        p++;
    add_char_nonext:
        if (!is_long)
            cstr_ccat(outstr, c);
-        else
+        else {
+#ifdef TCC_TARGET_PE
+            /* store as UTF-16 */
+            if (c < 0x10000) {
+                cstr_wccat(outstr, c);
+            } else {
+                c -= 0x10000;
+                cstr_wccat(outstr, (c >> 10) + 0xD800);
+                cstr_wccat(outstr, (c & 0x3FF) + 0xDC00);
+            }
+#else
            cstr_wccat(outstr, c);
+#endif
+        }
    }
    /* add a trailing '\0' */
    if (!is_long)