From 0463e552dcbfad1e4bb47135d2599cf4a1e4da4f Mon Sep 17 00:00:00 2001 From: matt335672 <30179339+matt335672@users.noreply.github.com> Date: Tue, 19 Sep 2023 09:19:17 +0100 Subject: [PATCH] Add UTF-8 / UTF-32 conversion routines These are intended to replace non-UTF-16 uses of mbstowcs() / wcstombs() --- common/Makefile.am | 1 + common/arch.h | 11 + common/string_calls.c | 304 ++++++++- common/string_calls.h | 81 +++ common/unicode_defines.h | 71 ++ configure.ac | 2 +- tests/common/Makefile.am | 1 + tests/common/UTF-8-test.txt | Bin 0 -> 22781 bytes tests/common/test_common.h | 1 + tests/common/test_common_main.c | 1 + tests/common/test_string_calls_unicode.c | 835 +++++++++++++++++++++++ xrdp/xrdp_font.c | 7 +- 12 files changed, 1308 insertions(+), 7 deletions(-) create mode 100644 common/unicode_defines.h create mode 100644 tests/common/UTF-8-test.txt create mode 100644 tests/common/test_string_calls_unicode.c diff --git a/common/Makefile.am b/common/Makefile.am index c929baab..a206a21f 100644 --- a/common/Makefile.am +++ b/common/Makefile.am @@ -68,6 +68,7 @@ libcommon_la_SOURCES = \ thread_calls.h \ trans.c \ trans.h \ + unicode_defines.h \ $(PIXMAN_SOURCES) libcommon_la_LIBADD = \ diff --git a/common/arch.h b/common/arch.h index 38e79d5c..e66b2479 100644 --- a/common/arch.h +++ b/common/arch.h @@ -46,6 +46,17 @@ typedef unsigned long uintptr_t; typedef int bool_t; +// Define Unicode character types +#if defined(HAVE_UCHAR_H) +#include +#elif defined(HAVE_STDINT_H) +typedef uint_least16_t char16_t; +typedef uint_least32_t char32_t; +#else +typedef uint16_t char16_t; +typedef uint32_t char32_t; +#endif + /* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN in the makefile to override */ diff --git a/common/string_calls.c b/common/string_calls.c index 1ba35613..a702a8a4 100644 --- a/common/string_calls.c +++ b/common/string_calls.c @@ -27,11 +27,11 @@ #include #include - #include "log.h" #include "os_calls.h" #include "string_calls.h" #include "defines.h" +#include "unicode_defines.h" unsigned int g_format_info_string(char *dest, unsigned int len, @@ -1288,3 +1288,305 @@ g_sig2text(int signum, char sigstr[]) g_snprintf(sigstr, MAXSTRSIGLEN, "SIG#%d", signum); return sigstr; } + +/*****************************************************************************/ +char32_t +utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref) +{ + /* + * Macro used to parse a continuation character + * @param cp Character Pointer (incremented on success) + * @param end One character past end of input string + * @param value The value we're constructing + * @param finish_label Where to go in the event of an error */ +#define PARSE_CONTINUATION_CHARACTER(cp, end, value, finish_label) \ + { \ + /* Error if we're out of data, or this char isn't a continuation */ \ + if (cp == end || !IS_VALID_CONTINUATION_CHAR(*cp)) \ + { \ + value = UCS_REPLACEMENT_CHARACTER; \ + goto finish_label; \ + } \ + value = (value) << 6 | (*cp & 0x3f); \ + ++cp; \ + } + + char32_t rv; + + /* Easier to work with unsigned chars and no indirection */ + const unsigned char *cp = (const unsigned char *)*utf8str_ref; + const unsigned char *end = (len_ref != NULL) ? cp + *len_ref : cp + 6; + + if (cp == end) + { + return 0; // Pathological case + } + + unsigned int c0 = *cp++; + + if (c0 < 0x80) + { + rv = c0; + } + else if (c0 < 0xc0) + { + /* Unexpected continuation character */ + rv = UCS_REPLACEMENT_CHARACTER; + } + else if (c0 < 0xe0) + { + /* Valid start character for sequence of length 2 + * U-00000080 – U-000007FF */ + rv = (c0 & 0x1f); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + + if (rv < 0x80 || INVALID_UNICODE_80_TO_7FF(rv)) + { + rv = UCS_REPLACEMENT_CHARACTER; + } + } + else if (c0 < 0xf0) + { + /* Valid start character for sequence of length 3 + * U-00000800 – U-0000FFFF */ + rv = (c0 & 0xf); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + if (rv < 0x800 || INVALID_UNICODE_800_TO_FFFF(rv)) + { + rv = UCS_REPLACEMENT_CHARACTER; + } + } + else if (c0 < 0xf8) + { + /* Valid start character for sequence of length 4 + * U-00010000 – U-0001FFFFF */ + rv = (c0 & 0x7); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + if (rv < 0x10000 || INVALID_UNICODE_10000_TO_1FFFFF(rv)) + { + rv = UCS_REPLACEMENT_CHARACTER; + } + } + else if (c0 < 0xfc) + { + /* Valid start character for sequence of length 5 + * U-00200000 – U-03FFFFFF */ + rv = (c0 & 0x3); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + + // These values are currently unsupported + rv = UCS_REPLACEMENT_CHARACTER; + } + + else if (c0 < 0xfe) + { + /* Valid start character for sequence of length 6 + * U-04000000 – U-7FFFFFFF */ + rv = (c0 & 0x1); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish); + + // These values are currently unsupported + rv = UCS_REPLACEMENT_CHARACTER; + } + else + { + // Invalid characters + rv = UCS_REPLACEMENT_CHARACTER; + } + +finish: + + if (len_ref) + { + *len_ref -= ((const char *)cp - *utf8str_ref); + } + *utf8str_ref = (const char *)cp; + + return rv; +#undef PARSE_CONTINUATION_CHARACTER +} + +/*****************************************************************************/ +unsigned int +utf_char32_to_utf8(char32_t c32, char *u8str) +{ + unsigned int rv; + + if (INVALID_UNICODE(c32)) + { + c32 = UCS_REPLACEMENT_CHARACTER; + } + + if (c32 < 0x80) + { + rv = 1; + if (u8str != NULL) + { + u8str[0] = (char)c32; + } + } + else if (c32 < 0x800) + { + rv = 2; + // 11 bits. Five in first byte, six in second + if (u8str != NULL) + { + u8str[1] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[0] = (c32 & 0x1f) | 0xc0; + } + } + else if (c32 < 0xffff) + { + rv = 3; + // 16 bits. Four in first byte, six in second and third + if (u8str != NULL) + { + u8str[2] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[1] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[0] = (c32 & 0xf) | 0xe0; + } + } + else + { + rv = 4; + // 21 bits. Three in first byte, six in second, third and fourth + if (u8str != NULL) + { + u8str[3] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[2] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[1] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + u8str[0] = (c32 & 0x7) | 0xf0; + } + } + + return rv; +} + +/*****************************************************************************/ +unsigned int +utf8_char_count(const char *utf8str) +{ + unsigned int rv = 0; + char32_t c; + + if (utf8str != NULL) + { + while ((c = utf8_get_next_char(&utf8str, NULL)) != 0) + { + ++rv; + } + } + + return rv; +} + +/*****************************************************************************/ +unsigned int +utf8_as_utf16_word_count(const char *utf8str, unsigned int len) +{ + unsigned int rv = 0; + while (len > 0) + { + char32_t c = utf8_get_next_char(&utf8str, &len); + // Characters not in the BMP (i.e. over 0xffff) need a high/low + // surrogate pair + rv += (c >= 0x10000) ? 2 : 1; + } + + return rv; +} + +/*****************************************************************************/ +int +utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32, + unsigned int index) +{ + int rv = 0; + + char c8[MAXLEN_UTF8_CHAR]; + unsigned int c8len = utf_char32_to_utf8(c32, c8); + + // Find out where to insert the character + char *insert_pos = utf8str; + + while (index > 0 && *insert_pos != '\0') + { + utf8_get_next_char((const char **)&insert_pos, NULL); + --index; + } + + // Did we get to where we need to be? + if (index == 0) + { + unsigned int bytes_to_move = strlen(insert_pos) + 1; // Include terminator + // Is there room to insert the character? + // + // <----------- len ----------> + // <--> (bytes_to_move) + // +----------------------------+ + // |ABCDEFGHIJLMN\0 | + // +----------------------------+ + // ^ ^ + // +-utf8str +-insert_pos + // + if ((insert_pos - utf8str) + bytes_to_move + c8len <= len) + { + memmove(insert_pos + c8len, insert_pos, bytes_to_move); + memcpy(insert_pos, c8, c8len); + rv = 1; + } + } + + return rv; +} + +/*****************************************************************************/ +char32_t +utf8_remove_char_at(char *utf8str, unsigned int index) +{ + int rv = 0; + + // Find out where to remove the character + char *remove_pos = utf8str; + + while (index > 0) + { + // Any characters left in string? + if (*remove_pos == '\0') + { + break; + } + + utf8_get_next_char((const char **)&remove_pos, NULL); + --index; + } + + // Did we get to where we need to be? + if (index == 0) + { + // Find the position after the character + char *after_pos = remove_pos; + rv = utf8_get_next_char((const char **)&after_pos, NULL); + + // Move everything up + memmove(remove_pos, after_pos, strlen(after_pos) + 1); + } + + return rv; +} diff --git a/common/string_calls.h b/common/string_calls.h index b4ab8c96..759c0566 100644 --- a/common/string_calls.h +++ b/common/string_calls.h @@ -87,6 +87,15 @@ enum MAXSTRSIGLEN = (3 + 1 + 1 + ((sizeof(int) * 5 + 1) / 2) + 1) }; +/* + * Significant Universal Character Set (Unicode) characters + */ +enum +{ + UCS_WHITE_SQUARE = 0x25a1, + UCS_REPLACEMENT_CHARACTER = 0xfffd +}; + /** * Processes a format string for general info * @@ -317,4 +326,76 @@ int g_strtrim(char *str, int trim_flags); * The string "SIG#" is returned for unrecognised signums */ char *g_sig2text(int signum, char sigstr[]); + +/** + * Get the next Unicode character from a UTF-8 string + * + * @param utf8str_ref UTF 8 string [by reference] + * @param len_ref Length of string [by reference] or NULL + * @return Unicode character + * + * On return, utf8str and len are updated to point past the decoded character. + * Unrecognised characters are mapped to UCS_REPLACEMENT_CHARACTER + * + * len is not needed if your utf8str has a terminator, or is known to + * be well-formed. + */ +char32_t +utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref); + +/** + * Convert a Unicode character to UTF-8 + * @param c32 Unicode character + * @param u8str buffer containing at least MAXLEN_UTF8_CHAR bytes for result + * @return Number of bytes written to u8str. Can be NULL if only the + * length is needed. + * + * The bytes written to u8str are unterminated + */ +#define MAXLEN_UTF8_CHAR 4 +unsigned int +utf_char32_to_utf8(char32_t c32, char *u8str); + +/** + * Returns the number of Unicode characters in a UTF-8 string + * @param utf8str UTF-8 string + * @result Number of Unicode characters in the string (terminator not included) + */ +unsigned int +utf8_char_count(const char *utf8str); + +/** + * Returns the number of UTF-16 words required to store a UTF-8 string + * @param utf8str UTF-8 string + * @param len Length of UTF-8 string + * @result number of words to store UTF-8 string as UTF-16. + */ +unsigned int +utf8_as_utf16_word_count(const char *utf8str, unsigned int len); + +/** + * Add a Unicode character into a UTF-8 string + * @param utf8str Pointer to UTF-8 string + * @param len Length of buffer for UTF-8 string (includes NULL) + * @param c32 character to add + * @param index Where to add the codepoint + * @return 1 for success, 0 if no character was inserted + * + * This routine has to parse the string as it goes, so can be slow. + */ +int +utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32, + unsigned int index); + +/** + * Remove a Unicode character from a UTF-8 string + * @param utf8str Pointer to UTF-8 string + * @param index Where to remove the codepoint from (0-based) + * @return Character removed, or 0 if no character was removed + * + * This routine has to parse the string as it goes, so can be slow. + */ +char32_t +utf8_remove_char_at(char *utf8str, unsigned int index); + #endif diff --git a/common/unicode_defines.h b/common/unicode_defines.h new file mode 100644 index 00000000..cf9df2db --- /dev/null +++ b/common/unicode_defines.h @@ -0,0 +1,71 @@ +/** + * xrdp: A Remote Desktop Protocol server. + * + * Copyright (C) Jay Sorg 2004-2023 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file common/unicode_defines.h + * + * Defines used internally by the implementations of the Unicode routines + */ + +#if !defined(UNICODE_DEFINES_H) +#define UNICODE_DEFINES_H + +/** + * Is this byte a valid UTF-8 continuation character? + */ +#define IS_VALID_CONTINUATION_CHAR(c) ((c) >= 0x80 && (c) < 0xc0) + +/** + * Is this character one of the end-of-plane non-characters? + * + * These are U+xFFFE and U+xFFFF for x in (0..10} + */ +#define IS_PLANE_END_NON_CHARACTER(c32) (((c32) & 0xfffe) == 0xfffe) + +/** + * Is this character one of the additional non-characters? + * + * 32 additional non-charactersare defined in the + * "Arabic Presentation Forms-A" Unicode block */ +#define IS_ARABIC_NON_CHARACTER(c32) ((c32) >= 0xfdd0 && (c32) <= 0xfdef) + +// Invalid characters, based on UTF-8 decoding range +// +// By 'invalid' we mean characters that should not be encoded or +// decoded when switching between UTF-8 and UTF-32 +// +// See "UTF-8 decoder capability and stress test" Markus Kuhn 2015-08-28 +#define INVALID_UNICODE_0_TO_7F(c) (0) // No invalid characters +#define INVALID_UNICODE_80_TO_7FF(c) (0) // No invalid characters +#define INVALID_UNICODE_800_TO_FFFF(c) \ + (((c) >= 0xd800 && (c) <= 0xdfff) || /* Surrogate pairs */ \ + IS_ARABIC_NON_CHARACTER(c) || \ + IS_PLANE_END_NON_CHARACTER(c)) + +#define INVALID_UNICODE_10000_TO_1FFFFF(c) \ + (IS_PLANE_END_NON_CHARACTER(c) || (c) > 0x10ffff) + +// Returns true for all 'invalid' Unicode chars +#define INVALID_UNICODE(c) \ + ( \ + INVALID_UNICODE_0_TO_7F(c) || \ + INVALID_UNICODE_80_TO_7FF(c) || \ + INVALID_UNICODE_800_TO_FFFF(c) || \ + INVALID_UNICODE_10000_TO_1FFFFF(c) \ + ) + +#endif // UNICODE_DEFINES_H diff --git a/configure.ac b/configure.ac index 2f3d6938..189d9493 100644 --- a/configure.ac +++ b/configure.ac @@ -569,7 +569,7 @@ AC_SUBST([pamconfdir], [$with_pamconfdir]) PKG_INSTALLDIR -AC_CHECK_HEADERS([sys/prctl.h]) +AC_CHECK_HEADERS([sys/prctl.h uchar.h]) AC_CONFIG_FILES([ common/Makefile diff --git a/tests/common/Makefile.am b/tests/common/Makefile.am index 8294afcb..ff888bcc 100644 --- a/tests/common/Makefile.am +++ b/tests/common/Makefile.am @@ -17,6 +17,7 @@ test_common_SOURCES = \ test_fifo_calls.c \ test_list_calls.c \ test_string_calls.c \ + test_string_calls_unicode.c \ test_os_calls.c \ test_os_calls_signals.c \ test_ssl_calls.c \ diff --git a/tests/common/UTF-8-test.txt b/tests/common/UTF-8-test.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b GIT binary patch literal 22781 zcmdU1X_Fh*b>CYjiIrHEbIyIeVd1bb#8j*y4MbNJE8ngr{T%> zH=?M0*NGF|Zns=*maBHFY*)*j-4j3B+Sy$_dEy6TNmiFvPA)BPEUj+fmviUj>}zDb zylhzyHeB;;sk==Fw0Y8Snr+$lJK|ijTdwCUO2hB+4}n}98At=QQVnO2tbk<@(9!SA*KTMb-()7zJ}9Yk)m)3ovMPE_?< z4&m`=AYfdi2gL9x(zW;T2&>e#!>~kZOLg0AmhhcCgBHcvb3FXf@9Z>qKS;P_9xq{M zX9r^v+X4}FC$KAEXd?8A)3Eozr9kXRLJ%VDrme$0ABAGOEs(JYzJ!rugIk~^6$2>n zcEzq>1dFOMR23y23`tPI^(hkaJx~+w1GHs>5#U|33BkDdJ8tNBov>p!@zi!F0^_$X zwVlAF6hNWW!L)ctp%pbicFV`;>TVE5OOcBa*d6d8P>GKyMu;y#v2jP!v2ly zk#^vNEf_7C=(s`3piW3-P;OK9WvZqTur#5BA;#K-8dm*1m=Jj3)$CxO%20DT9xyLe zwsDmL*Fxz!%_e`wy4l5ZQiI8()+YdS1n&OdNLR+dk#pITbEYjt^fc~L?miBLAnr&X@2<+Unk z1xxnas$;=~9Ct4@5j@+*5Y%*_V9+xuWkXKLj~OeW4FN%?jpW=YyARqX)rK9|RoI{z z6*L^m6}lKPRobrk?FftsJC)FhN~FyyrMJYKuwgkgxUNh6oN`}GFDwzt;8txQTCib0 zpaX3vNd^8-vO$_rm*s*-zPDPw2N~g;P^31r#;!I1WQ{2rZPN#c8k3rQ#uD?@lrel9 zc&guQIdCZHh-Bg7@$K#H&0~wwZMb+&)Ax7DrV9Ahba&}X@N9^jV}PH6M+Px?o*Hkbs)2c49sb4l;8pIYPlARu zoGPT(Z8+q>K>~n^Evy`)jv=&wc~r>Af(RF7VRiKHP%*^JY6M2Y=RqsH9fW|J=p5L9 znCI@cwPR{0b+~K4%VEB*<45q|vDS%Qr|sIF1&&Z+V?K-_EmasQ z8kByYLD(%L0K%By=V1!rzE^DozUTgkfsVl1(8MKiTZq~&WQ?&SPutJLna1mUxDM_8 zP)ZNpy(L@sZI)m$I*2Ai09`mdo-eM(DM*@X6gpHP8)DPgmBh+wV22I*AB0J!9+fy; z8{>>!u-$}rV^pxJHiRsQ=#+ES2`J?=&=eCklnU@e6AC)5L_(+U%sfB{lLwa<+D)c0 z@KC(hsyj~Ym{ISVi82sR3hY2obbeCmm0&TKs!kix5mfO0s0&XAKVcanR?92p6&3Aj z8SfK619~GH1bV)=)P}u8YO7}+ICuU$#}Wm8Gqf%qN3>YhoB8(iHYkdjXatJ~CgW3j zY>f4iQ(Nk~HBd)WUqlpu?vm2{m{U#~UP-4-nIT8$v^)WO76SEQ@N+P=V`Y;of&qRv z)cla%oCy!KxVqci&#Y-mCc=zBS!*Ss((AM;P-3Qwi3bb$e1=vyB2^fTxfonxrVgS2 zoKKuoB7(+ABNaY2vzm!32iYF<<33-#7gY~A0sMvCghhF zq-e|{8C-$Ugm&8k%c>);(`u^J2$kppjT%GKyf1X4j@nzOxnwR`JcchcJNi(NkUAXU zsBwD@$v}mutT zh$~}5e&&(OGeVveIhlv#Txx{K7vFe!R?Jf(r*fF<3Yq8RORop!41|tK>vn&mbRg_| zm`Ke*yX7d<8wxo^y-#oF)cePZYNK$x)71O)b{_BPK~X?uV0lu#G3m<#4hJ(Lj;r?y z0rQC89Lx$irrv9cn1}opAZH|VRK3AOy>gQIHZf-ayJ7t0~3rOy}M?jxyqUWGiivnIv=ZUUoW)yJ@%RT>A zZ|6_n7AGs61L#Y#T@(gG2L*alV=(oPCW|Yqyd_s}%50DMoQs02-}TDL_lo|U|9IgU zC=ru;a?(B`3|YRkv3!@5islZ`Ka<&Mu%8qz@G5oS74LhKL_mwUeB`g-0 z885ud+RgBaa?Su6Jh+(ko8rwxp}|7{J$ZAyStvAkr{x|eJ@}@Vx<*jdf`#awGu6MuZz3+SfqlZB<_ZYm#KJdX0efT3E z{n*Dp@ySnp`tc{O+%Dc{KKqNm^vj?7mHw}O{tLhM#V4=MWD%p!p1(ZSbgF~$IZ9|@ zg!S8ed2saM1%5jc`snXjU|w4_tD$k&MOa#0(%M{7dBDVc`BIg7T&ttBKjlj4(;A;C z%OM8p(pcEW0&zC_v`#PuCjC>2m;UAU-Y8z_ntVyVEMJkY%CF0B$ZyJT$#2W=$nVPU z$?wZk^0gTrVd(n<`9t|5`D6JL`BV8b`E&UT`AhjL`D^(b`CIurd0L7ms`DQ&^p6@y z`L3KAm+#sX`8t%Z^NN=5b@{sdz5Ijxqx_Tnv;2$vtNfe%yZndzr`(Wl$ngJN9>_N+ zkBxD0tWOa~o#LpUo^Ebg{P?c6w`b&8c}|{}7v#U>zvY`V@PFJCkH$WbuTdWB&64GgT zysDZJJZ*k%+6;6}KSAQkmiF_oRc4|oB7=iG(#2q13uZ9^c@>u(QBsJ|L8XgMXgXOq zm#rG3WWd$cxU&sS8R<2D{;i>eh5TVo7);F63rof4=zI7naP@a<-sQ;!47FNLX26 z$t>m86H2;}Fo~I(<6PcQ(on~9X!b$vkUblwOT$G!=a?r8i#%}tK7u}$E(?52e5$%z z*gDHJG1+X$Lc>MgN8*O8*Ce66mo|5>-H8DsEtVFV6DByx&-srR$fax7FT;7z+SNRt z9uA?!e{cTuJqZ2D`v2HZW++5+qqcUo+YTB| z5%$vF(4+}4($33K3g=HB_{TU{ceuFp^0>{9KK<6e(WXyw-d)#qrSHrK4`C4W@n1a# ze);9^9145>{DTL(fKE5hGUG_Lmbq~a<|`Q&!6_2p=}QBWMm2ea0_}P(FGrd?Dc08U2^rTakV#X z(Xa@!XT)ZkA}VQUq(i@Jt;5YC4X0I>Gg1m|sD$YQ%kbjn(^BG&1Lw8KC-o-N)2xl) z*{Cy)XwN6?Gk^8aUTcTgmT;Fg4`YW4w{vK-y7n%!r@hIhc*HIwuXW+JhA~|rCz7kD zn_I9`2oX=0$j~ z30Q8Z3fn^)s82;V(zUh!)zHLRu!j}ZoYp3d(oJZC#sC`E$OSyN?QmdVhS@=Y=}{-( zrZe5oLhIafo$g}7J)};MR=h6_ACB8crtjR2W%^xdvJl5O|J8_AObq4NWVKM1)X&S53})42gHvR~Io zCU5;QAF6wAcnkNvJ|GL31u)a|b4=={0}XCp4!onNK0W z_(((4S{kxYI#EJ5wb`r%y)(juZ6z1&m-~8h>Gr6A7gfNC(|{@KZm0;o+q!9y_RIk$ z7vs^vE!Ql8fE`MnZe=scf6dnoFWAP%kJAe|{{-W{H~n<>H}jV-^bZ=y^2RC~gR0n} zkc;=A5NYAHD~0p8o<+db4A9Y@pgbl3uV0=OaVhN4#N5@hu%i?v24T zJ(F?mj%30|pjAJ?#2fmEhk#j-o4F^<;Vp>-Cod#(nAA=*Rwo7g`!3#-%}x${>ipI5 z!O2`BF>Lq(lPBm7t^@2UBe@rEL(U*LH;rLfodHTRS5xE6G8Hmm;>xXhvJ0b73K;#4*}HwR>dIZnepaCUAE6hTQO%}vGzUetHYUmh$4KVTMi zNfR2UFij8q9n>fC1Idl)ptnatpK)KC*w6+8W0gf`1F*Flj}by43 zC48);eEiH|X~d!}__&>n8g^^E;Ek(?MZ~f#5tC7yM2Vv=mLH8tX5`n~{)LiWmHTZs zfIPlSt;OW-$JJWE$W++IXh3%+^~WyXxV7MQ%qHF^n5l3w%v3jld22x#%-CXJgClgR zDUQ(j_@FSIvCSCFjLsA@r4yJ%=`>bD%O0UqReglcCyMja*iDR?(V1eVbON&|oyKzL zFGlE8s1Z7^6z3;;9sm|&W^|^QDV@M9Lg#Ip;0OTqXie_JvV5|DfU-W8I1Zch%s+aK zW;-RE7Lt;R(lwWfN<5k-*n@yrRL~=$aGDH@-g?Z^*m(16PpHNsaT!^{^P4=S4s$u4 zn4_g-PG6ET$^-GbU5eSWO3BPePsM`!RsujH>i4x$Ilc89&MIe9-DdQnA3T!WrWLlMIw zp{lN#KWL=_YYXWDsV)UqntpW`df>;3Z59c&tui(!b9f~fie^igAJ#+iN8yc(*PM%o zgQ+lQ#JPjq$ur$Tjq@=p!HGQLh0Y9+wa(;}L%?JP$Ax_^nZjg87@pla_1*mCaesQs zBt{CNc)%f4raCGP@*pJw5#o&C&kib4X2*-!d*E^m64IZa^19wsZbc|NX=Mqr?7V z!~O?`{SOZN9~$;QJnVmD*#GFT|FL2J|ZGz&1f`h z_Eg0#9_P<(SztEeVN5IK^2&0qE&E{3&!4{dd8KC_9bCn0@XQtbyn>$t{6x|G*@p*L z@ftjf&+!^Oi_h^IJd4lq8a#*3@fsXVxA`=q{yCuGHFyqacnzKd8eW6vfri)Md3=u7 p;CXzG*Wh`4j@JOEDqJmkvO@N>BDCH5J3T8tzwg#xE%c3^^?x0SNE-kE literal 0 HcmV?d00001 diff --git a/tests/common/test_common.h b/tests/common/test_common.h index ea700c30..936dda1b 100644 --- a/tests/common/test_common.h +++ b/tests/common/test_common.h @@ -10,6 +10,7 @@ bin_to_hex(const char *input, int length); Suite *make_suite_test_fifo(void); Suite *make_suite_test_list(void); Suite *make_suite_test_string(void); +Suite *make_suite_test_string_unicode(void); Suite *make_suite_test_os_calls(void); Suite *make_suite_test_ssl_calls(void); Suite *make_suite_test_base64(void); diff --git a/tests/common/test_common_main.c b/tests/common/test_common_main.c index 05eacd27..d5471781 100644 --- a/tests/common/test_common_main.c +++ b/tests/common/test_common_main.c @@ -49,6 +49,7 @@ int main (void) sr = srunner_create (make_suite_test_fifo()); srunner_add_suite(sr, make_suite_test_list()); srunner_add_suite(sr, make_suite_test_string()); + srunner_add_suite(sr, make_suite_test_string_unicode()); srunner_add_suite(sr, make_suite_test_os_calls()); srunner_add_suite(sr, make_suite_test_ssl_calls()); srunner_add_suite(sr, make_suite_test_base64()); diff --git a/tests/common/test_string_calls_unicode.c b/tests/common/test_string_calls_unicode.c new file mode 100644 index 00000000..e678ba30 --- /dev/null +++ b/tests/common/test_string_calls_unicode.c @@ -0,0 +1,835 @@ +/* + * The UTF-8 decoder tests are based on the UTF-8 decoder capability + * and stress test" 2015-08-26 by Markus Kuhn. A copy of that file + * named "UTF-8-test.txt" should be in the source directory for this file */ + +#if defined(HAVE_CONFIG_H) +#include "config_ac.h" +#endif + +#include "string_calls.h" + +#include "test_common.h" + +// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_decode_sub_test arrays +#define URC UCS_REPLACEMENT_CHARACTER + +struct utf8_decode_sub_test +{ + const char *testref; + const char *utf8str; + // This array will contain 0 values after the initialised part + const char32_t expected[65]; +}; + +// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_encode_sub_test arrays +#define E_URC { 0xef, 0xbf, 0xbd } + +struct utf8_encode_sub_test +{ + const char *testref; + char32_t c32; + unsigned int expected_len; + char expected_str[MAXLEN_UTF8_CHAR]; +}; + +// Used as the simple test in UTF-8-test.txt +const static char greek_kosme[] = + "\xce\xba" // GREEK SMALL LETTER KAPPA + "\xe1\xbd\xb9" // GREEK SMALL LETTER OMICRON WITH OXIA + "\xcf\x83" // GREEK SMALL LETTER SIGMA + "\xce\xbc" // GREEK SMALL LETTER MU + "\xce\xb5"; // GREEK SMALL LETTER EPSILON + +// See Issue #2603 +const static char simple_test_with_emoji[] = + "Simple Test." + "\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face + +/******************************************************************************/ +/** + * Function to decode a UTF-8 string and check the expected result + * + * @param st Pointer to the sub-test to run + */ +static void +run_utf8_decode_sub_test(const struct utf8_decode_sub_test *st) +{ + char32_t c; + const char *p = st->utf8str; + unsigned int index = 0; + + do + { + c = utf8_get_next_char(&p, NULL); + + if (c != st->expected[index]) + { + ck_abort_msg("Sub-test section %s Index %u expected %x, got %x", + st->testref, + index, st->expected[index], c); + } + ++index; + } + while (c != 0); +} + +/******************************************************************************/ +/** + * Function to run an array of decode sub-tests + * + * @param st Pointer to the first sub-test to run + */ +static void +run_decode_sub_test_array(const struct utf8_decode_sub_test *st) +{ + while (st->utf8str != NULL) + { + run_utf8_decode_sub_test(st++); + } +} + +/******************************************************************************/ +/** + * Function to encode a UTF-8 value and check the expected result + * + * @param st Pointer to the sub-test to run + */ +static void +run_utf8_encode_sub_test(const struct utf8_encode_sub_test *st) +{ + char actual_str[MAXLEN_UTF8_CHAR]; + unsigned int index; + unsigned int actual_len = utf_char32_to_utf8(st->c32, actual_str); + + if (actual_len != st->expected_len) + { + ck_abort_msg("Sub-test %s Expected length of %u, got %u", + st->testref, + st->expected_len, actual_len); + } + + for (index = 0 ; index < actual_len; ++index) + { + if (actual_str[index] != st->expected_str[index]) + { + ck_abort_msg("Sub-test %s Character %u, expected %02x got %02x", + st->testref, index, + (int)(unsigned char)st->expected_str[index], + (int)(unsigned char)actual_str[index]); + } + } +} + +/******************************************************************************/ +/** + * Function to run an array of encode sub-tests + * + * @param st Pointer to the first sub-test to run + */ +static void +run_encode_sub_test_array(const struct utf8_encode_sub_test *st) +{ + while (st->expected_len > 0) + { + run_utf8_encode_sub_test(st++); + } +} + + +/******************************************************************************/ +START_TEST(test_get_next_char__section_1) +{ + const struct utf8_decode_sub_test st = + { + "1", + greek_kosme, + { + 0x03ba, // GREEK SMALL LETTER KAPPA + 0x1f79, // GREEK SMALL LETTER OMICRON WITH OXIA + 0x03c3, // GREEK SMALL LETTER SIGMA + 0x03bc, // GREEK SMALL LETTER MU + 0x03b5 // GREEK SMALL LETTER EPSILON + } + }; + + run_utf8_decode_sub_test(&st); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_get_next_char__section_2) +{ + struct utf8_decode_sub_test tests[] = + { + // 2.1 First possible sequence of a certain length + // + // (2.1.1 Is tested separately) + { "2.1.2", "\xc2\x80", { 0x80 } }, + { "2.1.3", "\xe0\xa0\x80", { 0x800 } }, + { "2.1.4", "\xf0\x90\x80\x80", { 0x10000 } }, + { "2.1.5", "\xf8\x88\x80\x80\x80", { URC } }, + { "2.1.6", "\xfc\x84\x80\x80\x80\x80", { URC } }, + + // 2.2 Last possible sequence of a certain length + { "2.2.1", "\x7f", { 0x7f } }, + { "2.2.2", "\xdf\xbf", { 0x7ff } }, + // Use U+0000FFFC instead of U+0000FFFF as our decoder + // treats non-characters as an input error + { "2.2.3", "\xef\xbf\xbc", { 0xfffc } }, + // U+001FFFFF is out-of-range + { "2.2.4", "\xf7\xbf\xbf\xbf", { URC } }, + { "2.2.5", "\xfb\xbf\xbf\xbf\xbf", { URC } }, + { "2.2.6", "\xfd\xbf\xbf\xbf\xbf\xbf", { URC } }, + + // 2.3 Other boundary conditions + { "2.3.1", "\xed\x9f\xbf", { 0xd7ff } }, + { "2.3.2", "\xee\x80\x80", { 0xe000 } }, + { "2.3.3", "\xef\xbf\xbd", { 0xfffd } }, + // Don't use U+10FFFF (non-character) + { "2.3.4", "\xf4\x8f\xbf\xbd", { 0x10fffd } }, + { "2.3.5", "\xf4\x90\x80\x80", { URC } }, + // Terminator + { 0 } + }; + + // 2.1.1 is a '\0' which we use to terminate our strings. Test + // it separately + { + const char *p = ""; + + ck_assert_int_eq(utf8_get_next_char(&p, NULL), 0); + } + + // Do the rest of the section 2 tests + run_decode_sub_test_array(tests); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_get_next_char__section_3) +{ + struct utf8_decode_sub_test tests[] = + { + // 3.1 Unexpected continuation bytes + // + // Each unexpected continuation byte should be separately + // signalled as a malformed sequence of its own. + { "3.1.1", "\x80", { URC } }, + { "3.1.2", "\xbf", { URC } }, + { "3.1.3", "\x80\xbf", { URC, URC } }, + { "3.1.4", "\x80\xbf\x80", { URC, URC, URC } }, + { "3.1.5", "\x80\xbf\x80\xbf", { URC, URC, URC, URC } }, + { "3.1.6", "\x80\xbf\x80\xbf\x80", { URC, URC, URC, URC, URC } }, + { + "3.1.7", + "\x80\xbf\x80\xbf\x80\xbf", + { URC, URC, URC, URC, URC, URC } + }, + { + "3.1.8", + "\x80\xbf\x80\xbf\x80\xbf\x80", + { URC, URC, URC, URC, URC, URC, URC } + }, + { + "3.1.9", + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf", + { + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC + } + }, + + // 3.2 Lonely start characters + { + "3.2.1", + "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 " + "\xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf " + "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 " + "\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf ", + { + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ' + } + }, + { + "3.2.2", + "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 " + "\xe8 \xe9 \xea \xeb \xec \xed \xee \xef ", + { + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ' + } + }, + { + "3.2.3", + "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 ", + { + URC, ' ', URC, ' ', URC, ' ', URC, ' ', + URC, ' ', URC, ' ', URC, ' ', URC, ' ' + } + }, + { + "3.2.4", + "\xf8 \xf9 \xfa \xfb ", + { + URC, ' ', URC, ' ', URC, ' ', URC, ' ' + } + }, + { + "3.2.5", "\xfc \xfd ", { URC, ' ', URC, ' ' } + }, + + // 3.3 Sequences with last continuation byte missing + // + // From UTF-8-test.txt:- + // All bytes of an incomplete sequence should be signalled as + // a single malformed sequence, i.e., you should see only a + // single replacement character in each of the next 10 tests. + { "3.3.1", "\xc0", { URC } }, + { "3.3.2", "\xe0\x80", { URC } }, + { "3.3.3", "\xf0\x80\x80", { URC } }, + { "3.3.4", "\xf8\x80\x80\x80", { URC } }, + { "3.3.5", "\xfc\x80\x80\x80\x80", { URC } }, + + { "3.3.6", "\xdf", { URC } }, + { "3.3.7", "\xef\xbf", { URC } }, + { "3.3.8", "\xf7\xbf\xbf", { URC} }, + { "3.3.9", "\xfb\xbf\xbf\xbf", { URC } }, + { "3.3.10", "\xfd\xbf\xbf\xbf\xbf", { URC } }, + + // 3.4 Concatenation of incomplete sequences + { + "3,4", + "\xc0" + "\xe0\x80" + "\xf0\x80\x80" + "\xf8\x80\x80\x80" + "\xfc\x80\x80\x80\x80" + "\xdf" + "\xef\xbf" + "\xf7\xbf\xbf" + "\xfb\xbf\xbf\xbf" + "\xfd\xbf\xbf\xbf\xbf", + { + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC + } + }, + + // 3.5 Impossible bytes + { "3.5.1", "\xfe", { URC } }, + { "3.5.2", "\xff", { URC } }, + { "3.5.3", "\xfe\xfe\xff\xff", { URC, URC, URC, URC } }, + // Terminator + { 0 } + }; + + run_decode_sub_test_array(tests); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_get_next_char__section_4) +{ + struct utf8_decode_sub_test tests[] = + { + // 4.1 Examples of an overlong ASCII character + // + // With a safe UTF-8 decoder, all of the following five + // overlong representations of the ASCII character slash ("/") + // should be rejected like a malformed UTF-8 sequence, for + // instance by substituting it with a replacement character. If + // you see a slash below, you do not have a safe UTF-8 decoder! + { "4.1.1", "\xc0\xaf", { URC } }, + { "4.1.2", "\xe0\x80\xaf", { URC } }, + { "4.1.3", "\xf0\x80\x80\xaf", { URC } }, + { "4.1.4", "\xf8\x80\x80\x80\xaf", { URC } }, + { "4.1.5", "\xfc\x80\x80\x80\x80\xaf", { URC } }, + + // 4.2 Maximum overlong sequences + + // Below you see the highest Unicode value that is still resulting + // in an overlong sequence if represented with the given number of + // bytes. This is a boundary test for safe UTF-8 decoders. All + // five characters should be rejected like malformed UTF-8 + // sequences. + { "4.2.1", "\xc1\xbf", { URC } }, + { "4.2.2", "\xe0\x9f\xbf", { URC } }, + { "4.2.3", "\xf0\x8f\xbf\xbf", { URC } }, + { "4.2.4", "\xf8\x87\xbf\xbf\xbf", { URC } }, + { "4.2.5", "\xfc\x83\xbf\xbf\xbf\xbf", { URC } }, + + // 4.3 Overlong representation of the NUL character + + // The following five sequences should also be rejected like + // malformed UTF-8 sequences and should not be treated like the + // ASCII NUL character. + { "4.3.1", "\xc0\x80", { URC } }, + { "4.3.2", "\xe0\x80\x80", { URC } }, + { "4.3.3", "\xf0\x80\x80\x80", { URC } }, + { "4.3.4", "\xf8\x80\x80\x80\x80", { URC } }, + { "4.3.5", "\xfc\x80\x80\x80\x80\x80", { URC } }, + + // Terminator + { 0 } + }; + + run_decode_sub_test_array(tests); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_get_next_char__section_5) +{ + struct utf8_decode_sub_test tests[] = + { + // 5 Illegal code positions + + // The following UTF-8 sequences should be rejected like + // malformed sequences, because they never represent valid + // ISO 10646 characters and a UTF-8 decoder that accepts them + // might introduce security problems comparable to overlong + // UTF-8 sequences. + + // 5.1 Single UTF-16 surrogates + { "5.1.1", "\xed\xa0\x80", { URC } }, + { "5.1.2", "\xed\xad\xbf", { URC } }, + { "5.1.3", "\xed\xae\x80", { URC } }, + { "5.1.4", "\xed\xaf\xbf", { URC } }, + { "5.1.5", "\xed\xb0\x80", { URC } }, + { "5.1.6", "\xed\xbe\x80", { URC } }, + { "5.1.7", "\xed\xbf\xbf", { URC } }, + + // 5.2 Paired UTF-16 surrogates + { "5.2.1", "\xed\xa0\x80\xed\xb0\x80", { URC, URC } }, + { "5.2.2", "\xed\xa0\x80\xed\xbf\xbf", { URC, URC } }, + { "5.2.3", "\xed\xad\xbf\xed\xb0\x80", { URC, URC } }, + { "5.2.4", "\xed\xad\xbf\xed\xbf\xbf", { URC, URC } }, + { "5.2.5", "\xed\xae\x80\xed\xb0\x80", { URC, URC } }, + { "5.2.6", "\xed\xae\x80\xed\xbf\xbf", { URC, URC } }, + { "5.2.7", "\xed\xaf\xbf\xed\xb0\x80", { URC, URC } }, + { "5.2.8", "\xed\xaf\xbf\xed\xbf\xbf", { URC, URC } }, + + // 5.3 Noncharacter code positions + + // The following "noncharacters" are "reserved for internal + // use" by applications, and according to older versions of + // the Unicode Standard "should never be interchanged". Unicode + // Corrigendum #9 dropped the latter restriction. Nevertheless, + // their presence in incoming UTF-8 data can remain a potential + // security risk, depending on what use is made of these codes + // subsequently. Examples of such internal use: + // + // - Some file APIs with 16-bit characters may use the integer + // value -1 = U+FFFF to signal an end-of-file (EOF) or error + // condition. + // + // - In some UTF-16 receivers, code point U+FFFE might trigger + // a byte-swap operation (to convert between UTF-16LE and + // UTF-16BE). + // With such internal use of noncharacters, it may be desirable + // and safer to block those code points in UTF-8 decoders, as + // they should never occur legitimately in incoming UTF-8 data, + // and could trigger unsafe behaviour in subsequent processing. + + // Particularly problematic noncharacters in 16-bit applications: + { "5.3.1", "\xef\xbf\xbe", { URC } }, + { "5.3.2", "\xef\xbf\xbf", { URC } }, + + // Other noncharacters: + { + "5.3.3", + // Non-characters in "Arabic Presentation Forms-A" (BMP) + "\xef\xb7\x90" "\xef\xb7\x91" "\xef\xb7\x92" "\xef\xb7\x93" + "\xef\xb7\x94" "\xef\xb7\x95" "\xef\xb7\x96" "\xef\xb7\x97" + "\xef\xb7\x98" "\xef\xb7\x99" "\xef\xb7\x9a" "\xef\xb7\x9b" + "\xef\xb7\x9c" "\xef\xb7\x9d" "\xef\xb7\x9e" "\xef\xb7\x9f" + "\xef\xb7\xa0" "\xef\xb7\xa1" "\xef\xb7\xa2" "\xef\xb7\xa3" + "\xef\xb7\xa4" "\xef\xb7\xa5" "\xef\xb7\xa6" "\xef\xb7\xa7" + "\xef\xb7\xa8" "\xef\xb7\xa9" "\xef\xb7\xaa" "\xef\xb7\xab" + "\xef\xb7\xac" "\xef\xb7\xad" "\xef\xb7\xae" "\xef\xb7\xaf", + { + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC + } + }, + + { + "5.3.4", + "\xf0\x9f\xbf\xbe" "\xf0\x9f\xbf\xbf" // U+0001FFFE U+0001FFFF + "\xf0\xaf\xbf\xbe" "\xf0\xaf\xbf\xbf" // U+0002FFFE U+0002FFFF + "\xf0\xbf\xbf\xbe" "\xf0\xbf\xbf\xbf" // U+0003FFFE U+0003FFFF + "\xf1\x8f\xbf\xbe" "\xf1\x8f\xbf\xbf" // U+0004FFFE U+0004FFFF + "\xf1\x9f\xbf\xbe" "\xf1\x9f\xbf\xbf" // U+0005FFFE U+0005FFFF + "\xf1\xaf\xbf\xbe" "\xf1\xaf\xbf\xbf" // U+0006FFFE U+0006FFFF + "\xf1\xbf\xbf\xbe" "\xf1\xbf\xbf\xbf" // U+0007FFFE U+0007FFFF + "\xf2\x8f\xbf\xbe" "\xf2\x8f\xbf\xbf" // U+0008FFFE U+0008FFFF + "\xf2\x9f\xbf\xbe" "\xf2\x9f\xbf\xbf" // U+0009FFFE U+0009FFFF + "\xf2\xaf\xbf\xbe" "\xf2\xaf\xbf\xbf" // U+000AFFFE U+000AFFFF + "\xf2\xbf\xbf\xbe" "\xf2\xbf\xbf\xbf" // U+000BFFFE U+000BFFFF + "\xf3\x8f\xbf\xbe" "\xf3\x8f\xbf\xbf" // U+000CFFFE U+000CFFFF + "\xf3\x9f\xbf\xbe" "\xf3\x9f\xbf\xbf" // U+000DFFFE U+000DFFFF + "\xf3\xaf\xbf\xbe" "\xf3\xaf\xbf\xbf" // U+000EFFFE U+000EFFFF + "\xf3\xbf\xbf\xbe" "\xf3\xbf\xbf\xbf" // U+000FFFFE U+000FFFFF + "\xf4\x8f\xbf\xbe" "\xf4\x8f\xbf\xbf",// U+0010FFFE U+0010FFFF + { + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC, + URC, URC, URC, URC, URC, URC, URC, URC + } + }, + + // Last line of UTF8-test.txt + { "TheEnd", "THE END\n", { 'T', 'H', 'E', ' ', 'E', 'N', 'D', '\n'} }, + + // Terminator + { 0 } + + }; + + run_decode_sub_test_array(tests); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_utf_char32_to_utf8) +{ + struct utf8_encode_sub_test tests[] = + { + + // E2.1 First possible sequence of a certain length + // + { "E2.1.1", 0, 1, { 0 } }, + { "E2.1.2", 0x80, 2, { 0xc2, 0x80 } }, + { "E2.1.3", 0x800, 3, { 0xe0, 0xa0, 0x80 } }, + { "E2.1.4", 0x10000, 4, { 0xf0, 0x90, 0x80, 0x80 } }, + + // E2.2 Last possible sequence of a certain length + { "E2.2.1", 0x7f, 1, { 0x7f } }, + { "E2.2.2", 0x7ff, 2, { 0xdf, 0xbf } }, + { "E2.2.3", 0xfffc, 3, { 0xef, 0xbf, 0xbc } }, // See 2.1.3 above + { "E2.2.4", 0x1FFFFF, 3, E_URC }, // out-of-range + + // E2.3 Other boundary conditions + { "E2.3.1", 0xd7ff, 3, { 0xed, 0x9f, 0xbf } }, + { "E2.3.2", 0xe000, 3, { 0xee, 0x80, 0x80 } }, + { "E2.3.3", 0xfffd, 3, { 0xef, 0xbf, 0xbd } }, + { "E2.3.4", 0x10fffd, 4, { 0xf4, 0x8f, 0xbf, 0xbd } }, // See 2.3.4 above + // E2.3.5 - not tested + + // E5.1 Single UTF-16 surrogates + { "E5.1.1", 0xd800, 3, E_URC }, + { "E5.1.2", 0xdb7f, 3, E_URC }, + { "E5.1.3", 0xdb80, 3, E_URC }, + { "E5.1.4", 0xdbff, 3, E_URC }, + { "E5.1.5", 0xdc00, 3, E_URC }, + { "E5.1.6", 0xdf80, 3, E_URC }, + { "E5.1.7", 0xdfff, 3, E_URC }, + + // E5.3 Non-character code positions + { "E5.3.3(0)", 0xfdd0, 3, E_URC }, + { "E5.3.3(1)", 0xfdd1, 3, E_URC }, + { "E5.3.3(2)", 0xfdd2, 3, E_URC }, + { "E5.3.3(3)", 0xfdd3, 3, E_URC }, + { "E5.3.3(4)", 0xfdd4, 3, E_URC }, + { "E5.3.3(5)", 0xfdd5, 3, E_URC }, + { "E5.3.3(6)", 0xfdd6, 3, E_URC }, + { "E5.3.3(7)", 0xfdd7, 3, E_URC }, + { "E5.3.3(8)", 0xfdd8, 3, E_URC }, + { "E5.3.3(9)", 0xfdd9, 3, E_URC }, + { "E5.3.3(10)", 0xfdda, 3, E_URC }, + { "E5.3.3(11)", 0xfddb, 3, E_URC }, + { "E5.3.3(12)", 0xfddc, 3, E_URC }, + { "E5.3.3(13)", 0xfddd, 3, E_URC }, + { "E5.3.3(14)", 0xfdde, 3, E_URC }, + { "E5.3.3(15)", 0xfddf, 3, E_URC }, + { "E5.3.3(16)", 0xfde0, 3, E_URC }, + { "E5.3.3(17)", 0xfde1, 3, E_URC }, + { "E5.3.3(18)", 0xfde2, 3, E_URC }, + { "E5.3.3(19)", 0xfde3, 3, E_URC }, + { "E5.3.3(20)", 0xfde4, 3, E_URC }, + { "E5.3.3(21)", 0xfde5, 3, E_URC }, + { "E5.3.3(22)", 0xfde6, 3, E_URC }, + { "E5.3.3(23)", 0xfde7, 3, E_URC }, + { "E5.3.3(24)", 0xfde8, 3, E_URC }, + { "E5.3.3(25)", 0xfde9, 3, E_URC }, + { "E5.3.3(26)", 0xfdea, 3, E_URC }, + { "E5.3.3(27)", 0xfdeb, 3, E_URC }, + { "E5.3.3(28)", 0xfdec, 3, E_URC }, + { "E5.3.3(29)", 0xfded, 3, E_URC }, + { "E5.3.3(30)", 0xfdee, 3, E_URC }, + { "E5.3.3(31)", 0xfdef, 3, E_URC }, + { "E5.3.4(0)", 0x1fffe, 3, E_URC }, + { "E5.3.4(1)", 0x1ffff, 3, E_URC }, + { "E5.3.4(2)", 0x2fffe, 3, E_URC }, + { "E5.3.4(3)", 0x2ffff, 3, E_URC }, + { "E5.3.4(4)", 0x3fffe, 3, E_URC }, + { "E5.3.4(5)", 0x3ffff, 3, E_URC }, + { "E5.3.4(6)", 0x4fffe, 3, E_URC }, + { "E5.3.4(7)", 0x4ffff, 3, E_URC }, + { "E5.3.4(8)", 0x5fffe, 3, E_URC }, + { "E5.3.4(9)", 0x5ffff, 3, E_URC }, + { "E5.3.4(10)", 0x6fffe, 3, E_URC }, + { "E5.3.4(11)", 0x6ffff, 3, E_URC }, + { "E5.3.4(12)", 0x7fffe, 3, E_URC }, + { "E5.3.4(13)", 0x7ffff, 3, E_URC }, + { "E5.3.4(14)", 0x8fffe, 3, E_URC }, + { "E5.3.4(15)", 0x8ffff, 3, E_URC }, + { "E5.3.4(16)", 0x9fffe, 3, E_URC }, + { "E5.3.4(17)", 0x9ffff, 3, E_URC }, + { "E5.3.4(18)", 0xafffe, 3, E_URC }, + { "E5.3.4(19)", 0xaffff, 3, E_URC }, + { "E5.3.4(20)", 0xbfffe, 3, E_URC }, + { "E5.3.4(21)", 0xbffff, 3, E_URC }, + { "E5.3.4(22)", 0xcfffe, 3, E_URC }, + { "E5.3.4(23)", 0xcffff, 3, E_URC }, + { "E5.3.4(24)", 0xdfffe, 3, E_URC }, + { "E5.3.4(25)", 0xdffff, 3, E_URC }, + { "E5.3.4(26)", 0xefffe, 3, E_URC }, + { "E5.3.4(27)", 0xeffff, 3, E_URC }, + { "E5.3.4(28)", 0xffffe, 3, E_URC }, + { "E5.3.4(29)", 0xfffff, 3, E_URC }, + { "E5.3.4(30)", 0x10fffe, 3, E_URC }, + { "E5.3.4(31)", 0x10ffff, 3, E_URC }, + { "E5.99.0", 'T', 1, { 'T' } }, + { "E5.99.1", 'H', 1, { 'H' } }, + { "E5.99.2", 'E', 1, { 'E' } }, + { "E5.99.3", ' ', 1, { ' ' } }, + { "E5.99.4", 'E', 1, { 'E' } }, + { "E5.99.5", 'N', 1, { 'N' } }, + { "E5.99.6", 'D', 1, { 'D' } }, + + // Terminator + { 0 } + }; + + run_encode_sub_test_array(tests); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_utf8_char_count) +{ + // Check function can cope with NULL argument + ck_assert_int_eq(utf8_char_count(NULL), 0); + + unsigned int kosme_strlen = strlen(greek_kosme); + unsigned int kosme_len = utf8_char_count(greek_kosme); + + // All characters map to two bytes except for the 'omicrom with oxia' + // which maps to three + ck_assert_int_eq(kosme_strlen, 2 + 3 + 2 + 2 + 2); + ck_assert_int_eq(kosme_len, 5); + + unsigned int simple_test_strlen = strlen(simple_test_with_emoji); + unsigned int simple_test_len = utf8_char_count(simple_test_with_emoji); + + ck_assert_int_eq(simple_test_strlen, + (1 + 1 + 1 + 1 + 1 + 1 ) + // Simple + 1 + + (1 + 1 + 1 + 1 ) + // Test + 1 + + 4); // emoji + // The emoji is 4 bytes - all others are 1 + ck_assert_int_eq(simple_test_len, simple_test_strlen - 3); +} +END_TEST + +/******************************************************************************/ +START_TEST(test_utf8_as_utf16_word_count) +{ + unsigned int kosme_count = + utf8_as_utf16_word_count(greek_kosme, strlen(greek_kosme)); + + ck_assert_int_eq(kosme_count, 5); // All characters in BMP + + unsigned int simple_test_count = + utf8_as_utf16_word_count(simple_test_with_emoji, + strlen(simple_test_with_emoji)); + + ck_assert_int_eq(simple_test_count, + (1 + 1 + 1 + 1 + 1 + 1 ) + // Simple + 1 + + (1 + 1 + 1 + 1 ) + // Test + 1 + + 2); // emoji +} +END_TEST + +/******************************************************************************/ +START_TEST(test_utf8_add_char_at) +{ +#define TEST_SIZE sizeof(simple_test_with_emoji) + + // Type pairing a string position with a Unicode char + struct pos_to_char_map + { + unsigned int pos; + char32_t c32; + }; + + // Buffer for constructing the string + char buff[TEST_SIZE]; + + // A pseudo-random map of the characters in simple_test_with_emoji + const struct pos_to_char_map map[] = + { + { 0, 'l' }, + { 0, 'S' }, + { 1, 'i' }, + { 2, 'm' }, + { 4, 0x1f625 }, + { 4, '.' }, + { 4, 'e' }, + { 5, 'T' }, + { 3, 'p' }, + { 7, 't' }, + { 7, 'e' }, + { 8, 's' }, + { 6, ' ' }, + { 0 } + }; + + buff[0] = '\0'; + + // Construct the string in a pseudo-random fashion + + const struct pos_to_char_map *p; + for (p = map; p->c32 != 0 ; ++p) + { + if (!utf8_add_char_at(buff, TEST_SIZE, p->c32, p->pos)) + { + ck_abort_msg("test_utf8_add_char_at: " + "Can't insert char %x at pos %u", + p->c32, + p->pos); + } + } + + // Should have reached the buffer size by now + ck_assert_int_eq(strlen(buff), TEST_SIZE - 1); + + // Check the string is what we expect + ck_assert_int_eq(strcmp(buff, simple_test_with_emoji), 0); + + // Try to insert another character + if (utf8_add_char_at(buff, TEST_SIZE, ' ', 0)) + { + ck_abort_msg("test_utf8_add_char_at: " + "Insert succeeded but should have failed"); + } + +#undef TEST_SIZE +} +END_TEST + +/******************************************************************************/ +START_TEST(test_utf8_remove_char_at) +{ +#define TEST_SIZE sizeof(simple_test_with_emoji) + // Type pairing a string position with a Unicode char + struct pos_to_char_map + { + unsigned int pos; + char32_t c32; + }; + + // Buffer for deconstructing the string + char buff[TEST_SIZE]; + + // A pseudo-random map of the characters in simple_test_with_emoji + const struct pos_to_char_map map[] = + { + { 2, 'm' }, + { 7, 'e' }, + { 5, ' ' }, + { 1, 'i' }, + { 2, 'l' }, + { 3, 'T' }, + { 6, 0x1f625 }, + { 2, 'e' }, + { 3, 't' }, + { 3, '.' }, + { 2, 's' }, + { 1, 'p' }, + { 0, 'S' }, + { 0 } + }; + + char32_t c32; + + strcpy(buff, simple_test_with_emoji); + + // Deconstruct the string in a pseudo-random fashion + const struct pos_to_char_map *p; + for (p = map; p->c32 != 0 ; ++p) + { + c32 = utf8_remove_char_at(buff, p->pos); + if (c32 != p->c32) + { + ck_abort_msg("test_utf8_remove_char_at: " + "remove char at pos %u was %x, expected %x", + p->pos, c32, p->c32); + } + } + + // Should have emptied the buffer by now + ck_assert_int_eq(buff[0], '\0'); + + // Try to remove other characters + c32 = utf8_remove_char_at(buff, 0); + ck_assert_int_eq(c32, 0); + c32 = utf8_remove_char_at(buff, 99); + ck_assert_int_eq(c32, 0); + ck_assert_int_eq(buff[0], '\0'); + +#undef TEST_SIZE +} +END_TEST + +/******************************************************************************/ + +Suite * +make_suite_test_string_unicode(void) +{ + Suite *s; + TCase *tc_unicode; + + s = suite_create("String"); + + tc_unicode = tcase_create("Unicode"); + suite_add_tcase(s, tc_unicode); + tcase_add_test(tc_unicode, test_get_next_char__section_1); + tcase_add_test(tc_unicode, test_get_next_char__section_2); + tcase_add_test(tc_unicode, test_get_next_char__section_3); + tcase_add_test(tc_unicode, test_get_next_char__section_4); + tcase_add_test(tc_unicode, test_get_next_char__section_5); + tcase_add_test(tc_unicode, test_utf_char32_to_utf8); + tcase_add_test(tc_unicode, test_utf8_char_count); + tcase_add_test(tc_unicode, test_utf8_as_utf16_word_count); + tcase_add_test(tc_unicode, test_utf8_add_char_at); + tcase_add_test(tc_unicode, test_utf8_remove_char_at); + + return s; +} diff --git a/xrdp/xrdp_font.c b/xrdp/xrdp_font.c index f88ec93c..7eb0893a 100644 --- a/xrdp/xrdp_font.c +++ b/xrdp/xrdp_font.c @@ -52,9 +52,6 @@ static char w_char[] = }; #endif -// Unicode definitions -#define UNICODE_WHITE_SQUARE 0x25a1 - // First character allocated in the 'struct xrdp_font.chars' array #define FIRST_CHAR ' ' @@ -354,9 +351,9 @@ xrdp_font_create(struct xrdp_wm *wm, unsigned int dpi) } // Find a default glyph - if (char_count > UNICODE_WHITE_SQUARE) + if (char_count > UCS_WHITE_SQUARE) { - self->default_char = &self->chars[UNICODE_WHITE_SQUARE]; + self->default_char = &self->chars[UCS_WHITE_SQUARE]; } else if (char_count > '?') {