From 0463e552dcbfad1e4bb47135d2599cf4a1e4da4f Mon Sep 17 00:00:00 2001
From: matt335672 <30179339+matt335672@users.noreply.github.com>
Date: Tue, 19 Sep 2023 09:19:17 +0100
Subject: [PATCH] Add UTF-8 / UTF-32 conversion routines

These are intended to replace non-UTF-16 uses of mbstowcs() / wcstombs()
---
 common/Makefile.am                       |   1 +
 common/arch.h                            |  11 +
 common/string_calls.c                    | 304 ++++++++-
 common/string_calls.h                    |  81 +++
 common/unicode_defines.h                 |  71 ++
 configure.ac                             |   2 +-
 tests/common/Makefile.am                 |   1 +
 tests/common/UTF-8-test.txt              | Bin 0 -> 22781 bytes
 tests/common/test_common.h               |   1 +
 tests/common/test_common_main.c          |   1 +
 tests/common/test_string_calls_unicode.c | 835 +++++++++++++++++++++++
 xrdp/xrdp_font.c                         |   7 +-
 12 files changed, 1308 insertions(+), 7 deletions(-)
 create mode 100644 common/unicode_defines.h
 create mode 100644 tests/common/UTF-8-test.txt
 create mode 100644 tests/common/test_string_calls_unicode.c

diff --git a/common/Makefile.am b/common/Makefile.am
index c929baab..a206a21f 100644
--- a/common/Makefile.am
+++ b/common/Makefile.am
@@ -68,6 +68,7 @@ libcommon_la_SOURCES = \
   thread_calls.h \
   trans.c \
   trans.h \
+  unicode_defines.h \
   $(PIXMAN_SOURCES)
 
 libcommon_la_LIBADD = \
diff --git a/common/arch.h b/common/arch.h
index 38e79d5c..e66b2479 100644
--- a/common/arch.h
+++ b/common/arch.h
@@ -46,6 +46,17 @@ typedef unsigned long uintptr_t;
 
 typedef int bool_t;
 
+// Define Unicode character types
+#if defined(HAVE_UCHAR_H)
+#include <uchar.h>
+#elif defined(HAVE_STDINT_H)
+typedef uint_least16_t char16_t;
+typedef uint_least32_t char32_t;
+#else
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+
 /* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN
    in the makefile to override */
 
diff --git a/common/string_calls.c b/common/string_calls.c
index 1ba35613..a702a8a4 100644
--- a/common/string_calls.c
+++ b/common/string_calls.c
@@ -27,11 +27,11 @@
 #include <stdlib.h>
 #include <ctype.h>
 
-
 #include "log.h"
 #include "os_calls.h"
 #include "string_calls.h"
 #include "defines.h"
+#include "unicode_defines.h"
 
 unsigned int
 g_format_info_string(char *dest, unsigned int len,
@@ -1288,3 +1288,305 @@ g_sig2text(int signum, char sigstr[])
     g_snprintf(sigstr, MAXSTRSIGLEN, "SIG#%d", signum);
     return sigstr;
 }
+
+/*****************************************************************************/
+char32_t
+utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref)
+{
+    /*
+     * Macro used to parse a continuation character
+     * @param cp Character Pointer (incremented on success)
+     * @param end One character past end of input string
+     * @param value The value we're constructing
+     * @param finish_label Where to go in the event of an error */
+#define PARSE_CONTINUATION_CHARACTER(cp, end, value, finish_label) \
+    { \
+        /* Error if we're out of data, or this char isn't a continuation */ \
+        if (cp == end || !IS_VALID_CONTINUATION_CHAR(*cp)) \
+        { \
+            value = UCS_REPLACEMENT_CHARACTER; \
+            goto finish_label; \
+        } \
+        value = (value) << 6 | (*cp & 0x3f); \
+        ++cp; \
+    }
+
+    char32_t rv;
+
+    /* Easier to work with unsigned chars and no indirection */
+    const unsigned char *cp = (const unsigned char *)*utf8str_ref;
+    const unsigned char *end = (len_ref != NULL) ? cp + *len_ref : cp + 6;
+
+    if (cp == end)
+    {
+        return 0; // Pathological case
+    }
+
+    unsigned int c0 = *cp++;
+
+    if (c0 < 0x80)
+    {
+        rv = c0;
+    }
+    else if (c0 < 0xc0)
+    {
+        /* Unexpected continuation character */
+        rv = UCS_REPLACEMENT_CHARACTER;
+    }
+    else if (c0 < 0xe0)
+    {
+        /* Valid start character for sequence of length 2
+         * U-00000080 – U-000007FF */
+        rv = (c0 & 0x1f);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+
+        if (rv < 0x80 || INVALID_UNICODE_80_TO_7FF(rv))
+        {
+            rv = UCS_REPLACEMENT_CHARACTER;
+        }
+    }
+    else if (c0 < 0xf0)
+    {
+        /* Valid start character for sequence of length 3
+         *  U-00000800 – U-0000FFFF */
+        rv = (c0 & 0xf);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        if (rv < 0x800 || INVALID_UNICODE_800_TO_FFFF(rv))
+        {
+            rv = UCS_REPLACEMENT_CHARACTER;
+        }
+    }
+    else if (c0 < 0xf8)
+    {
+        /* Valid start character for sequence of length 4
+         * U-00010000 – U-0001FFFFF */
+        rv = (c0 & 0x7);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        if (rv < 0x10000 || INVALID_UNICODE_10000_TO_1FFFFF(rv))
+        {
+            rv = UCS_REPLACEMENT_CHARACTER;
+        }
+    }
+    else if (c0 < 0xfc)
+    {
+        /* Valid start character for sequence of length 5
+         * U-00200000 – U-03FFFFFF */
+        rv = (c0 & 0x3);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+
+        // These values are currently unsupported
+        rv = UCS_REPLACEMENT_CHARACTER;
+    }
+
+    else if (c0 < 0xfe)
+    {
+        /* Valid start character for sequence of length 6
+         * U-04000000 – U-7FFFFFFF */
+        rv = (c0 & 0x1);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+        PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
+
+        // These values are currently unsupported
+        rv = UCS_REPLACEMENT_CHARACTER;
+    }
+    else
+    {
+        // Invalid characters
+        rv = UCS_REPLACEMENT_CHARACTER;
+    }
+
+finish:
+
+    if (len_ref)
+    {
+        *len_ref -= ((const char *)cp - *utf8str_ref);
+    }
+    *utf8str_ref = (const char *)cp;
+
+    return rv;
+#undef PARSE_CONTINUATION_CHARACTER
+}
+
+/*****************************************************************************/
+unsigned int
+utf_char32_to_utf8(char32_t c32, char *u8str)
+{
+    unsigned int rv;
+
+    if (INVALID_UNICODE(c32))
+    {
+        c32 = UCS_REPLACEMENT_CHARACTER;
+    }
+
+    if (c32 < 0x80)
+    {
+        rv = 1;
+        if (u8str != NULL)
+        {
+            u8str[0] = (char)c32;
+        }
+    }
+    else if (c32 < 0x800)
+    {
+        rv = 2;
+        // 11 bits. Five in first byte, six in second
+        if (u8str != NULL)
+        {
+            u8str[1] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[0] = (c32 & 0x1f) | 0xc0;
+        }
+    }
+    else if (c32 < 0xffff)
+    {
+        rv = 3;
+        // 16 bits. Four in first byte, six in second and third
+        if (u8str != NULL)
+        {
+            u8str[2] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[1] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[0] = (c32 & 0xf) | 0xe0;
+        }
+    }
+    else
+    {
+        rv = 4;
+        // 21 bits. Three in first byte, six in second, third and fourth
+        if (u8str != NULL)
+        {
+            u8str[3] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[2] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[1] = (c32 & 0x3f) | 0x80;
+            c32 >>= 6;
+            u8str[0] = (c32 & 0x7) | 0xf0;
+        }
+    }
+
+    return rv;
+}
+
+/*****************************************************************************/
+unsigned int
+utf8_char_count(const char *utf8str)
+{
+    unsigned int rv = 0;
+    char32_t c;
+
+    if (utf8str != NULL)
+    {
+        while ((c = utf8_get_next_char(&utf8str, NULL)) != 0)
+        {
+            ++rv;
+        }
+    }
+
+    return rv;
+}
+
+/*****************************************************************************/
+unsigned int
+utf8_as_utf16_word_count(const char *utf8str, unsigned int len)
+{
+    unsigned int rv = 0;
+    while (len > 0)
+    {
+        char32_t c = utf8_get_next_char(&utf8str, &len);
+        // Characters not in the BMP (i.e. over 0xffff) need a high/low
+        // surrogate pair
+        rv += (c >= 0x10000) ? 2 : 1;
+    }
+
+    return rv;
+}
+
+/*****************************************************************************/
+int
+utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32,
+                 unsigned int index)
+{
+    int rv = 0;
+
+    char c8[MAXLEN_UTF8_CHAR];
+    unsigned int c8len = utf_char32_to_utf8(c32, c8);
+
+    // Find out where to insert the character
+    char *insert_pos = utf8str;
+
+    while (index > 0 && *insert_pos != '\0')
+    {
+        utf8_get_next_char((const char **)&insert_pos, NULL);
+        --index;
+    }
+
+    // Did we get to where we need to be?
+    if (index == 0)
+    {
+        unsigned int bytes_to_move = strlen(insert_pos) + 1; // Include terminator
+        // Is there room to insert the character?
+        //
+        //  <----------- len ---------->
+        //            <--> (bytes_to_move)
+        // +----------------------------+
+        // |ABCDEFGHIJLMN\0             |
+        // +----------------------------+
+        //  ^         ^
+        //  +-utf8str +-insert_pos
+        //
+        if ((insert_pos - utf8str) + bytes_to_move + c8len <= len)
+        {
+            memmove(insert_pos + c8len, insert_pos, bytes_to_move);
+            memcpy(insert_pos, c8, c8len);
+            rv = 1;
+        }
+    }
+
+    return rv;
+}
+
+/*****************************************************************************/
+char32_t
+utf8_remove_char_at(char *utf8str, unsigned int index)
+{
+    int rv = 0;
+
+    // Find out where to remove the character
+    char *remove_pos = utf8str;
+
+    while (index > 0)
+    {
+        // Any characters left in string?
+        if (*remove_pos == '\0')
+        {
+            break;
+        }
+
+        utf8_get_next_char((const char **)&remove_pos, NULL);
+        --index;
+    }
+
+    // Did we get to where we need to be?
+    if (index == 0)
+    {
+        // Find the position after the character
+        char *after_pos = remove_pos;
+        rv = utf8_get_next_char((const char **)&after_pos, NULL);
+
+        // Move everything up
+        memmove(remove_pos, after_pos, strlen(after_pos) + 1);
+    }
+
+    return rv;
+}
diff --git a/common/string_calls.h b/common/string_calls.h
index b4ab8c96..759c0566 100644
--- a/common/string_calls.h
+++ b/common/string_calls.h
@@ -87,6 +87,15 @@ enum
     MAXSTRSIGLEN =  (3 + 1 + 1 + ((sizeof(int) * 5 + 1) / 2) + 1)
 };
 
+/*
+ * Significant Universal Character Set (Unicode) characters
+ */
+enum
+{
+    UCS_WHITE_SQUARE  = 0x25a1,
+    UCS_REPLACEMENT_CHARACTER  = 0xfffd
+};
+
 /**
  * Processes a format string for general info
  *
@@ -317,4 +326,76 @@ int      g_strtrim(char *str, int trim_flags);
  * The string "SIG#<num>" is returned for unrecognised signums
  */
 char    *g_sig2text(int signum, char sigstr[]);
+
+/**
+ * Get the next Unicode character from a UTF-8 string
+ *
+ * @param utf8str_ref UTF 8 string [by reference]
+ * @param len_ref Length of string [by reference] or NULL
+ * @return Unicode character
+ *
+ * On return, utf8str and len are updated to point past the decoded character.
+ * Unrecognised characters are mapped to UCS_REPLACEMENT_CHARACTER
+ *
+ * len is not needed if your utf8str has a terminator, or is known to
+ * be well-formed.
+ */
+char32_t
+utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref);
+
+/**
+ * Convert a Unicode character to UTF-8
+ * @param c32 Unicode character
+ * @param u8str buffer containing at least MAXLEN_UTF8_CHAR  bytes for result
+ * @return Number of bytes written to u8str. Can be NULL if only the
+ *         length is needed.
+ *
+ * The bytes written to u8str are unterminated
+ */
+#define MAXLEN_UTF8_CHAR 4
+unsigned int
+utf_char32_to_utf8(char32_t c32, char *u8str);
+
+/**
+ * Returns the number of Unicode characters in a UTF-8 string
+ * @param utf8str UTF-8 string
+ * @result Number of Unicode characters in the string (terminator not included)
+ */
+unsigned int
+utf8_char_count(const char *utf8str);
+
+/**
+ * Returns the number of UTF-16 words required to store a UTF-8 string
+ * @param utf8str UTF-8 string
+ * @param len Length of UTF-8 string
+ * @result number of words to store UTF-8 string as UTF-16.
+ */
+unsigned int
+utf8_as_utf16_word_count(const char *utf8str, unsigned int len);
+
+/**
+ * Add a Unicode character into a UTF-8 string
+ * @param utf8str Pointer to UTF-8 string
+ * @param len Length of buffer for UTF-8 string (includes NULL)
+ * @param c32 character to add
+ * @param index Where to add the codepoint
+ * @return 1 for success, 0 if no character was inserted
+ *
+ * This routine has to parse the string as it goes, so can be slow.
+ */
+int
+utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32,
+                 unsigned int index);
+
+/**
+ * Remove a Unicode character from a UTF-8 string
+ * @param utf8str Pointer to UTF-8 string
+ * @param index Where to remove the codepoint from (0-based)
+ * @return Character removed, or 0 if no character was removed
+ *
+ * This routine has to parse the string as it goes, so can be slow.
+ */
+char32_t
+utf8_remove_char_at(char *utf8str, unsigned int index);
+
 #endif
diff --git a/common/unicode_defines.h b/common/unicode_defines.h
new file mode 100644
index 00000000..cf9df2db
--- /dev/null
+++ b/common/unicode_defines.h
@@ -0,0 +1,71 @@
+/**
+ * xrdp: A Remote Desktop Protocol server.
+ *
+ * Copyright (C) Jay Sorg 2004-2023
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file common/unicode_defines.h
+ *
+ * Defines used internally by the implementations of the Unicode routines
+ */
+
+#if !defined(UNICODE_DEFINES_H)
+#define UNICODE_DEFINES_H
+
+/**
+ * Is this byte a valid UTF-8 continuation character?
+ */
+#define IS_VALID_CONTINUATION_CHAR(c) ((c) >= 0x80 && (c) < 0xc0)
+
+/**
+ * Is this character one of the end-of-plane non-characters?
+ *
+ * These are U+xFFFE and U+xFFFF for x in (0..10}
+ */
+#define IS_PLANE_END_NON_CHARACTER(c32) (((c32) & 0xfffe) == 0xfffe)
+
+/**
+ * Is this character one of the additional non-characters?
+ *
+ * 32 additional non-charactersare defined in the
+ * "Arabic Presentation Forms-A" Unicode block */
+#define IS_ARABIC_NON_CHARACTER(c32) ((c32) >= 0xfdd0 && (c32) <= 0xfdef)
+
+// Invalid characters, based on UTF-8 decoding range
+//
+// By 'invalid' we mean characters that should not be encoded or
+// decoded when switching between UTF-8 and UTF-32
+//
+// See "UTF-8 decoder capability and stress test" Markus Kuhn 2015-08-28
+#define INVALID_UNICODE_0_TO_7F(c) (0)   // No invalid characters
+#define INVALID_UNICODE_80_TO_7FF(c) (0) // No invalid characters
+#define INVALID_UNICODE_800_TO_FFFF(c) \
+    (((c) >= 0xd800 && (c) <= 0xdfff) || /* Surrogate pairs */ \
+     IS_ARABIC_NON_CHARACTER(c) || \
+     IS_PLANE_END_NON_CHARACTER(c))
+
+#define INVALID_UNICODE_10000_TO_1FFFFF(c) \
+    (IS_PLANE_END_NON_CHARACTER(c) || (c) > 0x10ffff)
+
+// Returns true for all 'invalid' Unicode chars
+#define INVALID_UNICODE(c) \
+    ( \
+      INVALID_UNICODE_0_TO_7F(c) || \
+      INVALID_UNICODE_80_TO_7FF(c) || \
+      INVALID_UNICODE_800_TO_FFFF(c) || \
+      INVALID_UNICODE_10000_TO_1FFFFF(c) \
+    )
+
+#endif // UNICODE_DEFINES_H
diff --git a/configure.ac b/configure.ac
index 2f3d6938..189d9493 100644
--- a/configure.ac
+++ b/configure.ac
@@ -569,7 +569,7 @@ AC_SUBST([pamconfdir], [$with_pamconfdir])
 
 PKG_INSTALLDIR
 
-AC_CHECK_HEADERS([sys/prctl.h])
+AC_CHECK_HEADERS([sys/prctl.h uchar.h])
 
 AC_CONFIG_FILES([
   common/Makefile
diff --git a/tests/common/Makefile.am b/tests/common/Makefile.am
index 8294afcb..ff888bcc 100644
--- a/tests/common/Makefile.am
+++ b/tests/common/Makefile.am
@@ -17,6 +17,7 @@ test_common_SOURCES = \
     test_fifo_calls.c \
     test_list_calls.c \
     test_string_calls.c \
+    test_string_calls_unicode.c \
     test_os_calls.c \
     test_os_calls_signals.c \
     test_ssl_calls.c \
diff --git a/tests/common/UTF-8-test.txt b/tests/common/UTF-8-test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b
GIT binary patch
literal 22781
zcmdU1X_Fh*b<OAg6*tHS4b2&FHj*P%oGOwt6R{$Rs*+eKe{2AaL3@C1xVtgnq)-VH
zIkB@!-gishSBZ8K2eD%j^pZW1v+s%M>>CYjiIrHEbIyIeVd1bb#8j*<f`b{LPv3X<
zci(;Q;)B~u8&Y$se$5G_YPao*+jOISvAvpvQQ(B3L{1o4O9d~>y4MbNJE8ngr{T%>
zH=?M0*NGF|Zns=*maBHFY*)*j-4j3B+Sy$_dEy6TNmiFvPA)BPEUj+fmviUj>}zDb
zylhzyHeB;;sk==Fw0Y8Snr+$lJK|ijTdwCUO2hB+4}n}98At=<H=WShvjf)$G0}e8
z3B|8V)ei!v8ZAo8Zr1&v<<u6%55$iePSABjM_ey*FzU$lb_}K!I<M<EUeyW9q9iFb
zAkFh5X*sqRN#sk#5gXthF>QQVnO2tbk<@(9!SA*KTMb-()7zJ}9Yk)m)3ovMPE_?<
z4&m`=AYfdi2gL9x(zW;T2&>e#!>~kZOLg0AmhhcCgBHcvb3FXf@9Z>qKS;P_9xq{M
zX9r^v+X4}FC$KAEXd?8A)3Eozr9kXRLJ%VDrme$0ABAGOEs(JYzJ!rugIk~^6$2>n
zcEzq>1dFOMR23y23`tPI^(hkaJx~+w1GHs>5#U|33BkDdJ8tNBov>p!@zi!F0^_$X
zwVlAF6hNWW!L)ctp%pbicFV`;>TVE5OOcBa*d6d8P>GKyMu;y#v2jP!v2ly<Igqc>
zk#^vNEf_7C=(s`3piW3-P;OK9WvZqTur#5BA;#K-8dm*1m=Jj3)$CxO%20DT9xyLe
zwsDmL*Fxz!%_e`wy4l<p3>5ZQi<LT6Kyv1y=kharx#n1HAL<J2CJRCczF89wS^$-*
zRv}M}gi5IzA7jM>I8()+YdS1n&OdNLR+dk#pITbEYjt^fc~L?miBLAnr&X@2<+Unk
z1xxnas$;=~9Ct4@5j@+*5Y%*_V9+xuWkXKLj~OeW4FN%?jpW=YyARqX)rK9|RoI{z
z6*L^m6}lKPRobrk?FftsJC)FhN~FyyrMJYKuwgkgxUNh6oN`}GFDwzt;8txQTCib0
zpaX3vNd^8-vO$_rm*s*-zPDPw2N~g;P^31r#;!I1WQ{2rZPN#c8k3rQ#uD?@lrel9
zc&guQIdCZHh-Bg7@$K#H&0~wwZMb+&)Ax7DrV9Ahba&}X@N9^jV}PH6<AMdX;DXh!
z*%G=fmrf?!Z&m!JdZd=S(};}6+VTCGfVno!de5cJVQ4j1{a?oo8+g=B$*u=}ixM*m
z*4(#JwM}$%LYA-X!00F&dk-#5-B2tc=xczXTXi4fuXJFV5QlNe^$v+@Iy-iA)XC)(
zSKX|4M`XeK)*w&_(}mwS4-0^^t=b`csy^J^@%(_An++?G15_=rI$lOvDuz(umGIcd
zQ?}{jAsiR_-?DcdtL9YTZ1n+f<X~mJPB$O>M+Px?o*Hkbs)2c49sb4l;8pIYPlARu
zoGPT(Z8+q>K>~n^Evy`)jv=&wc~r>Af(RF7VRiKHP%*^JY6M2Y=RqsH9fW|J=p5L9
znCI@<O)6sumf1~cAX9>cwPR{0b+~K4%VEB*<45q|vDS%Qr|sIF1&&Z+V?K-_EmasQ
z8kByYLD(%L0K%By=V1!rzE^DozUTgkfsVl1(8MKiTZq~&WQ?&SPutJLna1mUxDM_8
zP)ZNpy(L@sZI)m$I*2Ai09`mdo-eM(DM*@X6gpHP8)DPgmBh+wV22I*AB0J!9+fy;
z8{>>!u-$}rV^pxJHiRsQ=#+ES2`J?=&=eCklnU@e6AC)5L_(+U%sfB{lLwa<+D)c0
z@KC(hsyj~Ym{ISVi82sR3hY2obbeCmm0&TKs!kix5mfO0s0&XAKVcanR?92p6&3Aj
z8SfK619~GH1bV)=)P}u8YO7}+ICuU$#}Wm8Gqf%qN3>YhoB8(iHYkdjXatJ~CgW3j
zY>f4iQ(Nk~HBd)WUqlpu?vm2{m{U#~UP-4-nIT8$v^)WO76SEQ@N+P=V`Y;of&qRv
z)cla%oCy!KxVqci&#Y-mCc=zBS!*Ss((AM;P-3Qwi3bb$e1=vyB2^fTxfonxrVgS2
zoKKuoB7(+ABNaY2vzm!32iYF<<33-#7gY~A0sMvCghhF<gFOh(L@9Vs8XO3me4+*>
zq-e|{8C-$Ugm&8k%c>);(`u^J2$kppjT%GKyf1X4j@nzOxnwR`JcchcJNi(NkUAXU
zsBwD@$v}<yT4f)^gRETCNbS+6B;)FoK$mKcaV$o0F#mv=@9g6VR^ALX!}Q<|)L*Jc
zr?tjfq)GxtT6Z&rW@c^l9{Ld8C5OIx)}e2mdSlO85qW^rC2D7~0?pc|-f{3B=TF~j
zFwu$)(YQlLelT#HUFjlslsk6)u;tv5aJ2N)7ry@F=fC#QQ(t`QbETV8I)5rxMb1v-
z+{XGD_QRXgGVPNipbzd<wmCJCwjYKrr(TJ7Fs6dQpuQH-b_Z_p!0SlpBZVu*c9C!{
z(W&z~mdI90-*bRSN9nZ~P#}|4#Y2LU;BDmbzI?eTU{3+pMApUtH%17&{@9F&>mutT
zh$~}5e&&(OGeVveIhlv#Txx{K7vFe!R?Jf(r*fF<3Yq8RORop!41|tK>vn&mbRg_|
zm`Ke*yX7d<8wxo^y-#oF)cePZYNK$x)71O)b{_BPK~X?uV0lu#G3m<#4hJ(Lj;r?y
z0rQC89Lx$irrv9cn1}opAZH|VRK3AOy>gQIHZf-ay<LSKy`R#7Y%U<Tm|EEx{r0VY
zI|}t)qk0=By9ox$s|E9uJL^)Z<8A52qlKBL<m-s3;0AFE{)UK-Y%KF15Wi8}+vE_z
zMS}<dZxT@ntkj4&96VXnTyluea3ZcOQ)K{CUzZI7ssn%F;o@o|@H-H685*vT8VU^L
zojPnY@)u@^2!y<~KFsoso&_G>J7t0~3rOy}M?jxyqUWGiivnIv=ZUUoW)yJ@%RT>A
zZ|6_n7AGs61L#Y#T@(gG2L*alV=(oPCW|Yqyd_s}%50DMoQs02-}TDL_lo|U|9IgU
zC=ru;a?(B`3|YRkv3!@5islZ`Ka<&Mu%8qz@G5oS74<hqGhW!p>LhKL_mwUeB`g-0
z885ud+RgBaa?Su6Jh+(ko8rwxp}|7{J$ZAyStvAkr<rd~=!)Ur<-sgGG<X{#rw_s_
zPT|q$%bl2m$E6ybjG=~hYCTJ3CpjStw3{U=dzPk9!t<vOpry+{{u2+q{wIIx4R8GE
zpZVFJ`}tpZ*yz$>{x|eJ@}@Vx<*jdf`#awGu6MuZz3+SfqlZB<_ZYm#KJdX0efT3E
z{n*Dp@ySnp`tc{O+%Dc{KKqNm^vj?7mHw}O{tLhM#V4=MWD%p!p1(ZSbgF~$IZ9|@
zg!S8ed2saM1%5jc`snXjU|w4_tD$k&MOa#0(%M{7dBDVc`BIg7T&ttBKjlj4(;A;C
z%OM8p(pcEW0&zC_v`#PuCjC>2m;UAU-Y8z_ntVyVEMJkY%CF0B$ZyJT$#2W=$nVPU
z$?wZk^0gTrVd(n<`9t|5`D6JL`BV8b`E&UT`AhjL`D^(b`CIurd0L7ms`DQ&^p6@y
z`L3KAm+#sX`8t%Z^NN=5b@{sdz5Ijxqx_Tnv;2$vtNfe%yZndzr`(Wl$ngJN9>_N+
zkBxD0tWOa~o#LpUo^Ebg{P?c6w`b&8c}|{}7v#U>zvY`V@PFJCkH$WbuTdWB<MKE;
zO&%MRN97eQj~C^C<bUN`w;_i_E=MB|<hzu|L^wJ%O&(RsqxOoH$4m0<Imm)CISP5e
zTP4{pRxY_htV`e%S+2BPtW;px`G9Nw^iffmq-fig_O)Q~4vTzPQcTj8{0v>&64GgT
zysDZJJZ*k%+6;6}KSAQkmiF_oRc4|oB7=iG(#2q13uZ9^c@>u(QBsJ|L8XgMXgXOq
zm#rG3WWd$cxU&sS8R<2D{;i>eh5T<U)62<OGr&Z&=q$=DPn52WXEvO_X~sL2Sro*(
zet9||RB$Xd76g1|vhHX|gv%fE?gbHFoU(9YAmO#f98^Kbm!>Vo7);F63rof<!I?FM
z($mu!iJ5v~$tWJ{q?odz+&Uc)Q}x1<Vd0f{3W%kI>4=zI7naP@a<-sQ;!47FNLX26
z$t>m86H2;}Fo~I(<6PcQ(on~9X!b$vkUblwOT$G!=a?r8i#%}tK7u}$E(?52e5$%z
z*gDHJG1+X$Lc>MgN8*O8*Ce66mo|5>-H8DsEtVFV6DByx&-srR$fax7FT;7z+SNRt
z9uA<fRI#Z#k<e1~=lp5bg^2U5bUoF?#G>?!e{cTuJqZ2D`v2HZW++5+qqcUo+YTB|
z5%$vF(4+}4($33K3g=HB_{TU{ceuFp^0>{9KK<6e(WXyw-d)#qrSHrK4`C4W@n1a#
ze);9^9145>{DTL(fKE5<JItKKtvNOQ8}p}cRR1yxlfLiFPodE*YjdaDLo&(sROD^x
z)nf-@+E%|>hGUG_Lmbq~a<|`Q&!6_2p=}QBWMm2ea0_}P(FGrd?Dc08U2^rTakV#X
z(Xa@!XT)ZkA}VQUq(i@Jt;5YC4X0I>Gg1m|sD$YQ%kbjn(^BG&1Lw8KC-o-N)2xl)
z*{Cy)XwN6?Gk^8aUTcTgmT;Fg4`YW4w{vK-y7n%!r@hIhc*HIwuXW+JhA~|rCz7kD
zn_I9`2oX=0$j~<I*0LVK7ziv+^Jnacs5quKRvSbO0Wl!t;Y--o4KunSBgF0wJTGqL
zM0$-ZdWwF6muN!F=*sBJH&CXni_^){uFk4%qyRtNDmM?I+KUCj^y*pK`41v1``nR(
zN&_1-*+Gh37}~sTCKP0T+%TKY2g<UIzm8lzi(6~BXNVgt;uJCIq-U=wyaX7#aL@KJ
zG~VeJ5AX&Qfq~k`5B19f_O!7V!MwwrOfgmb1#zDH7M3q9WBUN7S&#zy?9n4?VA1o>
z30Q8Z3fn^)s82;V(zUh!)zHLRu!j}ZoYp3d(oJZC#sC`E$OSyN?QmdVhS@=Y=}{-(
zrZe5oLhIafo$g}7J)};MR=h6_ACB8crtjR2W%^xdvJl5O|J8<HMLxWW{nCjY<_Ast
z$olCqM1x^M9D*7L6xi?r<RmM4r|al6_pxUu>_AObq4NWVKM1)X&S53})42gHvR~Io
zCU5;QAF<y{$B_dheQN)yU5@T1fkHyXOhY>6wAcnkNvJ|GL31u)a|b4=={0}<yw2#m
zg&lgByC-w`l-Mzeq-5Om&HU;6VHQ5Xn#|Y6+%`S*kP~H`WjJ+qUZ(>XCp4!onNK0W
z_(((4S{kxYI#EJ5wb`r%y)(juZ6z1&m-~8h>Gr6A7gfNC(|{@KZm0;o+q!9y_RIk$
z7vs^vE!Ql8fE`MnZe=scf6dnoFWAP%kJAe|{{-W{H~n<>H}jV-^bZ=y^2RC~gR0n}
zkc;=A5NYAHD~0p8o<+db4A9Y@pgbl3uV0=OaV<lvCtXG<C=fAcnH6$92e}cqY@v6_
zpaPO}pjk0b<}j-nrhx^f&Z%Yron1u^x`Wa|a(=IU$!&F7nVDum;h?)GKbyZgp4Hu<
zI-Q20(K9r#d4;=ElD(YBm^%Cf0Sh^ziNn|fg<WIx*qlgtDWN?p^+d@KnWW`omYP4G
zvjfM4G&SALqA@Jq7L@=Z{a{y9x@}dr%`k#YS~;Qy%LnQXvs^CmWiepS$~(2O%yiU|
z<)ctS&cx68%LDqt0A<<{xoe3&0`2b;KH^IgHK+;d@CQ{us$dKl_h2r^J(%e&^by6H
zF%2SShz5~mHjqdkk@PnjQ2L01U5*A46Fcp1B%(sXN1W!s%|D>hN4#N5@hu%i?v24T
zJ(F?mj%30|pjAJ?#2fmEhk#j-o4F^<;Vp>-Cod#(nAA=*Rwo7g`!3#-%}x${>ipI5
z!O2`BF>Lq(lPBm7t^@2UBe@rEL(U*LH;rLfodHTRS5xE6G8Hmm;>xXh<Z{M|uND5o
z{P_p@syUmzh#Kw@oAMVF@Y=*`ME(o;_o^^z&IzCi_jybJUY}fz$fM-n8$_COf|!Q;
z6cmUE_cIbnb50?1;ohJElHq<vOqz3wIU4Q(3rr37Gl1qfar^Ik76?2a?=04H5KtvG
zrz{)_eE#Y<YR+Q;FEPYIrpy853heosiL)P{^N_kTF;$mzOM2$c5Z$xt6ek0Qrevb7
zS%=vWs+(Y2So>vJ0b73K;#4*}HwR>dIZnepaCUAE6hTQO%}vGzUetHYUmh$4KVTMi
zNfR2UFij8q9n>fC1Idl)ptnatpK)KC*w6+8W0gf`1F*Flj}<nwlxSFpY16)VeDkd4
zHbg|<o(AF@j~9x<DH`7xMD<^rXVnQ1QFnxp#wQL-qq?unvzmzzQBwvYUO6m{>by43
zC48);eEiH|X~d!}__&>n8g^^E;Ek(?MZ~f#5tC7yM2Vv=mLH8tX5`n~{)LiWmHTZs
zfIPlSt;OW-$JJWE$W++IXh3%+^~WyXxV7MQ%qHF^n5l3w%v3jld22x#%-CXJgClgR
zDUQ(j_@FSIvCSCFjLsA@r4yJ%=`>bD%O0UqReglcCyMja*iDR?(V1eVbON&|oyKzL
zFGlE8s1Z7^6z3;;9sm|&W^|^QDV@M9Lg#Ip;0OTqXie_JvV5|DfU-W8I1Zch%s+aK
zW;-RE7Lt;R(lwWfN<5k-*n@yrRL~=$aGDH@-g?Z^*m(16PpHNsaT!^{^P4=S4s$u4
zn4_g-PG6ET$^-GbU5eSWO3BPePsM`!RsujH>i4x$Ilc89&M<O!aBd1pNZ)s-)B+!i
z*jS<idhSAL*4Fhyj0sEGdRkdo?jtt7iDEaFRPY8zhnWQgtn8t9qs<>I<iv@W9VOLN
zWKXE<1ULar7t68ch4Xe?a3@W<0yo4_KsBcg(g=Zek}k_cTr)u3re#ALP_K?JUCP8K
zR1(Y`c}5mbA;FO!JV7n1$A=v~wFBz{u_$iRtTBIm!3lqQ<b@u2bLRf@aTnqWdU{Z|
z&U6PO(-XE-0+bmST}_#63ALMaym<3!STHPio9n5j`-J5(@N3DhFX?j@wk}}p4BF=)
z@zbMt%#k@0N$HV{_k4V~N*qtksbbLJ#l)5|JTru?IQ52V;kZ7Y{4{nceJtWFg<ZQX
zJS)kRF%~d(bnj3PF~N_z4$6M!dG~Co6`5boZsE+J+%^C4>e9-DdQnA3T!WrWLlMIw
zp{lN#KWL=_YYXWDsV)UqntpW`df>;3Z59c&tui(!b9f~fie^igAJ#+iN8yc(*PM%o
zgQ+lQ#JPjq$ur$Tjq@=p!HGQLh0Y9+wa(;}L%?JP$Ax_^nZjg87@pla_1*mCaesQs
zBt{CNc)%f4raCGP@*p<ByaS%_3iGEA@y*PBV8C0P{b<02Rv;rD4z3nnqe<AQ5z}OI
z>Jw5#o&C&kib4X2*-!d*E^m64IZa^19ws<l{896#k22BRb$fGjS;`2K$G11PST8OO
z`;QF!ZyNUBJnX+^*njJ=|F&WO?Zf^%hW&RA`|ld|-#zTVXV`!5u>Zbc|NX=Mqr?7V
z!~O?`{SOZN9~$;QJnVmD*#GFT|FL2J<HP<ZhW$?t`=1*2KRxU}KI}g+>|ZGz&1f`h
z_Eg0#9_P<(SztEeVN5IK^2&0qE&E{3&!4{dd8KC_9bCn0@XQtbyn>$t{6x|G*@p*L
z@ftjf&+!^Oi_h^IJd4lq8a#*3@fsXVxA`=q{yCuGHFyqacnzKd8eW6vfri)Md3=u7
p;CXzG*Wh`4j@JOEDqJmkvO@N>BDCH5J3T8tzwg#xE%c3^^?x0SNE-kE

literal 0
HcmV?d00001

diff --git a/tests/common/test_common.h b/tests/common/test_common.h
index ea700c30..936dda1b 100644
--- a/tests/common/test_common.h
+++ b/tests/common/test_common.h
@@ -10,6 +10,7 @@ bin_to_hex(const char *input, int length);
 Suite *make_suite_test_fifo(void);
 Suite *make_suite_test_list(void);
 Suite *make_suite_test_string(void);
+Suite *make_suite_test_string_unicode(void);
 Suite *make_suite_test_os_calls(void);
 Suite *make_suite_test_ssl_calls(void);
 Suite *make_suite_test_base64(void);
diff --git a/tests/common/test_common_main.c b/tests/common/test_common_main.c
index 05eacd27..d5471781 100644
--- a/tests/common/test_common_main.c
+++ b/tests/common/test_common_main.c
@@ -49,6 +49,7 @@ int main (void)
     sr = srunner_create (make_suite_test_fifo());
     srunner_add_suite(sr, make_suite_test_list());
     srunner_add_suite(sr, make_suite_test_string());
+    srunner_add_suite(sr, make_suite_test_string_unicode());
     srunner_add_suite(sr, make_suite_test_os_calls());
     srunner_add_suite(sr, make_suite_test_ssl_calls());
     srunner_add_suite(sr, make_suite_test_base64());
diff --git a/tests/common/test_string_calls_unicode.c b/tests/common/test_string_calls_unicode.c
new file mode 100644
index 00000000..e678ba30
--- /dev/null
+++ b/tests/common/test_string_calls_unicode.c
@@ -0,0 +1,835 @@
+/*
+ * The UTF-8 decoder tests are based on the UTF-8 decoder capability
+ * and stress test" 2015-08-26 by Markus Kuhn. A copy of that file
+ * named "UTF-8-test.txt" should be in the source directory for this file */
+
+#if defined(HAVE_CONFIG_H)
+#include "config_ac.h"
+#endif
+
+#include "string_calls.h"
+
+#include "test_common.h"
+
+// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_decode_sub_test arrays
+#define URC UCS_REPLACEMENT_CHARACTER
+
+struct utf8_decode_sub_test
+{
+    const char *testref;
+    const char *utf8str;
+    // This array will contain 0 values after the initialised part
+    const char32_t expected[65];
+};
+
+// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_encode_sub_test arrays
+#define E_URC { 0xef, 0xbf, 0xbd }
+
+struct utf8_encode_sub_test
+{
+    const char *testref;
+    char32_t c32;
+    unsigned int expected_len;
+    char expected_str[MAXLEN_UTF8_CHAR];
+};
+
+// Used as the simple test in UTF-8-test.txt
+const static char greek_kosme[] =
+    "\xce\xba" // GREEK SMALL LETTER KAPPA
+    "\xe1\xbd\xb9" // GREEK SMALL LETTER OMICRON WITH OXIA
+    "\xcf\x83" // GREEK SMALL LETTER SIGMA
+    "\xce\xbc" // GREEK SMALL LETTER MU
+    "\xce\xb5"; // GREEK SMALL LETTER EPSILON
+
+// See Issue #2603
+const static char simple_test_with_emoji[] =
+    "Simple Test."
+    "\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
+
+/******************************************************************************/
+/**
+ * Function to decode a UTF-8 string and check the expected result
+ *
+ * @param st Pointer to the sub-test to run
+ */
+static void
+run_utf8_decode_sub_test(const struct utf8_decode_sub_test *st)
+{
+    char32_t c;
+    const char *p = st->utf8str;
+    unsigned int index = 0;
+
+    do
+    {
+        c = utf8_get_next_char(&p, NULL);
+
+        if (c != st->expected[index])
+        {
+            ck_abort_msg("Sub-test section %s Index %u expected %x, got %x",
+                         st->testref,
+                         index, st->expected[index], c);
+        }
+        ++index;
+    }
+    while (c != 0);
+}
+
+/******************************************************************************/
+/**
+ * Function to run an array of decode sub-tests
+ *
+ * @param st Pointer to the first sub-test to run
+ */
+static void
+run_decode_sub_test_array(const struct utf8_decode_sub_test *st)
+{
+    while (st->utf8str != NULL)
+    {
+        run_utf8_decode_sub_test(st++);
+    }
+}
+
+/******************************************************************************/
+/**
+ * Function to encode a UTF-8 value and check the expected result
+ *
+ * @param st Pointer to the sub-test to run
+ */
+static void
+run_utf8_encode_sub_test(const struct utf8_encode_sub_test *st)
+{
+    char actual_str[MAXLEN_UTF8_CHAR];
+    unsigned int index;
+    unsigned int actual_len = utf_char32_to_utf8(st->c32, actual_str);
+
+    if (actual_len != st->expected_len)
+    {
+        ck_abort_msg("Sub-test %s Expected length of %u, got %u",
+                     st->testref,
+                     st->expected_len, actual_len);
+    }
+
+    for (index = 0 ; index < actual_len; ++index)
+    {
+        if (actual_str[index] != st->expected_str[index])
+        {
+            ck_abort_msg("Sub-test %s Character %u, expected %02x got %02x",
+                         st->testref, index,
+                         (int)(unsigned char)st->expected_str[index],
+                         (int)(unsigned char)actual_str[index]);
+        }
+    }
+}
+
+/******************************************************************************/
+/**
+ * Function to run an array of encode sub-tests
+ *
+ * @param st Pointer to the first sub-test to run
+ */
+static void
+run_encode_sub_test_array(const struct utf8_encode_sub_test *st)
+{
+    while (st->expected_len > 0)
+    {
+        run_utf8_encode_sub_test(st++);
+    }
+}
+
+
+/******************************************************************************/
+START_TEST(test_get_next_char__section_1)
+{
+    const struct utf8_decode_sub_test st =
+    {
+        "1",
+        greek_kosme,
+        {
+            0x03ba, // GREEK SMALL LETTER KAPPA
+            0x1f79, // GREEK SMALL LETTER OMICRON WITH OXIA
+            0x03c3, // GREEK SMALL LETTER SIGMA
+            0x03bc, // GREEK SMALL LETTER MU
+            0x03b5  // GREEK SMALL LETTER EPSILON
+        }
+    };
+
+    run_utf8_decode_sub_test(&st);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_get_next_char__section_2)
+{
+    struct utf8_decode_sub_test tests[] =
+    {
+        // 2.1  First possible sequence of a certain length
+        //
+        // (2.1.1 Is tested separately)
+        { "2.1.2", "\xc2\x80", { 0x80 } },
+        { "2.1.3", "\xe0\xa0\x80", { 0x800 } },
+        { "2.1.4", "\xf0\x90\x80\x80", { 0x10000 } },
+        { "2.1.5", "\xf8\x88\x80\x80\x80", { URC } },
+        { "2.1.6", "\xfc\x84\x80\x80\x80\x80", { URC } },
+
+        // 2.2  Last possible sequence of a certain length
+        { "2.2.1", "\x7f", { 0x7f } },
+        { "2.2.2", "\xdf\xbf", { 0x7ff } },
+        // Use U+0000FFFC instead of U+0000FFFF as our decoder
+        // treats non-characters as an input error
+        { "2.2.3", "\xef\xbf\xbc", { 0xfffc } },
+        // U+001FFFFF is out-of-range
+        { "2.2.4", "\xf7\xbf\xbf\xbf", { URC } },
+        { "2.2.5", "\xfb\xbf\xbf\xbf\xbf", { URC } },
+        { "2.2.6", "\xfd\xbf\xbf\xbf\xbf\xbf", { URC } },
+
+        // 2.3  Other boundary conditions
+        { "2.3.1", "\xed\x9f\xbf", { 0xd7ff } },
+        { "2.3.2", "\xee\x80\x80", { 0xe000 } },
+        { "2.3.3", "\xef\xbf\xbd", { 0xfffd } },
+        // Don't use U+10FFFF (non-character)
+        { "2.3.4", "\xf4\x8f\xbf\xbd", { 0x10fffd } },
+        { "2.3.5", "\xf4\x90\x80\x80", { URC } },
+        // Terminator
+        { 0 }
+    };
+
+    // 2.1.1 is a '\0' which we use to terminate our strings. Test
+    // it separately
+    {
+        const char *p = "";
+
+        ck_assert_int_eq(utf8_get_next_char(&p, NULL), 0);
+    }
+
+    // Do the rest of the section 2 tests
+    run_decode_sub_test_array(tests);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_get_next_char__section_3)
+{
+    struct utf8_decode_sub_test tests[] =
+    {
+        // 3.1  Unexpected continuation bytes
+        //
+        // Each unexpected continuation byte should be separately
+        // signalled as a malformed sequence of its own.
+        { "3.1.1", "\x80", { URC } },
+        { "3.1.2", "\xbf", { URC } },
+        { "3.1.3", "\x80\xbf", { URC, URC } },
+        { "3.1.4", "\x80\xbf\x80", { URC, URC, URC } },
+        { "3.1.5", "\x80\xbf\x80\xbf", { URC, URC, URC, URC } },
+        { "3.1.6", "\x80\xbf\x80\xbf\x80", { URC, URC, URC, URC, URC } },
+        {
+            "3.1.7",
+            "\x80\xbf\x80\xbf\x80\xbf",
+            { URC, URC, URC, URC, URC, URC }
+        },
+        {
+            "3.1.8",
+            "\x80\xbf\x80\xbf\x80\xbf\x80",
+            { URC, URC, URC, URC, URC, URC, URC }
+        },
+        {
+            "3.1.9",
+            "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+            "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+            "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+            "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf",
+            {
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC
+            }
+        },
+
+        // 3.2  Lonely start characters
+        {
+            "3.2.1",
+            "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 "
+            "\xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
+            "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 "
+            "\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf ",
+            {
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' '
+            }
+        },
+        {
+            "3.2.2",
+            "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 "
+            "\xe8 \xe9 \xea \xeb \xec \xed \xee \xef ",
+            {
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' '
+            }
+        },
+        {
+            "3.2.3",
+            "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 ",
+            {
+                URC, ' ', URC, ' ', URC, ' ', URC, ' ',
+                URC, ' ', URC, ' ', URC, ' ', URC, ' '
+            }
+        },
+        {
+            "3.2.4",
+            "\xf8 \xf9 \xfa \xfb ",
+            {
+                URC, ' ', URC, ' ', URC, ' ', URC, ' '
+            }
+        },
+        {
+            "3.2.5", "\xfc \xfd ", { URC, ' ', URC, ' ' }
+        },
+
+        // 3.3  Sequences with last continuation byte missing
+        //
+        // From  UTF-8-test.txt:-
+        //     All bytes of an incomplete sequence should be signalled as
+        //     a single malformed sequence, i.e., you should see only a
+        //     single replacement character in each of the next 10 tests.
+        { "3.3.1", "\xc0", { URC } },
+        { "3.3.2", "\xe0\x80", { URC } },
+        { "3.3.3", "\xf0\x80\x80", { URC } },
+        { "3.3.4", "\xf8\x80\x80\x80", { URC } },
+        { "3.3.5", "\xfc\x80\x80\x80\x80", { URC } },
+
+        { "3.3.6", "\xdf", { URC } },
+        { "3.3.7", "\xef\xbf", { URC } },
+        { "3.3.8", "\xf7\xbf\xbf", { URC} },
+        { "3.3.9", "\xfb\xbf\xbf\xbf", { URC } },
+        { "3.3.10", "\xfd\xbf\xbf\xbf\xbf", { URC } },
+
+        // 3.4  Concatenation of incomplete sequences
+        {
+            "3,4",
+            "\xc0"
+            "\xe0\x80"
+            "\xf0\x80\x80"
+            "\xf8\x80\x80\x80"
+            "\xfc\x80\x80\x80\x80"
+            "\xdf"
+            "\xef\xbf"
+            "\xf7\xbf\xbf"
+            "\xfb\xbf\xbf\xbf"
+            "\xfd\xbf\xbf\xbf\xbf",
+            {
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC
+            }
+        },
+
+        // 3.5  Impossible bytes
+        { "3.5.1", "\xfe", { URC } },
+        { "3.5.2", "\xff", { URC } },
+        { "3.5.3", "\xfe\xfe\xff\xff", { URC, URC, URC, URC } },
+        // Terminator
+        { 0 }
+    };
+
+    run_decode_sub_test_array(tests);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_get_next_char__section_4)
+{
+    struct utf8_decode_sub_test tests[] =
+    {
+        // 4.1  Examples of an overlong ASCII character
+        //
+        // With a safe UTF-8 decoder, all of the following five
+        // overlong representations of the ASCII character slash ("/")
+        // should be rejected like a malformed UTF-8 sequence, for
+        // instance by substituting it with a replacement character. If
+        // you see a slash below, you do not have a safe UTF-8 decoder!
+        { "4.1.1", "\xc0\xaf", { URC } },
+        { "4.1.2", "\xe0\x80\xaf", { URC } },
+        { "4.1.3", "\xf0\x80\x80\xaf", { URC } },
+        { "4.1.4", "\xf8\x80\x80\x80\xaf", { URC } },
+        { "4.1.5", "\xfc\x80\x80\x80\x80\xaf", { URC } },
+
+        // 4.2  Maximum overlong sequences
+
+        // Below you see the highest Unicode value that is still resulting
+        // in an overlong sequence if represented with the given number of
+        // bytes. This is a boundary test for safe UTF-8 decoders. All
+        // five characters should be rejected like malformed UTF-8
+        // sequences.
+        { "4.2.1", "\xc1\xbf", { URC } },
+        { "4.2.2", "\xe0\x9f\xbf", { URC } },
+        { "4.2.3", "\xf0\x8f\xbf\xbf", { URC } },
+        { "4.2.4", "\xf8\x87\xbf\xbf\xbf", { URC } },
+        { "4.2.5", "\xfc\x83\xbf\xbf\xbf\xbf", { URC } },
+
+        // 4.3  Overlong representation of the NUL character
+
+        // The following five sequences should also be rejected like
+        // malformed UTF-8 sequences and should not be treated like the
+        // ASCII NUL character.
+        { "4.3.1", "\xc0\x80", { URC } },
+        { "4.3.2", "\xe0\x80\x80", { URC } },
+        { "4.3.3", "\xf0\x80\x80\x80", { URC } },
+        { "4.3.4", "\xf8\x80\x80\x80\x80", { URC } },
+        { "4.3.5", "\xfc\x80\x80\x80\x80\x80", { URC } },
+
+        // Terminator
+        { 0 }
+    };
+
+    run_decode_sub_test_array(tests);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_get_next_char__section_5)
+{
+    struct utf8_decode_sub_test tests[] =
+    {
+        // 5  Illegal code positions
+
+        // The following UTF-8 sequences should be rejected like
+        // malformed sequences, because they never represent valid
+        // ISO 10646 characters and a UTF-8 decoder that accepts them
+        // might introduce security problems comparable to overlong
+        // UTF-8 sequences.
+
+        // 5.1 Single UTF-16 surrogates
+        { "5.1.1", "\xed\xa0\x80", { URC } },
+        { "5.1.2", "\xed\xad\xbf", { URC } },
+        { "5.1.3", "\xed\xae\x80", { URC } },
+        { "5.1.4", "\xed\xaf\xbf", { URC } },
+        { "5.1.5", "\xed\xb0\x80", { URC } },
+        { "5.1.6", "\xed\xbe\x80", { URC } },
+        { "5.1.7", "\xed\xbf\xbf", { URC } },
+
+        // 5.2 Paired UTF-16 surrogates
+        { "5.2.1", "\xed\xa0\x80\xed\xb0\x80", { URC, URC } },
+        { "5.2.2", "\xed\xa0\x80\xed\xbf\xbf", { URC, URC } },
+        { "5.2.3", "\xed\xad\xbf\xed\xb0\x80", { URC, URC } },
+        { "5.2.4", "\xed\xad\xbf\xed\xbf\xbf", { URC, URC } },
+        { "5.2.5", "\xed\xae\x80\xed\xb0\x80", { URC, URC } },
+        { "5.2.6", "\xed\xae\x80\xed\xbf\xbf", { URC, URC } },
+        { "5.2.7", "\xed\xaf\xbf\xed\xb0\x80", { URC, URC } },
+        { "5.2.8", "\xed\xaf\xbf\xed\xbf\xbf", { URC, URC } },
+
+        // 5.3 Noncharacter code positions
+
+        // The following "noncharacters" are "reserved for internal
+        // use" by applications, and according to older versions of
+        // the Unicode Standard "should never be interchanged". Unicode
+        // Corrigendum #9 dropped the latter restriction. Nevertheless,
+        // their presence in incoming UTF-8 data can remain a potential
+        // security risk, depending on what use is made of these codes
+        // subsequently. Examples of such internal use:
+        //
+        //  - Some file APIs with 16-bit characters may use the integer
+        //    value -1 = U+FFFF to signal an end-of-file (EOF) or error
+        //    condition.
+        //
+        //  - In some UTF-16 receivers, code point U+FFFE might trigger
+        //    a byte-swap operation (to convert between UTF-16LE and
+        //    UTF-16BE).
+        // With such internal use of noncharacters, it may be desirable
+        // and safer to block those code points in UTF-8 decoders, as
+        // they should never occur legitimately in incoming UTF-8 data,
+        // and could trigger unsafe behaviour in subsequent processing.
+
+        // Particularly problematic noncharacters in 16-bit applications:
+        { "5.3.1", "\xef\xbf\xbe", { URC } },
+        { "5.3.2", "\xef\xbf\xbf", { URC } },
+
+        // Other noncharacters:
+        {
+            "5.3.3",
+            // Non-characters in "Arabic Presentation Forms-A" (BMP)
+            "\xef\xb7\x90" "\xef\xb7\x91" "\xef\xb7\x92" "\xef\xb7\x93"
+            "\xef\xb7\x94" "\xef\xb7\x95" "\xef\xb7\x96" "\xef\xb7\x97"
+            "\xef\xb7\x98" "\xef\xb7\x99" "\xef\xb7\x9a" "\xef\xb7\x9b"
+            "\xef\xb7\x9c" "\xef\xb7\x9d" "\xef\xb7\x9e" "\xef\xb7\x9f"
+            "\xef\xb7\xa0" "\xef\xb7\xa1" "\xef\xb7\xa2" "\xef\xb7\xa3"
+            "\xef\xb7\xa4" "\xef\xb7\xa5" "\xef\xb7\xa6" "\xef\xb7\xa7"
+            "\xef\xb7\xa8" "\xef\xb7\xa9" "\xef\xb7\xaa" "\xef\xb7\xab"
+            "\xef\xb7\xac" "\xef\xb7\xad" "\xef\xb7\xae" "\xef\xb7\xaf",
+            {
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC
+            }
+        },
+
+        {
+            "5.3.4",
+            "\xf0\x9f\xbf\xbe" "\xf0\x9f\xbf\xbf" // U+0001FFFE U+0001FFFF
+            "\xf0\xaf\xbf\xbe" "\xf0\xaf\xbf\xbf" // U+0002FFFE U+0002FFFF
+            "\xf0\xbf\xbf\xbe" "\xf0\xbf\xbf\xbf" // U+0003FFFE U+0003FFFF
+            "\xf1\x8f\xbf\xbe" "\xf1\x8f\xbf\xbf" // U+0004FFFE U+0004FFFF
+            "\xf1\x9f\xbf\xbe" "\xf1\x9f\xbf\xbf" // U+0005FFFE U+0005FFFF
+            "\xf1\xaf\xbf\xbe" "\xf1\xaf\xbf\xbf" // U+0006FFFE U+0006FFFF
+            "\xf1\xbf\xbf\xbe" "\xf1\xbf\xbf\xbf" // U+0007FFFE U+0007FFFF
+            "\xf2\x8f\xbf\xbe" "\xf2\x8f\xbf\xbf" // U+0008FFFE U+0008FFFF
+            "\xf2\x9f\xbf\xbe" "\xf2\x9f\xbf\xbf" // U+0009FFFE U+0009FFFF
+            "\xf2\xaf\xbf\xbe" "\xf2\xaf\xbf\xbf" // U+000AFFFE U+000AFFFF
+            "\xf2\xbf\xbf\xbe" "\xf2\xbf\xbf\xbf" // U+000BFFFE U+000BFFFF
+            "\xf3\x8f\xbf\xbe" "\xf3\x8f\xbf\xbf" // U+000CFFFE U+000CFFFF
+            "\xf3\x9f\xbf\xbe" "\xf3\x9f\xbf\xbf" // U+000DFFFE U+000DFFFF
+            "\xf3\xaf\xbf\xbe" "\xf3\xaf\xbf\xbf" // U+000EFFFE U+000EFFFF
+            "\xf3\xbf\xbf\xbe" "\xf3\xbf\xbf\xbf" // U+000FFFFE U+000FFFFF
+            "\xf4\x8f\xbf\xbe" "\xf4\x8f\xbf\xbf",// U+0010FFFE U+0010FFFF
+            {
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC,
+                URC, URC, URC, URC, URC, URC, URC, URC
+            }
+        },
+
+        // Last line of UTF8-test.txt
+        { "TheEnd", "THE END\n", { 'T', 'H', 'E', ' ', 'E', 'N', 'D', '\n'} },
+
+        // Terminator
+        { 0 }
+
+    };
+
+    run_decode_sub_test_array(tests);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_utf_char32_to_utf8)
+{
+    struct utf8_encode_sub_test tests[] =
+    {
+
+        // E2.1  First possible sequence of a certain length
+        //
+        { "E2.1.1", 0, 1, { 0 } },
+        { "E2.1.2", 0x80, 2, { 0xc2, 0x80 } },
+        { "E2.1.3", 0x800, 3, { 0xe0, 0xa0, 0x80 } },
+        { "E2.1.4", 0x10000, 4, { 0xf0, 0x90, 0x80, 0x80 } },
+
+        // E2.2  Last possible sequence of a certain length
+        { "E2.2.1", 0x7f, 1, { 0x7f } },
+        { "E2.2.2", 0x7ff, 2, { 0xdf, 0xbf } },
+        { "E2.2.3", 0xfffc, 3, { 0xef, 0xbf, 0xbc } }, // See 2.1.3 above
+        { "E2.2.4", 0x1FFFFF, 3, E_URC }, // out-of-range
+
+        // E2.3  Other boundary conditions
+        { "E2.3.1", 0xd7ff, 3, { 0xed, 0x9f, 0xbf } },
+        { "E2.3.2", 0xe000, 3, { 0xee, 0x80, 0x80 } },
+        { "E2.3.3", 0xfffd, 3, { 0xef, 0xbf, 0xbd } },
+        { "E2.3.4", 0x10fffd, 4, { 0xf4, 0x8f, 0xbf, 0xbd } }, // See 2.3.4 above
+        // E2.3.5 - not tested
+
+        // E5.1 Single UTF-16 surrogates
+        { "E5.1.1", 0xd800, 3, E_URC },
+        { "E5.1.2", 0xdb7f, 3, E_URC },
+        { "E5.1.3", 0xdb80, 3, E_URC },
+        { "E5.1.4", 0xdbff, 3, E_URC },
+        { "E5.1.5", 0xdc00, 3, E_URC },
+        { "E5.1.6", 0xdf80, 3, E_URC },
+        { "E5.1.7", 0xdfff, 3, E_URC },
+
+        // E5.3 Non-character code positions
+        { "E5.3.3(0)", 0xfdd0, 3, E_URC },
+        { "E5.3.3(1)", 0xfdd1, 3, E_URC },
+        { "E5.3.3(2)", 0xfdd2, 3, E_URC },
+        { "E5.3.3(3)", 0xfdd3, 3, E_URC },
+        { "E5.3.3(4)", 0xfdd4, 3, E_URC },
+        { "E5.3.3(5)", 0xfdd5, 3, E_URC },
+        { "E5.3.3(6)", 0xfdd6, 3, E_URC },
+        { "E5.3.3(7)", 0xfdd7, 3, E_URC },
+        { "E5.3.3(8)", 0xfdd8, 3, E_URC },
+        { "E5.3.3(9)", 0xfdd9, 3, E_URC },
+        { "E5.3.3(10)", 0xfdda, 3, E_URC },
+        { "E5.3.3(11)", 0xfddb, 3, E_URC },
+        { "E5.3.3(12)", 0xfddc, 3, E_URC },
+        { "E5.3.3(13)", 0xfddd, 3, E_URC },
+        { "E5.3.3(14)", 0xfdde, 3, E_URC },
+        { "E5.3.3(15)", 0xfddf, 3, E_URC },
+        { "E5.3.3(16)", 0xfde0, 3, E_URC },
+        { "E5.3.3(17)", 0xfde1, 3, E_URC },
+        { "E5.3.3(18)", 0xfde2, 3, E_URC },
+        { "E5.3.3(19)", 0xfde3, 3, E_URC },
+        { "E5.3.3(20)", 0xfde4, 3, E_URC },
+        { "E5.3.3(21)", 0xfde5, 3, E_URC },
+        { "E5.3.3(22)", 0xfde6, 3, E_URC },
+        { "E5.3.3(23)", 0xfde7, 3, E_URC },
+        { "E5.3.3(24)", 0xfde8, 3, E_URC },
+        { "E5.3.3(25)", 0xfde9, 3, E_URC },
+        { "E5.3.3(26)", 0xfdea, 3, E_URC },
+        { "E5.3.3(27)", 0xfdeb, 3, E_URC },
+        { "E5.3.3(28)", 0xfdec, 3, E_URC },
+        { "E5.3.3(29)", 0xfded, 3, E_URC },
+        { "E5.3.3(30)", 0xfdee, 3, E_URC },
+        { "E5.3.3(31)", 0xfdef, 3, E_URC },
+        { "E5.3.4(0)", 0x1fffe, 3, E_URC },
+        { "E5.3.4(1)", 0x1ffff, 3, E_URC },
+        { "E5.3.4(2)", 0x2fffe, 3, E_URC },
+        { "E5.3.4(3)", 0x2ffff, 3, E_URC },
+        { "E5.3.4(4)", 0x3fffe, 3, E_URC },
+        { "E5.3.4(5)", 0x3ffff, 3, E_URC },
+        { "E5.3.4(6)", 0x4fffe, 3, E_URC },
+        { "E5.3.4(7)", 0x4ffff, 3, E_URC },
+        { "E5.3.4(8)", 0x5fffe, 3, E_URC },
+        { "E5.3.4(9)", 0x5ffff, 3, E_URC },
+        { "E5.3.4(10)", 0x6fffe, 3, E_URC },
+        { "E5.3.4(11)", 0x6ffff, 3, E_URC },
+        { "E5.3.4(12)", 0x7fffe, 3, E_URC },
+        { "E5.3.4(13)", 0x7ffff, 3, E_URC },
+        { "E5.3.4(14)", 0x8fffe, 3, E_URC },
+        { "E5.3.4(15)", 0x8ffff, 3, E_URC },
+        { "E5.3.4(16)", 0x9fffe, 3, E_URC },
+        { "E5.3.4(17)", 0x9ffff, 3, E_URC },
+        { "E5.3.4(18)", 0xafffe, 3, E_URC },
+        { "E5.3.4(19)", 0xaffff, 3, E_URC },
+        { "E5.3.4(20)", 0xbfffe, 3, E_URC },
+        { "E5.3.4(21)", 0xbffff, 3, E_URC },
+        { "E5.3.4(22)", 0xcfffe, 3, E_URC },
+        { "E5.3.4(23)", 0xcffff, 3, E_URC },
+        { "E5.3.4(24)", 0xdfffe, 3, E_URC },
+        { "E5.3.4(25)", 0xdffff, 3, E_URC },
+        { "E5.3.4(26)", 0xefffe, 3, E_URC },
+        { "E5.3.4(27)", 0xeffff, 3, E_URC },
+        { "E5.3.4(28)", 0xffffe, 3, E_URC },
+        { "E5.3.4(29)", 0xfffff, 3, E_URC },
+        { "E5.3.4(30)", 0x10fffe, 3, E_URC },
+        { "E5.3.4(31)", 0x10ffff, 3, E_URC },
+        { "E5.99.0", 'T', 1, { 'T' } },
+        { "E5.99.1", 'H', 1, { 'H' } },
+        { "E5.99.2", 'E', 1, { 'E' } },
+        { "E5.99.3", ' ', 1, { ' ' } },
+        { "E5.99.4", 'E', 1, { 'E' } },
+        { "E5.99.5", 'N', 1, { 'N' } },
+        { "E5.99.6", 'D', 1, { 'D' } },
+
+        // Terminator
+        { 0 }
+    };
+
+    run_encode_sub_test_array(tests);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_utf8_char_count)
+{
+    // Check function can cope with NULL argument
+    ck_assert_int_eq(utf8_char_count(NULL), 0);
+
+    unsigned int kosme_strlen = strlen(greek_kosme);
+    unsigned int kosme_len = utf8_char_count(greek_kosme);
+
+    // All characters map to two bytes except for the 'omicrom with oxia'
+    // which maps to three
+    ck_assert_int_eq(kosme_strlen, 2 + 3 + 2 + 2 + 2);
+    ck_assert_int_eq(kosme_len, 5);
+
+    unsigned int simple_test_strlen = strlen(simple_test_with_emoji);
+    unsigned int simple_test_len = utf8_char_count(simple_test_with_emoji);
+
+    ck_assert_int_eq(simple_test_strlen,
+                     (1 + 1 + 1 + 1 + 1 + 1 ) + // Simple
+                     1 +
+                     (1 + 1 + 1 + 1 ) + // Test
+                     1 +
+                     4);               // emoji
+    // The emoji is 4 bytes - all others are 1
+    ck_assert_int_eq(simple_test_len, simple_test_strlen - 3);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_utf8_as_utf16_word_count)
+{
+    unsigned int kosme_count =
+        utf8_as_utf16_word_count(greek_kosme, strlen(greek_kosme));
+
+    ck_assert_int_eq(kosme_count, 5); // All characters in BMP
+
+    unsigned int simple_test_count =
+        utf8_as_utf16_word_count(simple_test_with_emoji,
+                                 strlen(simple_test_with_emoji));
+
+    ck_assert_int_eq(simple_test_count,
+                     (1 + 1 + 1 + 1 + 1 + 1 ) + // Simple
+                     1 +
+                     (1 + 1 + 1 + 1 ) + // Test
+                     1 +
+                     2); // emoji
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_utf8_add_char_at)
+{
+#define TEST_SIZE sizeof(simple_test_with_emoji)
+
+    // Type pairing a string position with a Unicode char
+    struct pos_to_char_map
+    {
+        unsigned int pos;
+        char32_t c32;
+    };
+
+    // Buffer for constructing the string
+    char buff[TEST_SIZE];
+
+    // A pseudo-random map of the characters in simple_test_with_emoji
+    const struct pos_to_char_map map[] =
+    {
+        { 0, 'l' },
+        { 0, 'S' },
+        { 1, 'i' },
+        { 2, 'm' },
+        { 4, 0x1f625 },
+        { 4, '.' },
+        { 4, 'e' },
+        { 5, 'T' },
+        { 3, 'p' },
+        { 7, 't' },
+        { 7, 'e' },
+        { 8, 's' },
+        { 6, ' ' },
+        { 0 }
+    };
+
+    buff[0] = '\0';
+
+    // Construct the string in a pseudo-random fashion
+
+    const struct pos_to_char_map *p;
+    for (p = map; p->c32 != 0 ; ++p)
+    {
+        if (!utf8_add_char_at(buff, TEST_SIZE, p->c32, p->pos))
+        {
+            ck_abort_msg("test_utf8_add_char_at: "
+                         "Can't insert char %x at pos %u",
+                         p->c32,
+                         p->pos);
+        }
+    }
+
+    // Should have reached the buffer size by now
+    ck_assert_int_eq(strlen(buff), TEST_SIZE - 1);
+
+    // Check the string is what we expect
+    ck_assert_int_eq(strcmp(buff, simple_test_with_emoji), 0);
+
+    // Try to insert another character
+    if (utf8_add_char_at(buff, TEST_SIZE, ' ', 0))
+    {
+        ck_abort_msg("test_utf8_add_char_at: "
+                     "Insert succeeded but should have failed");
+    }
+
+#undef TEST_SIZE
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_utf8_remove_char_at)
+{
+#define TEST_SIZE sizeof(simple_test_with_emoji)
+    // Type pairing a string position with a Unicode char
+    struct pos_to_char_map
+    {
+        unsigned int pos;
+        char32_t c32;
+    };
+
+    // Buffer for deconstructing the string
+    char buff[TEST_SIZE];
+
+    // A pseudo-random map of the characters in simple_test_with_emoji
+    const struct pos_to_char_map map[] =
+    {
+        { 2, 'm' },
+        { 7, 'e' },
+        { 5, ' ' },
+        { 1, 'i' },
+        { 2, 'l' },
+        { 3, 'T' },
+        { 6, 0x1f625 },
+        { 2, 'e' },
+        { 3, 't' },
+        { 3, '.' },
+        { 2, 's' },
+        { 1, 'p' },
+        { 0, 'S' },
+        { 0 }
+    };
+
+    char32_t c32;
+
+    strcpy(buff, simple_test_with_emoji);
+
+    // Deconstruct the string in a pseudo-random fashion
+    const struct pos_to_char_map *p;
+    for (p = map; p->c32 != 0 ; ++p)
+    {
+        c32 = utf8_remove_char_at(buff, p->pos);
+        if (c32 != p->c32)
+        {
+            ck_abort_msg("test_utf8_remove_char_at: "
+                         "remove char at pos %u was %x, expected %x",
+                         p->pos, c32, p->c32);
+        }
+    }
+
+    // Should have emptied the buffer by now
+    ck_assert_int_eq(buff[0], '\0');
+
+    // Try to remove other characters
+    c32 = utf8_remove_char_at(buff, 0);
+    ck_assert_int_eq(c32, 0);
+    c32 = utf8_remove_char_at(buff, 99);
+    ck_assert_int_eq(c32, 0);
+    ck_assert_int_eq(buff[0], '\0');
+
+#undef TEST_SIZE
+}
+END_TEST
+
+/******************************************************************************/
+
+Suite *
+make_suite_test_string_unicode(void)
+{
+    Suite *s;
+    TCase *tc_unicode;
+
+    s = suite_create("String");
+
+    tc_unicode = tcase_create("Unicode");
+    suite_add_tcase(s, tc_unicode);
+    tcase_add_test(tc_unicode, test_get_next_char__section_1);
+    tcase_add_test(tc_unicode, test_get_next_char__section_2);
+    tcase_add_test(tc_unicode, test_get_next_char__section_3);
+    tcase_add_test(tc_unicode, test_get_next_char__section_4);
+    tcase_add_test(tc_unicode, test_get_next_char__section_5);
+    tcase_add_test(tc_unicode, test_utf_char32_to_utf8);
+    tcase_add_test(tc_unicode, test_utf8_char_count);
+    tcase_add_test(tc_unicode, test_utf8_as_utf16_word_count);
+    tcase_add_test(tc_unicode, test_utf8_add_char_at);
+    tcase_add_test(tc_unicode, test_utf8_remove_char_at);
+
+    return s;
+}
diff --git a/xrdp/xrdp_font.c b/xrdp/xrdp_font.c
index f88ec93c..7eb0893a 100644
--- a/xrdp/xrdp_font.c
+++ b/xrdp/xrdp_font.c
@@ -52,9 +52,6 @@ static char w_char[] =
 };
 #endif
 
-// Unicode definitions
-#define UNICODE_WHITE_SQUARE 0x25a1
-
 // First character allocated in the 'struct xrdp_font.chars' array
 #define FIRST_CHAR ' '
 
@@ -354,9 +351,9 @@ xrdp_font_create(struct xrdp_wm *wm, unsigned int dpi)
                 }
 
                 // Find a default glyph
-                if (char_count > UNICODE_WHITE_SQUARE)
+                if (char_count > UCS_WHITE_SQUARE)
                 {
-                    self->default_char = &self->chars[UNICODE_WHITE_SQUARE];
+                    self->default_char = &self->chars[UCS_WHITE_SQUARE];
                 }
                 else if (char_count > '?')
                 {