Add UTF-16 LE I/O routines

These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
2023-09-19 11:36:13 +01:00 · 2023-09-19 11:36:13 +01:00 · 0758fe03a6
commit 0758fe03a6
parent 0463e552dc
7 changed files with 705 additions and 0 deletions
--- a/common/parse.c
+++ b/common/parse.c
@ -27,7 +27,47 @@
 #include "arch.h"
 #include "parse.h"
 #include "log.h"
+#include "string_calls.h"
+#include "unicode_defines.h"

+/******************************************************************************/
+
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((s)->p) = (unsigned char)((v) >> 0); \
+        (s)->p++; \
+        *((s)->p) = (unsigned char)((v) >> 8); \
+        (s)->p++; \
+    } while (0)
+#else
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((unsigned short*)((s)->p)) = (unsigned short)(v); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = (unsigned short) \
+              ( \
+                (*((unsigned char*)((s)->p + 0)) << 0) | \
+                (*((unsigned char*)((s)->p + 1)) << 8) \
+              ); \
+        (s)->p += 2; \
+    } while (0)
+#else
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = *((unsigned short*)((s)->p)); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
 void
 parser_stream_overflow_check(const struct stream *s, int n, int is_out,
                             const char *file, int line)
@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
        }
    }
 }
+
+/******************************************************************************/
+void
+out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
+                          unsigned int vn,
+                          const char *file, int line)
+{
+    // Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
+    // file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
+    parser_stream_overflow_check(s, octet_cnt, 1, file, line);
+#endif
+
+    while (vn > 0)
+    {
+        char32_t c32 = utf8_get_next_char(&v, &vn);
+        char16_t low;
+        if (c32 < 0x10000)
+        {
+            low = (char16_t)c32;
+        }
+        else
+        {
+            /* Need a surrogate pair */
+            low = LOW_SURROGATE_FROM_C32(c32);
+            char16_t high = HIGH_SURROGATE_FROM_C32(c32);
+            out_uint16_le_unchecked(s, high);
+        }
+        out_uint16_le_unchecked(s, low);
+    }
+}
+
+/******************************************************************************/
+/**
+ * Gets the next Unicode character from a code stream
+ * @param s Stream
+ * @return Unicode character
+ *
+ * Non-characters and illegally coded characters are mapped to
+ * UCS_REPLACEMENT_CHARACTER
+ *
+ * @pre Two bytes are assumed to be available on the stram on entry
+ */
+static char32_t
+get_c32_from_stream(struct stream *s)
+{
+    char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
+    char16_t w;
+
+    in_uint16_le_unchecked(s, w);
+
+    if (IS_HIGH_SURROGATE(w))
+    {
+        if (s_check_rem(s, 2))
+        {
+            char16_t low;
+            in_uint16_le_unchecked(s, low);
+            if (IS_LOW_SURROGATE(low))
+            {
+                /* Valid surrogate pair */
+                char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
+
+                /* Ignore some values which can be successfully encoded
+                 * in this way */
+                if (!IS_PLANE_END_NON_CHARACTER(c32))
+                {
+                    c32 = v;
+                }
+            }
+            else
+            {
+                /* Invalid low surrogate  - pop character back */
+                s->p -= 2;
+            }
+        }
+    }
+    else if (!IS_LOW_SURROGATE(w) &&
+             !IS_PLANE_END_NON_CHARACTER(w) &&
+             !IS_ARABIC_NON_CHARACTER(w))
+    {
+        /* Character from the Basic Multilingual Plane */
+        c32 = (char32_t)w;
+    }
+
+    return c32;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
+                               char *v, unsigned int vn,
+                               const char *file, int line)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    char *saved_s_end = s->end;
+
+    // Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    parser_stream_overflow_check(s, n * 2, 0, file, line);
+#endif
+    // Temporarily set the stream end pointer to allow us to use
+    // s_check_rem() when reading in UTF-16 words
+    if (s->end - s->p > (int)(n * 2))
+    {
+        s->end = s->p + (int)(n * 2);
+    }
+
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+
+        rv += u8len;
+    }
+
+    // Restore stream to full length
+    s->end = saved_s_end;
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8(struct stream *s,
+                               char *v, unsigned int vn)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+        if (c32 == 0)
+        {
+            break;  // Terminator encountered
+        }
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+        rv += u8len;
+    }
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8_length(struct stream *s)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}
--- a/common/parse.h
+++ b/common/parse.h
@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
 #   define S_CHECK_REM_OUT(s,n)
 #endif

+/******************************************************************************/
+/**
+ * Copies a UTF-8 string to a stream as little-endian UTF-16
+ *
+ * @param s Stream
+ * @param v UTF-8 string
+ * @param vn Length of UTF-8 string.
+ * @param file Caller location (from __FILE__)
+ * @param line Caller location (from __LINE__)
+ *
+ * Caller is expected to check there is room for the result in s
+ */
+void
+out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
+                          unsigned int vn,
+                          const char *file, int line);
+
+#define out_utf8_as_utf16_le(s,v,vn) \
+    out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)
+
+
+/******************************************************************************/
+/**
+ * Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
+ *
+ * @param s Stream
+ * @param n Number of 16-bit words to copy
+ * @param v Pointer to result buffer
+ * @param vn Max size of result buffer
+ *
+ * @return number of characters which would be written to v, INCLUDING
+ *         an additional terminator. This can be used to check for a buffer
+ *         overflow. A terminator is added whether or not the input
+ *         includes one.
+ *
+ * Output is unconditionally NULL-terminated.
+ * Input is not checked for NULLs - these are copied verbatim
+ */
+unsigned int
+in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
+                               char *v, unsigned int vn,
+                               const char *file, int line);
+
+#define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
+    in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)
+
+/******************************************************************************/
+/**
+ * Returns the size of the buffer needed to store a fixed-size
+ * little-endian UTF-16 string in a stream as a UTF-8 string
+ *
+ * @param s Stream
+ * @param n Number of 16-bit words to consider
+ * @return number of characters needed to store the UTF-8 string. This
+ *         includes a terminator, which is written whether the parsed
+ *         string includes one or not.
+ * @post Stream position is not moved between start and end of this call
+ */
+unsigned int
+in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);
+
+/******************************************************************************/
+/**
+ * Copies a terminated little-endian UTF-16 string from a stream as UTF-8
+ *
+ * @param s Stream
+ * @param v Pointer to result buffer
+ * @param vn Max size of result buffer
+ *
+ * @return number of characters which would be written to v, INCLUDING
+ *         the terminator. This can be used to check for a buffer overflow.
+ *
+ * Output is unconditionally NULL-terminated.
+ * Input processing stops when a NULL is encountered, or the end of the buffer
+ * is reached.
+ */
+unsigned int
+in_utf16_le_terminated_as_utf8(struct stream *s,
+                               char *v, unsigned int vn);
+
+/******************************************************************************/
+/**
+ * Returns the size of the buffer needed to store a terminated
+ * little-endian UTF-16 string in a stream as a terminated UTF-8 string
+ *
+ * @param s Stream
+ * @return number of characters needed to store the UTF-8 string,
+ *         including the terminator
+ * @post Stream position is not moved between start and end of this call
+ *
+ * Input processing stops when a NULL is encountered, or the end of the buffer
+ * is reached.
+ */
+unsigned int
+in_utf16_le_terminated_as_utf8_length(struct stream *s);
+
 /******************************************************************************/
 #define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)

--- a/common/unicode_defines.h
+++ b/common/unicode_defines.h
@ -68,4 +68,33 @@
      INVALID_UNICODE_10000_TO_1FFFFF(c) \
    )

+/**
+ * Is this character a UTF-16 high surrogate?
+ */
+#define IS_HIGH_SURROGATE(u16) (((u16) & 0xfc00) == 0xd800)
+
+/**
+ * Is this character a UTF-16 low surrogate?
+ */
+#define IS_LOW_SURROGATE(u16) (((u16) & 0xfc00) == 0xdc00)
+
+/**
+ * Extract the UTF-16 high surrogate from a character
+ */
+#define HIGH_SURROGATE_FROM_C32(c32) \
+    (((((c32) - 0x10000) >> 10) & 0x3ff) | 0xd800)
+
+/**
+ * Extract the UTF-16 low surrogate from a character
+ */
+#define LOW_SURROGATE_FROM_C32(c32) (((c32) & 0x3ff) | 0xdc00)
+
+/**
+ * Reconstruct a character from a UTF-16 surrogate pair
+ *
+ * This macro cannot return values higher than 0x10ffff
+ */
+#define C32_FROM_SURROGATE_PAIR(low,high) \
+    ((char32_t)(((high) & 0x3ff) << 10) + ((low) & 0x3ff) + 0x10000)
+
 #endif // UNICODE_DEFINES_H
--- a/tests/common/Makefile.am
+++ b/tests/common/Makefile.am
@ -16,6 +16,7 @@ test_common_SOURCES = \
    test_common_main.c \
    test_fifo_calls.c \
    test_list_calls.c \
+    test_parse.c \
    test_string_calls.c \
    test_string_calls_unicode.c \
    test_os_calls.c \
--- a/tests/common/test_common.h
+++ b/tests/common/test_common.h
@ -9,6 +9,7 @@ bin_to_hex(const char *input, int length);

 Suite *make_suite_test_fifo(void);
 Suite *make_suite_test_list(void);
+Suite *make_suite_test_parse(void);
 Suite *make_suite_test_string(void);
 Suite *make_suite_test_string_unicode(void);
 Suite *make_suite_test_os_calls(void);
--- a/tests/common/test_common_main.c
+++ b/tests/common/test_common_main.c
@ -48,6 +48,7 @@ int main (void)

    sr = srunner_create (make_suite_test_fifo());
    srunner_add_suite(sr, make_suite_test_list());
+    srunner_add_suite(sr, make_suite_test_parse());
    srunner_add_suite(sr, make_suite_test_string());
    srunner_add_suite(sr, make_suite_test_string_unicode());
    srunner_add_suite(sr, make_suite_test_os_calls());
--- a/tests/common/test_parse.c
+++ b/tests/common/test_parse.c
@ -0,0 +1,319 @@
+#if defined(HAVE_CONFIG_H)
+#include "config_ac.h"
+#endif
+
+#include "arch.h"
+#include "os_calls.h"
+#include "string_calls.h"
+#include "parse.h"
+
+#include "test_common.h"
+
+#define ELEMENTS(x) (sizeof(x) / sizeof(x[0]))
+
+const static char
+utf8_simple_test_with_emoji[] =
+    "Simple Test."
+    "\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
+
+const static char16_t
+utf16_simple_test_with_emoji[] =
+{
+    'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.',
+    0xd83d, 0xde25, // U+1F625
+    0 // terminator
+};
+
+/******************************************************************************/
+START_TEST(test_out_utf8_as_utf16_le)
+{
+    struct stream *s;
+    make_stream(s);
+    init_stream(s, 8192);
+    out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji,
+                         sizeof(utf8_simple_test_with_emoji)); // Include term
+    s_mark_end(s);
+
+    // Rewind the stream
+    s->p = s->data;
+    unsigned int i;
+
+    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
+    {
+        char16_t val;
+        in_uint16_le(s, val);
+        if (val != utf16_simple_test_with_emoji[i])
+        {
+            ck_abort_msg("test_out_utf8_as_utf16_le: "
+                         "Index %u expected %x, got %x",
+                         i, utf16_simple_test_with_emoji[i], val);
+        }
+    }
+
+    ck_assert_int_eq(s_check_end(s), 1);
+
+    free_stream(s);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_in_utf16_le_fixed_as_utf8)
+{
+    struct stream *s;
+    make_stream(s);
+    init_stream(s, 8192);
+
+    // Write the stream without a terminator
+    unsigned int i;
+    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i)
+    {
+        out_uint16_le(s, utf16_simple_test_with_emoji[i]);
+    }
+    s_mark_end(s);
+
+    // Rewind the stream
+    s->p = s->data;
+
+    char buff[256];
+    unsigned int len;
+
+    // Check the length call
+    len = in_utf16_le_fixed_as_utf8_length(s, i);
+    ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
+
+    // Now read the string, checking for the same length
+    unsigned int read_len;
+    read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff));
+    ck_assert_int_eq(len, read_len);
+
+    // Should be at the end of the buffer
+    ck_assert_int_eq(s_check_end(s), 1);
+
+    // Check the contents are as expected
+    int cmp = memcmp(buff, utf8_simple_test_with_emoji,
+                     sizeof(utf8_simple_test_with_emoji));
+    ck_assert_int_eq(cmp, 0);
+
+    free_stream(s);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_in_utf16_le_terminated_as_utf8)
+{
+    struct stream *s;
+    make_stream(s);
+    init_stream(s, 8192);
+
+    // Write the stream with the terminator
+    unsigned int i;
+    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
+    {
+        out_uint16_le(s, utf16_simple_test_with_emoji[i]);
+    }
+    s_mark_end(s);
+
+    // Rewind the stream
+    s->p = s->data;
+
+    char buff[256];
+    unsigned int len;
+
+    // Check the length call
+    len = in_utf16_le_terminated_as_utf8_length(s);
+    ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
+
+    // Now read the string, checking for the same length
+    unsigned int read_len;
+    read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff));
+    ck_assert_int_eq(len, read_len);
+
+    // Should be at the end of the buffer
+    ck_assert_int_eq(s_check_end(s), 1);
+
+    // Check the contents are as expected
+    int cmp = memcmp(buff, utf8_simple_test_with_emoji,
+                     sizeof(utf8_simple_test_with_emoji));
+    ck_assert_int_eq(cmp, 0);
+
+    free_stream(s);
+}
+END_TEST
+
+/******************************************************************************/
+START_TEST(test_in_utf16_le_significant_chars)
+{
+    struct stream *s;
+    make_stream(s);
+    init_stream(s, 8192);
+
+    const struct
+    {
+        struct
+        {
+            char16_t high; // Set to 0 for a single UTF-16 word
+            char16_t low;
+        } pair;
+        char32_t expected;
+    } tests[] =
+    {
+        // Single high surrogates are bad
+        { { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER },
+        // Single low surrogates are bad
+        { { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER },
+        // Values before and after surrogate range
+        { { 0, 0xd7ff }, 0xd7ff },
+        { { 0, 0xe000 }, 0xe000 },
+        // First and last non-surrogate pair values (don't use
+        // 0xfffe and 0xffff for this test as they are non-characters,
+        // and 0xfffd is the replacement character)
+        { { 0, 0 }, 0 },
+        { { 0, 0xfffc }, 0xfffc },
+        { { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER },
+        // First and last surrogate pair values (don't use
+        // 0x10fffe and 0x10ffff for this test as they are non-characters)
+        { { 0xd800, 0xdc00 }, 0x10000 },
+        { { 0xdbff, 0xdffd }, 0x10fffd },
+        // End-of-plane non-characters (BMP) and the characters before them
+        { { 0xd83f, 0xdffd }, 0x1fffd },
+        { { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE
+        { { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF
+        { { 0xd87f, 0xdffd }, 0x2fffd },
+        { { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE
+        { { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF
+        { { 0xd8bf, 0xdffd }, 0x3fffd },
+        { { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE
+        { { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF
+        { { 0xd8ff, 0xdffd }, 0x4fffd },
+        { { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE
+        { { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF
+        { { 0xd93f, 0xdffd }, 0x5fffd },
+        { { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE
+        { { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF
+        { { 0xd97f, 0xdffd }, 0x6fffd },
+        { { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE
+        { { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF
+        { { 0xd9bf, 0xdffd }, 0x7fffd },
+        { { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE
+        { { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF
+        { { 0xd9ff, 0xdffd }, 0x8fffd },
+        { { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE
+        { { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF
+        { { 0xda3f, 0xdffd }, 0x9fffd },
+        { { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE
+        { { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF
+        { { 0xda7f, 0xdffd }, 0xafffd },
+        { { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE
+        { { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF
+        { { 0xdabf, 0xdffd }, 0xbfffd },
+        { { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE
+        { { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF
+        { { 0xdaff, 0xdffd }, 0xcfffd },
+        { { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE
+        { { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF
+        { { 0xdb3f, 0xdffd }, 0xdfffd },
+        { { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE
+        { { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF
+        { { 0xdb7f, 0xdffd }, 0xefffd },
+        { { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE
+        { { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF
+        { { 0xdbbf, 0xdffd }, 0xffffd },
+        { { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE
+        { { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF
+        { { 0xdbff, 0xdffd }, 0x10fffd },
+        { { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE
+        { { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF
+        // Non-characters in "Arabic Presentation Forms-A"
+        { { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER },
+        { { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER }
+    };
+
+    unsigned int i;
+    for (i = 0; i < ELEMENTS(tests); ++i)
+    {
+        char buff[256];
+        unsigned int word_count;
+        init_stream(s, 8192);
+
+        word_count = 0;
+        if (tests[i].pair.high != 0)
+        {
+            out_uint16_le(s, tests[i].pair.high);
+            ++word_count;
+        }
+        out_uint16_le(s, tests[i].pair.low);
+        ++word_count;
+        s_mark_end(s);
+
+        // Rewind the stream
+        s->p = s->data;
+
+        // Read in one UTF-16 LE character as UTF-32
+        in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff));
+        const char *p = buff;
+        char32_t c32 = utf8_get_next_char(&p, NULL);
+
+        if (c32 != tests[i].expected)
+        {
+            ck_abort_msg("test_in_utf16_le_significant_chars: "
+                         "Index %u for {%x, %x}, expected %x, got %x",
+                         i,  tests[i].pair.high, tests[i].pair.low,
+                         tests[i].expected, c32);
+        }
+    }
+
+    free_stream(s);
+}
+END_TEST
+
+/******************************************************************************/
+
+Suite *
+make_suite_test_parse(void)
+{
+    Suite *s;
+    TCase *tc_unicode;
+
+    s = suite_create("Parse");
+
+    tc_unicode = tcase_create("Unicode");
+    suite_add_tcase(s, tc_unicode);
+    tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le);
+    tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8);
+    tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8);
+    tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars);
+
+    return s;
+}