Add UTF-16 LE I/O routines

These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
2023-09-19 11:36:13 +01:00 · 2023-09-19 11:36:13 +01:00 · 0758fe03a6
commit 0758fe03a6
parent 0463e552dc
7 changed files with 705 additions and 0 deletions
--- a/common/parse.c
+++ b/common/parse.c
@ -27,7 +27,47 @@
 #include "arch.h"
 #include "parse.h"
 #include "log.h"
 #include "string_calls.h"
 #include "unicode_defines.h"
 /******************************************************************************/
 #if defined(B_ENDIAN) || defined(NEED_ALIGN)
 #define out_uint16_le_unchecked(s, v) do \
    { \
        *((s)->p) = (unsigned char)((v) >> 0); \
        (s)->p++; \
        *((s)->p) = (unsigned char)((v) >> 8); \
        (s)->p++; \
    } while (0)
 #else
 #define out_uint16_le_unchecked(s, v) do \
    { \
        *((unsigned short*)((s)->p)) = (unsigned short)(v); \
        (s)->p += 2; \
    } while (0)
 #endif
 /******************************************************************************/
 #if defined(B_ENDIAN) || defined(NEED_ALIGN)
 #define in_uint16_le_unchecked(s, v) do \
    { \
        (v) = (unsigned short) \
              ( \
                (*((unsigned char*)((s)->p + 0)) << 0) | \
                (*((unsigned char*)((s)->p + 1)) << 8) \
              ); \
        (s)->p += 2; \
    } while (0)
 #else
 #define in_uint16_le_unchecked(s, v) do \
    { \
        (v) = *((unsigned short*)((s)->p)); \
        (s)->p += 2; \
    } while (0)
 #endif
 /******************************************************************************/
 void
 parser_stream_overflow_check(const struct stream *s, int n, int is_out,
                             const char *file, int line)
@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
        }
    }
 }
 /******************************************************************************/
 void
 out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
                          unsigned int vn,
                          const char *file, int line)
 {
    // Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
    // file and line
 #ifdef USE_DEVEL_STREAMCHECK
    int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
    parser_stream_overflow_check(s, octet_cnt, 1, file, line);
 #endif
    while (vn > 0)
    {
        char32_t c32 = utf8_get_next_char(&v, &vn);
        char16_t low;
        if (c32 < 0x10000)
        {
            low = (char16_t)c32;
        }
        else
        {
            /* Need a surrogate pair */
            low = LOW_SURROGATE_FROM_C32(c32);
            char16_t high = HIGH_SURROGATE_FROM_C32(c32);
            out_uint16_le_unchecked(s, high);
        }
        out_uint16_le_unchecked(s, low);
    }
 }
 /******************************************************************************/
 /**
 * Gets the next Unicode character from a code stream
 * @param s Stream
 * @return Unicode character
 *
 * Non-characters and illegally coded characters are mapped to
 * UCS_REPLACEMENT_CHARACTER
 *
 * @pre Two bytes are assumed to be available on the stram on entry
 */
 static char32_t
 get_c32_from_stream(struct stream *s)
 {
    char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
    char16_t w;
    in_uint16_le_unchecked(s, w);
    if (IS_HIGH_SURROGATE(w))
    {
        if (s_check_rem(s, 2))
        {
            char16_t low;
            in_uint16_le_unchecked(s, low);
            if (IS_LOW_SURROGATE(low))
            {
                /* Valid surrogate pair */
                char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
                /* Ignore some values which can be successfully encoded
                 * in this way */
                if (!IS_PLANE_END_NON_CHARACTER(c32))
                {
                    c32 = v;
                }
            }
            else
            {
                /* Invalid low surrogate  - pop character back */
                s->p -= 2;
            }
        }
    }
    else if (!IS_LOW_SURROGATE(w) &&
             !IS_PLANE_END_NON_CHARACTER(w) &&
             !IS_ARABIC_NON_CHARACTER(w))
    {
        /* Character from the Basic Multilingual Plane */
        c32 = (char32_t)w;
    }
    return c32;
 }
 /******************************************************************************/
 unsigned int
 in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
                               char *v, unsigned int vn,
                               const char *file, int line)
 {
    unsigned int rv = 0;
    char32_t c32;
    char u8str[MAXLEN_UTF8_CHAR];
    unsigned int u8len;
    char *saved_s_end = s->end;
    // Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
 #ifdef USE_DEVEL_STREAMCHECK
    parser_stream_overflow_check(s, n * 2, 0, file, line);
 #endif
    // Temporarily set the stream end pointer to allow us to use
    // s_check_rem() when reading in UTF-16 words
    if (s->end - s->p > (int)(n * 2))
    {
        s->end = s->p + (int)(n * 2);
    }
    while (s_check_rem(s, 2))
    {
        c32 = get_c32_from_stream(s);
        u8len = utf_char32_to_utf8(c32, u8str);
        if (u8len + 1 <= vn)
        {
            /* Room for this character and a terminator. Add the character */
            unsigned int i;
            for (i = 0 ; i < u8len ; ++i)
            {
                v[i] = u8str[i];
            }
            vn -= u8len;
            v += u8len;
        }
        else if (vn > 1)
        {
            /* We've skipped a character, but there's more than one byte
             * remaining in the output buffer. Mark the output buffer as
             * full so we don't get a smaller character being squeezed into
             * the remaining space */
            vn = 1;
        }
        rv += u8len;
    }
    // Restore stream to full length
    s->end = saved_s_end;
    if (vn > 0)
    {
        *v = '\0';
    }
    ++rv;
    return rv;
 }
 /******************************************************************************/
 unsigned int
 in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
 {
    char *saved_s_p = s->p;
    unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
    s->p = saved_s_p;
    return rv;
 }
 /******************************************************************************/
 unsigned int
 in_utf16_le_terminated_as_utf8(struct stream *s,
                               char *v, unsigned int vn)
 {
    unsigned int rv = 0;
    char32_t c32;
    char u8str[MAXLEN_UTF8_CHAR];
    unsigned int u8len;
    while (s_check_rem(s, 2))
    {
        c32 = get_c32_from_stream(s);
        if (c32 == 0)
        {
            break;  // Terminator encountered
        }
        u8len = utf_char32_to_utf8(c32, u8str);
        if (u8len + 1 <= vn)
        {
            /* Room for this character and a terminator. Add the character */
            unsigned int i;
            for (i = 0 ; i < u8len ; ++i)
            {
                v[i] = u8str[i];
            }
            vn -= u8len;
            v += u8len;
        }
        else if (vn > 1)
        {
            /* We've skipped a character, but there's more than one byte
             * remaining in the output buffer. Mark the output buffer as
             * full so we don't get a smaller character being squeezed into
             * the remaining space */
            vn = 1;
        }
        rv += u8len;
    }
    if (vn > 0)
    {
        *v = '\0';
    }
    ++rv;
    return rv;
 }
 /******************************************************************************/
 unsigned int
 in_utf16_le_terminated_as_utf8_length(struct stream *s)
 {
    char *saved_s_p = s->p;
    unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
    s->p = saved_s_p;
    return rv;
 }
--- a/common/parse.h
+++ b/common/parse.h
@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
 #   define S_CHECK_REM_OUT(s,n)
 #endif
 /******************************************************************************/
 /**
 * Copies a UTF-8 string to a stream as little-endian UTF-16
 *
 * @param s Stream
 * @param v UTF-8 string
 * @param vn Length of UTF-8 string.
 * @param file Caller location (from __FILE__)
 * @param line Caller location (from __LINE__)
 *
 * Caller is expected to check there is room for the result in s
 */
 void
 out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
                          unsigned int vn,
                          const char *file, int line);
 #define out_utf8_as_utf16_le(s,v,vn) \
    out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)
 /******************************************************************************/
 /**
 * Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
 *
 * @param s Stream
 * @param n Number of 16-bit words to copy
 * @param v Pointer to result buffer
 * @param vn Max size of result buffer
 *
 * @return number of characters which would be written to v, INCLUDING
 *         an additional terminator. This can be used to check for a buffer
 *         overflow. A terminator is added whether or not the input
 *         includes one.
 *
 * Output is unconditionally NULL-terminated.
 * Input is not checked for NULLs - these are copied verbatim
 */
 unsigned int
 in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
                               char *v, unsigned int vn,
                               const char *file, int line);
 #define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
    in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)
 /******************************************************************************/
 /**
 * Returns the size of the buffer needed to store a fixed-size
 * little-endian UTF-16 string in a stream as a UTF-8 string
 *
 * @param s Stream
 * @param n Number of 16-bit words to consider
 * @return number of characters needed to store the UTF-8 string. This
 *         includes a terminator, which is written whether the parsed
 *         string includes one or not.
 * @post Stream position is not moved between start and end of this call
 */
 unsigned int
 in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);
 /******************************************************************************/
 /**
 * Copies a terminated little-endian UTF-16 string from a stream as UTF-8
 *
 * @param s Stream
 * @param v Pointer to result buffer
 * @param vn Max size of result buffer
 *
 * @return number of characters which would be written to v, INCLUDING
 *         the terminator. This can be used to check for a buffer overflow.
 *
 * Output is unconditionally NULL-terminated.
 * Input processing stops when a NULL is encountered, or the end of the buffer
 * is reached.
 */
 unsigned int
 in_utf16_le_terminated_as_utf8(struct stream *s,
                               char *v, unsigned int vn);
 /******************************************************************************/
 /**
 * Returns the size of the buffer needed to store a terminated
 * little-endian UTF-16 string in a stream as a terminated UTF-8 string
 *
 * @param s Stream
 * @return number of characters needed to store the UTF-8 string,
 *         including the terminator
 * @post Stream position is not moved between start and end of this call
 *
 * Input processing stops when a NULL is encountered, or the end of the buffer
 * is reached.
 */
 unsigned int
 in_utf16_le_terminated_as_utf8_length(struct stream *s);
 /******************************************************************************/
 #define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)
--- a/common/unicode_defines.h
+++ b/common/unicode_defines.h
@ -68,4 +68,33 @@
      INVALID_UNICODE_10000_TO_1FFFFF(c) \
    )
 /**
 * Is this character a UTF-16 high surrogate?
 */
 #define IS_HIGH_SURROGATE(u16) (((u16) & 0xfc00) == 0xd800)
 /**
 * Is this character a UTF-16 low surrogate?
 */
 #define IS_LOW_SURROGATE(u16) (((u16) & 0xfc00) == 0xdc00)
 /**
 * Extract the UTF-16 high surrogate from a character
 */
 #define HIGH_SURROGATE_FROM_C32(c32) \
    (((((c32) - 0x10000) >> 10) & 0x3ff) | 0xd800)
 /**
 * Extract the UTF-16 low surrogate from a character
 */
 #define LOW_SURROGATE_FROM_C32(c32) (((c32) & 0x3ff) | 0xdc00)
 /**
 * Reconstruct a character from a UTF-16 surrogate pair
 *
 * This macro cannot return values higher than 0x10ffff
 */
 #define C32_FROM_SURROGATE_PAIR(low,high) \
    ((char32_t)(((high) & 0x3ff) << 10) + ((low) & 0x3ff) + 0x10000)
 #endif // UNICODE_DEFINES_H
--- a/tests/common/Makefile.am
+++ b/tests/common/Makefile.am
@ -16,6 +16,7 @@ test_common_SOURCES = \
    test_common_main.c \
    test_fifo_calls.c \
    test_list_calls.c \
    test_parse.c \
    test_string_calls.c \
    test_string_calls_unicode.c \
    test_os_calls.c \
--- a/tests/common/test_common.h
+++ b/tests/common/test_common.h
@ -9,6 +9,7 @@ bin_to_hex(const char *input, int length);
 Suite *make_suite_test_fifo(void);
 Suite *make_suite_test_list(void);
 Suite *make_suite_test_parse(void);
 Suite *make_suite_test_string(void);
 Suite *make_suite_test_string_unicode(void);
 Suite *make_suite_test_os_calls(void);
--- a/tests/common/test_common_main.c
+++ b/tests/common/test_common_main.c
@ -48,6 +48,7 @@ int main (void)
    sr = srunner_create (make_suite_test_fifo());
    srunner_add_suite(sr, make_suite_test_list());
    srunner_add_suite(sr, make_suite_test_parse());
    srunner_add_suite(sr, make_suite_test_string());
    srunner_add_suite(sr, make_suite_test_string_unicode());
    srunner_add_suite(sr, make_suite_test_os_calls());
--- a/tests/common/test_parse.c
+++ b/tests/common/test_parse.c
@ -0,0 +1,319 @@
 #if defined(HAVE_CONFIG_H)
 #include "config_ac.h"
 #endif
 #include "arch.h"
 #include "os_calls.h"
 #include "string_calls.h"
 #include "parse.h"
 #include "test_common.h"
 #define ELEMENTS(x) (sizeof(x) / sizeof(x[0]))
 const static char
 utf8_simple_test_with_emoji[] =
    "Simple Test."
    "\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
 const static char16_t
 utf16_simple_test_with_emoji[] =
 {
    'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.',
    0xd83d, 0xde25, // U+1F625
    0 // terminator
 };
 /******************************************************************************/
 START_TEST(test_out_utf8_as_utf16_le)
 {
    struct stream *s;
    make_stream(s);
    init_stream(s, 8192);
    out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji,
                         sizeof(utf8_simple_test_with_emoji)); // Include term
    s_mark_end(s);
    // Rewind the stream
    s->p = s->data;
    unsigned int i;
    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
    {
        char16_t val;
        in_uint16_le(s, val);
        if (val != utf16_simple_test_with_emoji[i])
        {
            ck_abort_msg("test_out_utf8_as_utf16_le: "
                         "Index %u expected %x, got %x",
                         i, utf16_simple_test_with_emoji[i], val);
        }
    }
    ck_assert_int_eq(s_check_end(s), 1);
    free_stream(s);
 }
 END_TEST
 /******************************************************************************/
 START_TEST(test_in_utf16_le_fixed_as_utf8)
 {
    struct stream *s;
    make_stream(s);
    init_stream(s, 8192);
    // Write the stream without a terminator
    unsigned int i;
    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i)
    {
        out_uint16_le(s, utf16_simple_test_with_emoji[i]);
    }
    s_mark_end(s);
    // Rewind the stream
    s->p = s->data;
    char buff[256];
    unsigned int len;
    // Check the length call
    len = in_utf16_le_fixed_as_utf8_length(s, i);
    ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
    // Now read the string, checking for the same length
    unsigned int read_len;
    read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff));
    ck_assert_int_eq(len, read_len);
    // Should be at the end of the buffer
    ck_assert_int_eq(s_check_end(s), 1);
    // Check the contents are as expected
    int cmp = memcmp(buff, utf8_simple_test_with_emoji,
                     sizeof(utf8_simple_test_with_emoji));
    ck_assert_int_eq(cmp, 0);
    free_stream(s);
 }
 END_TEST
 /******************************************************************************/
 START_TEST(test_in_utf16_le_terminated_as_utf8)
 {
    struct stream *s;
    make_stream(s);
    init_stream(s, 8192);
    // Write the stream with the terminator
    unsigned int i;
    for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
    {
        out_uint16_le(s, utf16_simple_test_with_emoji[i]);
    }
    s_mark_end(s);
    // Rewind the stream
    s->p = s->data;
    char buff[256];
    unsigned int len;
    // Check the length call
    len = in_utf16_le_terminated_as_utf8_length(s);
    ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
    // Now read the string, checking for the same length
    unsigned int read_len;
    read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff));
    ck_assert_int_eq(len, read_len);
    // Should be at the end of the buffer
    ck_assert_int_eq(s_check_end(s), 1);
    // Check the contents are as expected
    int cmp = memcmp(buff, utf8_simple_test_with_emoji,
                     sizeof(utf8_simple_test_with_emoji));
    ck_assert_int_eq(cmp, 0);
    free_stream(s);
 }
 END_TEST
 /******************************************************************************/
 START_TEST(test_in_utf16_le_significant_chars)
 {
    struct stream *s;
    make_stream(s);
    init_stream(s, 8192);
    const struct
    {
        struct
        {
            char16_t high; // Set to 0 for a single UTF-16 word
            char16_t low;
        } pair;
        char32_t expected;
    } tests[] =
    {
        // Single high surrogates are bad
        { { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER },
        // Single low surrogates are bad
        { { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER },
        // Values before and after surrogate range
        { { 0, 0xd7ff }, 0xd7ff },
        { { 0, 0xe000 }, 0xe000 },
        // First and last non-surrogate pair values (don't use
        // 0xfffe and 0xffff for this test as they are non-characters,
        // and 0xfffd is the replacement character)
        { { 0, 0 }, 0 },
        { { 0, 0xfffc }, 0xfffc },
        { { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER },
        // First and last surrogate pair values (don't use
        // 0x10fffe and 0x10ffff for this test as they are non-characters)
        { { 0xd800, 0xdc00 }, 0x10000 },
        { { 0xdbff, 0xdffd }, 0x10fffd },
        // End-of-plane non-characters (BMP) and the characters before them
        { { 0xd83f, 0xdffd }, 0x1fffd },
        { { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE
        { { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF
        { { 0xd87f, 0xdffd }, 0x2fffd },
        { { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE
        { { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF
        { { 0xd8bf, 0xdffd }, 0x3fffd },
        { { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE
        { { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF
        { { 0xd8ff, 0xdffd }, 0x4fffd },
        { { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE
        { { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF
        { { 0xd93f, 0xdffd }, 0x5fffd },
        { { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE
        { { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF
        { { 0xd97f, 0xdffd }, 0x6fffd },
        { { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE
        { { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF
        { { 0xd9bf, 0xdffd }, 0x7fffd },
        { { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE
        { { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF
        { { 0xd9ff, 0xdffd }, 0x8fffd },
        { { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE
        { { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF
        { { 0xda3f, 0xdffd }, 0x9fffd },
        { { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE
        { { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF
        { { 0xda7f, 0xdffd }, 0xafffd },
        { { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE
        { { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF
        { { 0xdabf, 0xdffd }, 0xbfffd },
        { { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE
        { { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF
        { { 0xdaff, 0xdffd }, 0xcfffd },
        { { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE
        { { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF
        { { 0xdb3f, 0xdffd }, 0xdfffd },
        { { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE
        { { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF
        { { 0xdb7f, 0xdffd }, 0xefffd },
        { { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE
        { { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF
        { { 0xdbbf, 0xdffd }, 0xffffd },
        { { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE
        { { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF
        { { 0xdbff, 0xdffd }, 0x10fffd },
        { { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE
        { { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF
        // Non-characters in "Arabic Presentation Forms-A"
        { { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER },
        { { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER }
    };
    unsigned int i;
    for (i = 0; i < ELEMENTS(tests); ++i)
    {
        char buff[256];
        unsigned int word_count;
        init_stream(s, 8192);
        word_count = 0;
        if (tests[i].pair.high != 0)
        {
            out_uint16_le(s, tests[i].pair.high);
            ++word_count;
        }
        out_uint16_le(s, tests[i].pair.low);
        ++word_count;
        s_mark_end(s);
        // Rewind the stream
        s->p = s->data;
        // Read in one UTF-16 LE character as UTF-32
        in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff));
        const char *p = buff;
        char32_t c32 = utf8_get_next_char(&p, NULL);
        if (c32 != tests[i].expected)
        {
            ck_abort_msg("test_in_utf16_le_significant_chars: "
                         "Index %u for {%x, %x}, expected %x, got %x",
                         i,  tests[i].pair.high, tests[i].pair.low,
                         tests[i].expected, c32);
        }
    }
    free_stream(s);
 }
 END_TEST
 /******************************************************************************/
 Suite *
 make_suite_test_parse(void)
 {
    Suite *s;
    TCase *tc_unicode;
    s = suite_create("Parse");
    tc_unicode = tcase_create("Unicode");
    suite_add_tcase(s, tc_unicode);
    tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le);
    tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8);
    tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8);
    tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars);
    return s;
 }