Add UTF-16 LE I/O routines

These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
This commit is contained in:
matt335672 2023-09-19 11:36:13 +01:00
parent 0463e552dc
commit 0758fe03a6
7 changed files with 705 additions and 0 deletions

View File

@ -27,7 +27,47 @@
#include "arch.h" #include "arch.h"
#include "parse.h" #include "parse.h"
#include "log.h" #include "log.h"
#include "string_calls.h"
#include "unicode_defines.h"
/******************************************************************************/
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define out_uint16_le_unchecked(s, v) do \
{ \
*((s)->p) = (unsigned char)((v) >> 0); \
(s)->p++; \
*((s)->p) = (unsigned char)((v) >> 8); \
(s)->p++; \
} while (0)
#else
#define out_uint16_le_unchecked(s, v) do \
{ \
*((unsigned short*)((s)->p)) = (unsigned short)(v); \
(s)->p += 2; \
} while (0)
#endif
/******************************************************************************/
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = (unsigned short) \
( \
(*((unsigned char*)((s)->p + 0)) << 0) | \
(*((unsigned char*)((s)->p + 1)) << 8) \
); \
(s)->p += 2; \
} while (0)
#else
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = *((unsigned short*)((s)->p)); \
(s)->p += 2; \
} while (0)
#endif
/******************************************************************************/
void void
parser_stream_overflow_check(const struct stream *s, int n, int is_out, parser_stream_overflow_check(const struct stream *s, int n, int is_out,
const char *file, int line) const char *file, int line)
@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
} }
} }
} }
/******************************************************************************/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line)
{
// Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
// file and line
#ifdef USE_DEVEL_STREAMCHECK
int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
parser_stream_overflow_check(s, octet_cnt, 1, file, line);
#endif
while (vn > 0)
{
char32_t c32 = utf8_get_next_char(&v, &vn);
char16_t low;
if (c32 < 0x10000)
{
low = (char16_t)c32;
}
else
{
/* Need a surrogate pair */
low = LOW_SURROGATE_FROM_C32(c32);
char16_t high = HIGH_SURROGATE_FROM_C32(c32);
out_uint16_le_unchecked(s, high);
}
out_uint16_le_unchecked(s, low);
}
}
/******************************************************************************/
/**
* Gets the next Unicode character from a code stream
* @param s Stream
* @return Unicode character
*
* Non-characters and illegally coded characters are mapped to
* UCS_REPLACEMENT_CHARACTER
*
* @pre Two bytes are assumed to be available on the stram on entry
*/
static char32_t
get_c32_from_stream(struct stream *s)
{
char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
char16_t w;
in_uint16_le_unchecked(s, w);
if (IS_HIGH_SURROGATE(w))
{
if (s_check_rem(s, 2))
{
char16_t low;
in_uint16_le_unchecked(s, low);
if (IS_LOW_SURROGATE(low))
{
/* Valid surrogate pair */
char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
/* Ignore some values which can be successfully encoded
* in this way */
if (!IS_PLANE_END_NON_CHARACTER(c32))
{
c32 = v;
}
}
else
{
/* Invalid low surrogate - pop character back */
s->p -= 2;
}
}
}
else if (!IS_LOW_SURROGATE(w) &&
!IS_PLANE_END_NON_CHARACTER(w) &&
!IS_ARABIC_NON_CHARACTER(w))
{
/* Character from the Basic Multilingual Plane */
c32 = (char32_t)w;
}
return c32;
}
/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
char *saved_s_end = s->end;
// Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
#ifdef USE_DEVEL_STREAMCHECK
parser_stream_overflow_check(s, n * 2, 0, file, line);
#endif
// Temporarily set the stream end pointer to allow us to use
// s_check_rem() when reading in UTF-16 words
if (s->end - s->p > (int)(n * 2))
{
s->end = s->p + (int)(n * 2);
}
while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);
u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}
rv += u8len;
}
// Restore stream to full length
s->end = saved_s_end;
if (vn > 0)
{
*v = '\0';
}
++rv;
return rv;
}
/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
s->p = saved_s_p;
return rv;
}
/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);
if (c32 == 0)
{
break; // Terminator encountered
}
u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}
rv += u8len;
}
if (vn > 0)
{
*v = '\0';
}
++rv;
return rv;
}
/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
s->p = saved_s_p;
return rv;
}

View File

@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
# define S_CHECK_REM_OUT(s,n) # define S_CHECK_REM_OUT(s,n)
#endif #endif
/******************************************************************************/
/**
* Copies a UTF-8 string to a stream as little-endian UTF-16
*
* @param s Stream
* @param v UTF-8 string
* @param vn Length of UTF-8 string.
* @param file Caller location (from __FILE__)
* @param line Caller location (from __LINE__)
*
* Caller is expected to check there is room for the result in s
*/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line);
#define out_utf8_as_utf16_le(s,v,vn) \
out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)
/******************************************************************************/
/**
* Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
*
* @param s Stream
* @param n Number of 16-bit words to copy
* @param v Pointer to result buffer
* @param vn Max size of result buffer
*
* @return number of characters which would be written to v, INCLUDING
* an additional terminator. This can be used to check for a buffer
* overflow. A terminator is added whether or not the input
* includes one.
*
* Output is unconditionally NULL-terminated.
* Input is not checked for NULLs - these are copied verbatim
*/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line);
#define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)
/******************************************************************************/
/**
* Returns the size of the buffer needed to store a fixed-size
* little-endian UTF-16 string in a stream as a UTF-8 string
*
* @param s Stream
* @param n Number of 16-bit words to consider
* @return number of characters needed to store the UTF-8 string. This
* includes a terminator, which is written whether the parsed
* string includes one or not.
* @post Stream position is not moved between start and end of this call
*/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);
/******************************************************************************/
/**
* Copies a terminated little-endian UTF-16 string from a stream as UTF-8
*
* @param s Stream
* @param v Pointer to result buffer
* @param vn Max size of result buffer
*
* @return number of characters which would be written to v, INCLUDING
* the terminator. This can be used to check for a buffer overflow.
*
* Output is unconditionally NULL-terminated.
* Input processing stops when a NULL is encountered, or the end of the buffer
* is reached.
*/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn);
/******************************************************************************/
/**
* Returns the size of the buffer needed to store a terminated
* little-endian UTF-16 string in a stream as a terminated UTF-8 string
*
* @param s Stream
* @return number of characters needed to store the UTF-8 string,
* including the terminator
* @post Stream position is not moved between start and end of this call
*
* Input processing stops when a NULL is encountered, or the end of the buffer
* is reached.
*/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s);
/******************************************************************************/ /******************************************************************************/
#define s_check_rem(s, n) ((s)->p + (n) <= (s)->end) #define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)

View File

@ -68,4 +68,33 @@
INVALID_UNICODE_10000_TO_1FFFFF(c) \ INVALID_UNICODE_10000_TO_1FFFFF(c) \
) )
/**
* Is this character a UTF-16 high surrogate?
*/
#define IS_HIGH_SURROGATE(u16) (((u16) & 0xfc00) == 0xd800)
/**
* Is this character a UTF-16 low surrogate?
*/
#define IS_LOW_SURROGATE(u16) (((u16) & 0xfc00) == 0xdc00)
/**
* Extract the UTF-16 high surrogate from a character
*/
#define HIGH_SURROGATE_FROM_C32(c32) \
(((((c32) - 0x10000) >> 10) & 0x3ff) | 0xd800)
/**
* Extract the UTF-16 low surrogate from a character
*/
#define LOW_SURROGATE_FROM_C32(c32) (((c32) & 0x3ff) | 0xdc00)
/**
* Reconstruct a character from a UTF-16 surrogate pair
*
* This macro cannot return values higher than 0x10ffff
*/
#define C32_FROM_SURROGATE_PAIR(low,high) \
((char32_t)(((high) & 0x3ff) << 10) + ((low) & 0x3ff) + 0x10000)
#endif // UNICODE_DEFINES_H #endif // UNICODE_DEFINES_H

View File

@ -16,6 +16,7 @@ test_common_SOURCES = \
test_common_main.c \ test_common_main.c \
test_fifo_calls.c \ test_fifo_calls.c \
test_list_calls.c \ test_list_calls.c \
test_parse.c \
test_string_calls.c \ test_string_calls.c \
test_string_calls_unicode.c \ test_string_calls_unicode.c \
test_os_calls.c \ test_os_calls.c \

View File

@ -9,6 +9,7 @@ bin_to_hex(const char *input, int length);
Suite *make_suite_test_fifo(void); Suite *make_suite_test_fifo(void);
Suite *make_suite_test_list(void); Suite *make_suite_test_list(void);
Suite *make_suite_test_parse(void);
Suite *make_suite_test_string(void); Suite *make_suite_test_string(void);
Suite *make_suite_test_string_unicode(void); Suite *make_suite_test_string_unicode(void);
Suite *make_suite_test_os_calls(void); Suite *make_suite_test_os_calls(void);

View File

@ -48,6 +48,7 @@ int main (void)
sr = srunner_create (make_suite_test_fifo()); sr = srunner_create (make_suite_test_fifo());
srunner_add_suite(sr, make_suite_test_list()); srunner_add_suite(sr, make_suite_test_list());
srunner_add_suite(sr, make_suite_test_parse());
srunner_add_suite(sr, make_suite_test_string()); srunner_add_suite(sr, make_suite_test_string());
srunner_add_suite(sr, make_suite_test_string_unicode()); srunner_add_suite(sr, make_suite_test_string_unicode());
srunner_add_suite(sr, make_suite_test_os_calls()); srunner_add_suite(sr, make_suite_test_os_calls());

319
tests/common/test_parse.c Normal file
View File

@ -0,0 +1,319 @@
#if defined(HAVE_CONFIG_H)
#include "config_ac.h"
#endif
#include "arch.h"
#include "os_calls.h"
#include "string_calls.h"
#include "parse.h"
#include "test_common.h"
#define ELEMENTS(x) (sizeof(x) / sizeof(x[0]))
const static char
utf8_simple_test_with_emoji[] =
"Simple Test."
"\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
const static char16_t
utf16_simple_test_with_emoji[] =
{
'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.',
0xd83d, 0xde25, // U+1F625
0 // terminator
};
/******************************************************************************/
START_TEST(test_out_utf8_as_utf16_le)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji)); // Include term
s_mark_end(s);
// Rewind the stream
s->p = s->data;
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
{
char16_t val;
in_uint16_le(s, val);
if (val != utf16_simple_test_with_emoji[i])
{
ck_abort_msg("test_out_utf8_as_utf16_le: "
"Index %u expected %x, got %x",
i, utf16_simple_test_with_emoji[i], val);
}
}
ck_assert_int_eq(s_check_end(s), 1);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_fixed_as_utf8)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
// Write the stream without a terminator
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i)
{
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
}
s_mark_end(s);
// Rewind the stream
s->p = s->data;
char buff[256];
unsigned int len;
// Check the length call
len = in_utf16_le_fixed_as_utf8_length(s, i);
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
// Now read the string, checking for the same length
unsigned int read_len;
read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff));
ck_assert_int_eq(len, read_len);
// Should be at the end of the buffer
ck_assert_int_eq(s_check_end(s), 1);
// Check the contents are as expected
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji));
ck_assert_int_eq(cmp, 0);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_terminated_as_utf8)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
// Write the stream with the terminator
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
{
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
}
s_mark_end(s);
// Rewind the stream
s->p = s->data;
char buff[256];
unsigned int len;
// Check the length call
len = in_utf16_le_terminated_as_utf8_length(s);
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
// Now read the string, checking for the same length
unsigned int read_len;
read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff));
ck_assert_int_eq(len, read_len);
// Should be at the end of the buffer
ck_assert_int_eq(s_check_end(s), 1);
// Check the contents are as expected
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji));
ck_assert_int_eq(cmp, 0);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_significant_chars)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
const struct
{
struct
{
char16_t high; // Set to 0 for a single UTF-16 word
char16_t low;
} pair;
char32_t expected;
} tests[] =
{
// Single high surrogates are bad
{ { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER },
// Single low surrogates are bad
{ { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER },
// Values before and after surrogate range
{ { 0, 0xd7ff }, 0xd7ff },
{ { 0, 0xe000 }, 0xe000 },
// First and last non-surrogate pair values (don't use
// 0xfffe and 0xffff for this test as they are non-characters,
// and 0xfffd is the replacement character)
{ { 0, 0 }, 0 },
{ { 0, 0xfffc }, 0xfffc },
{ { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER },
// First and last surrogate pair values (don't use
// 0x10fffe and 0x10ffff for this test as they are non-characters)
{ { 0xd800, 0xdc00 }, 0x10000 },
{ { 0xdbff, 0xdffd }, 0x10fffd },
// End-of-plane non-characters (BMP) and the characters before them
{ { 0xd83f, 0xdffd }, 0x1fffd },
{ { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE
{ { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF
{ { 0xd87f, 0xdffd }, 0x2fffd },
{ { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE
{ { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF
{ { 0xd8bf, 0xdffd }, 0x3fffd },
{ { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE
{ { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF
{ { 0xd8ff, 0xdffd }, 0x4fffd },
{ { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE
{ { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF
{ { 0xd93f, 0xdffd }, 0x5fffd },
{ { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE
{ { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF
{ { 0xd97f, 0xdffd }, 0x6fffd },
{ { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE
{ { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF
{ { 0xd9bf, 0xdffd }, 0x7fffd },
{ { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE
{ { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF
{ { 0xd9ff, 0xdffd }, 0x8fffd },
{ { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE
{ { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF
{ { 0xda3f, 0xdffd }, 0x9fffd },
{ { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE
{ { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF
{ { 0xda7f, 0xdffd }, 0xafffd },
{ { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE
{ { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF
{ { 0xdabf, 0xdffd }, 0xbfffd },
{ { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE
{ { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF
{ { 0xdaff, 0xdffd }, 0xcfffd },
{ { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE
{ { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF
{ { 0xdb3f, 0xdffd }, 0xdfffd },
{ { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE
{ { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF
{ { 0xdb7f, 0xdffd }, 0xefffd },
{ { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE
{ { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF
{ { 0xdbbf, 0xdffd }, 0xffffd },
{ { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE
{ { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF
{ { 0xdbff, 0xdffd }, 0x10fffd },
{ { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE
{ { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF
// Non-characters in "Arabic Presentation Forms-A"
{ { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER }
};
unsigned int i;
for (i = 0; i < ELEMENTS(tests); ++i)
{
char buff[256];
unsigned int word_count;
init_stream(s, 8192);
word_count = 0;
if (tests[i].pair.high != 0)
{
out_uint16_le(s, tests[i].pair.high);
++word_count;
}
out_uint16_le(s, tests[i].pair.low);
++word_count;
s_mark_end(s);
// Rewind the stream
s->p = s->data;
// Read in one UTF-16 LE character as UTF-32
in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff));
const char *p = buff;
char32_t c32 = utf8_get_next_char(&p, NULL);
if (c32 != tests[i].expected)
{
ck_abort_msg("test_in_utf16_le_significant_chars: "
"Index %u for {%x, %x}, expected %x, got %x",
i, tests[i].pair.high, tests[i].pair.low,
tests[i].expected, c32);
}
}
free_stream(s);
}
END_TEST
/******************************************************************************/
Suite *
make_suite_test_parse(void)
{
Suite *s;
TCase *tc_unicode;
s = suite_create("Parse");
tc_unicode = tcase_create("Unicode");
suite_add_tcase(s, tc_unicode);
tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le);
tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8);
tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8);
tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars);
return s;
}