Add UTF-16 LE I/O routines
These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
This commit is contained in:
parent
0463e552dc
commit
0758fe03a6
258
common/parse.c
258
common/parse.c
@ -27,7 +27,47 @@
|
|||||||
#include "arch.h"
|
#include "arch.h"
|
||||||
#include "parse.h"
|
#include "parse.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
#include "string_calls.h"
|
||||||
|
#include "unicode_defines.h"
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
|
||||||
|
#define out_uint16_le_unchecked(s, v) do \
|
||||||
|
{ \
|
||||||
|
*((s)->p) = (unsigned char)((v) >> 0); \
|
||||||
|
(s)->p++; \
|
||||||
|
*((s)->p) = (unsigned char)((v) >> 8); \
|
||||||
|
(s)->p++; \
|
||||||
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define out_uint16_le_unchecked(s, v) do \
|
||||||
|
{ \
|
||||||
|
*((unsigned short*)((s)->p)) = (unsigned short)(v); \
|
||||||
|
(s)->p += 2; \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
|
||||||
|
#define in_uint16_le_unchecked(s, v) do \
|
||||||
|
{ \
|
||||||
|
(v) = (unsigned short) \
|
||||||
|
( \
|
||||||
|
(*((unsigned char*)((s)->p + 0)) << 0) | \
|
||||||
|
(*((unsigned char*)((s)->p + 1)) << 8) \
|
||||||
|
); \
|
||||||
|
(s)->p += 2; \
|
||||||
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define in_uint16_le_unchecked(s, v) do \
|
||||||
|
{ \
|
||||||
|
(v) = *((unsigned short*)((s)->p)); \
|
||||||
|
(s)->p += 2; \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
void
|
void
|
||||||
parser_stream_overflow_check(const struct stream *s, int n, int is_out,
|
parser_stream_overflow_check(const struct stream *s, int n, int is_out,
|
||||||
const char *file, int line)
|
const char *file, int line)
|
||||||
@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
void
|
||||||
|
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
|
||||||
|
unsigned int vn,
|
||||||
|
const char *file, int line)
|
||||||
|
{
|
||||||
|
// Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
|
||||||
|
// file and line
|
||||||
|
#ifdef USE_DEVEL_STREAMCHECK
|
||||||
|
int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
|
||||||
|
parser_stream_overflow_check(s, octet_cnt, 1, file, line);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
while (vn > 0)
|
||||||
|
{
|
||||||
|
char32_t c32 = utf8_get_next_char(&v, &vn);
|
||||||
|
char16_t low;
|
||||||
|
if (c32 < 0x10000)
|
||||||
|
{
|
||||||
|
low = (char16_t)c32;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Need a surrogate pair */
|
||||||
|
low = LOW_SURROGATE_FROM_C32(c32);
|
||||||
|
char16_t high = HIGH_SURROGATE_FROM_C32(c32);
|
||||||
|
out_uint16_le_unchecked(s, high);
|
||||||
|
}
|
||||||
|
out_uint16_le_unchecked(s, low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Gets the next Unicode character from a code stream
|
||||||
|
* @param s Stream
|
||||||
|
* @return Unicode character
|
||||||
|
*
|
||||||
|
* Non-characters and illegally coded characters are mapped to
|
||||||
|
* UCS_REPLACEMENT_CHARACTER
|
||||||
|
*
|
||||||
|
* @pre Two bytes are assumed to be available on the stram on entry
|
||||||
|
*/
|
||||||
|
static char32_t
|
||||||
|
get_c32_from_stream(struct stream *s)
|
||||||
|
{
|
||||||
|
char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
|
||||||
|
char16_t w;
|
||||||
|
|
||||||
|
in_uint16_le_unchecked(s, w);
|
||||||
|
|
||||||
|
if (IS_HIGH_SURROGATE(w))
|
||||||
|
{
|
||||||
|
if (s_check_rem(s, 2))
|
||||||
|
{
|
||||||
|
char16_t low;
|
||||||
|
in_uint16_le_unchecked(s, low);
|
||||||
|
if (IS_LOW_SURROGATE(low))
|
||||||
|
{
|
||||||
|
/* Valid surrogate pair */
|
||||||
|
char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
|
||||||
|
|
||||||
|
/* Ignore some values which can be successfully encoded
|
||||||
|
* in this way */
|
||||||
|
if (!IS_PLANE_END_NON_CHARACTER(c32))
|
||||||
|
{
|
||||||
|
c32 = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Invalid low surrogate - pop character back */
|
||||||
|
s->p -= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!IS_LOW_SURROGATE(w) &&
|
||||||
|
!IS_PLANE_END_NON_CHARACTER(w) &&
|
||||||
|
!IS_ARABIC_NON_CHARACTER(w))
|
||||||
|
{
|
||||||
|
/* Character from the Basic Multilingual Plane */
|
||||||
|
c32 = (char32_t)w;
|
||||||
|
}
|
||||||
|
|
||||||
|
return c32;
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
|
||||||
|
char *v, unsigned int vn,
|
||||||
|
const char *file, int line)
|
||||||
|
{
|
||||||
|
unsigned int rv = 0;
|
||||||
|
char32_t c32;
|
||||||
|
char u8str[MAXLEN_UTF8_CHAR];
|
||||||
|
unsigned int u8len;
|
||||||
|
char *saved_s_end = s->end;
|
||||||
|
|
||||||
|
// Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
|
||||||
|
#ifdef USE_DEVEL_STREAMCHECK
|
||||||
|
parser_stream_overflow_check(s, n * 2, 0, file, line);
|
||||||
|
#endif
|
||||||
|
// Temporarily set the stream end pointer to allow us to use
|
||||||
|
// s_check_rem() when reading in UTF-16 words
|
||||||
|
if (s->end - s->p > (int)(n * 2))
|
||||||
|
{
|
||||||
|
s->end = s->p + (int)(n * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (s_check_rem(s, 2))
|
||||||
|
{
|
||||||
|
c32 = get_c32_from_stream(s);
|
||||||
|
|
||||||
|
u8len = utf_char32_to_utf8(c32, u8str);
|
||||||
|
if (u8len + 1 <= vn)
|
||||||
|
{
|
||||||
|
/* Room for this character and a terminator. Add the character */
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0 ; i < u8len ; ++i)
|
||||||
|
{
|
||||||
|
v[i] = u8str[i];
|
||||||
|
}
|
||||||
|
vn -= u8len;
|
||||||
|
v += u8len;
|
||||||
|
}
|
||||||
|
else if (vn > 1)
|
||||||
|
{
|
||||||
|
/* We've skipped a character, but there's more than one byte
|
||||||
|
* remaining in the output buffer. Mark the output buffer as
|
||||||
|
* full so we don't get a smaller character being squeezed into
|
||||||
|
* the remaining space */
|
||||||
|
vn = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
rv += u8len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore stream to full length
|
||||||
|
s->end = saved_s_end;
|
||||||
|
|
||||||
|
if (vn > 0)
|
||||||
|
{
|
||||||
|
*v = '\0';
|
||||||
|
}
|
||||||
|
++rv;
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
|
||||||
|
{
|
||||||
|
char *saved_s_p = s->p;
|
||||||
|
unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
|
||||||
|
s->p = saved_s_p;
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_terminated_as_utf8(struct stream *s,
|
||||||
|
char *v, unsigned int vn)
|
||||||
|
{
|
||||||
|
unsigned int rv = 0;
|
||||||
|
char32_t c32;
|
||||||
|
char u8str[MAXLEN_UTF8_CHAR];
|
||||||
|
unsigned int u8len;
|
||||||
|
while (s_check_rem(s, 2))
|
||||||
|
{
|
||||||
|
c32 = get_c32_from_stream(s);
|
||||||
|
if (c32 == 0)
|
||||||
|
{
|
||||||
|
break; // Terminator encountered
|
||||||
|
}
|
||||||
|
|
||||||
|
u8len = utf_char32_to_utf8(c32, u8str);
|
||||||
|
if (u8len + 1 <= vn)
|
||||||
|
{
|
||||||
|
/* Room for this character and a terminator. Add the character */
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0 ; i < u8len ; ++i)
|
||||||
|
{
|
||||||
|
v[i] = u8str[i];
|
||||||
|
}
|
||||||
|
vn -= u8len;
|
||||||
|
v += u8len;
|
||||||
|
}
|
||||||
|
else if (vn > 1)
|
||||||
|
{
|
||||||
|
/* We've skipped a character, but there's more than one byte
|
||||||
|
* remaining in the output buffer. Mark the output buffer as
|
||||||
|
* full so we don't get a smaller character being squeezed into
|
||||||
|
* the remaining space */
|
||||||
|
vn = 1;
|
||||||
|
}
|
||||||
|
rv += u8len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vn > 0)
|
||||||
|
{
|
||||||
|
*v = '\0';
|
||||||
|
}
|
||||||
|
++rv;
|
||||||
|
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_terminated_as_utf8_length(struct stream *s)
|
||||||
|
{
|
||||||
|
char *saved_s_p = s->p;
|
||||||
|
unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
|
||||||
|
s->p = saved_s_p;
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
@ -89,6 +89,102 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
|
|||||||
# define S_CHECK_REM_OUT(s,n)
|
# define S_CHECK_REM_OUT(s,n)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Copies a UTF-8 string to a stream as little-endian UTF-16
|
||||||
|
*
|
||||||
|
* @param s Stream
|
||||||
|
* @param v UTF-8 string
|
||||||
|
* @param vn Length of UTF-8 string.
|
||||||
|
* @param file Caller location (from __FILE__)
|
||||||
|
* @param line Caller location (from __LINE__)
|
||||||
|
*
|
||||||
|
* Caller is expected to check there is room for the result in s
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
|
||||||
|
unsigned int vn,
|
||||||
|
const char *file, int line);
|
||||||
|
|
||||||
|
#define out_utf8_as_utf16_le(s,v,vn) \
|
||||||
|
out_utf8_as_utf16_le_proc((s), (v), (vn), __FILE__, __LINE__)
|
||||||
|
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Copies a fixed-size little-endian UTF-16 string from a stream as UTF-8
|
||||||
|
*
|
||||||
|
* @param s Stream
|
||||||
|
* @param n Number of 16-bit words to copy
|
||||||
|
* @param v Pointer to result buffer
|
||||||
|
* @param vn Max size of result buffer
|
||||||
|
*
|
||||||
|
* @return number of characters which would be written to v, INCLUDING
|
||||||
|
* an additional terminator. This can be used to check for a buffer
|
||||||
|
* overflow. A terminator is added whether or not the input
|
||||||
|
* includes one.
|
||||||
|
*
|
||||||
|
* Output is unconditionally NULL-terminated.
|
||||||
|
* Input is not checked for NULLs - these are copied verbatim
|
||||||
|
*/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
|
||||||
|
char *v, unsigned int vn,
|
||||||
|
const char *file, int line);
|
||||||
|
|
||||||
|
#define in_utf16_le_fixed_as_utf8(s,n,v,vn) \
|
||||||
|
in_utf16_le_fixed_as_utf8_proc((s), (n), (v), (vn), __FILE__, __LINE__)
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Returns the size of the buffer needed to store a fixed-size
|
||||||
|
* little-endian UTF-16 string in a stream as a UTF-8 string
|
||||||
|
*
|
||||||
|
* @param s Stream
|
||||||
|
* @param n Number of 16-bit words to consider
|
||||||
|
* @return number of characters needed to store the UTF-8 string. This
|
||||||
|
* includes a terminator, which is written whether the parsed
|
||||||
|
* string includes one or not.
|
||||||
|
* @post Stream position is not moved between start and end of this call
|
||||||
|
*/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n);
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Copies a terminated little-endian UTF-16 string from a stream as UTF-8
|
||||||
|
*
|
||||||
|
* @param s Stream
|
||||||
|
* @param v Pointer to result buffer
|
||||||
|
* @param vn Max size of result buffer
|
||||||
|
*
|
||||||
|
* @return number of characters which would be written to v, INCLUDING
|
||||||
|
* the terminator. This can be used to check for a buffer overflow.
|
||||||
|
*
|
||||||
|
* Output is unconditionally NULL-terminated.
|
||||||
|
* Input processing stops when a NULL is encountered, or the end of the buffer
|
||||||
|
* is reached.
|
||||||
|
*/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_terminated_as_utf8(struct stream *s,
|
||||||
|
char *v, unsigned int vn);
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
/**
|
||||||
|
* Returns the size of the buffer needed to store a terminated
|
||||||
|
* little-endian UTF-16 string in a stream as a terminated UTF-8 string
|
||||||
|
*
|
||||||
|
* @param s Stream
|
||||||
|
* @return number of characters needed to store the UTF-8 string,
|
||||||
|
* including the terminator
|
||||||
|
* @post Stream position is not moved between start and end of this call
|
||||||
|
*
|
||||||
|
* Input processing stops when a NULL is encountered, or the end of the buffer
|
||||||
|
* is reached.
|
||||||
|
*/
|
||||||
|
unsigned int
|
||||||
|
in_utf16_le_terminated_as_utf8_length(struct stream *s);
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
#define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)
|
#define s_check_rem(s, n) ((s)->p + (n) <= (s)->end)
|
||||||
|
|
||||||
|
@ -68,4 +68,33 @@
|
|||||||
INVALID_UNICODE_10000_TO_1FFFFF(c) \
|
INVALID_UNICODE_10000_TO_1FFFFF(c) \
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this character a UTF-16 high surrogate?
|
||||||
|
*/
|
||||||
|
#define IS_HIGH_SURROGATE(u16) (((u16) & 0xfc00) == 0xd800)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this character a UTF-16 low surrogate?
|
||||||
|
*/
|
||||||
|
#define IS_LOW_SURROGATE(u16) (((u16) & 0xfc00) == 0xdc00)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the UTF-16 high surrogate from a character
|
||||||
|
*/
|
||||||
|
#define HIGH_SURROGATE_FROM_C32(c32) \
|
||||||
|
(((((c32) - 0x10000) >> 10) & 0x3ff) | 0xd800)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the UTF-16 low surrogate from a character
|
||||||
|
*/
|
||||||
|
#define LOW_SURROGATE_FROM_C32(c32) (((c32) & 0x3ff) | 0xdc00)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reconstruct a character from a UTF-16 surrogate pair
|
||||||
|
*
|
||||||
|
* This macro cannot return values higher than 0x10ffff
|
||||||
|
*/
|
||||||
|
#define C32_FROM_SURROGATE_PAIR(low,high) \
|
||||||
|
((char32_t)(((high) & 0x3ff) << 10) + ((low) & 0x3ff) + 0x10000)
|
||||||
|
|
||||||
#endif // UNICODE_DEFINES_H
|
#endif // UNICODE_DEFINES_H
|
||||||
|
@ -16,6 +16,7 @@ test_common_SOURCES = \
|
|||||||
test_common_main.c \
|
test_common_main.c \
|
||||||
test_fifo_calls.c \
|
test_fifo_calls.c \
|
||||||
test_list_calls.c \
|
test_list_calls.c \
|
||||||
|
test_parse.c \
|
||||||
test_string_calls.c \
|
test_string_calls.c \
|
||||||
test_string_calls_unicode.c \
|
test_string_calls_unicode.c \
|
||||||
test_os_calls.c \
|
test_os_calls.c \
|
||||||
|
@ -9,6 +9,7 @@ bin_to_hex(const char *input, int length);
|
|||||||
|
|
||||||
Suite *make_suite_test_fifo(void);
|
Suite *make_suite_test_fifo(void);
|
||||||
Suite *make_suite_test_list(void);
|
Suite *make_suite_test_list(void);
|
||||||
|
Suite *make_suite_test_parse(void);
|
||||||
Suite *make_suite_test_string(void);
|
Suite *make_suite_test_string(void);
|
||||||
Suite *make_suite_test_string_unicode(void);
|
Suite *make_suite_test_string_unicode(void);
|
||||||
Suite *make_suite_test_os_calls(void);
|
Suite *make_suite_test_os_calls(void);
|
||||||
|
@ -48,6 +48,7 @@ int main (void)
|
|||||||
|
|
||||||
sr = srunner_create (make_suite_test_fifo());
|
sr = srunner_create (make_suite_test_fifo());
|
||||||
srunner_add_suite(sr, make_suite_test_list());
|
srunner_add_suite(sr, make_suite_test_list());
|
||||||
|
srunner_add_suite(sr, make_suite_test_parse());
|
||||||
srunner_add_suite(sr, make_suite_test_string());
|
srunner_add_suite(sr, make_suite_test_string());
|
||||||
srunner_add_suite(sr, make_suite_test_string_unicode());
|
srunner_add_suite(sr, make_suite_test_string_unicode());
|
||||||
srunner_add_suite(sr, make_suite_test_os_calls());
|
srunner_add_suite(sr, make_suite_test_os_calls());
|
||||||
|
319
tests/common/test_parse.c
Normal file
319
tests/common/test_parse.c
Normal file
@ -0,0 +1,319 @@
|
|||||||
|
#if defined(HAVE_CONFIG_H)
|
||||||
|
#include "config_ac.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "arch.h"
|
||||||
|
#include "os_calls.h"
|
||||||
|
#include "string_calls.h"
|
||||||
|
#include "parse.h"
|
||||||
|
|
||||||
|
#include "test_common.h"
|
||||||
|
|
||||||
|
#define ELEMENTS(x) (sizeof(x) / sizeof(x[0]))
|
||||||
|
|
||||||
|
const static char
|
||||||
|
utf8_simple_test_with_emoji[] =
|
||||||
|
"Simple Test."
|
||||||
|
"\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
|
||||||
|
|
||||||
|
const static char16_t
|
||||||
|
utf16_simple_test_with_emoji[] =
|
||||||
|
{
|
||||||
|
'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.',
|
||||||
|
0xd83d, 0xde25, // U+1F625
|
||||||
|
0 // terminator
|
||||||
|
};
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
START_TEST(test_out_utf8_as_utf16_le)
|
||||||
|
{
|
||||||
|
struct stream *s;
|
||||||
|
make_stream(s);
|
||||||
|
init_stream(s, 8192);
|
||||||
|
out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji,
|
||||||
|
sizeof(utf8_simple_test_with_emoji)); // Include term
|
||||||
|
s_mark_end(s);
|
||||||
|
|
||||||
|
// Rewind the stream
|
||||||
|
s->p = s->data;
|
||||||
|
unsigned int i;
|
||||||
|
|
||||||
|
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
|
||||||
|
{
|
||||||
|
char16_t val;
|
||||||
|
in_uint16_le(s, val);
|
||||||
|
if (val != utf16_simple_test_with_emoji[i])
|
||||||
|
{
|
||||||
|
ck_abort_msg("test_out_utf8_as_utf16_le: "
|
||||||
|
"Index %u expected %x, got %x",
|
||||||
|
i, utf16_simple_test_with_emoji[i], val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ck_assert_int_eq(s_check_end(s), 1);
|
||||||
|
|
||||||
|
free_stream(s);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
START_TEST(test_in_utf16_le_fixed_as_utf8)
|
||||||
|
{
|
||||||
|
struct stream *s;
|
||||||
|
make_stream(s);
|
||||||
|
init_stream(s, 8192);
|
||||||
|
|
||||||
|
// Write the stream without a terminator
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i)
|
||||||
|
{
|
||||||
|
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
|
||||||
|
}
|
||||||
|
s_mark_end(s);
|
||||||
|
|
||||||
|
// Rewind the stream
|
||||||
|
s->p = s->data;
|
||||||
|
|
||||||
|
char buff[256];
|
||||||
|
unsigned int len;
|
||||||
|
|
||||||
|
// Check the length call
|
||||||
|
len = in_utf16_le_fixed_as_utf8_length(s, i);
|
||||||
|
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
|
||||||
|
|
||||||
|
// Now read the string, checking for the same length
|
||||||
|
unsigned int read_len;
|
||||||
|
read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff));
|
||||||
|
ck_assert_int_eq(len, read_len);
|
||||||
|
|
||||||
|
// Should be at the end of the buffer
|
||||||
|
ck_assert_int_eq(s_check_end(s), 1);
|
||||||
|
|
||||||
|
// Check the contents are as expected
|
||||||
|
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
|
||||||
|
sizeof(utf8_simple_test_with_emoji));
|
||||||
|
ck_assert_int_eq(cmp, 0);
|
||||||
|
|
||||||
|
free_stream(s);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
START_TEST(test_in_utf16_le_terminated_as_utf8)
|
||||||
|
{
|
||||||
|
struct stream *s;
|
||||||
|
make_stream(s);
|
||||||
|
init_stream(s, 8192);
|
||||||
|
|
||||||
|
// Write the stream with the terminator
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
|
||||||
|
{
|
||||||
|
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
|
||||||
|
}
|
||||||
|
s_mark_end(s);
|
||||||
|
|
||||||
|
// Rewind the stream
|
||||||
|
s->p = s->data;
|
||||||
|
|
||||||
|
char buff[256];
|
||||||
|
unsigned int len;
|
||||||
|
|
||||||
|
// Check the length call
|
||||||
|
len = in_utf16_le_terminated_as_utf8_length(s);
|
||||||
|
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
|
||||||
|
|
||||||
|
// Now read the string, checking for the same length
|
||||||
|
unsigned int read_len;
|
||||||
|
read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff));
|
||||||
|
ck_assert_int_eq(len, read_len);
|
||||||
|
|
||||||
|
// Should be at the end of the buffer
|
||||||
|
ck_assert_int_eq(s_check_end(s), 1);
|
||||||
|
|
||||||
|
// Check the contents are as expected
|
||||||
|
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
|
||||||
|
sizeof(utf8_simple_test_with_emoji));
|
||||||
|
ck_assert_int_eq(cmp, 0);
|
||||||
|
|
||||||
|
free_stream(s);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
START_TEST(test_in_utf16_le_significant_chars)
|
||||||
|
{
|
||||||
|
struct stream *s;
|
||||||
|
make_stream(s);
|
||||||
|
init_stream(s, 8192);
|
||||||
|
|
||||||
|
const struct
|
||||||
|
{
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
char16_t high; // Set to 0 for a single UTF-16 word
|
||||||
|
char16_t low;
|
||||||
|
} pair;
|
||||||
|
char32_t expected;
|
||||||
|
} tests[] =
|
||||||
|
{
|
||||||
|
// Single high surrogates are bad
|
||||||
|
{ { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
// Single low surrogates are bad
|
||||||
|
{ { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
// Values before and after surrogate range
|
||||||
|
{ { 0, 0xd7ff }, 0xd7ff },
|
||||||
|
{ { 0, 0xe000 }, 0xe000 },
|
||||||
|
// First and last non-surrogate pair values (don't use
|
||||||
|
// 0xfffe and 0xffff for this test as they are non-characters,
|
||||||
|
// and 0xfffd is the replacement character)
|
||||||
|
{ { 0, 0 }, 0 },
|
||||||
|
{ { 0, 0xfffc }, 0xfffc },
|
||||||
|
{ { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
// First and last surrogate pair values (don't use
|
||||||
|
// 0x10fffe and 0x10ffff for this test as they are non-characters)
|
||||||
|
{ { 0xd800, 0xdc00 }, 0x10000 },
|
||||||
|
{ { 0xdbff, 0xdffd }, 0x10fffd },
|
||||||
|
// End-of-plane non-characters (BMP) and the characters before them
|
||||||
|
{ { 0xd83f, 0xdffd }, 0x1fffd },
|
||||||
|
{ { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE
|
||||||
|
{ { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF
|
||||||
|
{ { 0xd87f, 0xdffd }, 0x2fffd },
|
||||||
|
{ { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE
|
||||||
|
{ { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF
|
||||||
|
{ { 0xd8bf, 0xdffd }, 0x3fffd },
|
||||||
|
{ { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE
|
||||||
|
{ { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF
|
||||||
|
{ { 0xd8ff, 0xdffd }, 0x4fffd },
|
||||||
|
{ { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE
|
||||||
|
{ { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF
|
||||||
|
{ { 0xd93f, 0xdffd }, 0x5fffd },
|
||||||
|
{ { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE
|
||||||
|
{ { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF
|
||||||
|
{ { 0xd97f, 0xdffd }, 0x6fffd },
|
||||||
|
{ { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE
|
||||||
|
{ { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF
|
||||||
|
{ { 0xd9bf, 0xdffd }, 0x7fffd },
|
||||||
|
{ { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE
|
||||||
|
{ { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF
|
||||||
|
{ { 0xd9ff, 0xdffd }, 0x8fffd },
|
||||||
|
{ { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE
|
||||||
|
{ { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF
|
||||||
|
{ { 0xda3f, 0xdffd }, 0x9fffd },
|
||||||
|
{ { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE
|
||||||
|
{ { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF
|
||||||
|
{ { 0xda7f, 0xdffd }, 0xafffd },
|
||||||
|
{ { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE
|
||||||
|
{ { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF
|
||||||
|
{ { 0xdabf, 0xdffd }, 0xbfffd },
|
||||||
|
{ { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE
|
||||||
|
{ { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF
|
||||||
|
{ { 0xdaff, 0xdffd }, 0xcfffd },
|
||||||
|
{ { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE
|
||||||
|
{ { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF
|
||||||
|
{ { 0xdb3f, 0xdffd }, 0xdfffd },
|
||||||
|
{ { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE
|
||||||
|
{ { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF
|
||||||
|
{ { 0xdb7f, 0xdffd }, 0xefffd },
|
||||||
|
{ { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE
|
||||||
|
{ { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF
|
||||||
|
{ { 0xdbbf, 0xdffd }, 0xffffd },
|
||||||
|
{ { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE
|
||||||
|
{ { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF
|
||||||
|
{ { 0xdbff, 0xdffd }, 0x10fffd },
|
||||||
|
{ { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE
|
||||||
|
{ { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF
|
||||||
|
// Non-characters in "Arabic Presentation Forms-A"
|
||||||
|
{ { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER },
|
||||||
|
{ { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER }
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned int i;
|
||||||
|
for (i = 0; i < ELEMENTS(tests); ++i)
|
||||||
|
{
|
||||||
|
char buff[256];
|
||||||
|
unsigned int word_count;
|
||||||
|
init_stream(s, 8192);
|
||||||
|
|
||||||
|
word_count = 0;
|
||||||
|
if (tests[i].pair.high != 0)
|
||||||
|
{
|
||||||
|
out_uint16_le(s, tests[i].pair.high);
|
||||||
|
++word_count;
|
||||||
|
}
|
||||||
|
out_uint16_le(s, tests[i].pair.low);
|
||||||
|
++word_count;
|
||||||
|
s_mark_end(s);
|
||||||
|
|
||||||
|
// Rewind the stream
|
||||||
|
s->p = s->data;
|
||||||
|
|
||||||
|
// Read in one UTF-16 LE character as UTF-32
|
||||||
|
in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff));
|
||||||
|
const char *p = buff;
|
||||||
|
char32_t c32 = utf8_get_next_char(&p, NULL);
|
||||||
|
|
||||||
|
if (c32 != tests[i].expected)
|
||||||
|
{
|
||||||
|
ck_abort_msg("test_in_utf16_le_significant_chars: "
|
||||||
|
"Index %u for {%x, %x}, expected %x, got %x",
|
||||||
|
i, tests[i].pair.high, tests[i].pair.low,
|
||||||
|
tests[i].expected, c32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free_stream(s);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
Suite *
|
||||||
|
make_suite_test_parse(void)
|
||||||
|
{
|
||||||
|
Suite *s;
|
||||||
|
TCase *tc_unicode;
|
||||||
|
|
||||||
|
s = suite_create("Parse");
|
||||||
|
|
||||||
|
tc_unicode = tcase_create("Unicode");
|
||||||
|
suite_add_tcase(s, tc_unicode);
|
||||||
|
tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le);
|
||||||
|
tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8);
|
||||||
|
tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8);
|
||||||
|
tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars);
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user