xrdp/tests/common/test_parse.c
matt335672 0758fe03a6 Add UTF-16 LE I/O routines
These are intended to replace UTF-16 uses of mbstowcs() / wcstombs()
2023-10-18 10:07:49 +01:00

320 lines
11 KiB
C

#if defined(HAVE_CONFIG_H)
#include "config_ac.h"
#endif
#include "arch.h"
#include "os_calls.h"
#include "string_calls.h"
#include "parse.h"
#include "test_common.h"
#define ELEMENTS(x) (sizeof(x) / sizeof(x[0]))
const static char
utf8_simple_test_with_emoji[] =
"Simple Test."
"\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
const static char16_t
utf16_simple_test_with_emoji[] =
{
'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.',
0xd83d, 0xde25, // U+1F625
0 // terminator
};
/******************************************************************************/
START_TEST(test_out_utf8_as_utf16_le)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji)); // Include term
s_mark_end(s);
// Rewind the stream
s->p = s->data;
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
{
char16_t val;
in_uint16_le(s, val);
if (val != utf16_simple_test_with_emoji[i])
{
ck_abort_msg("test_out_utf8_as_utf16_le: "
"Index %u expected %x, got %x",
i, utf16_simple_test_with_emoji[i], val);
}
}
ck_assert_int_eq(s_check_end(s), 1);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_fixed_as_utf8)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
// Write the stream without a terminator
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i)
{
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
}
s_mark_end(s);
// Rewind the stream
s->p = s->data;
char buff[256];
unsigned int len;
// Check the length call
len = in_utf16_le_fixed_as_utf8_length(s, i);
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
// Now read the string, checking for the same length
unsigned int read_len;
read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff));
ck_assert_int_eq(len, read_len);
// Should be at the end of the buffer
ck_assert_int_eq(s_check_end(s), 1);
// Check the contents are as expected
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji));
ck_assert_int_eq(cmp, 0);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_terminated_as_utf8)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
// Write the stream with the terminator
unsigned int i;
for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i)
{
out_uint16_le(s, utf16_simple_test_with_emoji[i]);
}
s_mark_end(s);
// Rewind the stream
s->p = s->data;
char buff[256];
unsigned int len;
// Check the length call
len = in_utf16_le_terminated_as_utf8_length(s);
ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji));
// Now read the string, checking for the same length
unsigned int read_len;
read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff));
ck_assert_int_eq(len, read_len);
// Should be at the end of the buffer
ck_assert_int_eq(s_check_end(s), 1);
// Check the contents are as expected
int cmp = memcmp(buff, utf8_simple_test_with_emoji,
sizeof(utf8_simple_test_with_emoji));
ck_assert_int_eq(cmp, 0);
free_stream(s);
}
END_TEST
/******************************************************************************/
START_TEST(test_in_utf16_le_significant_chars)
{
struct stream *s;
make_stream(s);
init_stream(s, 8192);
const struct
{
struct
{
char16_t high; // Set to 0 for a single UTF-16 word
char16_t low;
} pair;
char32_t expected;
} tests[] =
{
// Single high surrogates are bad
{ { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER },
// Single low surrogates are bad
{ { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER },
// Values before and after surrogate range
{ { 0, 0xd7ff }, 0xd7ff },
{ { 0, 0xe000 }, 0xe000 },
// First and last non-surrogate pair values (don't use
// 0xfffe and 0xffff for this test as they are non-characters,
// and 0xfffd is the replacement character)
{ { 0, 0 }, 0 },
{ { 0, 0xfffc }, 0xfffc },
{ { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER },
// First and last surrogate pair values (don't use
// 0x10fffe and 0x10ffff for this test as they are non-characters)
{ { 0xd800, 0xdc00 }, 0x10000 },
{ { 0xdbff, 0xdffd }, 0x10fffd },
// End-of-plane non-characters (BMP) and the characters before them
{ { 0xd83f, 0xdffd }, 0x1fffd },
{ { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE
{ { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF
{ { 0xd87f, 0xdffd }, 0x2fffd },
{ { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE
{ { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF
{ { 0xd8bf, 0xdffd }, 0x3fffd },
{ { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE
{ { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF
{ { 0xd8ff, 0xdffd }, 0x4fffd },
{ { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE
{ { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF
{ { 0xd93f, 0xdffd }, 0x5fffd },
{ { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE
{ { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF
{ { 0xd97f, 0xdffd }, 0x6fffd },
{ { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE
{ { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF
{ { 0xd9bf, 0xdffd }, 0x7fffd },
{ { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE
{ { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF
{ { 0xd9ff, 0xdffd }, 0x8fffd },
{ { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE
{ { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF
{ { 0xda3f, 0xdffd }, 0x9fffd },
{ { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE
{ { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF
{ { 0xda7f, 0xdffd }, 0xafffd },
{ { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE
{ { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF
{ { 0xdabf, 0xdffd }, 0xbfffd },
{ { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE
{ { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF
{ { 0xdaff, 0xdffd }, 0xcfffd },
{ { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE
{ { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF
{ { 0xdb3f, 0xdffd }, 0xdfffd },
{ { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE
{ { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF
{ { 0xdb7f, 0xdffd }, 0xefffd },
{ { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE
{ { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF
{ { 0xdbbf, 0xdffd }, 0xffffd },
{ { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE
{ { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF
{ { 0xdbff, 0xdffd }, 0x10fffd },
{ { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE
{ { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF
// Non-characters in "Arabic Presentation Forms-A"
{ { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER },
{ { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER }
};
unsigned int i;
for (i = 0; i < ELEMENTS(tests); ++i)
{
char buff[256];
unsigned int word_count;
init_stream(s, 8192);
word_count = 0;
if (tests[i].pair.high != 0)
{
out_uint16_le(s, tests[i].pair.high);
++word_count;
}
out_uint16_le(s, tests[i].pair.low);
++word_count;
s_mark_end(s);
// Rewind the stream
s->p = s->data;
// Read in one UTF-16 LE character as UTF-32
in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff));
const char *p = buff;
char32_t c32 = utf8_get_next_char(&p, NULL);
if (c32 != tests[i].expected)
{
ck_abort_msg("test_in_utf16_le_significant_chars: "
"Index %u for {%x, %x}, expected %x, got %x",
i, tests[i].pair.high, tests[i].pair.low,
tests[i].expected, c32);
}
}
free_stream(s);
}
END_TEST
/******************************************************************************/
Suite *
make_suite_test_parse(void)
{
Suite *s;
TCase *tc_unicode;
s = suite_create("Parse");
tc_unicode = tcase_create("Unicode");
suite_add_tcase(s, tc_unicode);
tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le);
tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8);
tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8);
tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars);
return s;
}