#if defined(HAVE_CONFIG_H) #include "config_ac.h" #endif #include "arch.h" #include "os_calls.h" #include "string_calls.h" #include "parse.h" #include "test_common.h" #define ELEMENTS(x) (sizeof(x) / sizeof(x[0])) const static char utf8_simple_test_with_emoji[] = "Simple Test." "\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face const static char16_t utf16_simple_test_with_emoji[] = { 'S', 'i', 'm', 'p', 'l', 'e', ' ', 'T', 'e', 's', 't', '.', 0xd83d, 0xde25, // U+1F625 0 // terminator }; /******************************************************************************/ START_TEST(test_out_utf8_as_utf16_le) { struct stream *s; make_stream(s); init_stream(s, 8192); out_utf8_as_utf16_le(s, utf8_simple_test_with_emoji, sizeof(utf8_simple_test_with_emoji)); // Include term s_mark_end(s); // Rewind the stream s->p = s->data; unsigned int i; for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i) { char16_t val; in_uint16_le(s, val); if (val != utf16_simple_test_with_emoji[i]) { ck_abort_msg("test_out_utf8_as_utf16_le: " "Index %u expected %x, got %x", i, utf16_simple_test_with_emoji[i], val); } } ck_assert_int_eq(s_check_end(s), 1); free_stream(s); } END_TEST /******************************************************************************/ START_TEST(test_in_utf16_le_fixed_as_utf8) { struct stream *s; make_stream(s); init_stream(s, 8192); // Write the stream without a terminator unsigned int i; for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji) - 1; ++i) { out_uint16_le(s, utf16_simple_test_with_emoji[i]); } s_mark_end(s); // Rewind the stream s->p = s->data; char buff[256]; unsigned int len; // Check the length call len = in_utf16_le_fixed_as_utf8_length(s, i); ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji)); // Now read the string, checking for the same length unsigned int read_len; read_len = in_utf16_le_fixed_as_utf8(s, i, buff, sizeof(buff)); ck_assert_int_eq(len, read_len); // Should be at the end of the buffer ck_assert_int_eq(s_check_end(s), 1); // Check the contents are as expected int cmp = memcmp(buff, utf8_simple_test_with_emoji, sizeof(utf8_simple_test_with_emoji)); ck_assert_int_eq(cmp, 0); free_stream(s); } END_TEST /******************************************************************************/ START_TEST(test_in_utf16_le_terminated_as_utf8) { struct stream *s; make_stream(s); init_stream(s, 8192); // Write the stream with the terminator unsigned int i; for (i = 0; i < ELEMENTS(utf16_simple_test_with_emoji); ++i) { out_uint16_le(s, utf16_simple_test_with_emoji[i]); } s_mark_end(s); // Rewind the stream s->p = s->data; char buff[256]; unsigned int len; // Check the length call len = in_utf16_le_terminated_as_utf8_length(s); ck_assert_int_eq(len, sizeof(utf8_simple_test_with_emoji)); // Now read the string, checking for the same length unsigned int read_len; read_len = in_utf16_le_terminated_as_utf8(s, buff, sizeof(buff)); ck_assert_int_eq(len, read_len); // Should be at the end of the buffer ck_assert_int_eq(s_check_end(s), 1); // Check the contents are as expected int cmp = memcmp(buff, utf8_simple_test_with_emoji, sizeof(utf8_simple_test_with_emoji)); ck_assert_int_eq(cmp, 0); free_stream(s); } END_TEST /******************************************************************************/ START_TEST(test_in_utf16_le_significant_chars) { struct stream *s; make_stream(s); init_stream(s, 8192); const struct { struct { char16_t high; // Set to 0 for a single UTF-16 word char16_t low; } pair; char32_t expected; } tests[] = { // Single high surrogates are bad { { 0, 0xd800 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xdbff }, UCS_REPLACEMENT_CHARACTER }, // Single low surrogates are bad { { 0, 0xdc00 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // Values before and after surrogate range { { 0, 0xd7ff }, 0xd7ff }, { { 0, 0xe000 }, 0xe000 }, // First and last non-surrogate pair values (don't use // 0xfffe and 0xffff for this test as they are non-characters, // and 0xfffd is the replacement character) { { 0, 0 }, 0 }, { { 0, 0xfffc }, 0xfffc }, { { 0, 0xfffd }, UCS_REPLACEMENT_CHARACTER }, // First and last surrogate pair values (don't use // 0x10fffe and 0x10ffff for this test as they are non-characters) { { 0xd800, 0xdc00 }, 0x10000 }, { { 0xdbff, 0xdffd }, 0x10fffd }, // End-of-plane non-characters (BMP) and the characters before them { { 0xd83f, 0xdffd }, 0x1fffd }, { { 0xd83f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFE { { 0xd83f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+1FFFF { { 0xd87f, 0xdffd }, 0x2fffd }, { { 0xd87f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFE { { 0xd87f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+2FFFF { { 0xd8bf, 0xdffd }, 0x3fffd }, { { 0xd8bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFE { { 0xd8bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+3FFFF { { 0xd8ff, 0xdffd }, 0x4fffd }, { { 0xd8ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFE { { 0xd8ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+4FFFF { { 0xd93f, 0xdffd }, 0x5fffd }, { { 0xd93f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFE { { 0xd93f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+5FFFF { { 0xd97f, 0xdffd }, 0x6fffd }, { { 0xd97f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFE { { 0xd97f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+6FFFF { { 0xd9bf, 0xdffd }, 0x7fffd }, { { 0xd9bf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFE { { 0xd9bf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+7FFFF { { 0xd9ff, 0xdffd }, 0x8fffd }, { { 0xd9ff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFE { { 0xd9ff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+8FFFF { { 0xda3f, 0xdffd }, 0x9fffd }, { { 0xda3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFE { { 0xda3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+9FFFF { { 0xda7f, 0xdffd }, 0xafffd }, { { 0xda7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFE { { 0xda7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+AFFFF { { 0xdabf, 0xdffd }, 0xbfffd }, { { 0xdabf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFE { { 0xdabf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+BFFFF { { 0xdaff, 0xdffd }, 0xcfffd }, { { 0xdaff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFE { { 0xdaff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+CFFFF { { 0xdb3f, 0xdffd }, 0xdfffd }, { { 0xdb3f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFE { { 0xdb3f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+DFFFF { { 0xdb7f, 0xdffd }, 0xefffd }, { { 0xdb7f, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFE { { 0xdb7f, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+EFFFF { { 0xdbbf, 0xdffd }, 0xffffd }, { { 0xdbbf, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFE { { 0xdbbf, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+FFFFF { { 0xdbff, 0xdffd }, 0x10fffd }, { { 0xdbff, 0xdffe }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFE { { 0xdbff, 0xdfff }, UCS_REPLACEMENT_CHARACTER }, // U+10FFFF // Non-characters in "Arabic Presentation Forms-A" { { 0, 0xfdd0 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd1 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd2 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd3 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd4 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd5 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd6 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd7 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd8 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdd9 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdda }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfddb }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfddc }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfddd }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdde }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfddf }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde0 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde1 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde2 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde3 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde4 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde5 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde6 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde7 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde8 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfde9 }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdea }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdeb }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdec }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfded }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdee }, UCS_REPLACEMENT_CHARACTER }, { { 0, 0xfdef }, UCS_REPLACEMENT_CHARACTER } }; unsigned int i; for (i = 0; i < ELEMENTS(tests); ++i) { char buff[256]; unsigned int word_count; init_stream(s, 8192); word_count = 0; if (tests[i].pair.high != 0) { out_uint16_le(s, tests[i].pair.high); ++word_count; } out_uint16_le(s, tests[i].pair.low); ++word_count; s_mark_end(s); // Rewind the stream s->p = s->data; // Read in one UTF-16 LE character as UTF-32 in_utf16_le_fixed_as_utf8(s, word_count, buff, sizeof(buff)); const char *p = buff; char32_t c32 = utf8_get_next_char(&p, NULL); if (c32 != tests[i].expected) { ck_abort_msg("test_in_utf16_le_significant_chars: " "Index %u for {%x, %x}, expected %x, got %x", i, tests[i].pair.high, tests[i].pair.low, tests[i].expected, c32); } } free_stream(s); } END_TEST /******************************************************************************/ Suite * make_suite_test_parse(void) { Suite *s; TCase *tc_unicode; s = suite_create("Parse"); tc_unicode = tcase_create("Unicode"); suite_add_tcase(s, tc_unicode); tcase_add_test(tc_unicode, test_out_utf8_as_utf16_le); tcase_add_test(tc_unicode, test_in_utf16_le_fixed_as_utf8); tcase_add_test(tc_unicode, test_in_utf16_le_terminated_as_utf8); tcase_add_test(tc_unicode, test_in_utf16_le_significant_chars); return s; }