Add UTF-8 / UTF-32 conversion routines
These are intended to replace non-UTF-16 uses of mbstowcs() / wcstombs()
This commit is contained in:
parent
da8fa059cd
commit
0463e552dc
@ -68,6 +68,7 @@ libcommon_la_SOURCES = \
|
||||
thread_calls.h \
|
||||
trans.c \
|
||||
trans.h \
|
||||
unicode_defines.h \
|
||||
$(PIXMAN_SOURCES)
|
||||
|
||||
libcommon_la_LIBADD = \
|
||||
|
@ -46,6 +46,17 @@ typedef unsigned long uintptr_t;
|
||||
|
||||
typedef int bool_t;
|
||||
|
||||
// Define Unicode character types
|
||||
#if defined(HAVE_UCHAR_H)
|
||||
#include <uchar.h>
|
||||
#elif defined(HAVE_STDINT_H)
|
||||
typedef uint_least16_t char16_t;
|
||||
typedef uint_least32_t char32_t;
|
||||
#else
|
||||
typedef uint16_t char16_t;
|
||||
typedef uint32_t char32_t;
|
||||
#endif
|
||||
|
||||
/* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN
|
||||
in the makefile to override */
|
||||
|
||||
|
@ -27,11 +27,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
|
||||
#include "log.h"
|
||||
#include "os_calls.h"
|
||||
#include "string_calls.h"
|
||||
#include "defines.h"
|
||||
#include "unicode_defines.h"
|
||||
|
||||
unsigned int
|
||||
g_format_info_string(char *dest, unsigned int len,
|
||||
@ -1288,3 +1288,305 @@ g_sig2text(int signum, char sigstr[])
|
||||
g_snprintf(sigstr, MAXSTRSIGLEN, "SIG#%d", signum);
|
||||
return sigstr;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
char32_t
|
||||
utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref)
|
||||
{
|
||||
/*
|
||||
* Macro used to parse a continuation character
|
||||
* @param cp Character Pointer (incremented on success)
|
||||
* @param end One character past end of input string
|
||||
* @param value The value we're constructing
|
||||
* @param finish_label Where to go in the event of an error */
|
||||
#define PARSE_CONTINUATION_CHARACTER(cp, end, value, finish_label) \
|
||||
{ \
|
||||
/* Error if we're out of data, or this char isn't a continuation */ \
|
||||
if (cp == end || !IS_VALID_CONTINUATION_CHAR(*cp)) \
|
||||
{ \
|
||||
value = UCS_REPLACEMENT_CHARACTER; \
|
||||
goto finish_label; \
|
||||
} \
|
||||
value = (value) << 6 | (*cp & 0x3f); \
|
||||
++cp; \
|
||||
}
|
||||
|
||||
char32_t rv;
|
||||
|
||||
/* Easier to work with unsigned chars and no indirection */
|
||||
const unsigned char *cp = (const unsigned char *)*utf8str_ref;
|
||||
const unsigned char *end = (len_ref != NULL) ? cp + *len_ref : cp + 6;
|
||||
|
||||
if (cp == end)
|
||||
{
|
||||
return 0; // Pathological case
|
||||
}
|
||||
|
||||
unsigned int c0 = *cp++;
|
||||
|
||||
if (c0 < 0x80)
|
||||
{
|
||||
rv = c0;
|
||||
}
|
||||
else if (c0 < 0xc0)
|
||||
{
|
||||
/* Unexpected continuation character */
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
else if (c0 < 0xe0)
|
||||
{
|
||||
/* Valid start character for sequence of length 2
|
||||
* U-00000080 – U-000007FF */
|
||||
rv = (c0 & 0x1f);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
|
||||
if (rv < 0x80 || INVALID_UNICODE_80_TO_7FF(rv))
|
||||
{
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
}
|
||||
else if (c0 < 0xf0)
|
||||
{
|
||||
/* Valid start character for sequence of length 3
|
||||
* U-00000800 – U-0000FFFF */
|
||||
rv = (c0 & 0xf);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
if (rv < 0x800 || INVALID_UNICODE_800_TO_FFFF(rv))
|
||||
{
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
}
|
||||
else if (c0 < 0xf8)
|
||||
{
|
||||
/* Valid start character for sequence of length 4
|
||||
* U-00010000 – U-0001FFFFF */
|
||||
rv = (c0 & 0x7);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
if (rv < 0x10000 || INVALID_UNICODE_10000_TO_1FFFFF(rv))
|
||||
{
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
}
|
||||
else if (c0 < 0xfc)
|
||||
{
|
||||
/* Valid start character for sequence of length 5
|
||||
* U-00200000 – U-03FFFFFF */
|
||||
rv = (c0 & 0x3);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
|
||||
// These values are currently unsupported
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
else if (c0 < 0xfe)
|
||||
{
|
||||
/* Valid start character for sequence of length 6
|
||||
* U-04000000 – U-7FFFFFFF */
|
||||
rv = (c0 & 0x1);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
PARSE_CONTINUATION_CHARACTER(cp, end, rv, finish);
|
||||
|
||||
// These values are currently unsupported
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invalid characters
|
||||
rv = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
finish:
|
||||
|
||||
if (len_ref)
|
||||
{
|
||||
*len_ref -= ((const char *)cp - *utf8str_ref);
|
||||
}
|
||||
*utf8str_ref = (const char *)cp;
|
||||
|
||||
return rv;
|
||||
#undef PARSE_CONTINUATION_CHARACTER
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
unsigned int
|
||||
utf_char32_to_utf8(char32_t c32, char *u8str)
|
||||
{
|
||||
unsigned int rv;
|
||||
|
||||
if (INVALID_UNICODE(c32))
|
||||
{
|
||||
c32 = UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
if (c32 < 0x80)
|
||||
{
|
||||
rv = 1;
|
||||
if (u8str != NULL)
|
||||
{
|
||||
u8str[0] = (char)c32;
|
||||
}
|
||||
}
|
||||
else if (c32 < 0x800)
|
||||
{
|
||||
rv = 2;
|
||||
// 11 bits. Five in first byte, six in second
|
||||
if (u8str != NULL)
|
||||
{
|
||||
u8str[1] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[0] = (c32 & 0x1f) | 0xc0;
|
||||
}
|
||||
}
|
||||
else if (c32 < 0xffff)
|
||||
{
|
||||
rv = 3;
|
||||
// 16 bits. Four in first byte, six in second and third
|
||||
if (u8str != NULL)
|
||||
{
|
||||
u8str[2] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[1] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[0] = (c32 & 0xf) | 0xe0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rv = 4;
|
||||
// 21 bits. Three in first byte, six in second, third and fourth
|
||||
if (u8str != NULL)
|
||||
{
|
||||
u8str[3] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[2] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[1] = (c32 & 0x3f) | 0x80;
|
||||
c32 >>= 6;
|
||||
u8str[0] = (c32 & 0x7) | 0xf0;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
unsigned int
|
||||
utf8_char_count(const char *utf8str)
|
||||
{
|
||||
unsigned int rv = 0;
|
||||
char32_t c;
|
||||
|
||||
if (utf8str != NULL)
|
||||
{
|
||||
while ((c = utf8_get_next_char(&utf8str, NULL)) != 0)
|
||||
{
|
||||
++rv;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
unsigned int
|
||||
utf8_as_utf16_word_count(const char *utf8str, unsigned int len)
|
||||
{
|
||||
unsigned int rv = 0;
|
||||
while (len > 0)
|
||||
{
|
||||
char32_t c = utf8_get_next_char(&utf8str, &len);
|
||||
// Characters not in the BMP (i.e. over 0xffff) need a high/low
|
||||
// surrogate pair
|
||||
rv += (c >= 0x10000) ? 2 : 1;
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
int
|
||||
utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32,
|
||||
unsigned int index)
|
||||
{
|
||||
int rv = 0;
|
||||
|
||||
char c8[MAXLEN_UTF8_CHAR];
|
||||
unsigned int c8len = utf_char32_to_utf8(c32, c8);
|
||||
|
||||
// Find out where to insert the character
|
||||
char *insert_pos = utf8str;
|
||||
|
||||
while (index > 0 && *insert_pos != '\0')
|
||||
{
|
||||
utf8_get_next_char((const char **)&insert_pos, NULL);
|
||||
--index;
|
||||
}
|
||||
|
||||
// Did we get to where we need to be?
|
||||
if (index == 0)
|
||||
{
|
||||
unsigned int bytes_to_move = strlen(insert_pos) + 1; // Include terminator
|
||||
// Is there room to insert the character?
|
||||
//
|
||||
// <----------- len ---------->
|
||||
// <--> (bytes_to_move)
|
||||
// +----------------------------+
|
||||
// |ABCDEFGHIJLMN\0 |
|
||||
// +----------------------------+
|
||||
// ^ ^
|
||||
// +-utf8str +-insert_pos
|
||||
//
|
||||
if ((insert_pos - utf8str) + bytes_to_move + c8len <= len)
|
||||
{
|
||||
memmove(insert_pos + c8len, insert_pos, bytes_to_move);
|
||||
memcpy(insert_pos, c8, c8len);
|
||||
rv = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
char32_t
|
||||
utf8_remove_char_at(char *utf8str, unsigned int index)
|
||||
{
|
||||
int rv = 0;
|
||||
|
||||
// Find out where to remove the character
|
||||
char *remove_pos = utf8str;
|
||||
|
||||
while (index > 0)
|
||||
{
|
||||
// Any characters left in string?
|
||||
if (*remove_pos == '\0')
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
utf8_get_next_char((const char **)&remove_pos, NULL);
|
||||
--index;
|
||||
}
|
||||
|
||||
// Did we get to where we need to be?
|
||||
if (index == 0)
|
||||
{
|
||||
// Find the position after the character
|
||||
char *after_pos = remove_pos;
|
||||
rv = utf8_get_next_char((const char **)&after_pos, NULL);
|
||||
|
||||
// Move everything up
|
||||
memmove(remove_pos, after_pos, strlen(after_pos) + 1);
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
@ -87,6 +87,15 @@ enum
|
||||
MAXSTRSIGLEN = (3 + 1 + 1 + ((sizeof(int) * 5 + 1) / 2) + 1)
|
||||
};
|
||||
|
||||
/*
|
||||
* Significant Universal Character Set (Unicode) characters
|
||||
*/
|
||||
enum
|
||||
{
|
||||
UCS_WHITE_SQUARE = 0x25a1,
|
||||
UCS_REPLACEMENT_CHARACTER = 0xfffd
|
||||
};
|
||||
|
||||
/**
|
||||
* Processes a format string for general info
|
||||
*
|
||||
@ -317,4 +326,76 @@ int g_strtrim(char *str, int trim_flags);
|
||||
* The string "SIG#<num>" is returned for unrecognised signums
|
||||
*/
|
||||
char *g_sig2text(int signum, char sigstr[]);
|
||||
|
||||
/**
|
||||
* Get the next Unicode character from a UTF-8 string
|
||||
*
|
||||
* @param utf8str_ref UTF 8 string [by reference]
|
||||
* @param len_ref Length of string [by reference] or NULL
|
||||
* @return Unicode character
|
||||
*
|
||||
* On return, utf8str and len are updated to point past the decoded character.
|
||||
* Unrecognised characters are mapped to UCS_REPLACEMENT_CHARACTER
|
||||
*
|
||||
* len is not needed if your utf8str has a terminator, or is known to
|
||||
* be well-formed.
|
||||
*/
|
||||
char32_t
|
||||
utf8_get_next_char(const char **utf8str_ref, unsigned int *len_ref);
|
||||
|
||||
/**
|
||||
* Convert a Unicode character to UTF-8
|
||||
* @param c32 Unicode character
|
||||
* @param u8str buffer containing at least MAXLEN_UTF8_CHAR bytes for result
|
||||
* @return Number of bytes written to u8str. Can be NULL if only the
|
||||
* length is needed.
|
||||
*
|
||||
* The bytes written to u8str are unterminated
|
||||
*/
|
||||
#define MAXLEN_UTF8_CHAR 4
|
||||
unsigned int
|
||||
utf_char32_to_utf8(char32_t c32, char *u8str);
|
||||
|
||||
/**
|
||||
* Returns the number of Unicode characters in a UTF-8 string
|
||||
* @param utf8str UTF-8 string
|
||||
* @result Number of Unicode characters in the string (terminator not included)
|
||||
*/
|
||||
unsigned int
|
||||
utf8_char_count(const char *utf8str);
|
||||
|
||||
/**
|
||||
* Returns the number of UTF-16 words required to store a UTF-8 string
|
||||
* @param utf8str UTF-8 string
|
||||
* @param len Length of UTF-8 string
|
||||
* @result number of words to store UTF-8 string as UTF-16.
|
||||
*/
|
||||
unsigned int
|
||||
utf8_as_utf16_word_count(const char *utf8str, unsigned int len);
|
||||
|
||||
/**
|
||||
* Add a Unicode character into a UTF-8 string
|
||||
* @param utf8str Pointer to UTF-8 string
|
||||
* @param len Length of buffer for UTF-8 string (includes NULL)
|
||||
* @param c32 character to add
|
||||
* @param index Where to add the codepoint
|
||||
* @return 1 for success, 0 if no character was inserted
|
||||
*
|
||||
* This routine has to parse the string as it goes, so can be slow.
|
||||
*/
|
||||
int
|
||||
utf8_add_char_at(char *utf8str, unsigned int len, char32_t c32,
|
||||
unsigned int index);
|
||||
|
||||
/**
|
||||
* Remove a Unicode character from a UTF-8 string
|
||||
* @param utf8str Pointer to UTF-8 string
|
||||
* @param index Where to remove the codepoint from (0-based)
|
||||
* @return Character removed, or 0 if no character was removed
|
||||
*
|
||||
* This routine has to parse the string as it goes, so can be slow.
|
||||
*/
|
||||
char32_t
|
||||
utf8_remove_char_at(char *utf8str, unsigned int index);
|
||||
|
||||
#endif
|
||||
|
71
common/unicode_defines.h
Normal file
71
common/unicode_defines.h
Normal file
@ -0,0 +1,71 @@
|
||||
/**
|
||||
* xrdp: A Remote Desktop Protocol server.
|
||||
*
|
||||
* Copyright (C) Jay Sorg 2004-2023
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* @file common/unicode_defines.h
|
||||
*
|
||||
* Defines used internally by the implementations of the Unicode routines
|
||||
*/
|
||||
|
||||
#if !defined(UNICODE_DEFINES_H)
|
||||
#define UNICODE_DEFINES_H
|
||||
|
||||
/**
|
||||
* Is this byte a valid UTF-8 continuation character?
|
||||
*/
|
||||
#define IS_VALID_CONTINUATION_CHAR(c) ((c) >= 0x80 && (c) < 0xc0)
|
||||
|
||||
/**
|
||||
* Is this character one of the end-of-plane non-characters?
|
||||
*
|
||||
* These are U+xFFFE and U+xFFFF for x in (0..10}
|
||||
*/
|
||||
#define IS_PLANE_END_NON_CHARACTER(c32) (((c32) & 0xfffe) == 0xfffe)
|
||||
|
||||
/**
|
||||
* Is this character one of the additional non-characters?
|
||||
*
|
||||
* 32 additional non-charactersare defined in the
|
||||
* "Arabic Presentation Forms-A" Unicode block */
|
||||
#define IS_ARABIC_NON_CHARACTER(c32) ((c32) >= 0xfdd0 && (c32) <= 0xfdef)
|
||||
|
||||
// Invalid characters, based on UTF-8 decoding range
|
||||
//
|
||||
// By 'invalid' we mean characters that should not be encoded or
|
||||
// decoded when switching between UTF-8 and UTF-32
|
||||
//
|
||||
// See "UTF-8 decoder capability and stress test" Markus Kuhn 2015-08-28
|
||||
#define INVALID_UNICODE_0_TO_7F(c) (0) // No invalid characters
|
||||
#define INVALID_UNICODE_80_TO_7FF(c) (0) // No invalid characters
|
||||
#define INVALID_UNICODE_800_TO_FFFF(c) \
|
||||
(((c) >= 0xd800 && (c) <= 0xdfff) || /* Surrogate pairs */ \
|
||||
IS_ARABIC_NON_CHARACTER(c) || \
|
||||
IS_PLANE_END_NON_CHARACTER(c))
|
||||
|
||||
#define INVALID_UNICODE_10000_TO_1FFFFF(c) \
|
||||
(IS_PLANE_END_NON_CHARACTER(c) || (c) > 0x10ffff)
|
||||
|
||||
// Returns true for all 'invalid' Unicode chars
|
||||
#define INVALID_UNICODE(c) \
|
||||
( \
|
||||
INVALID_UNICODE_0_TO_7F(c) || \
|
||||
INVALID_UNICODE_80_TO_7FF(c) || \
|
||||
INVALID_UNICODE_800_TO_FFFF(c) || \
|
||||
INVALID_UNICODE_10000_TO_1FFFFF(c) \
|
||||
)
|
||||
|
||||
#endif // UNICODE_DEFINES_H
|
@ -569,7 +569,7 @@ AC_SUBST([pamconfdir], [$with_pamconfdir])
|
||||
|
||||
PKG_INSTALLDIR
|
||||
|
||||
AC_CHECK_HEADERS([sys/prctl.h])
|
||||
AC_CHECK_HEADERS([sys/prctl.h uchar.h])
|
||||
|
||||
AC_CONFIG_FILES([
|
||||
common/Makefile
|
||||
|
@ -17,6 +17,7 @@ test_common_SOURCES = \
|
||||
test_fifo_calls.c \
|
||||
test_list_calls.c \
|
||||
test_string_calls.c \
|
||||
test_string_calls_unicode.c \
|
||||
test_os_calls.c \
|
||||
test_os_calls_signals.c \
|
||||
test_ssl_calls.c \
|
||||
|
BIN
tests/common/UTF-8-test.txt
Normal file
BIN
tests/common/UTF-8-test.txt
Normal file
Binary file not shown.
@ -10,6 +10,7 @@ bin_to_hex(const char *input, int length);
|
||||
Suite *make_suite_test_fifo(void);
|
||||
Suite *make_suite_test_list(void);
|
||||
Suite *make_suite_test_string(void);
|
||||
Suite *make_suite_test_string_unicode(void);
|
||||
Suite *make_suite_test_os_calls(void);
|
||||
Suite *make_suite_test_ssl_calls(void);
|
||||
Suite *make_suite_test_base64(void);
|
||||
|
@ -49,6 +49,7 @@ int main (void)
|
||||
sr = srunner_create (make_suite_test_fifo());
|
||||
srunner_add_suite(sr, make_suite_test_list());
|
||||
srunner_add_suite(sr, make_suite_test_string());
|
||||
srunner_add_suite(sr, make_suite_test_string_unicode());
|
||||
srunner_add_suite(sr, make_suite_test_os_calls());
|
||||
srunner_add_suite(sr, make_suite_test_ssl_calls());
|
||||
srunner_add_suite(sr, make_suite_test_base64());
|
||||
|
835
tests/common/test_string_calls_unicode.c
Normal file
835
tests/common/test_string_calls_unicode.c
Normal file
@ -0,0 +1,835 @@
|
||||
/*
|
||||
* The UTF-8 decoder tests are based on the UTF-8 decoder capability
|
||||
* and stress test" 2015-08-26 by Markus Kuhn. A copy of that file
|
||||
* named "UTF-8-test.txt" should be in the source directory for this file */
|
||||
|
||||
#if defined(HAVE_CONFIG_H)
|
||||
#include "config_ac.h"
|
||||
#endif
|
||||
|
||||
#include "string_calls.h"
|
||||
|
||||
#include "test_common.h"
|
||||
|
||||
// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_decode_sub_test arrays
|
||||
#define URC UCS_REPLACEMENT_CHARACTER
|
||||
|
||||
struct utf8_decode_sub_test
|
||||
{
|
||||
const char *testref;
|
||||
const char *utf8str;
|
||||
// This array will contain 0 values after the initialised part
|
||||
const char32_t expected[65];
|
||||
};
|
||||
|
||||
// Abbreviate UCS_REPLACEMENT_CHARACTER for utf8_encode_sub_test arrays
|
||||
#define E_URC { 0xef, 0xbf, 0xbd }
|
||||
|
||||
struct utf8_encode_sub_test
|
||||
{
|
||||
const char *testref;
|
||||
char32_t c32;
|
||||
unsigned int expected_len;
|
||||
char expected_str[MAXLEN_UTF8_CHAR];
|
||||
};
|
||||
|
||||
// Used as the simple test in UTF-8-test.txt
|
||||
const static char greek_kosme[] =
|
||||
"\xce\xba" // GREEK SMALL LETTER KAPPA
|
||||
"\xe1\xbd\xb9" // GREEK SMALL LETTER OMICRON WITH OXIA
|
||||
"\xcf\x83" // GREEK SMALL LETTER SIGMA
|
||||
"\xce\xbc" // GREEK SMALL LETTER MU
|
||||
"\xce\xb5"; // GREEK SMALL LETTER EPSILON
|
||||
|
||||
// See Issue #2603
|
||||
const static char simple_test_with_emoji[] =
|
||||
"Simple Test."
|
||||
"\xf0\x9f\x98\xa5"; // U+1F625 Disappointed But Relieved Face
|
||||
|
||||
/******************************************************************************/
|
||||
/**
|
||||
* Function to decode a UTF-8 string and check the expected result
|
||||
*
|
||||
* @param st Pointer to the sub-test to run
|
||||
*/
|
||||
static void
|
||||
run_utf8_decode_sub_test(const struct utf8_decode_sub_test *st)
|
||||
{
|
||||
char32_t c;
|
||||
const char *p = st->utf8str;
|
||||
unsigned int index = 0;
|
||||
|
||||
do
|
||||
{
|
||||
c = utf8_get_next_char(&p, NULL);
|
||||
|
||||
if (c != st->expected[index])
|
||||
{
|
||||
ck_abort_msg("Sub-test section %s Index %u expected %x, got %x",
|
||||
st->testref,
|
||||
index, st->expected[index], c);
|
||||
}
|
||||
++index;
|
||||
}
|
||||
while (c != 0);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/**
|
||||
* Function to run an array of decode sub-tests
|
||||
*
|
||||
* @param st Pointer to the first sub-test to run
|
||||
*/
|
||||
static void
|
||||
run_decode_sub_test_array(const struct utf8_decode_sub_test *st)
|
||||
{
|
||||
while (st->utf8str != NULL)
|
||||
{
|
||||
run_utf8_decode_sub_test(st++);
|
||||
}
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/**
|
||||
* Function to encode a UTF-8 value and check the expected result
|
||||
*
|
||||
* @param st Pointer to the sub-test to run
|
||||
*/
|
||||
static void
|
||||
run_utf8_encode_sub_test(const struct utf8_encode_sub_test *st)
|
||||
{
|
||||
char actual_str[MAXLEN_UTF8_CHAR];
|
||||
unsigned int index;
|
||||
unsigned int actual_len = utf_char32_to_utf8(st->c32, actual_str);
|
||||
|
||||
if (actual_len != st->expected_len)
|
||||
{
|
||||
ck_abort_msg("Sub-test %s Expected length of %u, got %u",
|
||||
st->testref,
|
||||
st->expected_len, actual_len);
|
||||
}
|
||||
|
||||
for (index = 0 ; index < actual_len; ++index)
|
||||
{
|
||||
if (actual_str[index] != st->expected_str[index])
|
||||
{
|
||||
ck_abort_msg("Sub-test %s Character %u, expected %02x got %02x",
|
||||
st->testref, index,
|
||||
(int)(unsigned char)st->expected_str[index],
|
||||
(int)(unsigned char)actual_str[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/**
|
||||
* Function to run an array of encode sub-tests
|
||||
*
|
||||
* @param st Pointer to the first sub-test to run
|
||||
*/
|
||||
static void
|
||||
run_encode_sub_test_array(const struct utf8_encode_sub_test *st)
|
||||
{
|
||||
while (st->expected_len > 0)
|
||||
{
|
||||
run_utf8_encode_sub_test(st++);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_get_next_char__section_1)
|
||||
{
|
||||
const struct utf8_decode_sub_test st =
|
||||
{
|
||||
"1",
|
||||
greek_kosme,
|
||||
{
|
||||
0x03ba, // GREEK SMALL LETTER KAPPA
|
||||
0x1f79, // GREEK SMALL LETTER OMICRON WITH OXIA
|
||||
0x03c3, // GREEK SMALL LETTER SIGMA
|
||||
0x03bc, // GREEK SMALL LETTER MU
|
||||
0x03b5 // GREEK SMALL LETTER EPSILON
|
||||
}
|
||||
};
|
||||
|
||||
run_utf8_decode_sub_test(&st);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_get_next_char__section_2)
|
||||
{
|
||||
struct utf8_decode_sub_test tests[] =
|
||||
{
|
||||
// 2.1 First possible sequence of a certain length
|
||||
//
|
||||
// (2.1.1 Is tested separately)
|
||||
{ "2.1.2", "\xc2\x80", { 0x80 } },
|
||||
{ "2.1.3", "\xe0\xa0\x80", { 0x800 } },
|
||||
{ "2.1.4", "\xf0\x90\x80\x80", { 0x10000 } },
|
||||
{ "2.1.5", "\xf8\x88\x80\x80\x80", { URC } },
|
||||
{ "2.1.6", "\xfc\x84\x80\x80\x80\x80", { URC } },
|
||||
|
||||
// 2.2 Last possible sequence of a certain length
|
||||
{ "2.2.1", "\x7f", { 0x7f } },
|
||||
{ "2.2.2", "\xdf\xbf", { 0x7ff } },
|
||||
// Use U+0000FFFC instead of U+0000FFFF as our decoder
|
||||
// treats non-characters as an input error
|
||||
{ "2.2.3", "\xef\xbf\xbc", { 0xfffc } },
|
||||
// U+001FFFFF is out-of-range
|
||||
{ "2.2.4", "\xf7\xbf\xbf\xbf", { URC } },
|
||||
{ "2.2.5", "\xfb\xbf\xbf\xbf\xbf", { URC } },
|
||||
{ "2.2.6", "\xfd\xbf\xbf\xbf\xbf\xbf", { URC } },
|
||||
|
||||
// 2.3 Other boundary conditions
|
||||
{ "2.3.1", "\xed\x9f\xbf", { 0xd7ff } },
|
||||
{ "2.3.2", "\xee\x80\x80", { 0xe000 } },
|
||||
{ "2.3.3", "\xef\xbf\xbd", { 0xfffd } },
|
||||
// Don't use U+10FFFF (non-character)
|
||||
{ "2.3.4", "\xf4\x8f\xbf\xbd", { 0x10fffd } },
|
||||
{ "2.3.5", "\xf4\x90\x80\x80", { URC } },
|
||||
// Terminator
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
// 2.1.1 is a '\0' which we use to terminate our strings. Test
|
||||
// it separately
|
||||
{
|
||||
const char *p = "";
|
||||
|
||||
ck_assert_int_eq(utf8_get_next_char(&p, NULL), 0);
|
||||
}
|
||||
|
||||
// Do the rest of the section 2 tests
|
||||
run_decode_sub_test_array(tests);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_get_next_char__section_3)
|
||||
{
|
||||
struct utf8_decode_sub_test tests[] =
|
||||
{
|
||||
// 3.1 Unexpected continuation bytes
|
||||
//
|
||||
// Each unexpected continuation byte should be separately
|
||||
// signalled as a malformed sequence of its own.
|
||||
{ "3.1.1", "\x80", { URC } },
|
||||
{ "3.1.2", "\xbf", { URC } },
|
||||
{ "3.1.3", "\x80\xbf", { URC, URC } },
|
||||
{ "3.1.4", "\x80\xbf\x80", { URC, URC, URC } },
|
||||
{ "3.1.5", "\x80\xbf\x80\xbf", { URC, URC, URC, URC } },
|
||||
{ "3.1.6", "\x80\xbf\x80\xbf\x80", { URC, URC, URC, URC, URC } },
|
||||
{
|
||||
"3.1.7",
|
||||
"\x80\xbf\x80\xbf\x80\xbf",
|
||||
{ URC, URC, URC, URC, URC, URC }
|
||||
},
|
||||
{
|
||||
"3.1.8",
|
||||
"\x80\xbf\x80\xbf\x80\xbf\x80",
|
||||
{ URC, URC, URC, URC, URC, URC, URC }
|
||||
},
|
||||
{
|
||||
"3.1.9",
|
||||
"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
|
||||
"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
|
||||
"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
|
||||
"\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf",
|
||||
{
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC
|
||||
}
|
||||
},
|
||||
|
||||
// 3.2 Lonely start characters
|
||||
{
|
||||
"3.2.1",
|
||||
"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 "
|
||||
"\xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
|
||||
"\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 "
|
||||
"\xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf ",
|
||||
{
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' '
|
||||
}
|
||||
},
|
||||
{
|
||||
"3.2.2",
|
||||
"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 "
|
||||
"\xe8 \xe9 \xea \xeb \xec \xed \xee \xef ",
|
||||
{
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' '
|
||||
}
|
||||
},
|
||||
{
|
||||
"3.2.3",
|
||||
"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 ",
|
||||
{
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' ',
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' '
|
||||
}
|
||||
},
|
||||
{
|
||||
"3.2.4",
|
||||
"\xf8 \xf9 \xfa \xfb ",
|
||||
{
|
||||
URC, ' ', URC, ' ', URC, ' ', URC, ' '
|
||||
}
|
||||
},
|
||||
{
|
||||
"3.2.5", "\xfc \xfd ", { URC, ' ', URC, ' ' }
|
||||
},
|
||||
|
||||
// 3.3 Sequences with last continuation byte missing
|
||||
//
|
||||
// From UTF-8-test.txt:-
|
||||
// All bytes of an incomplete sequence should be signalled as
|
||||
// a single malformed sequence, i.e., you should see only a
|
||||
// single replacement character in each of the next 10 tests.
|
||||
{ "3.3.1", "\xc0", { URC } },
|
||||
{ "3.3.2", "\xe0\x80", { URC } },
|
||||
{ "3.3.3", "\xf0\x80\x80", { URC } },
|
||||
{ "3.3.4", "\xf8\x80\x80\x80", { URC } },
|
||||
{ "3.3.5", "\xfc\x80\x80\x80\x80", { URC } },
|
||||
|
||||
{ "3.3.6", "\xdf", { URC } },
|
||||
{ "3.3.7", "\xef\xbf", { URC } },
|
||||
{ "3.3.8", "\xf7\xbf\xbf", { URC} },
|
||||
{ "3.3.9", "\xfb\xbf\xbf\xbf", { URC } },
|
||||
{ "3.3.10", "\xfd\xbf\xbf\xbf\xbf", { URC } },
|
||||
|
||||
// 3.4 Concatenation of incomplete sequences
|
||||
{
|
||||
"3,4",
|
||||
"\xc0"
|
||||
"\xe0\x80"
|
||||
"\xf0\x80\x80"
|
||||
"\xf8\x80\x80\x80"
|
||||
"\xfc\x80\x80\x80\x80"
|
||||
"\xdf"
|
||||
"\xef\xbf"
|
||||
"\xf7\xbf\xbf"
|
||||
"\xfb\xbf\xbf\xbf"
|
||||
"\xfd\xbf\xbf\xbf\xbf",
|
||||
{
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC
|
||||
}
|
||||
},
|
||||
|
||||
// 3.5 Impossible bytes
|
||||
{ "3.5.1", "\xfe", { URC } },
|
||||
{ "3.5.2", "\xff", { URC } },
|
||||
{ "3.5.3", "\xfe\xfe\xff\xff", { URC, URC, URC, URC } },
|
||||
// Terminator
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
run_decode_sub_test_array(tests);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_get_next_char__section_4)
|
||||
{
|
||||
struct utf8_decode_sub_test tests[] =
|
||||
{
|
||||
// 4.1 Examples of an overlong ASCII character
|
||||
//
|
||||
// With a safe UTF-8 decoder, all of the following five
|
||||
// overlong representations of the ASCII character slash ("/")
|
||||
// should be rejected like a malformed UTF-8 sequence, for
|
||||
// instance by substituting it with a replacement character. If
|
||||
// you see a slash below, you do not have a safe UTF-8 decoder!
|
||||
{ "4.1.1", "\xc0\xaf", { URC } },
|
||||
{ "4.1.2", "\xe0\x80\xaf", { URC } },
|
||||
{ "4.1.3", "\xf0\x80\x80\xaf", { URC } },
|
||||
{ "4.1.4", "\xf8\x80\x80\x80\xaf", { URC } },
|
||||
{ "4.1.5", "\xfc\x80\x80\x80\x80\xaf", { URC } },
|
||||
|
||||
// 4.2 Maximum overlong sequences
|
||||
|
||||
// Below you see the highest Unicode value that is still resulting
|
||||
// in an overlong sequence if represented with the given number of
|
||||
// bytes. This is a boundary test for safe UTF-8 decoders. All
|
||||
// five characters should be rejected like malformed UTF-8
|
||||
// sequences.
|
||||
{ "4.2.1", "\xc1\xbf", { URC } },
|
||||
{ "4.2.2", "\xe0\x9f\xbf", { URC } },
|
||||
{ "4.2.3", "\xf0\x8f\xbf\xbf", { URC } },
|
||||
{ "4.2.4", "\xf8\x87\xbf\xbf\xbf", { URC } },
|
||||
{ "4.2.5", "\xfc\x83\xbf\xbf\xbf\xbf", { URC } },
|
||||
|
||||
// 4.3 Overlong representation of the NUL character
|
||||
|
||||
// The following five sequences should also be rejected like
|
||||
// malformed UTF-8 sequences and should not be treated like the
|
||||
// ASCII NUL character.
|
||||
{ "4.3.1", "\xc0\x80", { URC } },
|
||||
{ "4.3.2", "\xe0\x80\x80", { URC } },
|
||||
{ "4.3.3", "\xf0\x80\x80\x80", { URC } },
|
||||
{ "4.3.4", "\xf8\x80\x80\x80\x80", { URC } },
|
||||
{ "4.3.5", "\xfc\x80\x80\x80\x80\x80", { URC } },
|
||||
|
||||
// Terminator
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
run_decode_sub_test_array(tests);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_get_next_char__section_5)
|
||||
{
|
||||
struct utf8_decode_sub_test tests[] =
|
||||
{
|
||||
// 5 Illegal code positions
|
||||
|
||||
// The following UTF-8 sequences should be rejected like
|
||||
// malformed sequences, because they never represent valid
|
||||
// ISO 10646 characters and a UTF-8 decoder that accepts them
|
||||
// might introduce security problems comparable to overlong
|
||||
// UTF-8 sequences.
|
||||
|
||||
// 5.1 Single UTF-16 surrogates
|
||||
{ "5.1.1", "\xed\xa0\x80", { URC } },
|
||||
{ "5.1.2", "\xed\xad\xbf", { URC } },
|
||||
{ "5.1.3", "\xed\xae\x80", { URC } },
|
||||
{ "5.1.4", "\xed\xaf\xbf", { URC } },
|
||||
{ "5.1.5", "\xed\xb0\x80", { URC } },
|
||||
{ "5.1.6", "\xed\xbe\x80", { URC } },
|
||||
{ "5.1.7", "\xed\xbf\xbf", { URC } },
|
||||
|
||||
// 5.2 Paired UTF-16 surrogates
|
||||
{ "5.2.1", "\xed\xa0\x80\xed\xb0\x80", { URC, URC } },
|
||||
{ "5.2.2", "\xed\xa0\x80\xed\xbf\xbf", { URC, URC } },
|
||||
{ "5.2.3", "\xed\xad\xbf\xed\xb0\x80", { URC, URC } },
|
||||
{ "5.2.4", "\xed\xad\xbf\xed\xbf\xbf", { URC, URC } },
|
||||
{ "5.2.5", "\xed\xae\x80\xed\xb0\x80", { URC, URC } },
|
||||
{ "5.2.6", "\xed\xae\x80\xed\xbf\xbf", { URC, URC } },
|
||||
{ "5.2.7", "\xed\xaf\xbf\xed\xb0\x80", { URC, URC } },
|
||||
{ "5.2.8", "\xed\xaf\xbf\xed\xbf\xbf", { URC, URC } },
|
||||
|
||||
// 5.3 Noncharacter code positions
|
||||
|
||||
// The following "noncharacters" are "reserved for internal
|
||||
// use" by applications, and according to older versions of
|
||||
// the Unicode Standard "should never be interchanged". Unicode
|
||||
// Corrigendum #9 dropped the latter restriction. Nevertheless,
|
||||
// their presence in incoming UTF-8 data can remain a potential
|
||||
// security risk, depending on what use is made of these codes
|
||||
// subsequently. Examples of such internal use:
|
||||
//
|
||||
// - Some file APIs with 16-bit characters may use the integer
|
||||
// value -1 = U+FFFF to signal an end-of-file (EOF) or error
|
||||
// condition.
|
||||
//
|
||||
// - In some UTF-16 receivers, code point U+FFFE might trigger
|
||||
// a byte-swap operation (to convert between UTF-16LE and
|
||||
// UTF-16BE).
|
||||
// With such internal use of noncharacters, it may be desirable
|
||||
// and safer to block those code points in UTF-8 decoders, as
|
||||
// they should never occur legitimately in incoming UTF-8 data,
|
||||
// and could trigger unsafe behaviour in subsequent processing.
|
||||
|
||||
// Particularly problematic noncharacters in 16-bit applications:
|
||||
{ "5.3.1", "\xef\xbf\xbe", { URC } },
|
||||
{ "5.3.2", "\xef\xbf\xbf", { URC } },
|
||||
|
||||
// Other noncharacters:
|
||||
{
|
||||
"5.3.3",
|
||||
// Non-characters in "Arabic Presentation Forms-A" (BMP)
|
||||
"\xef\xb7\x90" "\xef\xb7\x91" "\xef\xb7\x92" "\xef\xb7\x93"
|
||||
"\xef\xb7\x94" "\xef\xb7\x95" "\xef\xb7\x96" "\xef\xb7\x97"
|
||||
"\xef\xb7\x98" "\xef\xb7\x99" "\xef\xb7\x9a" "\xef\xb7\x9b"
|
||||
"\xef\xb7\x9c" "\xef\xb7\x9d" "\xef\xb7\x9e" "\xef\xb7\x9f"
|
||||
"\xef\xb7\xa0" "\xef\xb7\xa1" "\xef\xb7\xa2" "\xef\xb7\xa3"
|
||||
"\xef\xb7\xa4" "\xef\xb7\xa5" "\xef\xb7\xa6" "\xef\xb7\xa7"
|
||||
"\xef\xb7\xa8" "\xef\xb7\xa9" "\xef\xb7\xaa" "\xef\xb7\xab"
|
||||
"\xef\xb7\xac" "\xef\xb7\xad" "\xef\xb7\xae" "\xef\xb7\xaf",
|
||||
{
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"5.3.4",
|
||||
"\xf0\x9f\xbf\xbe" "\xf0\x9f\xbf\xbf" // U+0001FFFE U+0001FFFF
|
||||
"\xf0\xaf\xbf\xbe" "\xf0\xaf\xbf\xbf" // U+0002FFFE U+0002FFFF
|
||||
"\xf0\xbf\xbf\xbe" "\xf0\xbf\xbf\xbf" // U+0003FFFE U+0003FFFF
|
||||
"\xf1\x8f\xbf\xbe" "\xf1\x8f\xbf\xbf" // U+0004FFFE U+0004FFFF
|
||||
"\xf1\x9f\xbf\xbe" "\xf1\x9f\xbf\xbf" // U+0005FFFE U+0005FFFF
|
||||
"\xf1\xaf\xbf\xbe" "\xf1\xaf\xbf\xbf" // U+0006FFFE U+0006FFFF
|
||||
"\xf1\xbf\xbf\xbe" "\xf1\xbf\xbf\xbf" // U+0007FFFE U+0007FFFF
|
||||
"\xf2\x8f\xbf\xbe" "\xf2\x8f\xbf\xbf" // U+0008FFFE U+0008FFFF
|
||||
"\xf2\x9f\xbf\xbe" "\xf2\x9f\xbf\xbf" // U+0009FFFE U+0009FFFF
|
||||
"\xf2\xaf\xbf\xbe" "\xf2\xaf\xbf\xbf" // U+000AFFFE U+000AFFFF
|
||||
"\xf2\xbf\xbf\xbe" "\xf2\xbf\xbf\xbf" // U+000BFFFE U+000BFFFF
|
||||
"\xf3\x8f\xbf\xbe" "\xf3\x8f\xbf\xbf" // U+000CFFFE U+000CFFFF
|
||||
"\xf3\x9f\xbf\xbe" "\xf3\x9f\xbf\xbf" // U+000DFFFE U+000DFFFF
|
||||
"\xf3\xaf\xbf\xbe" "\xf3\xaf\xbf\xbf" // U+000EFFFE U+000EFFFF
|
||||
"\xf3\xbf\xbf\xbe" "\xf3\xbf\xbf\xbf" // U+000FFFFE U+000FFFFF
|
||||
"\xf4\x8f\xbf\xbe" "\xf4\x8f\xbf\xbf",// U+0010FFFE U+0010FFFF
|
||||
{
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC,
|
||||
URC, URC, URC, URC, URC, URC, URC, URC
|
||||
}
|
||||
},
|
||||
|
||||
// Last line of UTF8-test.txt
|
||||
{ "TheEnd", "THE END\n", { 'T', 'H', 'E', ' ', 'E', 'N', 'D', '\n'} },
|
||||
|
||||
// Terminator
|
||||
{ 0 }
|
||||
|
||||
};
|
||||
|
||||
run_decode_sub_test_array(tests);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_utf_char32_to_utf8)
|
||||
{
|
||||
struct utf8_encode_sub_test tests[] =
|
||||
{
|
||||
|
||||
// E2.1 First possible sequence of a certain length
|
||||
//
|
||||
{ "E2.1.1", 0, 1, { 0 } },
|
||||
{ "E2.1.2", 0x80, 2, { 0xc2, 0x80 } },
|
||||
{ "E2.1.3", 0x800, 3, { 0xe0, 0xa0, 0x80 } },
|
||||
{ "E2.1.4", 0x10000, 4, { 0xf0, 0x90, 0x80, 0x80 } },
|
||||
|
||||
// E2.2 Last possible sequence of a certain length
|
||||
{ "E2.2.1", 0x7f, 1, { 0x7f } },
|
||||
{ "E2.2.2", 0x7ff, 2, { 0xdf, 0xbf } },
|
||||
{ "E2.2.3", 0xfffc, 3, { 0xef, 0xbf, 0xbc } }, // See 2.1.3 above
|
||||
{ "E2.2.4", 0x1FFFFF, 3, E_URC }, // out-of-range
|
||||
|
||||
// E2.3 Other boundary conditions
|
||||
{ "E2.3.1", 0xd7ff, 3, { 0xed, 0x9f, 0xbf } },
|
||||
{ "E2.3.2", 0xe000, 3, { 0xee, 0x80, 0x80 } },
|
||||
{ "E2.3.3", 0xfffd, 3, { 0xef, 0xbf, 0xbd } },
|
||||
{ "E2.3.4", 0x10fffd, 4, { 0xf4, 0x8f, 0xbf, 0xbd } }, // See 2.3.4 above
|
||||
// E2.3.5 - not tested
|
||||
|
||||
// E5.1 Single UTF-16 surrogates
|
||||
{ "E5.1.1", 0xd800, 3, E_URC },
|
||||
{ "E5.1.2", 0xdb7f, 3, E_URC },
|
||||
{ "E5.1.3", 0xdb80, 3, E_URC },
|
||||
{ "E5.1.4", 0xdbff, 3, E_URC },
|
||||
{ "E5.1.5", 0xdc00, 3, E_URC },
|
||||
{ "E5.1.6", 0xdf80, 3, E_URC },
|
||||
{ "E5.1.7", 0xdfff, 3, E_URC },
|
||||
|
||||
// E5.3 Non-character code positions
|
||||
{ "E5.3.3(0)", 0xfdd0, 3, E_URC },
|
||||
{ "E5.3.3(1)", 0xfdd1, 3, E_URC },
|
||||
{ "E5.3.3(2)", 0xfdd2, 3, E_URC },
|
||||
{ "E5.3.3(3)", 0xfdd3, 3, E_URC },
|
||||
{ "E5.3.3(4)", 0xfdd4, 3, E_URC },
|
||||
{ "E5.3.3(5)", 0xfdd5, 3, E_URC },
|
||||
{ "E5.3.3(6)", 0xfdd6, 3, E_URC },
|
||||
{ "E5.3.3(7)", 0xfdd7, 3, E_URC },
|
||||
{ "E5.3.3(8)", 0xfdd8, 3, E_URC },
|
||||
{ "E5.3.3(9)", 0xfdd9, 3, E_URC },
|
||||
{ "E5.3.3(10)", 0xfdda, 3, E_URC },
|
||||
{ "E5.3.3(11)", 0xfddb, 3, E_URC },
|
||||
{ "E5.3.3(12)", 0xfddc, 3, E_URC },
|
||||
{ "E5.3.3(13)", 0xfddd, 3, E_URC },
|
||||
{ "E5.3.3(14)", 0xfdde, 3, E_URC },
|
||||
{ "E5.3.3(15)", 0xfddf, 3, E_URC },
|
||||
{ "E5.3.3(16)", 0xfde0, 3, E_URC },
|
||||
{ "E5.3.3(17)", 0xfde1, 3, E_URC },
|
||||
{ "E5.3.3(18)", 0xfde2, 3, E_URC },
|
||||
{ "E5.3.3(19)", 0xfde3, 3, E_URC },
|
||||
{ "E5.3.3(20)", 0xfde4, 3, E_URC },
|
||||
{ "E5.3.3(21)", 0xfde5, 3, E_URC },
|
||||
{ "E5.3.3(22)", 0xfde6, 3, E_URC },
|
||||
{ "E5.3.3(23)", 0xfde7, 3, E_URC },
|
||||
{ "E5.3.3(24)", 0xfde8, 3, E_URC },
|
||||
{ "E5.3.3(25)", 0xfde9, 3, E_URC },
|
||||
{ "E5.3.3(26)", 0xfdea, 3, E_URC },
|
||||
{ "E5.3.3(27)", 0xfdeb, 3, E_URC },
|
||||
{ "E5.3.3(28)", 0xfdec, 3, E_URC },
|
||||
{ "E5.3.3(29)", 0xfded, 3, E_URC },
|
||||
{ "E5.3.3(30)", 0xfdee, 3, E_URC },
|
||||
{ "E5.3.3(31)", 0xfdef, 3, E_URC },
|
||||
{ "E5.3.4(0)", 0x1fffe, 3, E_URC },
|
||||
{ "E5.3.4(1)", 0x1ffff, 3, E_URC },
|
||||
{ "E5.3.4(2)", 0x2fffe, 3, E_URC },
|
||||
{ "E5.3.4(3)", 0x2ffff, 3, E_URC },
|
||||
{ "E5.3.4(4)", 0x3fffe, 3, E_URC },
|
||||
{ "E5.3.4(5)", 0x3ffff, 3, E_URC },
|
||||
{ "E5.3.4(6)", 0x4fffe, 3, E_URC },
|
||||
{ "E5.3.4(7)", 0x4ffff, 3, E_URC },
|
||||
{ "E5.3.4(8)", 0x5fffe, 3, E_URC },
|
||||
{ "E5.3.4(9)", 0x5ffff, 3, E_URC },
|
||||
{ "E5.3.4(10)", 0x6fffe, 3, E_URC },
|
||||
{ "E5.3.4(11)", 0x6ffff, 3, E_URC },
|
||||
{ "E5.3.4(12)", 0x7fffe, 3, E_URC },
|
||||
{ "E5.3.4(13)", 0x7ffff, 3, E_URC },
|
||||
{ "E5.3.4(14)", 0x8fffe, 3, E_URC },
|
||||
{ "E5.3.4(15)", 0x8ffff, 3, E_URC },
|
||||
{ "E5.3.4(16)", 0x9fffe, 3, E_URC },
|
||||
{ "E5.3.4(17)", 0x9ffff, 3, E_URC },
|
||||
{ "E5.3.4(18)", 0xafffe, 3, E_URC },
|
||||
{ "E5.3.4(19)", 0xaffff, 3, E_URC },
|
||||
{ "E5.3.4(20)", 0xbfffe, 3, E_URC },
|
||||
{ "E5.3.4(21)", 0xbffff, 3, E_URC },
|
||||
{ "E5.3.4(22)", 0xcfffe, 3, E_URC },
|
||||
{ "E5.3.4(23)", 0xcffff, 3, E_URC },
|
||||
{ "E5.3.4(24)", 0xdfffe, 3, E_URC },
|
||||
{ "E5.3.4(25)", 0xdffff, 3, E_URC },
|
||||
{ "E5.3.4(26)", 0xefffe, 3, E_URC },
|
||||
{ "E5.3.4(27)", 0xeffff, 3, E_URC },
|
||||
{ "E5.3.4(28)", 0xffffe, 3, E_URC },
|
||||
{ "E5.3.4(29)", 0xfffff, 3, E_URC },
|
||||
{ "E5.3.4(30)", 0x10fffe, 3, E_URC },
|
||||
{ "E5.3.4(31)", 0x10ffff, 3, E_URC },
|
||||
{ "E5.99.0", 'T', 1, { 'T' } },
|
||||
{ "E5.99.1", 'H', 1, { 'H' } },
|
||||
{ "E5.99.2", 'E', 1, { 'E' } },
|
||||
{ "E5.99.3", ' ', 1, { ' ' } },
|
||||
{ "E5.99.4", 'E', 1, { 'E' } },
|
||||
{ "E5.99.5", 'N', 1, { 'N' } },
|
||||
{ "E5.99.6", 'D', 1, { 'D' } },
|
||||
|
||||
// Terminator
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
run_encode_sub_test_array(tests);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_utf8_char_count)
|
||||
{
|
||||
// Check function can cope with NULL argument
|
||||
ck_assert_int_eq(utf8_char_count(NULL), 0);
|
||||
|
||||
unsigned int kosme_strlen = strlen(greek_kosme);
|
||||
unsigned int kosme_len = utf8_char_count(greek_kosme);
|
||||
|
||||
// All characters map to two bytes except for the 'omicrom with oxia'
|
||||
// which maps to three
|
||||
ck_assert_int_eq(kosme_strlen, 2 + 3 + 2 + 2 + 2);
|
||||
ck_assert_int_eq(kosme_len, 5);
|
||||
|
||||
unsigned int simple_test_strlen = strlen(simple_test_with_emoji);
|
||||
unsigned int simple_test_len = utf8_char_count(simple_test_with_emoji);
|
||||
|
||||
ck_assert_int_eq(simple_test_strlen,
|
||||
(1 + 1 + 1 + 1 + 1 + 1 ) + // Simple
|
||||
1 +
|
||||
(1 + 1 + 1 + 1 ) + // Test
|
||||
1 +
|
||||
4); // emoji
|
||||
// The emoji is 4 bytes - all others are 1
|
||||
ck_assert_int_eq(simple_test_len, simple_test_strlen - 3);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_utf8_as_utf16_word_count)
|
||||
{
|
||||
unsigned int kosme_count =
|
||||
utf8_as_utf16_word_count(greek_kosme, strlen(greek_kosme));
|
||||
|
||||
ck_assert_int_eq(kosme_count, 5); // All characters in BMP
|
||||
|
||||
unsigned int simple_test_count =
|
||||
utf8_as_utf16_word_count(simple_test_with_emoji,
|
||||
strlen(simple_test_with_emoji));
|
||||
|
||||
ck_assert_int_eq(simple_test_count,
|
||||
(1 + 1 + 1 + 1 + 1 + 1 ) + // Simple
|
||||
1 +
|
||||
(1 + 1 + 1 + 1 ) + // Test
|
||||
1 +
|
||||
2); // emoji
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_utf8_add_char_at)
|
||||
{
|
||||
#define TEST_SIZE sizeof(simple_test_with_emoji)
|
||||
|
||||
// Type pairing a string position with a Unicode char
|
||||
struct pos_to_char_map
|
||||
{
|
||||
unsigned int pos;
|
||||
char32_t c32;
|
||||
};
|
||||
|
||||
// Buffer for constructing the string
|
||||
char buff[TEST_SIZE];
|
||||
|
||||
// A pseudo-random map of the characters in simple_test_with_emoji
|
||||
const struct pos_to_char_map map[] =
|
||||
{
|
||||
{ 0, 'l' },
|
||||
{ 0, 'S' },
|
||||
{ 1, 'i' },
|
||||
{ 2, 'm' },
|
||||
{ 4, 0x1f625 },
|
||||
{ 4, '.' },
|
||||
{ 4, 'e' },
|
||||
{ 5, 'T' },
|
||||
{ 3, 'p' },
|
||||
{ 7, 't' },
|
||||
{ 7, 'e' },
|
||||
{ 8, 's' },
|
||||
{ 6, ' ' },
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
buff[0] = '\0';
|
||||
|
||||
// Construct the string in a pseudo-random fashion
|
||||
|
||||
const struct pos_to_char_map *p;
|
||||
for (p = map; p->c32 != 0 ; ++p)
|
||||
{
|
||||
if (!utf8_add_char_at(buff, TEST_SIZE, p->c32, p->pos))
|
||||
{
|
||||
ck_abort_msg("test_utf8_add_char_at: "
|
||||
"Can't insert char %x at pos %u",
|
||||
p->c32,
|
||||
p->pos);
|
||||
}
|
||||
}
|
||||
|
||||
// Should have reached the buffer size by now
|
||||
ck_assert_int_eq(strlen(buff), TEST_SIZE - 1);
|
||||
|
||||
// Check the string is what we expect
|
||||
ck_assert_int_eq(strcmp(buff, simple_test_with_emoji), 0);
|
||||
|
||||
// Try to insert another character
|
||||
if (utf8_add_char_at(buff, TEST_SIZE, ' ', 0))
|
||||
{
|
||||
ck_abort_msg("test_utf8_add_char_at: "
|
||||
"Insert succeeded but should have failed");
|
||||
}
|
||||
|
||||
#undef TEST_SIZE
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
START_TEST(test_utf8_remove_char_at)
|
||||
{
|
||||
#define TEST_SIZE sizeof(simple_test_with_emoji)
|
||||
// Type pairing a string position with a Unicode char
|
||||
struct pos_to_char_map
|
||||
{
|
||||
unsigned int pos;
|
||||
char32_t c32;
|
||||
};
|
||||
|
||||
// Buffer for deconstructing the string
|
||||
char buff[TEST_SIZE];
|
||||
|
||||
// A pseudo-random map of the characters in simple_test_with_emoji
|
||||
const struct pos_to_char_map map[] =
|
||||
{
|
||||
{ 2, 'm' },
|
||||
{ 7, 'e' },
|
||||
{ 5, ' ' },
|
||||
{ 1, 'i' },
|
||||
{ 2, 'l' },
|
||||
{ 3, 'T' },
|
||||
{ 6, 0x1f625 },
|
||||
{ 2, 'e' },
|
||||
{ 3, 't' },
|
||||
{ 3, '.' },
|
||||
{ 2, 's' },
|
||||
{ 1, 'p' },
|
||||
{ 0, 'S' },
|
||||
{ 0 }
|
||||
};
|
||||
|
||||
char32_t c32;
|
||||
|
||||
strcpy(buff, simple_test_with_emoji);
|
||||
|
||||
// Deconstruct the string in a pseudo-random fashion
|
||||
const struct pos_to_char_map *p;
|
||||
for (p = map; p->c32 != 0 ; ++p)
|
||||
{
|
||||
c32 = utf8_remove_char_at(buff, p->pos);
|
||||
if (c32 != p->c32)
|
||||
{
|
||||
ck_abort_msg("test_utf8_remove_char_at: "
|
||||
"remove char at pos %u was %x, expected %x",
|
||||
p->pos, c32, p->c32);
|
||||
}
|
||||
}
|
||||
|
||||
// Should have emptied the buffer by now
|
||||
ck_assert_int_eq(buff[0], '\0');
|
||||
|
||||
// Try to remove other characters
|
||||
c32 = utf8_remove_char_at(buff, 0);
|
||||
ck_assert_int_eq(c32, 0);
|
||||
c32 = utf8_remove_char_at(buff, 99);
|
||||
ck_assert_int_eq(c32, 0);
|
||||
ck_assert_int_eq(buff[0], '\0');
|
||||
|
||||
#undef TEST_SIZE
|
||||
}
|
||||
END_TEST
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
Suite *
|
||||
make_suite_test_string_unicode(void)
|
||||
{
|
||||
Suite *s;
|
||||
TCase *tc_unicode;
|
||||
|
||||
s = suite_create("String");
|
||||
|
||||
tc_unicode = tcase_create("Unicode");
|
||||
suite_add_tcase(s, tc_unicode);
|
||||
tcase_add_test(tc_unicode, test_get_next_char__section_1);
|
||||
tcase_add_test(tc_unicode, test_get_next_char__section_2);
|
||||
tcase_add_test(tc_unicode, test_get_next_char__section_3);
|
||||
tcase_add_test(tc_unicode, test_get_next_char__section_4);
|
||||
tcase_add_test(tc_unicode, test_get_next_char__section_5);
|
||||
tcase_add_test(tc_unicode, test_utf_char32_to_utf8);
|
||||
tcase_add_test(tc_unicode, test_utf8_char_count);
|
||||
tcase_add_test(tc_unicode, test_utf8_as_utf16_word_count);
|
||||
tcase_add_test(tc_unicode, test_utf8_add_char_at);
|
||||
tcase_add_test(tc_unicode, test_utf8_remove_char_at);
|
||||
|
||||
return s;
|
||||
}
|
@ -52,9 +52,6 @@ static char w_char[] =
|
||||
};
|
||||
#endif
|
||||
|
||||
// Unicode definitions
|
||||
#define UNICODE_WHITE_SQUARE 0x25a1
|
||||
|
||||
// First character allocated in the 'struct xrdp_font.chars' array
|
||||
#define FIRST_CHAR ' '
|
||||
|
||||
@ -354,9 +351,9 @@ xrdp_font_create(struct xrdp_wm *wm, unsigned int dpi)
|
||||
}
|
||||
|
||||
// Find a default glyph
|
||||
if (char_count > UNICODE_WHITE_SQUARE)
|
||||
if (char_count > UCS_WHITE_SQUARE)
|
||||
{
|
||||
self->default_char = &self->chars[UNICODE_WHITE_SQUARE];
|
||||
self->default_char = &self->chars[UCS_WHITE_SQUARE];
|
||||
}
|
||||
else if (char_count > '?')
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user