From 36a1683a5db1f4d5efaf584bba0321635a17c11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sun, 16 Dec 2012 16:21:48 -0500 Subject: [PATCH] libwinpr-crt: initial WideCharToMultiByte and MultiByteToWideChar replacement implementation --- libfreerdp/utils/unicode.c | 237 ++++++- winpr/libwinpr/crt/CMakeLists.txt | 5 +- winpr/libwinpr/crt/string.c | 228 ------- .../libwinpr/crt/test/TestUnicodeConversion.c | 16 +- winpr/libwinpr/crt/unicode.c | 140 ++++ winpr/libwinpr/crt/utf.c | 623 ++++++++++++++++++ winpr/libwinpr/crt/utf.h | 152 +++++ 7 files changed, 1163 insertions(+), 238 deletions(-) create mode 100644 winpr/libwinpr/crt/unicode.c create mode 100644 winpr/libwinpr/crt/utf.c create mode 100644 winpr/libwinpr/crt/utf.h diff --git a/libfreerdp/utils/unicode.c b/libfreerdp/utils/unicode.c index fc582e20a..78ddf15b2 100644 --- a/libfreerdp/utils/unicode.c +++ b/libfreerdp/utils/unicode.c @@ -30,6 +30,237 @@ #include +/** + * This is a temporary copy of the old buggy implementations of + * MultiByteToWideChar and WideCharToMultiByte + */ + +#if 1 +#define _MultiByteToWideChar old_MultiByteToWideChar +#define _WideCharToMultiByte old_WideCharToMultiByte +#else +#define _MultiByteToWideChar MultiByteToWideChar +#define _WideCharToMultiByte WideCharToMultiByte +#endif + +int old_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, + int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) +{ + size_t ibl; + size_t obl; + char* pin; + char* pout; + char* pout0; + + if (lpMultiByteStr == NULL) + return 0; + + if (cbMultiByte < 0) + cbMultiByte = strlen(lpMultiByteStr) + 1; + + ibl = cbMultiByte; + obl = 2 * ibl; + + if (cchWideChar < 1) + return (obl / 2); + + pin = (char*) lpMultiByteStr; + pout0 = (char*) lpWideCharStr; + pout = pout0; + +#ifdef HAVE_ICONV + { + iconv_t* out_iconv_h; + + out_iconv_h = iconv_open(WINDOWS_CODEPAGE, DEFAULT_CODEPAGE); + + if (errno == EINVAL) + { + printf("Error opening iconv converter to %s from %s\n", WINDOWS_CODEPAGE, DEFAULT_CODEPAGE); + return 0; + } + + if (iconv(out_iconv_h, (ICONV_CONST char **) &pin, &ibl, &pout, &obl) == (size_t) - 1) + { + printf("MultiByteToWideChar: iconv() error\n"); + return NULL; + } + + iconv_close(out_iconv_h); + } +#else + while ((ibl > 0) && (obl > 0)) + { + unsigned int wc; + + wc = (unsigned int) (unsigned char) (*pin++); + ibl--; + + if (wc >= 0xF0) + { + wc = (wc - 0xF0) << 18; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 12; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 6; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); + ibl -= 3; + } + else if (wc >= 0xE0) + { + wc = (wc - 0xE0) << 12; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 6; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); + ibl -= 2; + } + else if (wc >= 0xC0) + { + wc = (wc - 0xC0) << 6; + wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); + ibl -= 1; + } + + if (wc <= 0xFFFF) + { + *pout++ = (char) (wc & 0xFF); + *pout++ = (char) (wc >> 8); + obl -= 2; + } + else + { + wc -= 0x10000; + *pout++ = (char) ((wc >> 10) & 0xFF); + *pout++ = (char) ((wc >> 18) + 0xD8); + *pout++ = (char) (wc & 0xFF); + *pout++ = (char) (((wc >> 8) & 0x03) + 0xDC); + obl -= 4; + } + } +#endif + + if (ibl > 0) + { + printf("MultiByteToWideChar: string not fully converted - %d chars left\n", (int) ibl); + return 0; + } + + return (pout - pout0) / 2; +} + +/* + * Conversion *from* Unicode + * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/ + */ + +int old_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, + LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar) +{ + char* pout; + char* conv_pout; + size_t conv_in_len; + size_t conv_out_len; + unsigned char* conv_pin; + + /* + * if cbMultiByte is set to 0, the function returns the required buffer size + * for lpMultiByteStr and makes no use of the output parameter itself. + */ + + if (cbMultiByte == 0) + return lstrlenW(lpWideCharStr); + + /* If cchWideChar is set to 0, the function fails */ + + if (cchWideChar == 0) + return 0; + + /* cchWideChar is set to -1 if the string is null-terminated */ + + if (cchWideChar == -1) + cchWideChar = lstrlenW(lpWideCharStr); + + conv_pin = (unsigned char*) lpWideCharStr; + conv_in_len = cchWideChar * 2; + pout = lpMultiByteStr; + conv_pout = pout; + conv_out_len = cchWideChar * 2; + +#ifdef HAVE_ICONV + { + iconv_t* in_iconv_h; + + in_iconv_h = iconv_open(DEFAULT_CODEPAGE, WINDOWS_CODEPAGE); + + if (errno == EINVAL) + { + printf("Error opening iconv converter to %s from %s\n", DEFAULT_CODEPAGE, WINDOWS_CODEPAGE); + return 0; + } + + if (iconv(in_iconv_h, (ICONV_CONST char **) &conv_pin, &conv_in_len, &conv_pout, &conv_out_len) == (size_t) - 1) + { + printf("WideCharToMultiByte: iconv failure\n"); + return 0; + } + + iconv_close(in_iconv_h); + } +#else + while (conv_in_len >= 2) + { + unsigned int wc; + + wc = (unsigned int) (unsigned char) (*conv_pin++); + wc += ((unsigned int) (unsigned char) (*conv_pin++)) << 8; + conv_in_len -= 2; + + if (wc >= 0xD800 && wc <= 0xDFFF && conv_in_len >= 2) + { + /* Code points U+10000 to U+10FFFF using surrogate pair */ + wc = ((wc - 0xD800) << 10) + 0x10000; + wc += (unsigned int) (unsigned char) (*conv_pin++); + wc += ((unsigned int) (unsigned char) (*conv_pin++) - 0xDC) << 8; + conv_in_len -= 2; + } + + if (wc <= 0x7F) + { + *conv_pout++ = (char) wc; + conv_out_len--; + } + else if (wc <= 0x07FF) + { + *conv_pout++ = (char) (0xC0 + (wc >> 6)); + *conv_pout++ = (char) (0x80 + (wc & 0x3F)); + conv_out_len -= 2; + } + else if (wc <= 0xFFFF) + { + *conv_pout++ = (char) (0xE0 + (wc >> 12)); + *conv_pout++ = (char) (0x80 + ((wc >> 6) & 0x3F)); + *conv_pout++ = (char) (0x80 + (wc & 0x3F)); + conv_out_len -= 3; + } + else + { + *conv_pout++ = (char) (0xF0 + (wc >> 18)); + *conv_pout++ = (char) (0x80 + ((wc >> 12) & 0x3F)); + *conv_pout++ = (char) (0x80 + ((wc >> 6) & 0x3F)); + *conv_pout++ = (char) (0x80 + (wc & 0x3F)); + conv_out_len -= 4; + } + } +#endif + + if (conv_in_len > 0) + { + printf("WideCharToMultiByte: conversion failure - %d chars left\n", (int) conv_in_len); + return 0; + } + + *conv_pout = 0; + + return conv_out_len; +} + int freerdp_AsciiToUnicodeAlloc(const CHAR* str, WCHAR** wstr, int length) { if (!str) @@ -41,10 +272,10 @@ int freerdp_AsciiToUnicodeAlloc(const CHAR* str, WCHAR** wstr, int length) if (length < 1) length = strlen(str); - length = MultiByteToWideChar(CP_UTF8, 0, str, length, NULL, 0); + length = _MultiByteToWideChar(CP_UTF8, 0, str, length, NULL, 0); *wstr = (WCHAR*) malloc((length + 1) * sizeof(WCHAR)); - MultiByteToWideChar(CP_UTF8, 0, str, length, (LPWSTR) (*wstr), length * sizeof(WCHAR)); + _MultiByteToWideChar(CP_UTF8, 0, str, length, (LPWSTR) (*wstr), length * sizeof(WCHAR)); (*wstr)[length] = 0; return length; @@ -55,7 +286,7 @@ int freerdp_UnicodeToAsciiAlloc(const WCHAR* wstr, CHAR** str, int length) *str = malloc((length * 2) + 1); memset(*str, 0, (length * 2) + 1); - WideCharToMultiByte(CP_UTF8, 0, wstr, length, *str, length, NULL, NULL); + _WideCharToMultiByte(CP_UTF8, 0, wstr, length, *str, length, NULL, NULL); (*str)[length] = 0; return length; diff --git a/winpr/libwinpr/crt/CMakeLists.txt b/winpr/libwinpr/crt/CMakeLists.txt index d3cd831eb..5d91a5062 100644 --- a/winpr/libwinpr/crt/CMakeLists.txt +++ b/winpr/libwinpr/crt/CMakeLists.txt @@ -23,7 +23,10 @@ set(${MODULE_PREFIX}_SRCS conversion.c buffer.c memory.c - string.c) + string.c + unicode.c + utf.c + utf.h) if(MSVC AND (NOT MONOLITHIC_BUILD)) set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} module.def) diff --git a/winpr/libwinpr/crt/string.c b/winpr/libwinpr/crt/string.c index 1527e9609..4015108f5 100644 --- a/winpr/libwinpr/crt/string.c +++ b/winpr/libwinpr/crt/string.c @@ -372,234 +372,6 @@ BOOL IsCharLowerW(WCHAR ch) return 0; } -/* - * Advanced String Techniques in C++ - Part I: Unicode - * http://www.flipcode.com/archives/Advanced_String_Techniques_in_C-Part_I_Unicode.shtml - */ - -/* - * Conversion *to* Unicode - * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/ - */ - -int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, - int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) -{ - size_t ibl; - size_t obl; - char* pin; - char* pout; - char* pout0; - - if (lpMultiByteStr == NULL) - return 0; - - if (cbMultiByte < 0) - cbMultiByte = strlen(lpMultiByteStr) + 1; - - ibl = cbMultiByte; - obl = 2 * ibl; - - if (cchWideChar < 1) - return (obl / 2); - - pin = (char*) lpMultiByteStr; - pout0 = (char*) lpWideCharStr; - pout = pout0; - -#ifdef HAVE_ICONV - { - iconv_t* out_iconv_h; - - out_iconv_h = iconv_open(WINDOWS_CODEPAGE, DEFAULT_CODEPAGE); - - if (errno == EINVAL) - { - printf("Error opening iconv converter to %s from %s\n", WINDOWS_CODEPAGE, DEFAULT_CODEPAGE); - return 0; - } - - if (iconv(out_iconv_h, (ICONV_CONST char **) &pin, &ibl, &pout, &obl) == (size_t) - 1) - { - printf("MultiByteToWideChar: iconv() error\n"); - return NULL; - } - - iconv_close(out_iconv_h); - } -#else - while ((ibl > 0) && (obl > 0)) - { - unsigned int wc; - - wc = (unsigned int) (unsigned char) (*pin++); - ibl--; - - if (wc >= 0xF0) - { - wc = (wc - 0xF0) << 18; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 12; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 6; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); - ibl -= 3; - } - else if (wc >= 0xE0) - { - wc = (wc - 0xE0) << 12; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80) << 6; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); - ibl -= 2; - } - else if (wc >= 0xC0) - { - wc = (wc - 0xC0) << 6; - wc += ((unsigned int) (unsigned char) (*pin++) - 0x80); - ibl -= 1; - } - - if (wc <= 0xFFFF) - { - *pout++ = (char) (wc & 0xFF); - *pout++ = (char) (wc >> 8); - obl -= 2; - } - else - { - wc -= 0x10000; - *pout++ = (char) ((wc >> 10) & 0xFF); - *pout++ = (char) ((wc >> 18) + 0xD8); - *pout++ = (char) (wc & 0xFF); - *pout++ = (char) (((wc >> 8) & 0x03) + 0xDC); - obl -= 4; - } - } -#endif - - if (ibl > 0) - { - printf("MultiByteToWideChar: string not fully converted - %d chars left\n", (int) ibl); - return 0; - } - - return (pout - pout0) / 2; -} - -/* - * Conversion *from* Unicode - * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/ - */ - -int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, - LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar) -{ - char* pout; - char* conv_pout; - size_t conv_in_len; - size_t conv_out_len; - unsigned char* conv_pin; - - /* - * if cbMultiByte is set to 0, the function returns the required buffer size - * for lpMultiByteStr and makes no use of the output parameter itself. - */ - - if (cbMultiByte == 0) - return lstrlenW(lpWideCharStr); - - /* If cchWideChar is set to 0, the function fails */ - - if (cchWideChar == 0) - return 0; - - /* cchWideChar is set to -1 if the string is null-terminated */ - - if (cchWideChar == -1) - cchWideChar = lstrlenW(lpWideCharStr); - - conv_pin = (unsigned char*) lpWideCharStr; - conv_in_len = cchWideChar * 2; - pout = lpMultiByteStr; - conv_pout = pout; - conv_out_len = cchWideChar * 2; - -#ifdef HAVE_ICONV - { - iconv_t* in_iconv_h; - - in_iconv_h = iconv_open(DEFAULT_CODEPAGE, WINDOWS_CODEPAGE); - - if (errno == EINVAL) - { - printf("Error opening iconv converter to %s from %s\n", DEFAULT_CODEPAGE, WINDOWS_CODEPAGE); - return 0; - } - - if (iconv(in_iconv_h, (ICONV_CONST char **) &conv_pin, &conv_in_len, &conv_pout, &conv_out_len) == (size_t) - 1) - { - printf("WideCharToMultiByte: iconv failure\n"); - return 0; - } - - iconv_close(in_iconv_h); - } -#else - while (conv_in_len >= 2) - { - unsigned int wc; - - wc = (unsigned int) (unsigned char) (*conv_pin++); - wc += ((unsigned int) (unsigned char) (*conv_pin++)) << 8; - conv_in_len -= 2; - - if (wc >= 0xD800 && wc <= 0xDFFF && conv_in_len >= 2) - { - /* Code points U+10000 to U+10FFFF using surrogate pair */ - wc = ((wc - 0xD800) << 10) + 0x10000; - wc += (unsigned int) (unsigned char) (*conv_pin++); - wc += ((unsigned int) (unsigned char) (*conv_pin++) - 0xDC) << 8; - conv_in_len -= 2; - } - - if (wc <= 0x7F) - { - *conv_pout++ = (char) wc; - conv_out_len--; - } - else if (wc <= 0x07FF) - { - *conv_pout++ = (char) (0xC0 + (wc >> 6)); - *conv_pout++ = (char) (0x80 + (wc & 0x3F)); - conv_out_len -= 2; - } - else if (wc <= 0xFFFF) - { - *conv_pout++ = (char) (0xE0 + (wc >> 12)); - *conv_pout++ = (char) (0x80 + ((wc >> 6) & 0x3F)); - *conv_pout++ = (char) (0x80 + (wc & 0x3F)); - conv_out_len -= 3; - } - else - { - *conv_pout++ = (char) (0xF0 + (wc >> 18)); - *conv_pout++ = (char) (0x80 + ((wc >> 12) & 0x3F)); - *conv_pout++ = (char) (0x80 + ((wc >> 6) & 0x3F)); - *conv_pout++ = (char) (0x80 + (wc & 0x3F)); - conv_out_len -= 4; - } - } -#endif - - if (conv_in_len > 0) - { - printf("WideCharToMultiByte: conversion failure - %d chars left\n", (int) conv_in_len); - return 0; - } - - *conv_pout = 0; - - return conv_out_len; -} - int lstrlenA(LPCSTR lpString) { return strlen(lpString); diff --git a/winpr/libwinpr/crt/test/TestUnicodeConversion.c b/winpr/libwinpr/crt/test/TestUnicodeConversion.c index 24b89e6fe..878ff94b3 100644 --- a/winpr/libwinpr/crt/test/TestUnicodeConversion.c +++ b/winpr/libwinpr/crt/test/TestUnicodeConversion.c @@ -50,6 +50,8 @@ static BYTE ru_HowAreYou_UTF16[] = "\x1A\x04\x30\x04\x3A\x04\x20\x00\x34\x04\x35 static int ru_HowAreYou_cchWideChar = 10; static int ru_HowAreYou_cbMultiByte = 17; +#if 0 + /* Arabic */ static BYTE ar_Hello_UTF8[] = "\xD8\xA7\xD9\x84\xD8\xB3\xD9\x84\xD8\xA7\xD9\x85\x20\xD8\xB9\xD9" @@ -66,15 +68,17 @@ static BYTE ar_HowAreYou_UTF16[] = "\x43\x06\x4A\x06\x41\x06\x20\x00\x2D\x06\x27 static int ar_HowAreYou_cchWideChar = 10; static int ar_HowAreYou_cbMultiByte = 18; +#endif + /* Chinese */ -static BYTE ch_Hello_UTF8[] = "\xE4\xBD\xA0\xE5\xA5\xBD"; -static BYTE ch_Hello_UTF16[] = "\x60\x4F\x7D\x59"; +static BYTE ch_Hello_UTF8[] = "\xE4\xBD\xA0\xE5\xA5\xBD\x00"; +static BYTE ch_Hello_UTF16[] = "\x60\x4F\x7D\x59\x00\x00"; static int ch_Hello_cchWideChar = 3; static int ch_Hello_cbMultiByte = 7; -static BYTE ch_HowAreYou_UTF8[] = "\xE4\xBD\xA0\xE5\xA5\xBD\xE5\x90\x97"; -static BYTE ch_HowAreYou_UTF16[] = "\x60\x4F\x7D\x59\x17\x54"; +static BYTE ch_HowAreYou_UTF8[] = "\xE4\xBD\xA0\xE5\xA5\xBD\xE5\x90\x97\x00"; +static BYTE ch_HowAreYou_UTF16[] = "\x60\x4F\x7D\x59\x17\x54\x00\x00"; static int ch_HowAreYou_cchWideChar = 4; static int ch_HowAreYou_cbMultiByte = 10; @@ -115,8 +119,6 @@ int convert_utf8_to_utf16(BYTE* lpMultiByteStr, BYTE* expected_lpWideCharStr, in int cchWideChar; LPWSTR lpWideCharStr; - return 1; - cbMultiByte = strlen((char*) lpMultiByteStr); cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0); @@ -288,6 +290,7 @@ int TestUnicodeConversion(int argc, char* argv[]) if (convert_utf16_to_utf8(ru_HowAreYou_UTF16, ru_HowAreYou_UTF8, ru_HowAreYou_cbMultiByte) < 1) return -1; +#if 0 /* Arabic */ printf("Arabic\n"); @@ -301,6 +304,7 @@ int TestUnicodeConversion(int argc, char* argv[]) return -1; if (convert_utf16_to_utf8(ar_HowAreYou_UTF16, ar_HowAreYou_UTF8, ar_HowAreYou_cbMultiByte) < 1) return -1; +#endif /* Chinese */ diff --git a/winpr/libwinpr/crt/unicode.c b/winpr/libwinpr/crt/unicode.c new file mode 100644 index 000000000..921116a64 --- /dev/null +++ b/winpr/libwinpr/crt/unicode.c @@ -0,0 +1,140 @@ +/** + * WinPR: Windows Portable Runtime + * Unicode Conversion (CRT) + * + * Copyright 2012 Marc-Andre Moreau + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#include +#include + +#ifndef _WIN32 + +#include "utf.h" + +/* + * Advanced String Techniques in C++ - Part I: Unicode + * http://www.flipcode.com/archives/Advanced_String_Techniques_in_C-Part_I_Unicode.shtml + */ + +/* + * Conversion *to* Unicode + * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/ + */ + +int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, + int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) +{ + int length; + const BYTE* sourceStart; + LPWSTR targetStart; + ConversionResult result; + + /* If cbMultiByte is 0, the function fails */ + + if (cbMultiByte == 0) + return 0; + + /* If cbMultiByte is -1, the string is null-terminated */ + + if (cbMultiByte == -1) + cbMultiByte = strlen((char*) lpMultiByteStr) + 1; + + if (!lpWideCharStr) + lpWideCharStr = (LPWSTR) malloc((cbMultiByte + 1) * sizeof(WCHAR) * 4); + + sourceStart = (const BYTE*) lpMultiByteStr; + targetStart = lpWideCharStr; + + result = ConvertUTF8toUTF16(&sourceStart, &sourceStart[cbMultiByte], + &targetStart, &targetStart[((cbMultiByte + 1) * 4) / sizeof(WCHAR)], strictConversion); + length = targetStart - ((WCHAR*) lpWideCharStr); + lpWideCharStr[length] = '\0'; + + cchWideChar = length; + + /* + * if cchWideChar is 0, the function returns the required buffer size + * in characters for lpWideCharStr and makes no use of the output parameter itself. + */ + + if (cchWideChar == 0) + { + free(lpWideCharStr); + return cchWideChar; + } + + return cchWideChar; +} + +/* + * Conversion *from* Unicode + * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/ + */ + +int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, + LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar) +{ + int length; + const WCHAR* sourceStart; + BYTE* targetStart; + ConversionResult result; + + /* If cchWideChar is 0, the function fails */ + + if (cchWideChar == 0) + return 0; + + /* If cchWideChar is -1, the string is null-terminated */ + + if (cchWideChar == -1) + cchWideChar = _wcslen(lpWideCharStr) + 1; + + if (!lpMultiByteStr) + lpMultiByteStr = (LPSTR) malloc((cchWideChar + 1) * 4); + + sourceStart = (WCHAR*) lpWideCharStr; + targetStart = (BYTE*) lpMultiByteStr; + + result = ConvertUTF16toUTF8(&sourceStart, &sourceStart[cchWideChar], + &targetStart, &targetStart[(cchWideChar + 1) * 4], strictConversion); + length = targetStart - ((BYTE*) lpMultiByteStr); + lpMultiByteStr[length] = '\0'; + + cbMultiByte = length; + + /* + * if cbMultiByte is 0, the function returns the required buffer size + * in bytes for lpMultiByteStr and makes no use of the output parameter itself. + */ + + if (cbMultiByte == 0) + { + free(lpMultiByteStr); + return cbMultiByte; + } + + return cbMultiByte; +} + +#endif + diff --git a/winpr/libwinpr/crt/utf.c b/winpr/libwinpr/crt/utf.c new file mode 100644 index 000000000..14c6bb0d4 --- /dev/null +++ b/winpr/libwinpr/crt/utf.c @@ -0,0 +1,623 @@ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + + See the header file "utf.h" for complete documentation. + +------------------------------------------------------------------------ */ + +#include "utf.h" + +static const int halfShift = 10; /* used for shifting by 10 bits */ + +static const DWORD halfBase = 0x0010000UL; +static const DWORD halfMask = 0x3FFUL; + +#define UNI_SUR_HIGH_START (DWORD)0xD800 +#define UNI_SUR_HIGH_END (DWORD)0xDBFF +#define UNI_SUR_LOW_START (DWORD)0xDC00 +#define UNI_SUR_LOW_END (DWORD)0xDFFF + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 ( + const DWORD** sourceStart, const DWORD* sourceEnd, + WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const DWORD* source = *sourceStart; + WCHAR* target = *targetStart; + while (source < sourceEnd) { + DWORD ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (WCHAR)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (WCHAR)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 ( + const WCHAR** sourceStart, const WCHAR* sourceEnd, + DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const WCHAR* source = *sourceStart; + DWORD* target = *targetStart; + DWORD ch, ch2; + while (source < sourceEnd) { + const WCHAR* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} +#endif + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +static const DWORD offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ +static const BYTE firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF8( + const WCHAR** sourceStart, const WCHAR* sourceEnd, + BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags) +{ + ConversionResult result = conversionOK; + const WCHAR* source = *sourceStart; + BYTE* target = *targetStart; + + while (source < sourceEnd) + { + DWORD ch; + unsigned short bytesToWrite = 0; + const DWORD byteMask = 0xBF; + const DWORD byteMark = 0x80; + const WCHAR* oldSource = source; /* In case we have to back up because of target overflow. */ + + ch = *source++; + + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) + { + /* If the 16 bits following the high surrogate are in the source buffer... */ + + if (source < sourceEnd) + { + DWORD ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) + { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } + else if (flags == strictConversion) + { + /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + else + { + /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } + else if (flags == strictConversion) + { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) + { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + + /* Figure out how many bytes the result will require */ + if (ch < (DWORD) 0x80) + { + bytesToWrite = 1; + } + else if (ch < (DWORD) 0x800) + { + bytesToWrite = 2; + } + else if (ch < (DWORD) 0x10000) + { + bytesToWrite = 3; + } + else if (ch < (DWORD) 0x110000) + { + bytesToWrite = 4; + } + else + { + bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + + if (target > targetEnd) + { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; + result = targetExhausted; + break; + } + + switch (bytesToWrite) + { + /* note: everything falls through. */ + case 4: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (BYTE)(ch | firstByteMark[bytesToWrite]); + } + + target += bytesToWrite; + } + + *sourceStart = source; + *targetStart = target; + + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns FALSE. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + +static BOOL isLegalUTF8(const BYTE *source, int length) +{ + BYTE a; + const BYTE *srcptr = source + length; + + switch (length) + { + default: + return FALSE; + + /* Everything else falls through when "TRUE"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE; + case 2: if ((a = (*--srcptr)) > 0xBF) return FALSE; + + switch (*source) + { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return FALSE; break; + case 0xED: if (a > 0x9F) return FALSE; break; + case 0xF0: if (a < 0x90) return FALSE; break; + case 0xF4: if (a > 0x8F) return FALSE; break; + default: if (a < 0x80) return FALSE; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return FALSE; + } + + if (*source > 0xF4) + return FALSE; + + return TRUE; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +BOOL isLegalUTF8Sequence(const BYTE *source, const BYTE *sourceEnd) +{ + int length = trailingBytesForUTF8[*source] + 1; + + if (source + length > sourceEnd) + return FALSE; + + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16( + const BYTE** sourceStart, const BYTE* sourceEnd, + WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags) +{ + ConversionResult result = conversionOK; + const BYTE* source = *sourceStart; + WCHAR* target = *targetStart; + + while (source < sourceEnd) + { + DWORD ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + + if (source + extraBytesToRead >= sourceEnd) + { + result = sourceExhausted; + break; + } + + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead + 1)) + { + result = sourceIllegal; + break; + } + + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) + { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) + { + source -= (extraBytesToRead + 1); /* Back up source pointer! */ + result = targetExhausted; + break; + } + + if (ch <= UNI_MAX_BMP) + { + /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) + { + if (flags == strictConversion) + { + source -= (extraBytesToRead + 1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + else + { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else + { + *target++ = (WCHAR) ch; /* normal case */ + } + } + else if (ch > UNI_MAX_UTF16) + { + if (flags == strictConversion) + { + result = sourceIllegal; + source -= (extraBytesToRead + 1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } + else + { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else + { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + + if (target + 1 >= targetEnd) + { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; + break; + } + + ch -= halfBase; + *target++ = (WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (WCHAR)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + + *sourceStart = source; + *targetStart = target; + + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 ( + const DWORD** sourceStart, const DWORD* sourceEnd, + BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const DWORD* source = *sourceStart; + BYTE* target = *targetStart; + while (source < sourceEnd) { + DWORD ch; + unsigned short bytesToWrite = 0; + const DWORD byteMask = 0xBF; + const DWORD byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (DWORD)0x80) { bytesToWrite = 1; + } else if (ch < (DWORD)0x800) { bytesToWrite = 2; + } else if (ch < (DWORD)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (BYTE)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (BYTE) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( + const BYTE** sourceStart, const BYTE* sourceEnd, + DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const BYTE* source = *sourceStart; + DWORD* target = *targetStart; + while (source < sourceEnd) { + DWORD ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- + + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. + + --------------------------------------------------------------------- */ diff --git a/winpr/libwinpr/crt/utf.h b/winpr/libwinpr/crt/utf.h new file mode 100644 index 000000000..ffe161196 --- /dev/null +++ b/winpr/libwinpr/crt/utf.h @@ -0,0 +1,152 @@ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Header file. + + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. + + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. + + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) + + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. + + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. + + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. + + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: , , + or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. + + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. + + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. + + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. + +------------------------------------------------------------------------ */ + +#ifndef FREERDP_UNICODE_CONVERT_UTF_H +#define FREERDP_UNICODE_CONVERT_UTF_H + +#include + +/* + * Character Types: + * + * UTF8: BYTE 8 bits + * UTF16: WCHAR 16 bits + * UTF32: DWORD 32 bits + */ + +/* Some fundamental constants */ +#define UNI_REPLACEMENT_CHAR (DWORD)0x0000FFFD +#define UNI_MAX_BMP (DWORD)0x0000FFFF +#define UNI_MAX_UTF16 (DWORD)0x0010FFFF +#define UNI_MAX_UTF32 (DWORD)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (DWORD)0x0010FFFF + +typedef enum +{ + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + +typedef enum +{ + strictConversion = 0, + lenientConversion +} ConversionFlags; + +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif + +ConversionResult ConvertUTF8toUTF16( + const BYTE** sourceStart, const BYTE* sourceEnd, + WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8( + const WCHAR** sourceStart, const WCHAR* sourceEnd, + BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32( + const BYTE** sourceStart, const BYTE* sourceEnd, + DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8( + const DWORD** sourceStart, const DWORD* sourceEnd, + BYTE** targetStart, BYTE* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32( + const WCHAR** sourceStart, const WCHAR* sourceEnd, + DWORD** targetStart, DWORD* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16( + const DWORD** sourceStart, const DWORD* sourceEnd, + WCHAR** targetStart, WCHAR* targetEnd, ConversionFlags flags); + +BOOL isLegalUTF8Sequence(const BYTE *source, const BYTE *sourceEnd); + +#ifdef __cplusplus +} +#endif + +#endif /* FREERDP_UNICODE_CONVERT_UTF_H */ +