stdlib: Improve Unicode support and consistency in string comparison functions.

SDL_strcasecmp (even when calling into a C runtime) does not work with
Unicode chars, and depending on the user's locale, might not work with
even basic ASCII strings.

This implements the function from scratch, using "case-folding,"
which is a more robust method that deals with various languages. It
involves a hashtable of a few hundred codepoints that are "uppercase" and
how to map them to lowercase equivalents (possibly increasing the size of
the string in the process). The vast majority of human languages (and
Unicode) do not have letters with different cases, but still, this static
table takes about 10 kilobytes on a 64-bit machine.

Even this will fail in one known case: the Turkish 'i' folds differently
if you're writing in Turkish vs other languages. Generally this is seen as
unfortunate collateral damage in cases where you can't specify the language
in use.

In addition to case-folding the codepoints, the new functions also know how
to decode the various formats to turn them into codepoints in the first
place, instead of blindly stepping by one byte (or one wchar_t) per
character.

Also included is casefolding.txt from the Unicode Consortium and a perl
script to generate the hashtable from that text file, so we can trivially
update this if new languages are added in the future.

A simple test using the new function:

```c
 #include <SDL3/SDL.h>

 int main(void)
 {
     const char *a = "α ε η";
     const char *b = "Α Ε Η";
     SDL_Log("    strcasecmp(\"%s\", \"%s\") == %d\n", a, b, strcasecmp(a, b));
     SDL_Log("SDL_strcasecmp(\"%s\", \"%s\") == %d\n", a, b, SDL_strcasecmp(a, b));
     return 0;
 }
```

Produces:

```
INFO:     strcasecmp("α ε η", "Α Ε Η") == 32
INFO: SDL_strcasecmp("α ε η", "Α Ε Η") == 0
```

glibc strcasecmp() fails to compare a Greek lowercase string to its uppercase
equivalent, even with a UTF-8 locale, but SDL_strcasecmp() works.

Other SDL_stdinc.h functions are changed to be more consistent, which is to
say they now ignore any C runtime and often dictate that only English-based
low-ASCII works with them.

Fixes Issue #9313.
This commit is contained in:
Ryan C. Gordon 2024-03-26 13:22:38 -04:00
parent 4659a84bd1
commit a5c892d2c3
17 changed files with 4971 additions and 210 deletions

View File

@ -1028,7 +1028,6 @@ if(SDL_LIBC)
set(available_headers)
set(HAVE_LIBC TRUE)
set(headers_to_check
ctype.h
float.h
iconv.h
inttypes.h
@ -1081,11 +1080,11 @@ if(SDL_LIBC)
)
if(WINDOWS)
list(APPEND symbols_to_check
_stricmp _strlwr _strnicmp _strrev _strupr _ui64toa _uitoa _ultoa _wcsdup _wcsicmp _wcsnicmp
_strrev _ui64toa _uitoa _ultoa _wcsdup
)
else()
list(APPEND symbols_to_check
strcasecmp strcasestr strncasecmp wcscasecmp wcsncasecmp
strcasestr
)
endif()
check_library_exists(m pow "" HAVE_LIBM)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,322 @@
#!/usr/bin/perl -w
# Simple DirectMedia Layer
# Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
# 1. The origin of this software must not be misrepresented; you must not
# claim that you wrote the original software. If you use this software
# in a product, an acknowledgment in the product documentation would be
# appreciated but is not required.
# 2. Altered source versions must be plainly marked as such, and must not be
# misrepresented as being the original software.
# 3. This notice may not be removed or altered from any source distribution.
# This script was originally written by Ryan C. Gordon for PhysicsFS
# ( https://icculus.org/physfs/ ), under the zlib license: the same license
# that SDL itself uses).
use warnings;
use strict;
my $HASHBUCKETS1_16 = 256;
my $HASHBUCKETS1_32 = 16;
my $HASHBUCKETS2_16 = 16;
my $HASHBUCKETS3_16 = 4;
my $mem_used = 0;
print <<__EOF__;
/*
Simple DirectMedia Layer
Copyright (C) 1997-2024 Sam Lantinga <slouken\@libsdl.org>
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
/*
* This data was generated by SDL/build-scripts/makecasefoldhashtable.pl
*
* Do not manually edit this file!
*/
#ifndef SDL_casefolding_h_
#define SDL_casefolding_h_
/* We build three simple hashmaps here: one that maps Unicode codepoints to
a one, two, or three lowercase codepoints. To retrieve this info: look at
case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
a few dozen fold to two, and a handful fold to three. If the codepoint isn't
in any of these hashes, it doesn't fold (no separate upper and lowercase).
Almost all these codepoints fit into 16 bits, so we hash them as such to save
memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
since there are (currently) only about 120 of them and (currently) all of them
map to a single lowercase codepoint. */
typedef struct CaseFoldMapping1_32
{
Uint32 from;
Uint32 to0;
} CaseFoldMapping1_32;
typedef struct CaseFoldMapping1_16
{
Uint16 from;
Uint16 to0;
} CaseFoldMapping1_16;
typedef struct CaseFoldMapping2_16
{
Uint16 from;
Uint16 to0;
Uint16 to1;
} CaseFoldMapping2_16;
typedef struct CaseFoldMapping3_16
{
Uint16 from;
Uint16 to0;
Uint16 to1;
Uint16 to2;
} CaseFoldMapping3_16;
typedef struct CaseFoldHashBucket1_16
{
const CaseFoldMapping1_16 *list;
const Uint8 count;
} CaseFoldHashBucket1_16;
typedef struct CaseFoldHashBucket1_32
{
const CaseFoldMapping1_32 *list;
const Uint8 count;
} CaseFoldHashBucket1_32;
typedef struct CaseFoldHashBucket2_16
{
const CaseFoldMapping2_16 *list;
const Uint8 count;
} CaseFoldHashBucket2_16;
typedef struct CaseFoldHashBucket3_16
{
const CaseFoldMapping3_16 *list;
const Uint8 count;
} CaseFoldHashBucket3_16;
__EOF__
my @foldPairs1_16;
my @foldPairs2_16;
my @foldPairs3_16;
my @foldPairs1_32;
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] = '';
}
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] = '';
}
open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
while (<FH>) {
chomp;
# strip comments from textfile...
s/\#.*\Z//;
# strip whitespace...
s/\A\s+//;
s/\s+\Z//;
next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
my ($code, $status, $mapping) = ($1, $2, $3);
my $hexxed = hex($code);
#print("// code '$code' status '$status' mapping '$mapping'\n");
if (($status eq 'C') or ($status eq 'F')) {
my ($map1, $map2, $map3) = (undef, undef, undef);
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
die("mapping space too small for '$code'\n") if ($mapping ne '');
die("problem parsing mapping for '$code'\n") if (not defined($map1));
if ($hexxed < 128) {
# Just ignore these, we'll handle the low-ASCII ones ourselves.
} elsif ($hexxed > 0xFFFF) {
# We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_32[$hashed] .= " { 0x$code, 0x$map1 },\n";
$mem_used += 8;
} elsif (not defined($map2)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_16[$hashed] .= " { 0x$code, 0x$map1 },\n";
$mem_used += 4;
} elsif (not defined($map3)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
#print("// hexxed '$hexxed' hashed2 '$hashed'\n");
$foldPairs2_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2 },\n";
$mem_used += 6;
} else {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
#print("// hexxed '$hexxed' hashed3 '$hashed'\n");
$foldPairs3_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
$mem_used += 8;
}
}
}
close(FH);
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_16_${num}";
print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
}
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_32[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
}
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs2_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
}
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs3_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
}
print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
my $str = $foldPairs1_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_16_${num}";
print(" { $sym, SDL_arraysize($sym) },\n");
}
$mem_used += 12;
}
print("};\n\n");
print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
my $str = $foldPairs1_32[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print(" { $sym, SDL_arraysize($sym) },\n");
}
$mem_used += 12;
}
print("};\n\n");
print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
my $str = $foldPairs2_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print(" { $sym, SDL_arraysize($sym) },\n");
}
$mem_used += 12;
}
print("};\n\n");
print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
my $str = $foldPairs3_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print(" { $sym, SDL_arraysize($sym) },\n");
}
$mem_used += 12;
}
print("};\n\n");
print <<__EOF__;
#endif /* SDL_casefolding_h_ */
__EOF__
print STDERR "Memory required for case-folding hashtable: $mem_used bytes\n";
exit 0;
# end of makecashfoldhashtable.pl ...

View File

@ -1401,6 +1401,18 @@ This header has been removed and a simplified version of this API has been added
The standard C headers like stdio.h and stdlib.h are no longer included, you should include them directly in your project if you use non-SDL C runtime functions.
M_PI is no longer defined in SDL_stdinc.h, you can use the new symbols SDL_PI_D (double) and SDL_PI_F (float) instead.
SDL3 attempts to apply consistency to case-insensitive string functions. In SDL2, things like SDL_strcasecmp() would usually only work on English letters, and depending on the user's locale, possibly not even those. In SDL3, consistency is applied:
- Many things that don't care about case-insensitivity, like SDL_strcmp(), continue to work with any null-terminated string of bytes, even if it happens to be malformed UTF-8.
- SDL_strcasecmp() expects valid UTF-8 strings, and will attempt to support _most_ Unicode characters with a technique known as "case-folding," which is to say it can match 'A' and 'a', and also 'Η' and 'η', but ALSO 'ß' and "ss". This is _probably_ how most apps assumed it worked in SDL2 and won't need any changes.
- SDL_strncasecmp() works the same, but the third parameter takes _bytes_, as before, so SDL_strlen() can continue to be used with it. If a string hits the limit in the middle of a codepoint, the half-processed bytes of the codepoint will be treated as a collection of U+0xFFFD (REPLACEMENT CHARACTER) codepoints, which you probably don't want.
- SDL_wcscasecmp() and SDL_wcsncasecmp() work the same way but operate on UTF-16 or UTF-32 encoded strings, depending on what the platform considers "wchar_t" to be. SDL_wcsncasecmp's third parameter is number of wchar_t values, not bytes, but UTF-16 has the same concerns as UTF-8 for variable-length codepoints.
- SDL_strcasestr() expects valid UTF-8 strings, and will compare codepoints using case-folding.
- SDL_tolower() and SDL_toupper() continue to only work on single bytes (even though the parameter is an `int`) and _only_ converts low-ASCII English A through Z.
- SDL_strlwr() and SDL_strupr() operates on individual bytes (not UTF-8 codepoints) and only change low-ASCII English 'A' through 'Z'. These functions do not check the string for valid UTF-8 encoding.
- The ctype.h replacement SDL_is*() functions (SDL_isalpha, SDL_isdigit, etc) only work on low-ASCII characters and ignore user locale, assuming English. This makes these functions consistent in SDL3, but applications need to be careful to understand their limits.
Please note that the case-folding technique used by SDL3 will not produce correct results for the "Turkish 'I'"; this one letter is a surprisingly hard problem in the Unicode world, and since these functions do not specify the human language in use, we have chosen to ignore this problem.
The following functions have been renamed:
* SDL_strtokr() => SDL_strtok_r()

View File

@ -50,7 +50,6 @@
/* Useful headers */
#cmakedefine HAVE_ALLOCA_H 1
#cmakedefine HAVE_CTYPE_H 1
#cmakedefine HAVE_FLOAT_H 1
#cmakedefine HAVE_ICONV_H 1
#cmakedefine HAVE_INTTYPES_H 1
@ -97,10 +96,6 @@
#cmakedefine HAVE_WCSSTR 1
#cmakedefine HAVE_WCSCMP 1
#cmakedefine HAVE_WCSNCMP 1
#cmakedefine HAVE_WCSCASECMP 1
#cmakedefine HAVE__WCSICMP 1
#cmakedefine HAVE_WCSNCASECMP 1
#cmakedefine HAVE__WCSNICMP 1
#cmakedefine HAVE_WCSTOL 1
#cmakedefine HAVE_STRLEN 1
#cmakedefine HAVE_STRNLEN 1
@ -131,10 +126,6 @@
#cmakedefine HAVE_ATOF 1
#cmakedefine HAVE_STRCMP 1
#cmakedefine HAVE_STRNCMP 1
#cmakedefine HAVE__STRICMP 1
#cmakedefine HAVE_STRCASECMP 1
#cmakedefine HAVE__STRNICMP 1
#cmakedefine HAVE_STRNCASECMP 1
#cmakedefine HAVE_STRCASESTR 1
#cmakedefine HAVE_SSCANF 1
#cmakedefine HAVE_VSSCANF 1

View File

@ -36,7 +36,6 @@
#define HAVE_GCC_ATOMICS 1
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIMITS_H 1
@ -85,8 +84,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@ -35,7 +35,6 @@
/* Useful headers */
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_ICONV_H 1
#define HAVE_INTTYPES_H 1
@ -76,8 +75,6 @@
#define HAVE_WCSSTR 1
#define HAVE_WCSCMP 1
#define HAVE_WCSNCMP 1
#define HAVE_WCSCASECMP 1
#define HAVE_WCSNCASECMP 1
#define HAVE_STRLEN 1
#define HAVE_STRLCPY 1
#define HAVE_STRLCAT 1
@ -94,8 +91,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_SSCANF 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@ -28,7 +28,6 @@
#define HAVE_GCC_ATOMICS 1
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIMITS_H 1
@ -77,8 +76,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@ -32,7 +32,6 @@
/* Useful headers */
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIBUNWIND_H 1
@ -81,8 +80,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@ -118,7 +118,6 @@ typedef unsigned int uintptr_t;
#if HAVE_LIBC
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@ -159,10 +158,6 @@ typedef unsigned int uintptr_t;
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_SSCANF 1
#define HAVE_VSSCANF 1

View File

@ -57,7 +57,6 @@
#endif
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@ -100,10 +99,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_ACOS 1
#define HAVE_ASIN 1

View File

@ -57,7 +57,6 @@
#define HAVE_TPCSHRD_H 1
#define HAVE_LIBC 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@ -96,8 +95,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE_VSNPRINTF 1
/* TODO, WinRT: consider using ??_s versions of the following */
/* #undef HAVE__STRLWR */

View File

@ -57,7 +57,6 @@
#endif
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@ -100,10 +99,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_ACOS 1
#define HAVE_ASIN 1

View File

@ -114,9 +114,6 @@
#elif defined(HAVE_STDINT_H)
#include <stdint.h>
#endif
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#ifdef HAVE_MATH_H
#include <math.h>
#endif

2769
src/stdlib/SDL_casefolding.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -474,28 +474,7 @@ int SDL_abs(int x)
#endif
}
#ifdef HAVE_CTYPE_H
int SDL_isalpha(int x)
{
return isalpha(x);
}
int SDL_isalnum(int x) { return isalnum(x); }
int SDL_isdigit(int x) { return isdigit(x); }
int SDL_isxdigit(int x) { return isxdigit(x); }
int SDL_ispunct(int x) { return ispunct(x); }
int SDL_isspace(int x) { return isspace(x); }
int SDL_isupper(int x) { return isupper(x); }
int SDL_islower(int x) { return islower(x); }
int SDL_isprint(int x) { return isprint(x); }
int SDL_isgraph(int x) { return isgraph(x); }
int SDL_iscntrl(int x) { return iscntrl(x); }
int SDL_toupper(int x) { return toupper(x); }
int SDL_tolower(int x) { return tolower(x); }
#else
int SDL_isalpha(int x)
{
return (SDL_isupper(x)) || (SDL_islower(x));
}
int SDL_isalpha(int x) { return (SDL_isupper(x)) || (SDL_islower(x)); }
int SDL_isalnum(int x) { return (SDL_isalpha(x)) || (SDL_isdigit(x)); }
int SDL_isdigit(int x) { return ((x) >= '0') && ((x) <= '9'); }
int SDL_isxdigit(int x) { return (((x) >= 'A') && ((x) <= 'F')) || (((x) >= 'a') && ((x) <= 'f')) || (SDL_isdigit(x)); }
@ -508,19 +487,7 @@ int SDL_isgraph(int x) { return (SDL_isprint(x)) && ((x) != ' '); }
int SDL_iscntrl(int x) { return (((x) >= '\0') && ((x) <= '\x1f')) || ((x) == '\x7f'); }
int SDL_toupper(int x) { return ((x) >= 'a') && ((x) <= 'z') ? ('A' + ((x) - 'a')) : (x); }
int SDL_tolower(int x) { return ((x) >= 'A') && ((x) <= 'Z') ? ('a' + ((x) - 'A')) : (x); }
#endif
#if defined(HAVE_CTYPE_H) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
int SDL_isblank(int x)
{
return isblank(x);
}
#else
int SDL_isblank(int x)
{
return ((x) == ' ') || ((x) == '\t');
}
#endif
int SDL_isblank(int x) { return ((x) == ' ') || ((x) == '\t'); }
void *SDL_aligned_alloc(size_t alignment, size_t size)
{

View File

@ -28,6 +28,205 @@
#include <psp2/kernel/clib.h>
#endif
#include "SDL_casefolding.h"
// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values.
#define INVALID_UNICODE_CODEPOINT 0xFFFD
#if defined(__SIZEOF_WCHAR_T__)
#define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__
#elif defined(SDL_PLATFORM_WINDOWS)
#define SDL_SIZEOF_WCHAR_T 2
#else // assume everything else is UTF-32 (add more tests if compiler-assert fails below!)
#define SDL_SIZEOF_WCHAR_T 4
#endif
SDL_COMPILE_TIME_ASSERT(sizeof_wchar_t, sizeof(wchar_t) == SDL_SIZEOF_WCHAR_T);
// this expects `from` and `to` to be UTF-32 encoding!
static int SDL_UnicodeCaseFold(const Uint32 from, Uint32 *to)
{
// !!! FIXME: since the hashtable is static, maybe we should binary
// !!! FIXME: search it instead of walking the whole bucket.
if (from < 128) { // low-ASCII, easy!
if ((from >= 'A') && (from <= 'Z')) {
*to = 'a' + (from - 'A');
return 1;
}
} else if (from <= 0xFFFF) { // the Basic Multilingual Plane.
const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const Uint16 from16 = (Uint16) from;
// see if it maps to a single char (most common)...
{
const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping1_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
*to = mapping->to0;
return 1;
}
}
}
// see if it folds down to two chars...
{
const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping2_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
to[0] = mapping->to0;
to[1] = mapping->to1;
return 2;
}
}
}
// okay, maybe it's _three_ characters!
{
const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping3_16 *mapping = &bucket->list[i];
if (mapping->from == from16) {
to[0] = mapping->to0;
to[1] = mapping->to1;
to[2] = mapping->to2;
return 3;
}
}
}
} else { // codepoint that doesn't fit in 16 bits.
const Uint8 hash = ((from ^ (from >> 8)) & 0xFF);
const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
const int count = (int) bucket->count;
for (int i = 0; i < count; i++) {
const CaseFoldMapping1_32 *mapping = &bucket->list[i];
if (mapping->from == from) {
*to = mapping->to0;
return 1;
}
}
}
// Not found...there's no folding needed for this codepoint.
*to = from;
return 1;
}
#define UNICODE_STRCASECMP(bits, slen1, slen2, update_slen1, update_slen2) \
Uint32 folded1[3], folded2[3]; \
int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
while (SDL_TRUE) { \
Uint32 cp1, cp2; \
if (head1 != tail1) { \
cp1 = folded1[tail1++]; \
} else { \
const Uint##bits *str1start = (const Uint##bits *) str1; \
head1 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str1, slen1), folded1); \
update_slen1; \
cp1 = folded1[0]; \
tail1 = 1; \
} \
if (head2 != tail2) { \
cp2 = folded2[tail2++]; \
} else { \
const Uint##bits *str2start = (const Uint##bits *) str2; \
head2 = SDL_UnicodeCaseFold(SDL_StepUTF##bits(&str2, slen2), folded2); \
update_slen2; \
cp2 = folded2[0]; \
tail2 = 1; \
} \
if (cp1 < cp2) { \
return -1; \
} else if (cp1 > cp2) { \
return 1; \
} else if (cp1 == 0) { \
break; /* complete match. */ \
} \
} \
return 0
static Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
{
const char *str = *_str;
const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0);
// !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc.
if (octet == 0) { // null terminator, end of string.
return 0; // don't advance `*_str`.
} else if ((octet & 0x80) == 0) { // 0xxxxxxx: one byte codepoint.
(*_str)++;
return octet;
} else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) { // 110xxxxx 10xxxxxx: two byte codepoint.
if (slen >= 2) {
*_str += 2;
return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F);
}
} else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) { // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint.
*_str += 3;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6;
const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F);
return ((octet & 0x0F) << 12) | octet2 | octet3;
} else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) { // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint.
*_str += 4;
const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12;
const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6;
const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F);
return ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
}
// bogus byte, skip ahead, return a REPLACEMENT CHARACTER.
(*_str)++;
return INVALID_UNICODE_CODEPOINT;
}
#if (SDL_SIZEOF_WCHAR_T == 2)
static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
{
const Uint16 *str = *_str;
Uint32 cp = (Uint32) *(str++);
if (cp == 0) {
return 0; // don't advance string pointer.
} else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) {
cp = INVALID_UNICODE_CODEPOINT; // Orphaned second half of surrogate pair
} else if ((cp >= 0xD800) && (cp <= 0xDBFF)) { // start of surrogate pair!
const Uint32 pair = (Uint32) *str;
if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) {
cp = INVALID_UNICODE_CODEPOINT;
} else {
str++; // eat the other surrogate.
cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
}
}
*_str = str;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
}
#elif (SDL_SIZEOF_WCHAR_T == 4)
static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
{
if (!slen) {
return 0;
}
const Uint32 *str = *_str;
const Uint32 cp = *str;
if (cp == 0) {
return 0; // don't advance string pointer.
}
(*_str)++;
return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
}
#endif
#if !defined(HAVE_VSSCANF) || !defined(HAVE_STRTOL) || !defined(HAVE_WCSTOL) || !defined(HAVE_STRTOUL) || !defined(HAVE_STRTOD) || !defined(HAVE_STRTOLL) || !defined(HAVE_STRTOULL)
#define SDL_isupperhex(X) (((X) >= 'A') && ((X) <= 'F'))
#define SDL_islowerhex(X) (((X) >= 'a') && ((X) <= 'f'))
@ -507,83 +706,41 @@ int SDL_wcsncmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
#endif /* HAVE_WCSNCMP */
}
int SDL_wcscasecmp(const wchar_t *str1, const wchar_t *str2)
int SDL_wcscasecmp(const wchar_t *wstr1, const wchar_t *wstr2)
{
#ifdef HAVE_WCSCASECMP
return wcscasecmp(str1, str2);
#elif defined(HAVE__WCSICMP)
return _wcsicmp(str1, str2);
#if (SDL_SIZEOF_WCHAR_T == 2)
const Uint16 *str1 = (const Uint16 *) wstr1;
const Uint16 *str2 = (const Uint16 *) wstr2;
UNICODE_STRCASECMP(16, 2, 2, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
#elif (SDL_SIZEOF_WCHAR_T == 4)
const Uint32 *str1 = (const Uint32 *) wstr1;
const Uint32 *str2 = (const Uint32 *) wstr2;
UNICODE_STRCASECMP(32, 1, 1, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
#else
wchar_t a = 0;
wchar_t b = 0;
while (*str1 && *str2) {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
if (a != b) {
break;
}
++str1;
++str2;
}
#error Unexpected wchar_t size
#endif
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
return (int)((unsigned int)a - (unsigned int)b);
#endif /* HAVE__WCSICMP */
return -1;
}
int SDL_wcsncasecmp(const wchar_t *str1, const wchar_t *str2, size_t maxlen)
int SDL_wcsncasecmp(const wchar_t *wstr1, const wchar_t *wstr2, size_t maxlen)
{
#ifdef HAVE_WCSNCASECMP
return wcsncasecmp(str1, str2, maxlen);
#elif defined(HAVE__WCSNICMP)
return _wcsnicmp(str1, str2, maxlen);
#else
wchar_t a = 0;
wchar_t b = 0;
while (*str1 && *str2 && maxlen) {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
if (a != b) {
break;
}
++str1;
++str2;
--maxlen;
}
size_t slen1 = maxlen;
size_t slen2 = maxlen;
if (maxlen == 0) {
return 0;
} else {
/* FIXME: This doesn't actually support wide characters */
if (*str1 >= 0x80 || *str2 >= 0x80) {
a = *str1;
b = *str2;
} else {
a = (wchar_t)SDL_toupper((unsigned char)*str1);
b = (wchar_t)SDL_toupper((unsigned char)*str2);
}
return (int)((unsigned int)a - (unsigned int)b);
}
#endif /* HAVE__WCSNICMP */
#if (SDL_SIZEOF_WCHAR_T == 2)
const Uint16 *str1 = (const Uint16 *) wstr1;
const Uint16 *str2 = (const Uint16 *) wstr2;
UNICODE_STRCASECMP(16, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
#elif (SDL_SIZEOF_WCHAR_T == 4)
const Uint32 *str1 = (const Uint32 *) wstr1;
const Uint32 *str2 = (const Uint32 *) wstr2;
UNICODE_STRCASECMP(32, slen1, slen2, slen1 -= (size_t) (str1 - str1start), slen2 -= (size_t) (str2 - str2start));
#else
#error Unexpected wchar_t size
#endif
return -1;
}
long SDL_wcstol(const wchar_t *string, wchar_t **endp, int base)
@ -733,7 +890,7 @@ char *SDL_strrev(char *string)
char *b = &string[len - 1];
len /= 2;
while (len--) {
char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
const char c = *a; /* NOLINT(clang-analyzer-core.uninitialized.Assign) */
*a++ = *b;
*b-- = c;
}
@ -743,30 +900,22 @@ char *SDL_strrev(char *string)
char *SDL_strupr(char *string)
{
#ifdef HAVE__STRUPR
return _strupr(string);
#else
char *bufp = string;
while (*bufp) {
*bufp = (char)SDL_toupper((unsigned char)*bufp);
++bufp;
}
return string;
#endif /* HAVE__STRUPR */
}
char *SDL_strlwr(char *string)
{
#ifdef HAVE__STRLWR
return _strlwr(string);
#else
char *bufp = string;
while (*bufp) {
*bufp = (char)SDL_tolower((unsigned char)*bufp);
++bufp;
}
return string;
#endif /* HAVE__STRLWR */
}
char *SDL_strchr(const char *string, int c)
@ -838,18 +987,14 @@ char *SDL_strstr(const char *haystack, const char *needle)
char *SDL_strcasestr(const char *haystack, const char *needle)
{
#ifdef HAVE_STRCASESTR
return SDL_const_cast(char *, strcasestr(haystack, needle));
#else
size_t length = SDL_strlen(needle);
while (*haystack) {
const size_t length = SDL_strlen(needle);
do {
if (SDL_strncasecmp(haystack, needle, length) == 0) {
return (char *)haystack;
}
++haystack;
}
} while (SDL_StepUTF8(&haystack, 4)); // move ahead by a full codepoint at a time, regardless of bytes.
return NULL;
#endif /* HAVE_STRCASESTR */
}
#if !defined(HAVE__LTOA) || !defined(HAVE__I64TOA) || \
@ -1079,8 +1224,7 @@ Uint64 SDL_strtoull(const char *string, char **endp, int base)
#endif /* HAVE_STRTOULL */
}
double
SDL_strtod(const char *string, char **endp)
double SDL_strtod(const char *string, char **endp)
{
#ifdef HAVE_STRTOD
return strtod(string, endp);
@ -1137,49 +1281,14 @@ int SDL_strncmp(const char *str1, const char *str2, size_t maxlen)
int SDL_strcasecmp(const char *str1, const char *str2)
{
#ifdef HAVE_STRCASECMP
return strcasecmp(str1, str2);
#elif defined(HAVE__STRICMP)
return _stricmp(str1, str2);
#else
int a, b, result;
while (1) {
a = SDL_toupper((unsigned char)*str1);
b = SDL_toupper((unsigned char)*str2);
result = a - b;
if (result != 0 || a == 0 /*&& b == 0*/) {
break;
}
++str1;
++str2;
}
return result;
#endif /* HAVE_STRCASECMP */
UNICODE_STRCASECMP(8, 4, 4, (void) str1start, (void) str2start); // always NULL-terminated, no need to adjust lengths.
}
int SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen)
{
#ifdef HAVE_STRNCASECMP
return strncasecmp(str1, str2, maxlen);
#elif defined(HAVE__STRNICMP)
return _strnicmp(str1, str2, maxlen);
#else
int a, b, result = 0;
while (maxlen) {
a = SDL_tolower((unsigned char)*str1);
b = SDL_tolower((unsigned char)*str2);
result = a - b;
if (result != 0 || a == 0 /*&& b == 0*/) {
break;
}
++str1;
++str2;
--maxlen;
}
return result;
#endif /* HAVE_STRNCASECMP */
size_t slen1 = maxlen;
size_t slen2 = maxlen;
UNICODE_STRCASECMP(8, slen1, slen2, slen1 -= (size_t) (str1 - ((const char *) str1start)), slen2 -= (size_t) (str2 - ((const char *) str2start)));
}
int SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...)