Add unicode_strtitle() for Unicode Default Case Conversion.
This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
This commit is contained in:
parent
a96a8b15fa
commit
46e5441fa5
@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct WordBoundaryState
|
||||||
|
{
|
||||||
|
const char *str;
|
||||||
|
size_t len;
|
||||||
|
size_t offset;
|
||||||
|
bool init;
|
||||||
|
bool prev_alnum;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple word boundary iterator that draws boundaries each time the result of
|
||||||
|
* pg_u_isalnum() changes.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
initcap_wbnext(void *state)
|
||||||
|
{
|
||||||
|
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
|
||||||
|
|
||||||
|
while (wbstate->offset < wbstate->len &&
|
||||||
|
wbstate->str[wbstate->offset] != '\0')
|
||||||
|
{
|
||||||
|
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||||
|
wbstate->offset);
|
||||||
|
bool curr_alnum = pg_u_isalnum(u, true);
|
||||||
|
|
||||||
|
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
||||||
|
{
|
||||||
|
size_t prev_offset = wbstate->offset;
|
||||||
|
|
||||||
|
wbstate->init = true;
|
||||||
|
wbstate->offset += unicode_utf8len(u);
|
||||||
|
wbstate->prev_alnum = curr_alnum;
|
||||||
|
return prev_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
wbstate->offset += unicode_utf8len(u);
|
||||||
|
}
|
||||||
|
|
||||||
|
return wbstate->len;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* collation-aware, wide-character-aware initcap function
|
* collation-aware, wide-character-aware initcap function
|
||||||
*
|
*
|
||||||
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
|
|||||||
#endif
|
#endif
|
||||||
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
|
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
|
||||||
{
|
{
|
||||||
const unsigned char *src = (unsigned char *) buff;
|
const char *src = buff;
|
||||||
size_t srclen = nbytes;
|
size_t srclen = nbytes;
|
||||||
unsigned char *dst;
|
|
||||||
size_t dstsize;
|
size_t dstsize;
|
||||||
int srcoff = 0;
|
char *dst;
|
||||||
int dstoff = 0;
|
size_t needed;
|
||||||
|
struct WordBoundaryState wbstate = {
|
||||||
|
.str = src,
|
||||||
|
.len = srclen,
|
||||||
|
.offset = 0,
|
||||||
|
.init = false,
|
||||||
|
.prev_alnum = false,
|
||||||
|
};
|
||||||
|
|
||||||
Assert(GetDatabaseEncoding() == PG_UTF8);
|
Assert(GetDatabaseEncoding() == PG_UTF8);
|
||||||
|
|
||||||
/* overflow paranoia */
|
/* first try buffer of equal size plus terminating NUL */
|
||||||
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
|
dstsize = srclen + 1;
|
||||||
ereport(ERROR,
|
dst = palloc(dstsize);
|
||||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
||||||
errmsg("out of memory")));
|
|
||||||
|
|
||||||
/* result is at most srclen codepoints plus terminating NUL */
|
needed = unicode_strtitle(dst, dstsize, src, srclen,
|
||||||
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
|
initcap_wbnext, &wbstate);
|
||||||
dst = (unsigned char *) palloc(dstsize);
|
if (needed + 1 > dstsize)
|
||||||
|
|
||||||
while (srcoff < nbytes)
|
|
||||||
{
|
{
|
||||||
pg_wchar u1 = utf8_to_unicode(src + srcoff);
|
/* reset iterator */
|
||||||
pg_wchar u2;
|
wbstate.offset = 0;
|
||||||
int u1len = unicode_utf8len(u1);
|
wbstate.init = false;
|
||||||
int u2len;
|
|
||||||
|
|
||||||
if (wasalnum)
|
/* grow buffer if needed and retry */
|
||||||
u2 = unicode_lowercase_simple(u1);
|
dstsize = needed + 1;
|
||||||
else
|
dst = repalloc(dst, dstsize);
|
||||||
u2 = unicode_uppercase_simple(u1);
|
needed = unicode_strtitle(dst, dstsize, src, srclen,
|
||||||
|
initcap_wbnext, &wbstate);
|
||||||
u2len = unicode_utf8len(u2);
|
Assert(needed + 1 == dstsize);
|
||||||
|
|
||||||
Assert(dstoff + u2len + 1 <= dstsize);
|
|
||||||
|
|
||||||
wasalnum = pg_u_isalnum(u2, true);
|
|
||||||
|
|
||||||
unicode_to_utf8(u2, dst + dstoff);
|
|
||||||
srcoff += u1len;
|
|
||||||
dstoff += u2len;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert(dstoff + 1 <= dstsize);
|
result = dst;
|
||||||
*(dst + dstoff) = '\0';
|
|
||||||
dstoff++;
|
|
||||||
|
|
||||||
/* allocate result buffer of the right size and free workspace */
|
|
||||||
result = palloc(dstoff);
|
|
||||||
memcpy(result, dst, dstoff);
|
|
||||||
pfree(dst);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -21,8 +21,9 @@
|
|||||||
#include "mb/pg_wchar.h"
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
static const pg_case_map *find_case_map(pg_wchar ucs);
|
static const pg_case_map *find_case_map(pg_wchar ucs);
|
||||||
static size_t convert_case(char *dst, size_t dstsize, const char *src,
|
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
ssize_t srclen, CaseKind casekind);
|
CaseKind str_casekind, WordBoundaryNext wbnext,
|
||||||
|
void *wbstate);
|
||||||
|
|
||||||
pg_wchar
|
pg_wchar
|
||||||
unicode_lowercase_simple(pg_wchar code)
|
unicode_lowercase_simple(pg_wchar code)
|
||||||
@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
|
|||||||
size_t
|
size_t
|
||||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||||
{
|
{
|
||||||
return convert_case(dst, dstsize, src, srclen, CaseLower);
|
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* unicode_strtitle()
|
||||||
|
*
|
||||||
|
* Convert src to titlecase, and return the result length (not including
|
||||||
|
* terminating NUL).
|
||||||
|
*
|
||||||
|
* String src must be encoded in UTF-8. If srclen < 0, src must be
|
||||||
|
* NUL-terminated.
|
||||||
|
*
|
||||||
|
* Result string is stored in dst, truncating if larger than dstsize. If
|
||||||
|
* dstsize is greater than the result length, dst will be NUL-terminated;
|
||||||
|
* otherwise not.
|
||||||
|
*
|
||||||
|
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||||
|
* required buffer size before allocating.
|
||||||
|
*
|
||||||
|
* Titlecasing requires knowledge about word boundaries, which is provided by
|
||||||
|
* the callback wbnext. A word boundary is the offset of the start of a word
|
||||||
|
* or the offset of the character immediately following a word.
|
||||||
|
*
|
||||||
|
* The caller is expected to initialize and free the callback state
|
||||||
|
* wbstate. The callback should first return offset 0 for the first boundary;
|
||||||
|
* then the offset of each subsequent word boundary; then the total length of
|
||||||
|
* the string to indicate the final boundary.
|
||||||
|
*/
|
||||||
|
size_t
|
||||||
|
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
|
WordBoundaryNext wbnext, void *wbstate)
|
||||||
|
{
|
||||||
|
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
|
||||||
|
wbstate);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
|||||||
size_t
|
size_t
|
||||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||||
{
|
{
|
||||||
return convert_case(dst, dstsize, src, srclen, CaseUpper);
|
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Implement Unicode Default Case Conversion algorithm.
|
* If str_casekind is CaseLower or CaseUpper, map each character in the string
|
||||||
|
* for which a mapping is available.
|
||||||
*
|
*
|
||||||
* Map each character in the string for which a mapping is available.
|
* If str_casekind is CaseTitle, maps characters found on a word boundary to
|
||||||
|
* uppercase and other characters to lowercase.
|
||||||
*/
|
*/
|
||||||
static size_t
|
static size_t
|
||||||
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||||
CaseKind casekind)
|
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
|
||||||
{
|
{
|
||||||
|
/* character CaseKind varies while titlecasing */
|
||||||
|
CaseKind chr_casekind = str_casekind;
|
||||||
size_t srcoff = 0;
|
size_t srcoff = 0;
|
||||||
size_t result_len = 0;
|
size_t result_len = 0;
|
||||||
|
size_t boundary = 0;
|
||||||
|
|
||||||
|
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
|
||||||
|
(str_casekind != CaseTitle && !wbnext && !wbstate));
|
||||||
|
|
||||||
|
if (str_casekind == CaseTitle)
|
||||||
|
{
|
||||||
|
boundary = wbnext(wbstate);
|
||||||
|
Assert(boundary == 0); /* start of text is always a boundary */
|
||||||
|
}
|
||||||
|
|
||||||
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
|
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
|
||||||
{
|
{
|
||||||
@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
|||||||
int u1len = unicode_utf8len(u1);
|
int u1len = unicode_utf8len(u1);
|
||||||
const pg_case_map *casemap = find_case_map(u1);
|
const pg_case_map *casemap = find_case_map(u1);
|
||||||
|
|
||||||
|
if (str_casekind == CaseTitle)
|
||||||
|
{
|
||||||
|
if (srcoff == boundary)
|
||||||
|
{
|
||||||
|
chr_casekind = CaseUpper;
|
||||||
|
boundary = wbnext(wbstate);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
chr_casekind = CaseLower;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* perform mapping, update result_len, and write to dst */
|
||||||
if (casemap)
|
if (casemap)
|
||||||
{
|
{
|
||||||
pg_wchar u2 = casemap->simplemap[casekind];
|
pg_wchar u2 = casemap->simplemap[chr_casekind];
|
||||||
pg_wchar u2len = unicode_utf8len(u2);
|
pg_wchar u2len = unicode_utf8len(u2);
|
||||||
|
|
||||||
if (result_len + u2len <= dstsize)
|
if (result_len + u2len <= dstsize)
|
||||||
|
@ -16,11 +16,16 @@
|
|||||||
|
|
||||||
#include "mb/pg_wchar.h"
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
|
typedef size_t (*WordBoundaryNext) (void *wbstate);
|
||||||
|
|
||||||
pg_wchar unicode_lowercase_simple(pg_wchar ucs);
|
pg_wchar unicode_lowercase_simple(pg_wchar ucs);
|
||||||
pg_wchar unicode_titlecase_simple(pg_wchar ucs);
|
pg_wchar unicode_titlecase_simple(pg_wchar ucs);
|
||||||
pg_wchar unicode_uppercase_simple(pg_wchar ucs);
|
pg_wchar unicode_uppercase_simple(pg_wchar ucs);
|
||||||
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen);
|
ssize_t srclen);
|
||||||
|
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
|
||||||
|
ssize_t srclen, WordBoundaryNext wbnext,
|
||||||
|
void *wbstate);
|
||||||
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
|
||||||
ssize_t srclen);
|
ssize_t srclen);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user