Inline basic UTF-8 functions.
Shows a measurable speedup when processing UTF-8 data, such as with the new builtin collation provider. Discussion: https://postgr.es/m/163f4e2190cdf67f67016044e503c5004547e5a9.camel@j-davis.com Reviewed-by: Peter Eisentraut
This commit is contained in:
parent
2b520860c0
commit
9acae56ce0
@ -476,39 +476,6 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Map a Unicode code point to UTF-8. utf8string must have at least
|
|
||||||
* unicode_utf8len(c) bytes available.
|
|
||||||
*/
|
|
||||||
unsigned char *
|
|
||||||
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
|
||||||
{
|
|
||||||
if (c <= 0x7F)
|
|
||||||
{
|
|
||||||
utf8string[0] = c;
|
|
||||||
}
|
|
||||||
else if (c <= 0x7FF)
|
|
||||||
{
|
|
||||||
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
|
|
||||||
utf8string[1] = 0x80 | (c & 0x3F);
|
|
||||||
}
|
|
||||||
else if (c <= 0xFFFF)
|
|
||||||
{
|
|
||||||
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
|
|
||||||
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
||||||
utf8string[2] = 0x80 | (c & 0x3F);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
|
|
||||||
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
||||||
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
||||||
utf8string[3] = 0x80 | (c & 0x3F);
|
|
||||||
}
|
|
||||||
|
|
||||||
return utf8string;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Trivial conversion from pg_wchar to UTF-8.
|
* Trivial conversion from pg_wchar to UTF-8.
|
||||||
* caller should allocate enough space for "to"
|
* caller should allocate enough space for "to"
|
||||||
@ -670,34 +637,6 @@ ucs_wcwidth(pg_wchar ucs)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Convert a UTF-8 character to a Unicode code point.
|
|
||||||
* This is a one-character version of pg_utf2wchar_with_len.
|
|
||||||
*
|
|
||||||
* No error checks here, c must point to a long-enough string.
|
|
||||||
*/
|
|
||||||
pg_wchar
|
|
||||||
utf8_to_unicode(const unsigned char *c)
|
|
||||||
{
|
|
||||||
if ((*c & 0x80) == 0)
|
|
||||||
return (pg_wchar) c[0];
|
|
||||||
else if ((*c & 0xe0) == 0xc0)
|
|
||||||
return (pg_wchar) (((c[0] & 0x1f) << 6) |
|
|
||||||
(c[1] & 0x3f));
|
|
||||||
else if ((*c & 0xf0) == 0xe0)
|
|
||||||
return (pg_wchar) (((c[0] & 0x0f) << 12) |
|
|
||||||
((c[1] & 0x3f) << 6) |
|
|
||||||
(c[2] & 0x3f));
|
|
||||||
else if ((*c & 0xf8) == 0xf0)
|
|
||||||
return (pg_wchar) (((c[0] & 0x07) << 18) |
|
|
||||||
((c[1] & 0x3f) << 12) |
|
|
||||||
((c[2] & 0x3f) << 6) |
|
|
||||||
(c[3] & 0x3f));
|
|
||||||
else
|
|
||||||
/* that is an invalid code on purpose */
|
|
||||||
return 0xffffffff;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
pg_utf_dsplen(const unsigned char *s)
|
pg_utf_dsplen(const unsigned char *s)
|
||||||
{
|
{
|
||||||
|
@ -555,6 +555,67 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
|
|||||||
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
|
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Convert a UTF-8 character to a Unicode code point.
|
||||||
|
* This is a one-character version of pg_utf2wchar_with_len.
|
||||||
|
*
|
||||||
|
* No error checks here, c must point to a long-enough string.
|
||||||
|
*/
|
||||||
|
static inline pg_wchar
|
||||||
|
utf8_to_unicode(const unsigned char *c)
|
||||||
|
{
|
||||||
|
if ((*c & 0x80) == 0)
|
||||||
|
return (pg_wchar) c[0];
|
||||||
|
else if ((*c & 0xe0) == 0xc0)
|
||||||
|
return (pg_wchar) (((c[0] & 0x1f) << 6) |
|
||||||
|
(c[1] & 0x3f));
|
||||||
|
else if ((*c & 0xf0) == 0xe0)
|
||||||
|
return (pg_wchar) (((c[0] & 0x0f) << 12) |
|
||||||
|
((c[1] & 0x3f) << 6) |
|
||||||
|
(c[2] & 0x3f));
|
||||||
|
else if ((*c & 0xf8) == 0xf0)
|
||||||
|
return (pg_wchar) (((c[0] & 0x07) << 18) |
|
||||||
|
((c[1] & 0x3f) << 12) |
|
||||||
|
((c[2] & 0x3f) << 6) |
|
||||||
|
(c[3] & 0x3f));
|
||||||
|
else
|
||||||
|
/* that is an invalid code on purpose */
|
||||||
|
return 0xffffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Map a Unicode code point to UTF-8. utf8string must have at least
|
||||||
|
* unicode_utf8len(c) bytes available.
|
||||||
|
*/
|
||||||
|
static inline unsigned char *
|
||||||
|
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
||||||
|
{
|
||||||
|
if (c <= 0x7F)
|
||||||
|
{
|
||||||
|
utf8string[0] = c;
|
||||||
|
}
|
||||||
|
else if (c <= 0x7FF)
|
||||||
|
{
|
||||||
|
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
|
||||||
|
utf8string[1] = 0x80 | (c & 0x3F);
|
||||||
|
}
|
||||||
|
else if (c <= 0xFFFF)
|
||||||
|
{
|
||||||
|
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
|
||||||
|
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
|
||||||
|
utf8string[2] = 0x80 | (c & 0x3F);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
|
||||||
|
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
|
||||||
|
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
|
||||||
|
utf8string[3] = 0x80 | (c & 0x3F);
|
||||||
|
}
|
||||||
|
|
||||||
|
return utf8string;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Number of bytes needed to represent the given char in UTF8.
|
* Number of bytes needed to represent the given char in UTF8.
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user