consolodate duplicated conversion descriptor cache code

This commit is contained in:
Vincent Sanders 2022-11-26 14:44:08 +00:00
parent 6780766fb7
commit 1d82ef411a
1 changed files with 61 additions and 58 deletions

View File

@ -44,7 +44,7 @@ uint32_t utf8_to_ucs4(const char *s_in, size_t l)
parserutils_error perror;
perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
&ucs4, &len);
&ucs4, &len);
if (perror != PARSERUTILS_OK)
ucs4 = 0xfffd;
@ -106,7 +106,7 @@ size_t utf8_char_byte_length(const char *s)
parserutils_error perror;
perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
&len);
&len);
assert(perror == PARSERUTILS_OK);
return len;
@ -131,7 +131,7 @@ size_t utf8_next(const char *s, size_t l, size_t o)
parserutils_error perror;
perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
&next);
&next);
assert(perror == PARSERUTILS_OK);
return next;
@ -151,6 +151,47 @@ static inline void utf8_clear_cd_cache(void)
last_cd.cd = 0;
}
/**
* obtain a cached conversion descriptor
*
* either return the cached conversion descriptor or create one if required
*/
static nserror
get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
{
iconv_t cd;
/* we cache the last used conversion descriptor,
* so check if we're trying to use it here */
if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 &&
strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 &&
last_cd.cd != 0) {
*cd_out = last_cd.cd;
return NSERROR_OK;
}
/* no match, so create a new cd */
cd = iconv_open(enc_to, enc_from);
if (cd == (iconv_t) -1) {
if (errno == EINVAL) {
return NSERROR_BAD_ENCODING;
}
/* default to no memory */
return NSERROR_NOMEM;
}
/* close the last cd - we don't care if this fails */
if (last_cd.cd) {
iconv_close(last_cd.cd);
}
/* and safely copy the to/from/cd data into last_cd */
snprintf(last_cd.from, sizeof(last_cd.from), enc_from);
snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to);
*cd_out = last_cd.cd = cd;
return NSERROR_OK;
}
/* exported interface documented in utils/utf8.h */
nserror utf8_finalise(void)
{
@ -187,6 +228,7 @@ utf8_convert(const char *string,
iconv_t cd;
char *temp, *out, *in, *result;
size_t result_len;
nserror res;
assert(string && from && to && result_out);
@ -215,29 +257,9 @@ utf8_convert(const char *string,
in = (char *)string;
/* we cache the last used conversion descriptor,
* so check if we're trying to use it here */
if (strncasecmp(last_cd.from, from, sizeof(last_cd.from)) == 0 &&
strncasecmp(last_cd.to, to, sizeof(last_cd.to)) == 0) {
cd = last_cd.cd;
} else {
/* no match, so create a new cd */
cd = iconv_open(to, from);
if (cd == (iconv_t)-1) {
if (errno == EINVAL)
return NSERROR_BAD_ENCODING;
/* default to no memory */
return NSERROR_NOMEM;
}
/* close the last cd - we don't care if this fails */
if (last_cd.cd)
iconv_close(last_cd.cd);
/* and copy the to/from/cd data into last_cd */
snprintf(last_cd.from, sizeof(last_cd.from), "%s", from);
snprintf(last_cd.to, sizeof(last_cd.to), "%s", to);
last_cd.cd = cd;
res = get_cached_cd(from, to, &cd);
if (res != NSERROR_OK) {
return res;
}
/* Worst case = ASCII -> UCS4, so allocate an output buffer
@ -289,14 +311,14 @@ utf8_convert(const char *string,
/* exported interface documented in utils/utf8.h */
nserror utf8_to_enc(const char *string, const char *encname,
size_t len, char **result)
size_t len, char **result)
{
return utf8_convert(string, len, "UTF-8", encname, result, NULL);
}
/* exported interface documented in utils/utf8.h */
nserror utf8_from_enc(const char *string, const char *encname,
size_t len, char **result, size_t *result_len)
size_t len, char **result, size_t *result_len)
{
return utf8_convert(string, len, encname, "UTF-8", result, result_len);
}
@ -327,7 +349,7 @@ utf8_convert_html_chunk(iconv_t cd,
esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
pescape = escape;
ret = iconv(cd, (void *) &pescape, &esclen,
(void *) out, outlen);
(void *) out, outlen);
if (ret == (size_t) -1)
return NSERROR_NOMEM;
@ -339,6 +361,8 @@ utf8_convert_html_chunk(iconv_t cd,
return NSERROR_OK;
}
/* exported interface documented in utils/utf8.h */
nserror
utf8_to_html(const char *string, const char *encname, size_t len, char **result)
@ -349,35 +373,14 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
size_t off, prev_off, inlen, outlen, origoutlen, esclen;
nserror ret;
char *pescape, escape[11];
nserror res;
if (len == 0)
len = strlen(string);
/* we cache the last used conversion descriptor,
* so check if we're trying to use it here */
if (strncasecmp(last_cd.from, "UTF-8", sizeof(last_cd.from)) == 0 &&
strncasecmp(last_cd.to, encname,
sizeof(last_cd.to)) == 0 &&
last_cd.cd != 0) {
cd = last_cd.cd;
} else {
/* no match, so create a new cd */
cd = iconv_open(encname, "UTF-8");
if (cd == (iconv_t) -1) {
if (errno == EINVAL)
return NSERROR_BAD_ENCODING;
/* default to no memory */
return NSERROR_NOMEM;
}
/* close the last cd - we don't care if this fails */
if (last_cd.cd)
iconv_close(last_cd.cd);
/* and safely copy the to/from/cd data into last_cd */
snprintf(last_cd.from, sizeof(last_cd.from), "UTF-8");
snprintf(last_cd.to, sizeof(last_cd.to), "%s", encname);
last_cd.cd = cd;
res = get_cached_cd("UTF-8", encname, &cd);
if (res != NSERROR_OK) {
return res;
}
/* Worst case is ASCII -> UCS4, with all characters escaped:
@ -397,13 +400,13 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
while (off < len) {
/* Must escape '&', '<', and '>' */
if (string[off] == '&' || string[off] == '<' ||
string[off] == '>') {
string[off] == '>') {
if (off - prev_off > 0) {
/* Emit chunk */
in = string + prev_off;
inlen = off - prev_off;
ret = utf8_convert_html_chunk(cd, in, inlen,
&out, &outlen);
&out, &outlen);
if (ret != NSERROR_OK) {
free(origout);
iconv_close(cd);
@ -414,10 +417,10 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
/* Emit mandatory escape */
esclen = snprintf(escape, sizeof(escape),
"&#x%06x;", string[off]);
"&#x%06x;", string[off]);
pescape = escape;
ret = utf8_convert_html_chunk(cd, pescape, esclen,
&out, &outlen);
&out, &outlen);
if (ret != NSERROR_OK) {
free(origout);
iconv_close(cd);