consolodate duplicated conversion descriptor cache code
This commit is contained in:
parent
6780766fb7
commit
1d82ef411a
119
utils/utf8.c
119
utils/utf8.c
|
@ -44,7 +44,7 @@ uint32_t utf8_to_ucs4(const char *s_in, size_t l)
|
|||
parserutils_error perror;
|
||||
|
||||
perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
|
||||
&ucs4, &len);
|
||||
&ucs4, &len);
|
||||
if (perror != PARSERUTILS_OK)
|
||||
ucs4 = 0xfffd;
|
||||
|
||||
|
@ -106,7 +106,7 @@ size_t utf8_char_byte_length(const char *s)
|
|||
parserutils_error perror;
|
||||
|
||||
perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
|
||||
&len);
|
||||
&len);
|
||||
assert(perror == PARSERUTILS_OK);
|
||||
|
||||
return len;
|
||||
|
@ -131,7 +131,7 @@ size_t utf8_next(const char *s, size_t l, size_t o)
|
|||
parserutils_error perror;
|
||||
|
||||
perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
|
||||
&next);
|
||||
&next);
|
||||
assert(perror == PARSERUTILS_OK);
|
||||
|
||||
return next;
|
||||
|
@ -151,6 +151,47 @@ static inline void utf8_clear_cd_cache(void)
|
|||
last_cd.cd = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* obtain a cached conversion descriptor
|
||||
*
|
||||
* either return the cached conversion descriptor or create one if required
|
||||
*/
|
||||
static nserror
|
||||
get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
|
||||
{
|
||||
iconv_t cd;
|
||||
/* we cache the last used conversion descriptor,
|
||||
* so check if we're trying to use it here */
|
||||
if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 &&
|
||||
strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 &&
|
||||
last_cd.cd != 0) {
|
||||
*cd_out = last_cd.cd;
|
||||
return NSERROR_OK;
|
||||
}
|
||||
|
||||
/* no match, so create a new cd */
|
||||
cd = iconv_open(enc_to, enc_from);
|
||||
if (cd == (iconv_t) -1) {
|
||||
if (errno == EINVAL) {
|
||||
return NSERROR_BAD_ENCODING;
|
||||
}
|
||||
/* default to no memory */
|
||||
return NSERROR_NOMEM;
|
||||
}
|
||||
|
||||
/* close the last cd - we don't care if this fails */
|
||||
if (last_cd.cd) {
|
||||
iconv_close(last_cd.cd);
|
||||
}
|
||||
|
||||
/* and safely copy the to/from/cd data into last_cd */
|
||||
snprintf(last_cd.from, sizeof(last_cd.from), enc_from);
|
||||
snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to);
|
||||
*cd_out = last_cd.cd = cd;
|
||||
|
||||
return NSERROR_OK;
|
||||
}
|
||||
|
||||
/* exported interface documented in utils/utf8.h */
|
||||
nserror utf8_finalise(void)
|
||||
{
|
||||
|
@ -187,6 +228,7 @@ utf8_convert(const char *string,
|
|||
iconv_t cd;
|
||||
char *temp, *out, *in, *result;
|
||||
size_t result_len;
|
||||
nserror res;
|
||||
|
||||
assert(string && from && to && result_out);
|
||||
|
||||
|
@ -215,29 +257,9 @@ utf8_convert(const char *string,
|
|||
|
||||
in = (char *)string;
|
||||
|
||||
/* we cache the last used conversion descriptor,
|
||||
* so check if we're trying to use it here */
|
||||
if (strncasecmp(last_cd.from, from, sizeof(last_cd.from)) == 0 &&
|
||||
strncasecmp(last_cd.to, to, sizeof(last_cd.to)) == 0) {
|
||||
cd = last_cd.cd;
|
||||
} else {
|
||||
/* no match, so create a new cd */
|
||||
cd = iconv_open(to, from);
|
||||
if (cd == (iconv_t)-1) {
|
||||
if (errno == EINVAL)
|
||||
return NSERROR_BAD_ENCODING;
|
||||
/* default to no memory */
|
||||
return NSERROR_NOMEM;
|
||||
}
|
||||
|
||||
/* close the last cd - we don't care if this fails */
|
||||
if (last_cd.cd)
|
||||
iconv_close(last_cd.cd);
|
||||
|
||||
/* and copy the to/from/cd data into last_cd */
|
||||
snprintf(last_cd.from, sizeof(last_cd.from), "%s", from);
|
||||
snprintf(last_cd.to, sizeof(last_cd.to), "%s", to);
|
||||
last_cd.cd = cd;
|
||||
res = get_cached_cd(from, to, &cd);
|
||||
if (res != NSERROR_OK) {
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Worst case = ASCII -> UCS4, so allocate an output buffer
|
||||
|
@ -289,14 +311,14 @@ utf8_convert(const char *string,
|
|||
|
||||
/* exported interface documented in utils/utf8.h */
|
||||
nserror utf8_to_enc(const char *string, const char *encname,
|
||||
size_t len, char **result)
|
||||
size_t len, char **result)
|
||||
{
|
||||
return utf8_convert(string, len, "UTF-8", encname, result, NULL);
|
||||
}
|
||||
|
||||
/* exported interface documented in utils/utf8.h */
|
||||
nserror utf8_from_enc(const char *string, const char *encname,
|
||||
size_t len, char **result, size_t *result_len)
|
||||
size_t len, char **result, size_t *result_len)
|
||||
{
|
||||
return utf8_convert(string, len, encname, "UTF-8", result, result_len);
|
||||
}
|
||||
|
@ -327,7 +349,7 @@ utf8_convert_html_chunk(iconv_t cd,
|
|||
esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
|
||||
pescape = escape;
|
||||
ret = iconv(cd, (void *) &pescape, &esclen,
|
||||
(void *) out, outlen);
|
||||
(void *) out, outlen);
|
||||
if (ret == (size_t) -1)
|
||||
return NSERROR_NOMEM;
|
||||
|
||||
|
@ -339,6 +361,8 @@ utf8_convert_html_chunk(iconv_t cd,
|
|||
return NSERROR_OK;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* exported interface documented in utils/utf8.h */
|
||||
nserror
|
||||
utf8_to_html(const char *string, const char *encname, size_t len, char **result)
|
||||
|
@ -349,35 +373,14 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
|
|||
size_t off, prev_off, inlen, outlen, origoutlen, esclen;
|
||||
nserror ret;
|
||||
char *pescape, escape[11];
|
||||
nserror res;
|
||||
|
||||
if (len == 0)
|
||||
len = strlen(string);
|
||||
|
||||
/* we cache the last used conversion descriptor,
|
||||
* so check if we're trying to use it here */
|
||||
if (strncasecmp(last_cd.from, "UTF-8", sizeof(last_cd.from)) == 0 &&
|
||||
strncasecmp(last_cd.to, encname,
|
||||
sizeof(last_cd.to)) == 0 &&
|
||||
last_cd.cd != 0) {
|
||||
cd = last_cd.cd;
|
||||
} else {
|
||||
/* no match, so create a new cd */
|
||||
cd = iconv_open(encname, "UTF-8");
|
||||
if (cd == (iconv_t) -1) {
|
||||
if (errno == EINVAL)
|
||||
return NSERROR_BAD_ENCODING;
|
||||
/* default to no memory */
|
||||
return NSERROR_NOMEM;
|
||||
}
|
||||
|
||||
/* close the last cd - we don't care if this fails */
|
||||
if (last_cd.cd)
|
||||
iconv_close(last_cd.cd);
|
||||
|
||||
/* and safely copy the to/from/cd data into last_cd */
|
||||
snprintf(last_cd.from, sizeof(last_cd.from), "UTF-8");
|
||||
snprintf(last_cd.to, sizeof(last_cd.to), "%s", encname);
|
||||
last_cd.cd = cd;
|
||||
res = get_cached_cd("UTF-8", encname, &cd);
|
||||
if (res != NSERROR_OK) {
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Worst case is ASCII -> UCS4, with all characters escaped:
|
||||
|
@ -397,13 +400,13 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
|
|||
while (off < len) {
|
||||
/* Must escape '&', '<', and '>' */
|
||||
if (string[off] == '&' || string[off] == '<' ||
|
||||
string[off] == '>') {
|
||||
string[off] == '>') {
|
||||
if (off - prev_off > 0) {
|
||||
/* Emit chunk */
|
||||
in = string + prev_off;
|
||||
inlen = off - prev_off;
|
||||
ret = utf8_convert_html_chunk(cd, in, inlen,
|
||||
&out, &outlen);
|
||||
&out, &outlen);
|
||||
if (ret != NSERROR_OK) {
|
||||
free(origout);
|
||||
iconv_close(cd);
|
||||
|
@ -414,10 +417,10 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
|
|||
|
||||
/* Emit mandatory escape */
|
||||
esclen = snprintf(escape, sizeof(escape),
|
||||
"&#x%06x;", string[off]);
|
||||
"&#x%06x;", string[off]);
|
||||
pescape = escape;
|
||||
ret = utf8_convert_html_chunk(cd, pescape, esclen,
|
||||
&out, &outlen);
|
||||
&out, &outlen);
|
||||
if (ret != NSERROR_OK) {
|
||||
free(origout);
|
||||
iconv_close(cd);
|
||||
|
|
Loading…
Reference in New Issue