mirror of
https://github.com/netsurf-browser/netsurf
synced 2024-12-25 05:27:00 +03:00
Rewrite utf8_[to,from]_local_encoding to not have arbitrary limit in the
number of special characters permitted in the input. (fixes 1651343, 1649247, 1644745, 1607934) Should also be rather more efficient, too, as it now conducts only a single pass over the input data. svn path=/trunk/netsurf/; revision=3177
This commit is contained in:
parent
3b40e0f5fc
commit
b54332fd58
@ -441,13 +441,9 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
|
|||||||
char **result)
|
char **result)
|
||||||
{
|
{
|
||||||
os_error *error;
|
os_error *error;
|
||||||
int alphabet, i, offset_count = 0;
|
int alphabet, i;
|
||||||
struct {
|
size_t off, prev_off;
|
||||||
const struct special *local; /* local character */
|
char *temp, *cur_pos;
|
||||||
size_t offset; /* byte offset into string */
|
|
||||||
} offsets[CHAR_MAX];
|
|
||||||
size_t off;
|
|
||||||
char *temp;
|
|
||||||
const char *enc;
|
const char *enc;
|
||||||
utf8_convert_ret err;
|
utf8_convert_ret err;
|
||||||
|
|
||||||
@ -475,8 +471,18 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
|
|||||||
localencodings[CONT_ENC_END + 1]
|
localencodings[CONT_ENC_END + 1]
|
||||||
: localencodings[0]));
|
: localencodings[0]));
|
||||||
|
|
||||||
/* populate offsets array with details of characters that
|
/* create output buffer */
|
||||||
* will be stripped by iconv */
|
*(result) = malloc(len + 1);
|
||||||
|
if (!(*result))
|
||||||
|
return UTF8_CONVERT_NOMEM;
|
||||||
|
*(*result) = '\0';
|
||||||
|
|
||||||
|
prev_off = 0;
|
||||||
|
cur_pos = (*result);
|
||||||
|
|
||||||
|
/* Iterate over string, converting input between unconvertable
|
||||||
|
* characters and inserting appropriate output for characters
|
||||||
|
* that iconv can't handle. */
|
||||||
for (off = 0; off < len; off = utf8_next(string, len, off)) {
|
for (off = 0; off < len; off = utf8_next(string, len, off)) {
|
||||||
if (string[off] != 0xE2 &&
|
if (string[off] != 0xE2 &&
|
||||||
string[off] != 0xC5 && string[off] != 0xEF)
|
string[off] != 0xC5 && string[off] != 0xEF)
|
||||||
@ -484,69 +490,45 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
|
|||||||
|
|
||||||
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
|
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
|
||||||
if (strncmp(string + off, special_chars[i].utf,
|
if (strncmp(string + off, special_chars[i].utf,
|
||||||
special_chars[i].len) == 0) {
|
special_chars[i].len) != 0)
|
||||||
/* ensure we don't overflow our buffer */
|
continue;
|
||||||
assert(offset_count < CHAR_MAX - 1);
|
|
||||||
offsets[offset_count].local =
|
|
||||||
&special_chars[i];
|
|
||||||
offsets[offset_count].offset = off;
|
|
||||||
offset_count++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (offset_count == 0) {
|
/* 0 length has a special meaning to utf8_to_enc */
|
||||||
/* No substitutions are required, so exit here */
|
if (off - prev_off > 0) {
|
||||||
return utf8_to_enc(string, enc, len, result);
|
err = utf8_to_enc(string + prev_off, enc,
|
||||||
}
|
off - prev_off, &temp);
|
||||||
|
if (err != UTF8_CONVERT_OK) {
|
||||||
|
assert(err != UTF8_CONVERT_BADENC);
|
||||||
|
free(*result);
|
||||||
|
return UTF8_CONVERT_NOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
/* create output buffer */
|
strcat(cur_pos, temp);
|
||||||
*(result) = malloc(len + 1);
|
|
||||||
if (!(*result))
|
|
||||||
return UTF8_CONVERT_NOMEM;
|
|
||||||
*(*result) = '\0';
|
|
||||||
|
|
||||||
/* convert the chunks between offsets, then copy stripped
|
cur_pos += strlen(temp);
|
||||||
* character into output string */
|
|
||||||
for (i = 0; i != offset_count; i++) {
|
|
||||||
off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len
|
|
||||||
: 0);
|
|
||||||
|
|
||||||
/* 0 length has a special meaning to utf8_to_enc */
|
free(temp);
|
||||||
if (offsets[i].offset > off) {
|
|
||||||
err = utf8_to_enc(string + off, enc,
|
|
||||||
offsets[i].offset - off, &temp);
|
|
||||||
if (err != UTF8_CONVERT_OK) {
|
|
||||||
assert(err != UTF8_CONVERT_BADENC);
|
|
||||||
free(*result);
|
|
||||||
return UTF8_CONVERT_NOMEM;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
strcat((*result), temp);
|
*cur_pos = special_chars[i].local;
|
||||||
|
*(++cur_pos) = '\0';
|
||||||
free(temp);
|
prev_off = off + special_chars[i].len;
|
||||||
}
|
}
|
||||||
|
|
||||||
off = strlen(*result);
|
|
||||||
(*result)[off] = offsets[i].local->local;
|
|
||||||
(*result)[off+1] = '\0';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* handle last chunk
|
/* handle last chunk
|
||||||
* NB. 0 length has a special meaning to utf8_to_enc */
|
* NB. 0 length has a special meaning to utf8_to_enc */
|
||||||
|
|
||||||
off = offsets[offset_count - 1].offset +
|
if (prev_off < len) {
|
||||||
offsets[offset_count - 1].local->len;
|
err = utf8_to_enc(string + prev_off, enc, len - prev_off,
|
||||||
if (off < len) {
|
&temp);
|
||||||
err = utf8_to_enc(string + off, enc, len - off, &temp);
|
|
||||||
if (err != UTF8_CONVERT_OK) {
|
if (err != UTF8_CONVERT_OK) {
|
||||||
assert(err != UTF8_CONVERT_BADENC);
|
assert(err != UTF8_CONVERT_BADENC);
|
||||||
free(*result);
|
free(*result);
|
||||||
return UTF8_CONVERT_NOMEM;
|
return UTF8_CONVERT_NOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
strcat((*result), temp);
|
strcat(cur_pos, temp);
|
||||||
|
|
||||||
free(temp);
|
free(temp);
|
||||||
}
|
}
|
||||||
@ -566,12 +548,9 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
|
|||||||
char **result)
|
char **result)
|
||||||
{
|
{
|
||||||
os_error *error;
|
os_error *error;
|
||||||
int alphabet, i, offset_count = 0;
|
int alphabet, i, num_specials = 0, result_alloc;
|
||||||
struct {
|
#define SPECIAL_CHUNK_SIZE 255
|
||||||
const struct special *local; /* utf character */
|
size_t off, prev_off, cur_off;
|
||||||
size_t offset; /* byte offset into string */
|
|
||||||
} offsets[CHAR_MAX];
|
|
||||||
size_t off;
|
|
||||||
char *temp;
|
char *temp;
|
||||||
const char *enc;
|
const char *enc;
|
||||||
utf8_convert_ret err;
|
utf8_convert_ret err;
|
||||||
@ -603,64 +582,74 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
|
|||||||
localencodings[CONT_ENC_END + 1]
|
localencodings[CONT_ENC_END + 1]
|
||||||
: localencodings[0]));
|
: localencodings[0]));
|
||||||
|
|
||||||
/* populate offsets array with details of characters that
|
/* create output buffer (oversized) */
|
||||||
* will be stripped by iconv */
|
result_alloc = (len * 4) + (3 * SPECIAL_CHUNK_SIZE) + 1;
|
||||||
|
|
||||||
|
*(result) = malloc(result_alloc);
|
||||||
|
if (!(*result))
|
||||||
|
return UTF8_CONVERT_NOMEM;
|
||||||
|
*(*result) = '\0';
|
||||||
|
|
||||||
|
prev_off = 0;
|
||||||
|
cur_off = 0;
|
||||||
|
|
||||||
|
/* Iterate over string, converting input between unconvertable
|
||||||
|
* characters and inserting appropriate output for characters
|
||||||
|
* that iconv can't handle. */
|
||||||
for (off = 0; off < len; off++) {
|
for (off = 0; off < len; off++) {
|
||||||
if (string[off] < 0x80 || string[off] > 0x9f)
|
if (string[off] < 0x80 || string[off] > 0x9f)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
|
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
|
||||||
if (string[off] == special_chars[i].local) {
|
if (string[off] != special_chars[i].local)
|
||||||
/* ensure we don't overflow our buffer */
|
continue;
|
||||||
assert(offset_count < CHAR_MAX - 1);
|
|
||||||
offsets[offset_count].local =
|
|
||||||
&special_chars[i];
|
|
||||||
offsets[offset_count].offset = off;
|
|
||||||
offset_count++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (offset_count == 0) {
|
/* 0 length has a special meaning to utf8_from_enc */
|
||||||
/* No substitutions are required, so exit here */
|
if (off - prev_off > 0) {
|
||||||
return utf8_from_enc(string, enc, len, result);
|
err = utf8_from_enc(string + prev_off, enc,
|
||||||
}
|
off - prev_off, &temp);
|
||||||
|
if (err != UTF8_CONVERT_OK) {
|
||||||
|
assert(err != UTF8_CONVERT_BADENC);
|
||||||
|
LOG(("utf8_from_enc failed"));
|
||||||
|
free(*result);
|
||||||
|
return UTF8_CONVERT_NOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
/* create output buffer (oversized) */
|
strcat((*result) + cur_off, temp);
|
||||||
*(result) = malloc((len * 4) + (3 * offset_count) + 1);
|
|
||||||
if (!(*result))
|
|
||||||
return UTF8_CONVERT_NOMEM;
|
|
||||||
*(*result) = '\0';
|
|
||||||
|
|
||||||
/* convert the chunks between offsets, then copy stripped
|
cur_off += strlen(temp);
|
||||||
* UTF-8 character into output string */
|
|
||||||
for (i = 0; i != offset_count; i++) {
|
|
||||||
off = (i > 0 ? offsets[i-1].offset + 1 : 0);
|
|
||||||
|
|
||||||
/* 0 length has a special meaning to utf8_from_enc */
|
free(temp);
|
||||||
if (offsets[i].offset > off) {
|
|
||||||
err = utf8_from_enc(string + off, enc,
|
|
||||||
offsets[i].offset - off, &temp);
|
|
||||||
if (err != UTF8_CONVERT_OK) {
|
|
||||||
assert(err != UTF8_CONVERT_BADENC);
|
|
||||||
LOG(("utf8_from_enc failed"));
|
|
||||||
free(*result);
|
|
||||||
return UTF8_CONVERT_NOMEM;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
strcat((*result), temp);
|
strcat((*result) + cur_off, special_chars[i].utf);
|
||||||
free(temp);
|
|
||||||
}
|
|
||||||
|
|
||||||
strcat((*result), offsets[i].local->utf);
|
cur_off += special_chars[i].len;
|
||||||
|
|
||||||
|
prev_off = off + 1;
|
||||||
|
|
||||||
|
num_specials++;
|
||||||
|
if (num_specials % SPECIAL_CHUNK_SIZE ==
|
||||||
|
SPECIAL_CHUNK_SIZE - 1) {
|
||||||
|
char *temp = realloc((*result),
|
||||||
|
result_alloc +
|
||||||
|
(3 * SPECIAL_CHUNK_SIZE));
|
||||||
|
if (!temp) {
|
||||||
|
free(*result);
|
||||||
|
return UTF8_CONVERT_NOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
*result = temp;
|
||||||
|
result_alloc += (3 * SPECIAL_CHUNK_SIZE);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* handle last chunk
|
/* handle last chunk
|
||||||
* NB. 0 length has a special meaning to utf8_from_enc */
|
* NB. 0 length has a special meaning to utf8_from_enc */
|
||||||
off = offsets[offset_count - 1].offset + 1;
|
if (prev_off < len) {
|
||||||
if (off < len) {
|
err = utf8_from_enc(string + prev_off, enc, len - prev_off,
|
||||||
err = utf8_from_enc(string + off, enc, len - off, &temp);
|
&temp);
|
||||||
if (err != UTF8_CONVERT_OK) {
|
if (err != UTF8_CONVERT_OK) {
|
||||||
assert(err != UTF8_CONVERT_BADENC);
|
assert(err != UTF8_CONVERT_BADENC);
|
||||||
LOG(("utf8_from_enc failed"));
|
LOG(("utf8_from_enc failed"));
|
||||||
@ -668,22 +657,18 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
|
|||||||
return UTF8_CONVERT_NOMEM;
|
return UTF8_CONVERT_NOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
strcat((*result), temp);
|
strcat((*result) + cur_off, temp);
|
||||||
|
|
||||||
free(temp);
|
free(temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* and copy into more reasonably-sized buffer */
|
/* and copy into more reasonably-sized buffer */
|
||||||
temp = malloc(strlen((*result)) + 1);
|
temp = realloc((*result), cur_off + 1);
|
||||||
if (!temp) {
|
if (!temp) {
|
||||||
LOG(("malloc failed"));
|
LOG(("realloc failed"));
|
||||||
free(*result);
|
free(*result);
|
||||||
return UTF8_CONVERT_NOMEM;
|
return UTF8_CONVERT_NOMEM;
|
||||||
}
|
}
|
||||||
*temp = '\0';
|
|
||||||
|
|
||||||
strcpy(temp, (*result));
|
|
||||||
free(*result);
|
|
||||||
*result = temp;
|
*result = temp;
|
||||||
|
|
||||||
return UTF8_CONVERT_OK;
|
return UTF8_CONVERT_OK;
|
||||||
|
Loading…
Reference in New Issue
Block a user