Replace each occurence of an invalide character. Before all invalid

characters where omitted and the substitute character was append at the end
of the input text.
Added comment how the continuation of incomplete multibyte sequences
could be solved.
Please review.


git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@35875 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
Michael Pfeiffer 2010-03-16 09:28:53 +00:00
parent 4e19eb33b3
commit 2367c2d78d
1 changed files with 53 additions and 6 deletions

View File

@ -29,6 +29,51 @@ using namespace BPrivate;
int iconvctl(iconv_t icd, int request, void* argument);
static void
discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
size_t* inputLeft)
{
if (*inputLeft == 0)
return;
char outputBuffer[1];
// skip the invalid input character only
size_t left = 1;
for (; left <= *inputLeft; left ++) {
// reset internal state
iconv(*conversion, NULL, NULL, NULL, NULL);
char* buffer = *inputBuffer;
char* output = outputBuffer;
size_t outputLeft = 1;
size_t size = iconv(*conversion, &buffer, &left,
&output, &outputLeft);
if (size != (size_t)-1) {
// should not reach here
break;
}
if (errno == EINVAL) {
// too few input bytes provided,
// increase input buffer size and try again
continue;
}
if (errno == EILSEQ) {
// minimal size of input buffer found
break;
}
// should not reach here
};
*inputBuffer += left;
*inputLeft -= left;
}
status_t
convert_encoding(const char* from, const char* to, const char* src,
int32* srcLen, char* dst, int32* dstLen, int32* state,
@ -71,11 +116,8 @@ convert_encoding(const char* from, const char* to, const char* src,
switch (errno) {
case EILSEQ: // unable to generate a corresponding character
{
// discard the input character
const int one = 1, zero = 0;
iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&one);
iconv(conversion, inputBuffer, &inputLeft, &dst, &outputLeft);
iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&zero);
discard_invalid_input_character(&conversion, inputBuffer,
&inputLeft);
// prepare to convert the substitute character to target encoding
char original = substitute;
@ -96,7 +138,12 @@ convert_encoding(const char* from, const char* to, const char* src,
break;
}
case EINVAL: // incomplete multibyte sequence in the input
case EINVAL: // incomplete multibyte sequence at the end of the input
// TODO inputLeft bytes from inputBuffer should
// be stored in state variable, so that conversion
// can continue when the caller provides the missing
// bytes with the next call of this method
// we just eat bad bytes, as part of robustness/best-effort
inputBuffer++;
inputLeft--;