haiku/src/kits/textencoding/utf8_conversions.cpp

/*
 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
 * Distributed under the terms of the MIT License.
 *
 * Authors:
 *		Andrew Bachmann
 */


#include <CharacterSet.h>
#include <CharacterSetRoster.h>
#include <UTF8.h>

#include <errno.h>
#include <iconv.h>
#include <stdio.h>


//#define DEBUG_CONV 1

#ifdef DEBUG_CONV
#	define DEBPRINT(ARGS) printf ARGS;
#else
#	define DEBPRINT(ARGS) ;
#endif

using namespace BPrivate;

int iconvctl(iconv_t icd, int request, void* argument);


static void
discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
	size_t* inputLeft)
{
	if (*inputLeft == 0)
		return;

	char outputBuffer[1];

	// skip the invalid input character only
	size_t left = 1;
	for (; left <= *inputLeft; left ++) {
		// reset internal state
		iconv(*conversion, NULL, NULL, NULL, NULL);

		char* buffer = *inputBuffer;
		char* output = outputBuffer;
		size_t outputLeft = 1;
		size_t size = iconv(*conversion, &buffer, &left,
			&output, &outputLeft);

		if (size != (size_t)-1) {
			// should not reach here
			break;
		}

		if (errno == EINVAL) {
			// too few input bytes provided,
			// increase input buffer size and try again
			continue;
		}

		if (errno == EILSEQ) {
			// minimal size of input buffer found
			break;
		}

		// should not reach here
	};

	*inputBuffer += left;
	*inputLeft -= left;
}


status_t
convert_encoding(const char* from, const char* to, const char* src,
	int32* srcLen, char* dst, int32* dstLen, int32* state,
	char substitute)
{
	if (*srcLen == 0) {
		// nothing to do!
		*dstLen = 0;
		return B_OK;
	}

	// TODO: this doesn't work, as the state is reset every time!
	iconv_t conversion = iconv_open(to, from);
	if (conversion == (iconv_t)-1) {
		DEBPRINT(("iconv_open failed\n"));
		return B_ERROR;
	}

	size_t outputLeft = *dstLen;

	if (state == NULL || *state == 0) {
		if (state != NULL)
			*state = 1;

		iconv(conversion, NULL, NULL, &dst, &outputLeft);
	}

	char** inputBuffer = const_cast<char**>(&src);
	size_t inputLeft = *srcLen;
	do {
		size_t nonReversibleConversions = iconv(conversion, inputBuffer,
			&inputLeft, &dst, &outputLeft);
		if (nonReversibleConversions == (size_t)-1) {
			if (errno == E2BIG) {
				// Not enough room in the output buffer for the next converted character
				// This is not a "real" error, we just quit out.
				break;
			}

			switch (errno) {
				case EILSEQ: // unable to generate a corresponding character
				{
					discard_invalid_input_character(&conversion, inputBuffer,
						&inputLeft);

					// prepare to convert the substitute character to target encoding
					char original = substitute;
					size_t len = 1;
					char* copy = &original;

					// Perform the conversion
					// We ignore any errors during this as part of robustness/best-effort
					// We use ISO-8859-1 as a source because it is a single byte encoding
					// It also overlaps UTF-8 for the lower 128 characters.  It is also
					// likely to have a mapping to almost any target encoding.
					iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
					if (iso8859_1to != (iconv_t)-1) {
						iconv(iso8859_1to, 0, 0, 0, 0);
						iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
						iconv_close(iso8859_1to);
					}
					break;
				}

				case EINVAL: // incomplete multibyte sequence at the end of the input
					// TODO inputLeft bytes from inputBuffer should
					// be stored in state variable, so that conversion
					// can continue when the caller provides the missing
					// bytes with the next call of this method

					// we just eat bad bytes, as part of robustness/best-effort
					inputBuffer++;
					inputLeft--;
					break;

				default:
					// unknown error, completely bail
					status_t status = errno;
					iconv_close(conversion);
					return status;
			}
		}
	} while (inputLeft > 0 && outputLeft > 0);

	*srcLen -= inputLeft;
	*dstLen -= outputLeft;
	iconv_close(conversion);

	return B_OK;
}


status_t
convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
	char* dst, int32* dstLen, int32* state, char substitute)
{
	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
		srcEncoding);
	if (charset == NULL)
		return B_ERROR;

#if DEBUG_CONV
	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
	for (int i = 0 ; i < *srcLen ; i++) {
		fprintf(stderr, "%c", src[i]);
	}
	fprintf(stderr, "\"\n");
#endif

	return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
		dst, dstLen, state, substitute);
}


status_t
convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
	char* dst, int32* dstLen, int32* state, char substitute)
{
	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
		dstEncoding);
	if (charset == NULL)
		return B_ERROR;

#if DEBUG_CONV
	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
	for (int i = 0 ; i < *srcLen ; i++) {
		fprintf(stderr, "%c", src[i]);
	}
	fprintf(stderr, "\"\n");
#endif

	return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
		dst, dstLen, state, substitute);
}