Add implementation of mbsrtowcs() to our locale backend.

* add MultibyteStringToWchar() to ICU locale backend * implement mbsrtowcs() and mbsnrtowcs() on top of MultibyteStringToWchar() * drop respective glibc files
2011-12-07 18:20:34 +01:00 · 2011-12-07 18:20:34 +01:00 · 73186b2fcd
commit 73186b2fcd
parent d0e7bc307c
11 changed files with 216 additions and 312 deletions
--- a/headers/private/libroot/locale/ICUCtypeData.h
+++ b/headers/private/libroot/locale/ICUCtypeData.h
@ -33,6 +33,10 @@ public:
 			status_t			MultibyteToWchar(wchar_t* wcOut, const char* mb,
 									size_t mbLength, mbstate_t* mbState,
 									size_t& lengthOut);
+			status_t			MultibyteStringToWchar(wchar_t* wcDest,
+									size_t wcDestLength, const char** mbSource,
+									size_t mbSourceLength, mbstate_t* mbState,
+									size_t& lengthOut);
 			status_t			WcharToMultibyte(char* mbOut, wchar_t wc,
 									mbstate_t* mbState, size_t& lengthOut);

--- a/headers/private/libroot/locale/ICULocaleBackend.h
+++ b/headers/private/libroot/locale/ICULocaleBackend.h
@ -44,6 +44,10 @@ public:
 	virtual status_t			MultibyteToWchar(wchar_t* wcOut, const char* mb,
 									size_t mbLength, mbstate_t* mbState,
 									size_t& lengthOut);
+	virtual status_t			MultibyteStringToWchar(wchar_t* wcDest,
+									size_t wcDestLength, const char** mbSource,
+									size_t mbSourceLength, mbstate_t* mbState,
+									size_t& lengthOut);
 	virtual status_t			WcharToMultibyte(char* mbOut, wchar_t wc,
 									mbstate_t* mbState, size_t& lengthOut);

--- a/headers/private/libroot/locale/LocaleBackend.h
+++ b/headers/private/libroot/locale/LocaleBackend.h
@ -126,6 +126,10 @@ public:
 	virtual status_t			MultibyteToWchar(wchar_t* wcOut, const char* mb,
 									size_t mbLength, mbstate_t* mbState,
 									size_t& lengthOut) = 0;
+	virtual status_t			MultibyteStringToWchar(wchar_t* wcDest,
+									size_t wcDestLength, const char** mbSource,
+									size_t mbSourceLength, mbstate_t* mbState,
+									size_t& lengthOut) = 0;
 	virtual status_t			WcharToMultibyte(char* mbOut, wchar_t wc,
 									mbstate_t* mbState, size_t& lengthOut) = 0;

--- a/src/system/libroot/add-ons/icu/ICUCtypeData.cpp
+++ b/src/system/libroot/add-ons/icu/ICUCtypeData.cpp
@ -268,6 +268,77 @@ ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
 }


+status_t
+ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
+	const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
+	size_t& lengthOut)
+{
+	ICUConverterRef converterRef;
+	status_t result = _GetConverterForMbState(mbState, converterRef);
+	if (result != B_OK) {
+		TRACE(("MultibyteStringToWchar(): couldn't get converter for ID %d -"
+			" %lx\n", mbState->converterID, result));
+		return result;
+	}
+
+	UConverter* converter = converterRef->Converter();
+
+	bool wcsIsTerminated = false;
+	const char* source = *mbSource;
+	const char* sourceEnd = source + mbSourceLength;
+	if (sourceEnd < source) {
+		// overflow, clamp to highest possible address
+		sourceEnd = (const char*)-1;
+	}
+
+	if (wcDest == NULL) {
+		// if there's no destination buffer, there's no length limit either
+		wcDestLength = (size_t)-1;
+	}
+
+	UErrorCode icuStatus = U_ZERO_ERROR;
+	size_t sourceLengthUsed = 0;
+	for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) {
+		if (sourceLengthUsed >= mbSourceLength)
+			break;
+		UChar32 unicodeChar = ucnv_getNextUChar(converter, &source,
+			std::min(source + MB_LEN_MAX, sourceEnd), &icuStatus);
+		sourceLengthUsed = source - *mbSource;
+		TRACE(("l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu uchar:%x st:%x\n",
+			lengthOut, wcDestLength, source, sourceEnd, mbSourceLength,
+			sourceLengthUsed, unicodeChar, icuStatus));
+		if (!U_SUCCESS(icuStatus))
+			break;
+		if (wcDest != NULL)
+			*wcDest++ = unicodeChar;
+		if (unicodeChar == L'\0') {
+			if (wcDest != NULL)
+				wcsIsTerminated = true;
+			break;
+		}
+		icuStatus = U_ZERO_ERROR;
+	}
+
+	if (wcDest != NULL)
+		*mbSource = source;
+
+	if (!U_SUCCESS(icuStatus)) {
+		// conversion failed because of illegal character sequence
+		TRACE(("MultibyteStringToWchar(): illegal character sequence\n"));
+		ucnv_resetToUnicode(converter);
+		result = B_BAD_DATA;
+	} else if (wcsIsTerminated) {
+		// reset to initial state
+		_DropConverterFromMbState(mbState);
+		memset(mbState, 0, sizeof(mbstate_t));
+		*mbSource = NULL;
+	} else
+		mbState->count = 0;
+
+	return result;
+}
+
+
 status_t
 ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
 	size_t& lengthOut)
--- a/src/system/libroot/add-ons/icu/ICULocaleBackend.cpp
+++ b/src/system/libroot/add-ons/icu/ICULocaleBackend.cpp
@ -162,6 +162,18 @@ ICULocaleBackend::MultibyteToWchar(wchar_t* wcOut, const char* mb,
 }


+status_t
+ICULocaleBackend::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
+	const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
+	size_t& lengthOut)
+{
+	ErrnoMaintainer errnoMaintainer;
+
+	return fCtypeData.MultibyteStringToWchar(wcDest, wcDestLength, mbSource,
+		mbSourceLength, mbState, lengthOut);
+}
+
+
 status_t
 ICULocaleBackend::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
 	size_t& lengthOut)
--- a/src/system/libroot/posix/glibc/wcsmbs/Jamfile
+++ b/src/system/libroot/posix/glibc/wcsmbs/Jamfile
@ -18,9 +18,6 @@ UsePrivateHeaders libroot ;
 SubDirCcFlags -D_GNU_SOURCE -DUSE_IN_LIBIO ;

 MergeObject posix_gnu_wcsmbs.o :
-	mbsnrtowcs.c
-	mbsrtowcs.c
-#	mbsrtowcs_l.c
 	wcpcpy.c
 	wcpncpy.c
 	wcscasecmp.c
--- a/src/system/libroot/posix/glibc/wcsmbs/mbsnrtowcs.c
+++ b/src/system/libroot/posix/glibc/wcsmbs/mbsnrtowcs.c
@ -1,139 +0,0 @@
-/* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <gconv.h>
-#include <string.h>
-#include <wchar.h>
-#include <wcsmbsload.h>
-
-#include <assert.h>
-
-#ifndef EILSEQ
-# define EILSEQ EINVAL
-#endif
-
-
-/* This is the private state used if PS is NULL.  */
-static mbstate_t state;
-
-/* This is a non-standard function but it is very useful in the
-   implementation of stdio because we have to deal with unterminated
-   buffers.  At most NMC bytes will be converted.  */
-size_t
-__mbsnrtowcs (dst, src, nmc, len, ps)
-     wchar_t *dst;
-     const char **src;
-     size_t nmc;
-     size_t len;
-     mbstate_t *ps;
-{
-  const unsigned char *srcend;
-  struct __gconv_step_data data;
-  size_t result;
-  int status;
-  struct __gconv_step *towc;
-  size_t dummy;
-
-  /* Tell where we want the result.  */
-  data.__invocation_counter = 0;
-  data.__internal_use = 1;
-  data.__flags = __GCONV_IS_LAST;
-  data.__statep = ps ?: &state;
-  data.__trans = NULL;
-
-  if (nmc == 0)
-    return 0;
-  srcend = *src + __strnlen (*src, nmc - 1) + 1;
-
-  /* Make sure we use the correct function.  */
-  update_conversion_ptrs ();
-
-  /* Get the structure with the function pointers.  */
-  towc = __wcsmbs_gconv_fcts.towc;
-
-  /* We have to handle DST == NULL special.  */
-  if (dst == NULL)
-    {
-      wchar_t buf[64];		/* Just an arbitrary size.  */
-      const unsigned char *inbuf = *src;
-
-      result = 0;
-      data.__outbufend = (unsigned char *) buf + sizeof (buf);
-      do
-	{
-	  data.__outbuf = (unsigned char *) buf;
-
-	  status = DL_CALL_FCT (towc->__fct,
-				(towc, &data, &inbuf, srcend, NULL,
-				 &dummy, 0, 1));
-
-	  result += (wchar_t *) data.__outbuf - buf;
-	}
-      while (status == __GCONV_FULL_OUTPUT);
-
-      if ((status == __GCONV_OK || status == __GCONV_EMPTY_INPUT)
-	  && ((wchar_t *) data.__outbuf)[-1] == L'\0')
-	/* Don't count the NUL character in.  */
-	--result;
-    }
-  else
-    {
-      /* This code is based on the safe assumption that all internal
-	 multi-byte encodings use the NUL byte only to mark the end
-	 of the string.  */
-      data.__outbuf = (unsigned char *) dst;
-      data.__outbufend = data.__outbuf + len * sizeof (wchar_t);
-
-      status = DL_CALL_FCT (towc->__fct,
-			    (towc, &data, (const unsigned char **) src, srcend,
-			     NULL, &dummy, 0, 1));
-
-      result = (wchar_t *) data.__outbuf - dst;
-
-      /* We have to determine whether the last character converted
-	 is the NUL character.  */
-      if ((status == __GCONV_OK || status == __GCONV_EMPTY_INPUT)
-	  && (assert (result > 0),
-	      ((wchar_t *) dst)[result - 1] == L'\0'))
-	{
-	  assert (__mbsinit (data.__statep));
-	  *src = NULL;
-	  --result;
-	}
-    }
-
-  /* There must not be any problems with the conversion but illegal input
-     characters.  */
-  assert (status == __GCONV_OK || status == __GCONV_EMPTY_INPUT
-	  || status == __GCONV_ILLEGAL_INPUT
-	  || status == __GCONV_INCOMPLETE_INPUT
-	  || status == __GCONV_FULL_OUTPUT);
-
-  if (status != __GCONV_OK && status != __GCONV_FULL_OUTPUT
-      && status != __GCONV_EMPTY_INPUT && status != __GCONV_INCOMPLETE_INPUT)
-    {
-      result = (size_t) -1;
-      __set_errno (EILSEQ);
-    }
-
-  return result;
-}
-weak_alias (__mbsnrtowcs, mbsnrtowcs)
--- a/src/system/libroot/posix/glibc/wcsmbs/mbsrtowcs.c
+++ b/src/system/libroot/posix/glibc/wcsmbs/mbsrtowcs.c
@ -1,145 +0,0 @@
-/* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <gconv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <wchar.h>
-#include <wcsmbsload.h>
-
-#include <assert.h>
-
-#ifndef EILSEQ
-# define EILSEQ EINVAL
-#endif
-
-
-/* This is the private state used if PS is NULL.  */
-static mbstate_t state;
-
-size_t
-__mbsrtowcs (dst, src, len, ps)
-     wchar_t *dst;
-     const char **src;
-     size_t len;
-     mbstate_t *ps;
-{
-  struct __gconv_step_data data;
-  size_t result;
-  int status;
-  struct __gconv_step *towc;
-  size_t non_reversible;
-
-  /* Tell where we want the result.  */
-  data.__invocation_counter = 0;
-  data.__internal_use = 1;
-  data.__flags = __GCONV_IS_LAST;
-  data.__statep = ps ?: &state;
-  data.__trans = NULL;
-
-  /* Make sure we use the correct function.  */
-  update_conversion_ptrs ();
-
-  /* Get the structure with the function pointers.  */
-  towc = __wcsmbs_gconv_fcts.towc;
-
-  /* We have to handle DST == NULL special.  */
-  if (dst == NULL)
-    {
-      mbstate_t temp_state;
-      wchar_t buf[64];		/* Just an arbitrary size.  */
-      const unsigned char *inbuf = (const unsigned char *) *src;
-      const unsigned char *srcend = inbuf + strlen (inbuf) + 1;
-
-      temp_state = *data.__statep;
-      data.__statep = &temp_state;
-
-      result = 0;
-      data.__outbufend = (char *) buf + sizeof (buf);
-      do
-	{
-	  data.__outbuf = (char *) buf;
-
-	  status = DL_CALL_FCT (towc->__fct,
-				(towc, &data, &inbuf, srcend, NULL,
-				 &non_reversible, 0, 1));
-
-	  result += (wchar_t *) data.__outbuf - buf;
-	}
-      while (status == __GCONV_FULL_OUTPUT);
-
-      if (status == __GCONV_OK || status == __GCONV_EMPTY_INPUT)
-	{
-	  /* There better should be a NUL wide char at the end.  */
-	  assert (((wchar_t *) data.__outbuf)[-1] == L'\0');
-	  /* Don't count the NUL character in.  */
-	  --result;
-	}
-    }
-  else
-    {
-      /* This code is based on the safe assumption that all internal
-	 multi-byte encodings use the NUL byte only to mark the end
-	 of the string.  */
-      const unsigned char *srcend;
-
-      srcend = (const unsigned char *) (*src
-					+ __strnlen (*src, len * MB_CUR_MAX)
-					+ 1);
-
-      data.__outbuf = (unsigned char *) dst;
-      data.__outbufend = data.__outbuf + len * sizeof (wchar_t);
-
-      status = DL_CALL_FCT (towc->__fct,
-			    (towc, &data, (const unsigned char **) src, srcend,
-			     NULL, &non_reversible, 0, 1));
-
-      result = (wchar_t *) data.__outbuf - dst;
-
-      /* We have to determine whether the last character converted
-	 is the NUL character.  */
-      if ((status == __GCONV_OK || status == __GCONV_EMPTY_INPUT)
-	  && ((wchar_t *) dst)[result - 1] == L'\0')
-	{
-	  assert (result > 0);
-	  assert (__mbsinit (data.__statep));
-	  *src = NULL;
-	  --result;
-	}
-    }
-
-  /* There must not be any problems with the conversion but illegal input
-     characters.  */
-  assert (status == __GCONV_OK || status == __GCONV_EMPTY_INPUT
-	  || status == __GCONV_ILLEGAL_INPUT
-	  || status == __GCONV_INCOMPLETE_INPUT
-	  || status == __GCONV_FULL_OUTPUT);
-
-  if (status != __GCONV_OK && status != __GCONV_FULL_OUTPUT
-      && status != __GCONV_EMPTY_INPUT && status != __GCONV_INCOMPLETE_INPUT)
-    {
-      result = (size_t) -1;
-      __set_errno (EILSEQ);
-    }
-
-  return result;
-}
-weak_alias (__mbsrtowcs, mbsrtowcs)
--- a/src/system/libroot/posix/glibc/wcsmbs/mbsrtowcs_l.c
+++ b/src/system/libroot/posix/glibc/wcsmbs/mbsrtowcs_l.c
@ -1,25 +0,0 @@
-/* Copyright (C) 2002 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.org>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <ctype.h>
-#include <string.h>
-#include "wcsmbsload.h"
-
-#define USE_IN_EXTENDED_LOCALE_MODEL	1
-#include "mbsrtowcs.c"
--- a/src/system/libroot/posix/wchar/Jamfile
+++ b/src/system/libroot/posix/wchar/Jamfile
@ -11,6 +11,7 @@ MergeObject posix_wchar.o :
 	mbrlen.c
 	mbrtowc.cpp
 	mbsinit.c
+	mbsrtowcs.cpp
 	mbtowc.c
 	wcrtomb.cpp
 	wcswidth.c
--- a/src/system/libroot/posix/wchar/mbsrtowcs.cpp
+++ b/src/system/libroot/posix/wchar/mbsrtowcs.cpp
@ -0,0 +1,120 @@
+/*
+** Copyright 2011, Oliver Tappe, zooey@hirschkaefer.de. All rights reserved.
+** Distributed under the terms of the Haiku License.
+*/
+
+#include <errno.h>
+#include <string.h>
+#include <wchar.h>
+
+#include <errno_private.h>
+#include "LocaleBackend.h"
+
+
+//#define TRACE_MBSRTOWCS
+#ifdef TRACE_MBSRTOWCS
+#	include <OS.h>
+#	define TRACE(x) debug_printf x
+#else
+#	define TRACE(x) ;
+#endif
+
+
+using BPrivate::Libroot::gLocaleBackend;
+
+
+extern "C" size_t
+__mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len,
+	mbstate_t* ps)
+{
+	TRACE(("mbsnrtowcs(%p, %p, %lu, %lu)\n", dst, *src, nmc, len));
+
+	if (ps == NULL) {
+		static mbstate_t internalMbState;
+		ps = &internalMbState;
+	}
+
+	if (gLocaleBackend == NULL) {
+		/*
+		 * The POSIX locale is active. Since the POSIX locale only contains
+		 * chars 0-127 and those ASCII chars are compatible with the UTF32
+		 * values used in wint_t, we can just copy the bytes.
+		 */
+		size_t count = 0;
+		if (dst == NULL) {
+			// only count number of required wide characters
+			for (const char* srcEnd = *src + nmc; *src < srcEnd;
+					++*src, ++count) {
+				if (*src < 0) {
+					// char is non-ASCII
+					__set_errno(EILSEQ);
+					return (size_t)-1;
+				}
+				if (**src == 0) {
+					memset(ps, 0, sizeof(mbstate_t));
+					*src = NULL;
+					break;
+				}
+			}
+		} else {
+			// "convert" the characters
+			for (; count < len; ++*src, ++count) {
+				if (*src < 0) {
+					// char is non-ASCII
+					__set_errno(EILSEQ);
+					return (size_t)-1;
+				}
+				*dst++ = (wchar_t)*src;
+				if (*src == 0) {
+					memset(ps, 0, sizeof(mbstate_t));
+					*src = NULL;
+					break;
+				}
+			}
+		}
+
+		TRACE(("mbsnrtowcs returns %lx and src %p\n", count, *src));
+
+		return count;
+	}
+
+	size_t result = 0;
+	status_t status = gLocaleBackend->MultibyteStringToWchar(dst, len, src, nmc,
+		ps, result);
+
+	if (status == B_BAD_DATA) {
+		TRACE(("mbsnrtowc(): setting errno to EILSEQ\n"));
+		__set_errno(EILSEQ);
+		result = (size_t)-1;
+	} else if (status != B_OK) {
+		TRACE(("mbsnrtowc(): setting errno to EINVAL (status: %lx)\n", status));
+		__set_errno(EINVAL);
+		result = (size_t)-1;
+	}
+
+	TRACE(("mbsnrtowcs returns %lx and src %p\n", result, *src));
+
+	return result;
+}
+
+
+extern "C"
+B_DEFINE_WEAK_ALIAS(__mbsnrtowcs, mbsnrtowcs);
+
+
+extern "C" size_t
+__mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps)
+{
+	if (ps == NULL) {
+		static mbstate_t internalMbState;
+		ps = &internalMbState;
+	}
+
+	size_t srcLen = gLocaleBackend == NULL ? strlen(*src) : (size_t)-1;
+
+	return __mbsnrtowcs(dst, src, srcLen, len, ps);
+}
+
+
+extern "C"
+B_DEFINE_WEAK_ALIAS(__mbsrtowcs, mbsrtowcs);