/* $NetBSD: utf8.c,v 1.3 2000/12/21 12:21:38 itojun Exp $ */ /*- * Copyright (c)1999 Citrus Project, * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * citrus Id: utf8.c,v 1.11 2000/12/21 12:21:05 itojun Exp */ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Paul Borman at Krystal Technologies. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #if defined(LIBC_SCCS) && !defined(lint) #if 0 static char sccsid[] = "@(#)utf2.c 8.1 (Berkeley) 6/4/93"; #else __RCSID("$NetBSD: utf8.c,v 1.3 2000/12/21 12:21:38 itojun Exp $"); #endif #endif /* LIBC_SCCS and not lint */ #include #include #include "rune.h" #include #include #include #include int _UTF8_init __P((_RuneLocale *)); static int findlen __P((rune_t)); rune_t _UTF8_sgetrune __P((_RuneLocale *, const char *, size_t, char const **, void *)); int _UTF8_sputrune __P((_RuneLocale *, rune_t, char *, size_t, char **, void *)); static int _utf_count[256]; static _RuneState _UTF8_RuneState = { 0, /* sizestate */ NULL, /* initstate */ NULL, /* packstate */ NULL /* unpackstate */ }; static u_int32_t _UTF8_range[] = { 0, /*dummy*/ 0x00000000, 0x00000080, 0x00000800, 0x00010000, 0x00200000, 0x04000000, 0x80000000, }; int _UTF8_init(rl) _RuneLocale *rl; { int i; rl->__rune_sgetrune = _UTF8_sgetrune; rl->__rune_sputrune = _UTF8_sputrune; rl->__rune_RuneState = &_UTF8_RuneState; rl->__rune_mb_cur_max = 6; memset(_utf_count, 0, sizeof(_utf_count)); for (i = 0; i <= 0x7f; i++) _utf_count[i] = 1; for (i = 0xc0; i <= 0xdf; i++) _utf_count[i] = 2; for (i = 0xe0; i <= 0xef; i++) _utf_count[i] = 3; for (i = 0xf0; i <= 0xf7; i++) _utf_count[i] = 4; for (i = 0xf8; i <= 0xfb; i++) _utf_count[i] = 5; for (i = 0xfc; i <= 0xfd; i++) _utf_count[i] = 6; return (0); } static int findlen(v) rune_t v; { int i; u_int32_t c; c = (u_int32_t)v; /*XXX*/ for (i = 1; i < sizeof(_UTF8_range) / sizeof(_UTF8_range[0]); i++) if (c >= _UTF8_range[i] && c < _UTF8_range[i + 1]) return i; return -1; /*out of range*/ } rune_t _UTF8_sgetrune(rl, string, n, result, state) _RuneLocale *rl; const char *string; size_t n; char const **result; void *state; { int c; int i; rune_t v; if (n < 1 || (c = _utf_count[*(u_int8_t *)string]) > n) { if (result) *result = string; return (___INVALID_RUNE(rl)); } switch (c) { case 1: if (result) *result = string + 1; return (*string & 0xff); case 2: case 3: case 4: case 5: case 6: v = string[0] & (0x7f >> c); for (i = 1; i < c; i++) { if ((string[i] & 0xC0) != 0x80) goto encoding_error; v <<= 6; v |= (string[i] & 0x3f); } #if 1 /* should we do it? utf2.c does not reject redundant encodings */ /* sanity check on value range */ i = findlen(v); if (i != c) goto encoding_error; #endif if (result) *result = string + c; return v; default: encoding_error: if (result) *result = string + 1; return (___INVALID_RUNE(rl)); } } int _UTF8_sputrune(rl, c, string, n, result, state) _RuneLocale *rl; rune_t c; char *string, **result; size_t n; void *state; { int cnt; int i; cnt = findlen(c); if (cnt <= 0 || cnt > 6) { /* invalid UCS4 value */ if (result) *result = NULL; return 0; } if (n >= cnt) { if (string) { for (i = cnt - 1; i > 0; i--) { string[i] = 0x80 | (c & 0x3f); c >>= 6; } string[0] = c; if (cnt == 1) string[0] &= 0x7f; else { string[0] &= (0x7f >> cnt); string[0] |= ((0xff00 >> cnt) & 0xff); } } if (result) *result = string + cnt; } else if (result) *result = NULL; return cnt; }