--- a/usr/src/lib/libc/port/locale/euc.c Sat Jan 01 19:31:36 2011 -0500
+++ b/usr/src/lib/libc/port/locale/euc.c Thu Jan 13 08:38:20 2011 -0800
@@ -1,5 +1,5 @@
/*
- * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
@@ -39,23 +39,37 @@
#include <string.h>
#include <wchar.h>
#include <sys/types.h>
+#include <sys/euc.h>
#include "runetype.h"
#include "mblocal.h"
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
+static size_t _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
+ const char *_RESTRICT_KYWD,
+ size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
+static size_t _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
+ mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
-static size_t _EUC_mbrtowc(wchar_t *_RESTRICT_KYWD,
+static size_t _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
+ const char *_RESTRICT_KYWD,
+ size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
const char *_RESTRICT_KYWD,
size_t, mbstate_t *_RESTRICT_KYWD);
-static int _EUC_mbsinit(const mbstate_t *);
-static size_t _EUC_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+static size_t _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
+ const char *_RESTRICT_KYWD,
+ size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
+ const char *_RESTRICT_KYWD,
+ size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
mbstate_t *_RESTRICT_KYWD);
-
-typedef struct {
- int count[4];
- wchar_t bits[4];
- wchar_t mask;
-} _EucInfo;
+static size_t _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+ mbstate_t *_RESTRICT_KYWD);
+static size_t _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+ mbstate_t *_RESTRICT_KYWD);
+static size_t _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+ mbstate_t *_RESTRICT_KYWD);
+static int _EUC_mbsinit(const mbstate_t *);
typedef struct {
wchar_t ch;
@@ -63,59 +77,6 @@
int want;
} _EucState;
-int
-_EUC_init(_RuneLocale *rl)
-{
- _EucInfo *ei;
- int x, new__mb_cur_max;
- char *v, *e;
-
- if (rl->__variable == NULL)
- return (EINVAL);
-
- v = (char *)rl->__variable;
-
- while (*v == ' ' || *v == '\t')
- ++v;
-
- if ((ei = malloc(sizeof (_EucInfo))) == NULL)
- return (errno == 0 ? ENOMEM : errno);
-
- new__mb_cur_max = 0;
- for (x = 0; x < 4; ++x) {
- ei->count[x] = (int)strtol(v, &e, 0);
- if (v == e || !(v = e)) {
- free(ei);
- return (EINVAL);
- }
- if (new__mb_cur_max < ei->count[x])
- new__mb_cur_max = ei->count[x];
- while (*v == ' ' || *v == '\t')
- ++v;
- ei->bits[x] = (int)strtol(v, &e, 0);
- if (v == e || !(v = e)) {
- free(ei);
- return (EINVAL);
- }
- while (*v == ' ' || *v == '\t')
- ++v;
- }
- ei->mask = (int)strtol(v, &e, 0);
- if (v == e || !(v = e)) {
- free(ei);
- return (EINVAL);
- }
- rl->__variable = ei;
- rl->__variable_len = sizeof (_EucInfo);
- _CurrentRuneLocale = rl;
- __ctype[520] = new__mb_cur_max;
- __mbrtowc = _EUC_mbrtowc;
- __wcrtomb = _EUC_wcrtomb;
- __mbsinit = _EUC_mbsinit;
- charset_is_ascii = 0;
- return (0);
-}
-
static int
_EUC_mbsinit(const mbstate_t *ps)
{
@@ -123,34 +84,147 @@
return (ps == NULL || ((const _EucState *)ps)->want == 0);
}
-#define CEI ((_EucInfo *)(_CurrentRuneLocale->__variable))
+/*
+ * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
+ */
+int
+_EUC_CN_init(_RuneLocale *rl)
+{
+ __mbrtowc = _EUC_CN_mbrtowc;
+ __wcrtomb = _EUC_CN_wcrtomb;
+ __mbsinit = _EUC_mbsinit;
+
+ _CurrentRuneLocale = rl;
-#define _SS2 0x008e
-#define _SS3 0x008f
+ __ctype[520] = 4;
+ charset_is_ascii = 0;
+ return (0);
+}
-#define GR_BITS 0x80808080 /* XXX: to be fixed */
+static size_t
+_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+ size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
+}
-static int
-_euc_set(uint_t c)
+static size_t
+_EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+ mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
+}
+
+/*
+ * EUC-KR uses only CS0 and CS1.
+ */
+int
+_EUC_KR_init(_RuneLocale *rl)
{
+ __mbrtowc = _EUC_KR_mbrtowc;
+ __wcrtomb = _EUC_KR_wcrtomb;
+ __mbsinit = _EUC_mbsinit;
- c &= 0xff;
- return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0);
+ _CurrentRuneLocale = rl;
+
+ __ctype[520] = 2;
+ charset_is_ascii = 0;
+ return (0);
+}
+
+static size_t
+_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+ size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
}
static size_t
-_EUC_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+_EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+ mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
+}
+
+/*
+ * EUC-JP uses CS0, CS1, CS2, and CS3.
+ */
+int
+_EUC_JP_init(_RuneLocale *rl)
+{
+ __mbrtowc = _EUC_JP_mbrtowc;
+ __wcrtomb = _EUC_JP_wcrtomb;
+ __mbsinit = _EUC_mbsinit;
+
+ _CurrentRuneLocale = rl;
+
+ __ctype[520] = 3;
+ charset_is_ascii = 0;
+ return (0);
+}
+
+static size_t
+_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
size_t n, mbstate_t *_RESTRICT_KYWD ps)
{
+ return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
+}
+
+static size_t
+_EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+ mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
+}
+
+/*
+ * EUC-TW uses CS0, CS1, and CS2.
+ */
+int
+_EUC_TW_init(_RuneLocale *rl)
+{
+ __mbrtowc = _EUC_TW_mbrtowc;
+ __wcrtomb = _EUC_TW_wcrtomb;
+ __mbsinit = _EUC_mbsinit;
+
+ _CurrentRuneLocale = rl;
+
+ __ctype[520] = 4;
+ charset_is_ascii = 0;
+ return (0);
+}
+
+static size_t
+_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+ size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
+}
+
+static size_t
+_EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+ mbstate_t *_RESTRICT_KYWD ps)
+{
+ return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
+}
+
+/*
+ * Common EUC code.
+ */
+
+static size_t
+_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+ size_t n, mbstate_t *_RESTRICT_KYWD ps,
+ uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
+{
_EucState *es;
- int i, set, want;
+ int i, want;
wchar_t wc;
- const char *os;
+ unsigned char ch;
es = (_EucState *)ps;
- if (es->want < 0 || es->want > MB_CUR_MAX || es->set < 0 ||
- es->set > 3) {
+ if (es->want < 0 || es->want > MB_CUR_MAX) {
errno = EINVAL;
return ((size_t)-1);
}
@@ -165,58 +239,59 @@
/* Incomplete multibyte sequence */
return ((size_t)-2);
- os = s;
+ if (es->want == 0) {
+ /* Fast path for plain ASCII (CS0) */
+ if (((ch = (unsigned char)*s) & 0x80) == 0) {
+ if (pwc != NULL)
+ *pwc = ch;
+ return (ch != '\0' ? 1 : 0);
+ }
- if (es->want == 0) {
- want = CEI->count[set = _euc_set(*s)];
- if (set == 2 || set == 3) {
- --want;
- if (--n == 0) {
- /* Incomplete multibyte sequence */
- es->set = set;
- es->want = want;
- es->ch = 0;
- return ((size_t)-2);
- }
- ++s;
- if (*s == '\0') {
- errno = EILSEQ;
- return ((size_t)-1);
- }
+ if (ch >= 0xa1) {
+ /* CS1 */
+ want = 2;
+ } else if (ch == cs2) {
+ want = cs2width;
+ } else if (ch == cs3) {
+ want = cs3width;
+ } else {
+ errno = EILSEQ;
+ return ((size_t)-1);
}
- wc = (unsigned char)*s++;
+
+
+ es->want = want;
+ es->ch = 0;
} else {
- set = es->set;
want = es->want;
wc = es->ch;
}
- for (i = (es->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
- if (*s == '\0') {
- errno = EILSEQ;
- return ((size_t)-1);
- }
- wc = (wc << 8) | (unsigned char)*s++;
+
+ for (i = 0; i < MIN(want, n); i++) {
+ wc <<= 8;
+ wc |= *s;
+ s++;
}
if (i < want) {
/* Incomplete multibyte sequence */
- es->set = set;
es->want = want - i;
es->ch = wc;
return ((size_t)-2);
}
- wc = (wc & ~CEI->mask) | CEI->bits[set];
if (pwc != NULL)
*pwc = wc;
es->want = 0;
- return (wc == L'\0' ? 0 : s - os);
+ return (wc == L'\0' ? 0 : want);
}
static size_t
-_EUC_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
+_EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
+ mbstate_t *_RESTRICT_KYWD ps,
+ uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
{
_EucState *es;
- wchar_t m, nm;
int i, len;
+ wchar_t nm;
es = (_EucState *)ps;
@@ -229,38 +304,52 @@
/* Reset to initial shift state (no-op) */
return (1);
- m = wc & CEI->mask;
- nm = wc & ~m;
+ if ((wc & ~0x7f) == 0) {
+ /* Fast path for plain ASCII (CS0) */
+ *s = (char)wc;
+ return (1);
+ }
- if (m == CEI->bits[1]) {
-CodeSet1:
- /* Codeset 1: The first byte must have 0x80 in it. */
- i = len = CEI->count[1];
- while (i-- > 0) {
- *(unsigned char *)s = (nm >> (i << 3)) | 0x80;
- s++;
- }
+ /* Determine the "length" */
+ if ((unsigned)wc > 0xffffff) {
+ len = 4;
+ } else if ((unsigned)wc > 0xffff) {
+ len = 3;
+ } else if ((unsigned)wc > 0xff) {
+ len = 2;
} else {
- if (m == CEI->bits[0])
- i = len = CEI->count[0];
- else if (m == CEI->bits[2]) {
- i = len = CEI->count[2];
- *(unsigned char *)s = _SS2;
- s++;
- --i;
- /* SS2 designates G2 into GR */
- nm |= GR_BITS;
- } else if (m == CEI->bits[3]) {
- i = len = CEI->count[3];
- *(unsigned char *)s = _SS3;
- s++;
- --i;
- /* SS3 designates G3 into GR */
- nm |= GR_BITS;
- } else
- goto CodeSet1; /* Bletch */
- while (i-- > 0)
- *s++ = (nm >> (i << 3)) & 0xff;
+ len = 1;
+ }
+
+ if (len > MB_CUR_MAX) {
+ errno = EILSEQ;
+ return ((size_t)-1);
+ }
+
+ /* This first check excludes CS1, which is implicitly valid. */
+ if ((wc < 0xa100) || (wc > 0xffff)) {
+ /* Check for valid CS2 or CS3 */
+ nm = (wc >> ((len - 1) * 8));
+ if (nm == cs2) {
+ if (len != cs2width) {
+ errno = EILSEQ;
+ return ((size_t)-1);
+ }
+ } else if (nm == cs3) {
+ if (len != cs3width) {
+ errno = EILSEQ;
+ return ((size_t)-1);
+ }
+ } else {
+ errno = EILSEQ;
+ return ((size_t)-1);
+ }
+ }
+
+ /* Stash the bytes, least significant last */
+ for (i = len - 1; i >= 0; i--) {
+ s[i] = (wc & 0xff);
+ wc >>= 8;
}
return (len);
}