imported patch adding-gb18030-ucs4-iconv-modules
authoryongsun
Thu, 14 Aug 2008 15:12:38 +0800
changeset 228 bd5619b243de
parent 227 460a45b341ab
child 229 605c6ba1ebf2
imported patch adding-gb18030-ucs4-iconv-modules
.hgignore
src/lib/iconv/zh/Makefile
src/lib/iconv/zh/UTF-8%zh_CN.gbk.c
src/lib/iconv/zh/uni_common.c
src/lib/iconv/zh/zh_CN.gbk%UTF-8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Thu Aug 14 15:12:38 2008 +0800
@@ -0,0 +1,4 @@
+syntax: glob
+tags
+*.o
+*.so
--- a/src/lib/iconv/zh/Makefile	Wed Aug 13 15:04:27 2008 +0200
+++ b/src/lib/iconv/zh/Makefile	Thu Aug 14 15:12:38 2008 +0800
@@ -86,6 +86,18 @@
 	$(CC) $(CFLAGS) -DUCS_2LE -c -o $@ $?
 	$(CC) $(CFLAGS_64) -DUCS_2LE -c -o $(MACH64)/$@ $?
 
+GBK%UCS-2BE.o: zh_CN.gbk%UTF-8.c
+	$(CC) $(CFLAGS) -DUCS_2BE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_2BE -c -o $(MACH64)/$@ $?
+
+GBK%UCS-4LE.o: zh_CN.gbk%UTF-8.c
+	$(CC) $(CFLAGS) -DUCS_4LE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_4LE -c -o $(MACH64)/$@ $?
+
+GBK%UCS-4BE.o: zh_CN.gbk%UTF-8.c
+	$(CC) $(CFLAGS) -DUCS_4BE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_4BE -c -o $(MACH64)/$@ $?
+
 UCS-2LE%gb2312.o: UTF-8%zh_CN.euc.c
 	$(CC) $(CFLAGS) -DUCS_2LE -c -o $@ $?
 	$(CC) $(CFLAGS_64) -DUCS_2LE -c -o $(MACH64)/$@ $?
@@ -94,6 +106,18 @@
 	$(CC) $(CFLAGS) -DUCS_2LE -c -o $@ $?
 	$(CC) $(CFLAGS_64) -DUCS_2LE -c -o $(MACH64)/$@ $?
 
+UCS-2BE%GBK.o: UTF-8%zh_CN.gbk.c
+	$(CC) $(CFLAGS) -DUCS_2BE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_2BE -c -o $(MACH64)/$@ $?
+
+UCS-4LE%GBK.o: UTF-8%zh_CN.gbk.c
+	$(CC) $(CFLAGS) -DUCS_4LE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_4LE -c -o $(MACH64)/$@ $?
+
+UCS-4BE%GBK.o: UTF-8%zh_CN.gbk.c
+	$(CC) $(CFLAGS) -DUCS_4BE -c -o $@ $?
+	$(CC) $(CFLAGS_64) -DUCS_4BE -c -o $(MACH64)/$@ $?
+
 cns11643%UCS-2LE.o: zh_TW-euc%UTF-8.c
 	$(CC) $(CFLAGS) -DUCS_2LE -c -o $@ $?
 	$(CC) $(CFLAGS_64) -DUCS_2LE -c -o $(MACH64)/$@ $?
@@ -213,11 +237,12 @@
 PROGS += zh_TW-cp937%UTF-8.so           UTF-8%zh_TW-cp937.so
 PROGS += zh_CN-cp935%UTF-8.so           UTF-8%zh_CN-cp935.so                
 PROGS += gb2312%UCS-2LE.so              UCS-2LE%gb2312.so 
-PROGS += GBK%UCS-2LE.so                 UCS-2LE%GBK.so
 PROGS += cns11643%UCS-2LE.so BIG5%UCS-2LE.so Big5-HKSCS%UCS-2LE.so
 PROGS += UCS-2LE%cns11643.so UCS-2LE%BIG5.so UCS-2LE%Big5-HKSCS.so
 PROGS += UTF-8%zh_TW-euc.so UTF-8%zh_TW-big5.so UTF-8%zh_HK.hkscs.so
 PROGS += zh_TW-iso2022-CN-EXT%zh_TW-euc.so zh_TW-iso2022-CN-EXT%zh_TW-big5.so
+PROGS += GBK%UCS-2LE.so GBK%UCS-2BE.so GBK%UCS-4LE.so GBK%UCS-4BE.so
+PROGS += UCS-2LE%GBK.so UCS-2BE%GBK.so UCS-4LE%GBK.so UCS-4BE%GBK.so
 
 install: all $(TARGETDIRS)
 	for i in $(PROGS); do \
--- a/src/lib/iconv/zh/UTF-8%zh_CN.gbk.c	Wed Aug 13 15:04:27 2008 +0200
+++ b/src/lib/iconv/zh/UTF-8%zh_CN.gbk.c	Thu Aug 14 15:12:38 2008 +0800
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <unicode_gb18030.h>	/* Unicode to GBK mapping table */
 #include "common_defs.h"
+#include "ucs4.h"
 
 #define	MSB	0x80	/* most significant bit */
 #define ONEBYTE	0xff	/* right most byte */
@@ -44,13 +45,11 @@
 	char	keepc[6];	/* maximum # byte of UTF8 code */
 	short	ustate;
 	int	_errno;		/* internal errno */
-        boolean little_endian;
-        boolean bom_written;
 } _iconv_st;
 
 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
 
-int get_gbk_by_unicode(char c1, char c2, int* unidx, unsigned long* gbkcode);
+int get_gbk_by_unicode(unsigned long, int*, unsigned long*);
 
 
 /*
@@ -68,12 +67,6 @@
 
 	st->ustate = U0;
 	st->_errno = 0;
-	st->little_endian = false;
-	st->bom_written = false;
-#if defined(UCS_2LE)
-	st->little_endian = true;
-	st->bom_written = true;
-#endif
 	return ((void *) st);
 }
 
@@ -90,7 +83,87 @@
 		free(st);
 }
 
+#if defined(UCS_2LE) || defined (UCS_2BE) || defined (UCS_4LE) || defined (UCS_4BE)
+size_t
+_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
+				char **outbuf, size_t *outbytesleft)
+{
+	unsigned char   c1, c2, c3, c4;
+	int		n, unidx;
+        unsigned long   unichr;
+	unsigned long	gbkcode;
+        int		uconv_num = 0;
 
+  	if (st == NULL) {
+		errno = EBADF;
+		return ((size_t) -1);
+	}
+
+	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
+		st->ustate = U0;
+		st->_errno = 0;
+		return ((size_t) 0); 
+	}
+
+	st->_errno = 0;		/* reset internal errno */
+	errno = 0;		/* reset external errno */
+
+	while (*inbytesleft > ICV_FETCH_UCS_SIZE-1 && *outbytesleft > 0) {
+
+                int     size = 0;    
+	   	int	uconv_num_internal = 0;
+
+                c1 = *(*inbuf + size++);
+                c2 = *(*inbuf + size++);
+#if defined(UCS_4LE) || defined (UCS_4BE)
+                c3 = *(*inbuf + size++);
+                c4 = *(*inbuf + size++);
+#endif
+
+#if defined(UCS_2LE)
+                unichr = (unsigned long) (c1 | (c2<<8));
+#elif defined(UCS_2BE)
+                unichr = (unsigned long) ((c1<<8) | c2);
+#elif defined(UCS_4LE)
+                unichr = (unsigned long) (c1 | (c2<<8) | (c3)<<16 | (c4<<24));
+#else
+                unichr = (unsigned long) ((c1<<24) | (c2<<16) | (c3<<8) | c4);
+#endif
+
+                if (unichr < MSB) { /* ASCII */
+                        **outbuf = (char) unichr; 
+		        (*outbuf)++;
+			(*outbytesleft)--;
+                } else {
+                	n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
+                	if ( n == -1 ) { /* invalid unicode codepoint */
+                	        st->_errno = errno = EILSEQ;
+                	        return ((size_t)-1);
+                	}
+                
+                	n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
+                	if (n > 0) {
+                		(*outbuf) += n;
+                		(*outbytesleft) -= n;
+                
+                	   	uconv_num += uconv_num_internal;
+                        } else {
+                                return ((size_t)-1);
+                        }
+                }
+
+                (*inbuf) += size;
+                (*inbytesleft) -= size;
+        }
+
+        if ( *inbytesleft >0 ) {
+                errno =  *outbytesleft? EINVAL: E2BIG;
+                return ((size_t)-1);
+        }
+
+        return uconv_num;
+}
+#else
 /*
  * Actual conversion; called from iconv()
  */
@@ -121,6 +194,7 @@
 {
 	char		c1, c2;
 	int		n, unidx;
+        unsigned long   unichr;
 	unsigned long	gbkcode;
         int		uconv_num = 0;
    	int		utf8_len;	
@@ -150,14 +224,7 @@
 	   
 		switch (st->ustate) {
 		case U0:		/* assuming ASCII in the beginning */
-		        /*
-			 * Code conversion for UCS-2LE to support Samba
-			 */
-		        if (st->little_endian) {
-			  st->ustate = U1;
-			  st->keepc[0] = **inbuf;
-			}
-			else if ((**inbuf & MSB) == 0) {	/* ASCII */
+			if ((**inbuf & MSB) == 0) {	/* ASCII */
 				**outbuf = **inbuf;
 				(*outbuf)++;
 				(*outbytesleft)--;
@@ -185,30 +252,12 @@
 			}
 			break;
 		case U1:		/* 2 byte unicode */
-			if ((**inbuf & 0xc0) == MSB || st->little_endian) {
+			if ((**inbuf & 0xc0) == MSB) {
 			   	utf8_len = 2;
 				st->keepc[1] = **inbuf;
 
-			        /*
-				 * Code conversion for UCS-2LE to support Samba
-				 */
-			        if (st->little_endian) {
-				  c1 = st->keepc[1];
-				  c2 = st->keepc[0];
-
-				  /*
-				   * It's ASCII
-				   */
-				  if (c1 == 0 && (c2 & MSB) == 0) {
-				    *(*outbuf)++ = c2;
-				    (*outbytesleft)--;
-				    st->ustate = U0;
-				    break;
-				  }
-				} else {
-				  c1 = (st->keepc[0]&0x1c)>>2;
-				  c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f);
-				}
+				c1 = (st->keepc[0]&0x1c)>>2;
+				c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f);
 
 				st->ustate = U4;
 #ifdef DEBUG
@@ -253,7 +302,8 @@
 			}
 			break;
 		case U4:
-			n = get_gbk_by_unicode(c1, c2, &unidx, &gbkcode);
+	                unichr = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
+			n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
 		        if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */
 			     st->_errno = errno = EILSEQ;
 			     break;
@@ -264,7 +314,7 @@
 				break;
 			}
 */
-			n = utf8_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
+			n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
 			if (n > 0) {
 				(*outbuf) += n;
 				(*outbytesleft) -= n;
@@ -384,6 +434,7 @@
 
 	return uconv_num;
 }
+#endif /* UCS_2LE || UCS_2BE || UCS_4LE || UCS_4BE */
 
 
 /*
@@ -395,14 +446,9 @@
  * Since binary search of the UTF8 to GBK table is necessary, might as well
  * return index and GBK code matching to the unicode.
  */
-int get_gbk_by_unicode(char c1, char c2, int* unidx, unsigned long* gbkcode)
+int get_gbk_by_unicode(unsigned long unicode, int* unidx, unsigned long* gbkcode)
 {
-	unsigned long	unicode;
-
-	unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
-   
-        /* 0xfffe and 0xffff should not be allowed */
-        if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
+        if ( unicode > UCS4_MAXVAL || ext_ucs4_lsw(unicode) > UCS4_PPRC_MAXVAL ) return -1;
    
 	*unidx = binsearch(unicode, unicode_gbk_tab, UNICODEMAX);
 	if ((*unidx) >= 0)
@@ -410,7 +456,7 @@
 	else
 		return(1);	/* match from unicode to GBK not found */
 #ifdef DEBUG
-    fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", unicode, *unidx, *big5code);
+    fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", unicode, *unidx, *gbkcode);
 #endif
 
 	return(0);
@@ -424,7 +470,7 @@
  * Return: > 0 - converted with enough space in output buffer
  *         = 0 - no space in outbuf
  */
-int utf8_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num)
+int unicode_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num)
 {
 	unsigned long	val;		/* GBK value */
 	char		c[GBK_LEN_MAX];
@@ -469,14 +515,14 @@
 
 
 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
-int binsearch(unsigned long x, table_t v[], int n)
+static int binsearch(unsigned long x, table_t v[], int n)
 {
 	int low, high, mid;
 
 	low = 0;
 	high = n - 1;
 	while (low <= high) {
-		mid = (low + high) / 2;
+		mid = (high - low) / 2 + low;
 		if (x < v[mid].key)
 			high = mid - 1;
 		else if (x > v[mid].key)
@@ -486,3 +532,7 @@
 	}
 	return (-1);	/* no match */
 }
+
+/*
+vi:ts=8:ai:expandtab 
+*/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/iconv/zh/uni_common.c	Thu Aug 14 15:12:38 2008 +0800
@@ -0,0 +1,229 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").  
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright(c) 2001 Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+static int unichr_to_utf8 (st, unichr, buf, buflen, uconv_num)
+_iconv_st *st;
+unsigned long unichr;
+char	*buf;
+size_t	buflen;
+int	*uconv_num;
+{
+        if (unichr < 0x0080) {
+                if (buflen < 1) {
+                        errno = E2BIG;
+                        return(0);
+                }
+                *buf = (char) unichr;
+                return(1);
+        }
+
+	if (unichr >= 0x0080 && unichr <= 0x07ff) {
+		if (buflen < 2) {
+			errno = E2BIG;
+			return(0);
+		}
+		*buf = (char)((unichr >> 6) & 0x1f) | 0xc0;
+		*(buf+1) = (char)(unichr & 0x3f) | 0x80;
+		return(2);
+	}
+
+	if (unichr >= 0x0800 && unichr <= 0xffff) {
+		if (buflen < 3) {
+			errno = E2BIG;
+			return(0);
+		}
+		*buf = (char)((unichr >> 12) & 0xf) | 0xe0;
+		*(buf+1) = (char)((unichr >>6) & 0x3f) | 0x80;
+		*(buf+2) = (char)(unichr & 0x3f) | 0x80;
+		return(3);
+	}
+
+	if (unichr >= 0x10000 && unichr <= 0x10ffff) {
+	        if (buflen < 4) {
+		     errno = E2BIG;
+		     return(0);
+		}
+	   
+	        *buf = (char)((unichr >> 18) & 0x7) | 0xf0;
+	        *(buf+1) = (char)((unichr >> 12) & 0x3f) | 0x80;
+	        *(buf+2) = (char)((unichr >>6) & 0x3f) | 0x80;
+	        *(buf+3) = (char)(unichr & 0x3f) | 0x80;
+	        return(4);
+	}
+
+	/* unrecognized unicode character */
+	if (buflen < 3) {
+		errno = E2BIG;
+		return(0);
+	}
+	*buf = (char)UTF8_NON_ID_CHAR1;
+	*(buf+1) = (char)UTF8_NON_ID_CHAR2;
+	*(buf+2) = (char)UTF8_NON_ID_CHAR3;
+  
+        /* non-identical conversions */
+        *uconv_num = 1;
+   
+	return(3);
+}
+
+static int unichr_to_ucs_2le (st, unichr, buf, buflen, uconv_num)
+_iconv_st *st;
+unsigned long unichr;
+char	*buf;
+size_t	buflen;
+int	*uconv_num;
+{
+	int size = 0;
+
+	if (unichr > 0x00ffff) {
+	  unichr = ICV_CHAR_UCS2_REPLACEMENT;
+	  *uconv_num = 1;
+	}
+
+	if (!st->bom_written) {
+	  if (buflen < 4)
+	    return 0;
+
+	  *(buf + size++) = (uchar_t)0xff;
+	  *(buf + size++) = (uchar_t)0xfe;
+	  st->bom_written = true;
+	}
+
+	if (buflen < 2)
+	  return 0;
+
+	*(buf + size++) = (uchar_t)(unichr & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 8) & 0xff);
+
+	return size;
+}
+
+static int unichr_to_ucs_2be (st, unichr, buf, buflen, uconv_num)
+_iconv_st *st;
+unsigned long unichr;
+char	*buf;
+size_t	buflen;
+int	*uconv_num;
+{
+	int size = 0;
+
+	if (unichr > 0x00ffff) {
+	  unichr = ICV_CHAR_UCS2_REPLACEMENT;
+	  *uconv_num = 1;
+	}
+
+	if (!st->bom_written) {
+	  if (buflen < 4)
+	    return 0;
+
+	  *(buf + size++) = (uchar_t)0xfe;
+	  *(buf + size++) = (uchar_t)0xff;
+	  st->bom_written = true;
+	}
+
+	if (buflen < 2)
+	  return 0;
+
+	*(buf + size++) = (uchar_t)((unichr >> 8) & 0xff);
+	*(buf + size++) = (uchar_t)(unichr & 0xff);
+
+	return size;
+}
+
+static int unichr_to_ucs_4le (st, unichr, buf, buflen, uconv_num)
+_iconv_st *st;
+unsigned long unichr;
+char	*buf;
+size_t	buflen;
+int	*uconv_num;
+{
+	int size = 0;
+
+	if (unichr == 0xffffffff) {
+	  unichr = ICV_CHAR_UCS2_REPLACEMENT;
+	  *uconv_num = 1;
+	}
+
+	if (!st->bom_written) {
+	  if (buflen < 8)
+	    return 0;
+
+	  *(buf + size++) = (uchar_t)0xff;
+	  *(buf + size++) = (uchar_t)0xfe;
+	  *(buf + size++) = (uchar_t)0;
+	  *(buf + size++) = (uchar_t)0;
+	  st->bom_written = true;
+	}
+
+	if (buflen < 4)
+	  return 0;
+
+	*(buf + size++) = (uchar_t)(unichr & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 8) & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 16) & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 24) & 0xff);
+
+	return size;
+}
+
+static int unichr_to_ucs_4be (st, unichr, buf, buflen, uconv_num)
+_iconv_st *st;
+unsigned long unichr;
+char	*buf;
+size_t	buflen;
+int	*uconv_num;
+{
+	int size = 0;
+
+	if (unichr == 0xffffffff) {
+	  unichr = ICV_CHAR_UCS2_REPLACEMENT;
+	  *uconv_num = 1;
+	}
+
+	if (!st->bom_written) {
+	  if (buflen < 8)
+	    return 0;
+
+	  *(buf + size++) = (uchar_t)0;
+	  *(buf + size++) = (uchar_t)0;
+	  *(buf + size++) = (uchar_t)0xfe;
+	  *(buf + size++) = (uchar_t)0xff;
+	  st->bom_written = true;
+	}
+
+	if (buflen < 4)
+	  return 0;
+
+	*(buf + size++) = (uchar_t)((unichr >> 24) & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 16) & 0xff);
+	*(buf + size++) = (uchar_t)((unichr >> 8) & 0xff);
+	*(buf + size++) = (uchar_t)(unichr & 0xff);
+
+	return size;
+}
+
+/*
+vi:ts=8:ai:expandtab 
+*/
--- a/src/lib/iconv/zh/zh_CN.gbk%UTF-8.c	Wed Aug 13 15:04:27 2008 +0200
+++ b/src/lib/iconv/zh/zh_CN.gbk%UTF-8.c	Thu Aug 14 15:12:38 2008 +0800
@@ -46,17 +46,28 @@
 #define UTF8_NON_ID_CHAR2 0xBF
 #define UTF8_NON_ID_CHAR3 0xBD
 
+#if defined UCS_2LE
+    #define output_char unichr_to_ucs_2le
+#elif defined UCS_2BE
+    #define output_char unichr_to_ucs_2be
+#elif defined UCS_4LE
+    #define output_char unichr_to_ucs_4le
+#elif defined UCS_4BE
+    #define output_char unichr_to_ucs_4be
+#else
+    #define output_char unichr_to_utf8
+#endif
+
 typedef struct _icv_state {
 	char	keepc[GBK_LEN_MAX];	/* maximum # byte of GBK2K code */
 	short	cstate;		/* state machine id */
 	int	_errno;		/* internal errno */
-        boolean little_endian;
         boolean bom_written;
 } _iconv_st;
 
 enum _CSTATE	{ C0, C1, C2, C3 };
 
-
+static unsigned long gbk_to_unicode (_iconv_st *);
 
 /*
  * Open; called from iconv_open()
@@ -73,11 +84,10 @@
 
 	st->cstate = C0;
 	st->_errno = 0;
-	st->little_endian = false;
-	st->bom_written = false;
-#if defined(UCS_2LE)
-	st->little_endian = true;
+#if defined(UCS_2LE) || defined(UCS_2BE) || defined(UCS_4LE) || defined(UCS_4BE)
 	st->bom_written = true;
+#else
+        st->bom_written = false;
 #endif
 	return ((void *) st);
 }
@@ -162,34 +172,13 @@
 					st->cstate = C1;
 				}
 			} else {	/* real ASCII */
-			        /*
-				 * Code conversion for UCS-2LE to support Samba
-				 */
-			        if (st->little_endian) {
-				    if (!st->bom_written) {
-				      if (*outbytesleft < 4)
-					errno = E2BIG;
-				      else {
-					*(*outbuf)++ = (uchar_t)0xff;
-					*(*outbuf)++ = (uchar_t)0xfe;
-					*outbytesleft -= 2;
-
-					st->bom_written = true;
-				      }
-				    }
-
-				    if (*outbytesleft < 2)
-				      errno = E2BIG;
-				    else {
-				      *(*outbuf)++ = **inbuf;
-				      *(*outbuf)++ = (uchar_t)0x0;
-				      *outbytesleft -= 2;
-				    }
-				} else {
-				  **outbuf = **inbuf;
-				  (*outbuf)++;
-				  (*outbytesleft)--;
-				}
+                                int uconv_num_internal = 0;
+                                n = output_char (st, **inbuf, *outbuf, 
+                                                 *outbytesleft, &uconv_num_internal);
+				if (n > 0) {
+					(*outbuf) += n;
+					(*outbytesleft) -= n;
+                                }
 			}
 			break;
 		case C1:		/* GBK2 characters: 2nd byte */
@@ -198,8 +187,9 @@
 			   
 				st->keepc[1] = (**inbuf);
 				st->keepc[2] = st->keepc[3] = 0;
-				n = gbk_to_utf8(st, *outbuf,
-						*outbytesleft, &uconv_num_internal);
+
+				n = output_char (st, gbk_to_unicode (st), *outbuf,
+						 *outbytesleft, &uconv_num_internal);
 				if (n > 0) {
 					(*outbuf) += n;
 					(*outbytesleft) -= n;
@@ -233,7 +223,9 @@
 			        int uconv_num_internal = 0;
 			   
 				st->keepc[3] = **inbuf;
-				n = gbk_to_utf8(st, *outbuf, *outbytesleft, &uconv_num_internal);
+
+				n = output_char (st, gbk_to_unicode (st), *outbuf, 
+                                                 *outbytesleft, &uconv_num_internal);
 
 				if ( n > 0 ) {
 					(*outbuf) += n;
@@ -289,7 +281,7 @@
  * Return: = 0 - valid GBK2 2nd byte
  *         = 1 - invalid GBK2 2nd byte
  */
-int gbk_2nd_byte(inbuf)
+static int gbk_2nd_byte(inbuf)
 char inbuf;
 {
 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
@@ -301,32 +293,21 @@
 	return(1);
 }
 
-
-/*
- * GBK code --> ISO/IEC 10646-2000 (Unicode)
- * Unicode --> UTF8 (FSS-UTF)
- *             (File System Safe Universal Character Set Transformation Format)
- * Return: > 0 - converted with enough space in output buffer
- *         = 0 - no space in outbuf
- */
-int gbk_to_utf8(st, buf, buflen, uconv_num)
+static unsigned long gbk_to_unicode (st)
 _iconv_st *st;
-char	*buf;
-size_t	buflen;
-int	*uconv_num;
 {
-	unsigned long	gbk_val;	/* GBK value */
-	int		unidx;		/* Unicode index */
-	unsigned long	uni_val;	/* Unicode */
+	unsigned long	gbk_val;	        /* GBK value */
+	int		unidx;		        /* Unicode index */
+	unsigned long	uni_val = 0xffffffff;	/* Unicode */
 	int		isgbk4 = 1;
 	char            *keepc = st->keepc;
 
 	if ( keepc[2] == 0 && keepc[3] == 0 ) 
 		isgbk4 = 0;
 
-	if ( ! isgbk4 ) 
+	if ( ! isgbk4 ) { 
 		gbk_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
-	else {
+        } else {
 		int  i;
 
 		gbk_val = keepc[0] & ONEBYTE;
@@ -342,82 +323,18 @@
 		if ( unidx >= 0 ) uni_val = gbk_unicode_tab[unidx].value;
 	}
 
-	/*
-	 * Code conversion for UCS-2LE to support Samba
-	 */
-	if (st->little_endian) {
-	  int size = 0;
-
-	  if (unidx < 0 || uni_val > 0x00ffff) {
-	    uni_val = ICV_CHAR_UCS2_REPLACEMENT;
-	    *uconv_num = 1;
-	  }
-
-	  if (!st->bom_written) {
-	    if (buflen < 4)
-	      return 0;
-
-	    *(buf + size++) = (uchar_t)0xff;
-	    *(buf + size++) = (uchar_t)0xfe;
-	    st->bom_written = true;
-	  }
-
-	  if (buflen < 2)
-	    return 0;
-
-	  *(buf + size++) = (uchar_t)(uni_val & 0xff);
-	  *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
-
-	  return size;
-	}
-
-	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
-		if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
-			if (buflen < 2) {
-				errno = E2BIG;
-				return(0);
-			}
-			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
-			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
-			return(2);
-		}
-		if (uni_val >= 0x0800 && uni_val <= 0xffff) {
-			if (buflen < 3) {
-				errno = E2BIG;
-				return(0);
-			}
-			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
-			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
-			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
-			return(3);
-		}
-	}
-
-	/* can't find a match in GBK --> UTF8 table or illegal UTF8 code */
-	if (buflen < 3) {
-		errno = E2BIG;
-		return(0);
-	}
-	*buf = (char)UTF8_NON_ID_CHAR1;
-	*(buf+1) = (char)UTF8_NON_ID_CHAR2;
-	*(buf+2) = (char)UTF8_NON_ID_CHAR3;
-  
-        /* non-identical conversions */
-        *uconv_num = 1;
-   
-	return(3);
+        return uni_val;
 }
 
-
 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
-int binsearch(unsigned long x, table_t v[], int n)
+static int binsearch(unsigned long x, table_t v[], int n)
 {
 	int low, high, mid;
 
 	low = 0;
 	high = n - 1;
 	while (low <= high) {
-		mid = (low + high) / 2;
+		mid = (high - low) / 2 + low;
 		if (x < v[mid].key)
 			high = mid - 1;
 		else if (x > v[mid].key)
@@ -427,3 +344,9 @@
 	}
 	return (-1);	/* no match */
 }
+
+#include "uni_common.c"
+
+/*
+vi:ts=8:ai:expandtab 
+*/