From 0e2331c9b6e0c0b4f24019d4062f4c655d28cbaf Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Tue, 12 Jul 2011 20:30:04 -0400 Subject: gb18030 support in iconv (only from, not to) also support (and restrict to subsets) older chinese sets, and explicitly refuse to convert to cjk (since there's no code for it yet) --- src/locale/iconv.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) (limited to 'src/locale/iconv.c') diff --git a/src/locale/iconv.c b/src/locale/iconv.c index fb982921..a7d4fd9f 100644 --- a/src/locale/iconv.c +++ b/src/locale/iconv.c @@ -17,6 +17,9 @@ #define UTF_8 0310 #define EUC_JP 0320 #define SHIFT_JIS 0321 +#define GB18030 0330 +#define GBK 0331 +#define GB2312 0332 /* FIXME: these are not implemented yet * EUC: A1-FE A1-FE @@ -41,6 +44,9 @@ static const unsigned char charmaps[] = "ascii\0usascii\0iso646\0iso646us\0\0\306" "eucjp\0\0\320" "shiftjis\0sjis\0\0\321" +"gb18030\0\0\330" +"gbk\0\0\331" +"gb2312\0\0\332" #include "codepages.h" ; @@ -52,6 +58,10 @@ static const unsigned short jis0208[84][94] = { #include "jis0208.h" }; +static const unsigned short gb18030[126][190] = { +#include "gb18030.h" +}; + static int fuzzycmp(const unsigned char *a, const unsigned char *b) { for (; *a && *b; a++, b++) { @@ -82,7 +92,9 @@ iconv_t iconv_open(const char *to, const char *from) { size_t f, t; - if ((t = find_charmap(to))==-1 || (f = find_charmap(from))==-1) { + if ((t = find_charmap(to))==-1 + || (f = find_charmap(from))==-1 + || (t >= 0320)) { errno = EINVAL; return (iconv_t)-1; } @@ -127,7 +139,6 @@ static void put_32(unsigned char *s, unsigned c, int e) #define mbrtowc_utf8 mbrtowc #define wctomb_utf8 wctomb -#include size_t iconv(iconv_t cd0, char **in, size_t *inb, char **out, size_t *outb) { size_t x=0; @@ -229,6 +240,44 @@ size_t iconv(iconv_t cd0, char **in, size_t *inb, char **out, size_t *outb) c = jis0208[c][d]; if (!c) goto ilseq; break; + case GB2312: + if (c < 0xa1) goto ilseq; + case GBK: + case GB18030: + c -= 0x81; + if (c >= 126) goto ilseq; + l = 2; + if (*inb < 2) goto starved; + d = *((unsigned char *)*in + 1); + if (d < 0xa1 && type == GB2312) goto ilseq; + if (d-0x40>=191 || d==127) { + if (d-'0'>9 || type != GB18030) + goto ilseq; + l = 4; + if (*inb < 4) goto starved; + c = (10*c + d-'0') * 1260; + d = *((unsigned char *)*in + 2); + if (d-0x81>126) goto ilseq; + c += 10*(d-0x81); + d = *((unsigned char *)*in + 3); + if (d-'0'>9) goto ilseq; + c += d-'0'; + c += 128; + for (d=0; d<=c; ) { + k = 0; + for (int i=0; i<126; i++) + for (int j=0; j<190; j++) + if (gb18030[i][j]-d <= c-d) + k++; + d = c+1; + c += k; + } + break; + } + d -= 0x40; + if (d>63) d--; + c = gb18030[c][d]; + break; default: if (c < 128+type) break; c -= 128+type; -- cgit v1.2.1