From a223dbd27ae36fe53f9f67f86caf685b729593fc Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Tue, 14 Nov 2017 23:47:05 -0500 Subject: add reverse iconv mappings for JIS-based encodings these encodings are still commonly used in messaging protocols and such. the reverse mapping is implemented as a binary search of a list of the jis 0208 characters in unicode order; the existing forward table is used to perform the comparison in the search. --- src/locale/iconv.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) (limited to 'src/locale/iconv.c') diff --git a/src/locale/iconv.c b/src/locale/iconv.c index 01f17521..1784dc9d 100644 --- a/src/locale/iconv.c +++ b/src/locale/iconv.c @@ -80,6 +80,10 @@ static const unsigned short ksc[93][94] = { #include "ksc.h" }; +static const unsigned short rev_jis[] = { +#include "revjis.h" +}; + static int fuzzycmp(const unsigned char *a, const unsigned char *b) { for (; *a && *b; a++, b++) { @@ -134,7 +138,7 @@ iconv_t iconv_open(const char *to, const char *from) if ((t = find_charmap(to))==-1 || (f = find_charmap(from))==-1 - || (charmaps[t] >= 0320)) { + || (charmaps[t] >= 0330)) { errno = EINVAL; return (iconv_t)-1; } @@ -192,6 +196,25 @@ static unsigned legacy_map(const unsigned char *map, unsigned c) return x < 256 ? x : legacy_chars[x-256]; } +static unsigned uni_to_jis(unsigned c) +{ + unsigned nel = sizeof rev_jis / sizeof *rev_jis; + unsigned d, j, i, b = 0; + for (;;) { + i = nel/2; + j = rev_jis[b+i]; + d = jis0208[j/256][j%256]; + if (d==c) return j + 0x2121; + else if (nel == 1) return 0; + else if (c < d) + nel /= 2; + else { + b += i; + nel -= nel/2; + } + } +} + size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) { size_t x=0; @@ -493,6 +516,79 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri } } goto subst; + case SHIFT_JIS: + if (c < 128) goto revout; + if (c == 0xa5) { + x++; + c = '\\'; + goto revout; + } + if (c == 0x203e) { + x++; + c = '~'; + goto revout; + } + if (c-0xff61 <= 0xdf-0xa1) { + c += 0xa1 - 0xff61; + goto revout; + } + c = uni_to_jis(c); + if (!c) goto subst; + if (*outb < 2) goto toobig; + d = c%256; + c = c/256; + *(*out)++ = (c+1)/2 + (c<95 ? 112 : 176); + *(*out)++ = c%2 ? d + 31 + d/96 : d + 126; + *outb -= 2; + break; + case EUC_JP: + if (c < 128) goto revout; + if (c-0xff61 <= 0xdf-0xa1) { + c += 0x0e00 + 0x21 - 0xff61; + } else { + c = uni_to_jis(c); + } + if (!c) goto subst; + if (*outb < 2) goto toobig; + *(*out)++ = c/256 + 0x80; + *(*out)++ = c%256 + 0x80; + *outb -= 2; + break; + case ISO2022_JP: + if (c < 128) goto revout; + if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) { + if (*outb < 7) goto toobig; + *(*out)++ = '\033'; + *(*out)++ = '('; + if (c==0xa5) { + *(*out)++ = 'J'; + *(*out)++ = '\\'; + } else if (c==0x203e) { + *(*out)++ = 'J'; + *(*out)++ = '~'; + } else { + *(*out)++ = 'I'; + *(*out)++ = c-0xff61+0x21; + } + *(*out)++ = '\033'; + *(*out)++ = '('; + *(*out)++ = 'B'; + *outb -= 7; + break; + } + c = uni_to_jis(c); + if (!c) goto subst; + if (*outb < 8) goto toobig; + *(*out)++ = '\033'; + *(*out)++ = '$'; + *(*out)++ = 'B'; + *(*out)++ = c/256; + *(*out)++ = c%256; + *(*out)++ = '\033'; + *(*out)++ = '('; + *(*out)++ = 'B'; + *outb -= 8; + break; case UCS2BE: case UCS2LE: case UTF_16BE: -- cgit v1.2.1