summaryrefslogtreecommitdiff
path: root/src/locale/iconv.c
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2013-08-17 16:23:22 -0400
committerRich Felker <dalias@aerifal.cx>2013-08-17 16:23:22 -0400
commit109bd65acf5180f0a48ea4c4e5f2bc6884be202d (patch)
treea80cfd23f3d6b6df794c3658231f7e3c46ff2c6b /src/locale/iconv.c
parent453f462297062f9444ba1517d592cf31c7d7fce5 (diff)
downloadmusl-109bd65acf5180f0a48ea4c4e5f2bc6884be202d.tar.gz
add hkscs/big5-2003/eten extensions to iconv big5
with these changes, the character set implemented as "big5" in musl is a pure superset of cp950, the canonical "big5", and agrees with the normative parts of Unicode. this means it has minor differences from both hkscs and big5-2003: - the range A2CC-A2CE maps to CJK ideographs rather than numerals, contrary to changes made in big5-2003. - C6CD maps to a CJK ideograph rather than its corresponding Kangxi radical character, contrary to changes made in hkscs. - F9FE maps to U+2593 rather than U+FFED. of these differences, none but the last are visually distinct, and the last is a character used purely for text-based graphics, not to convey linguistic content. should there be future demand for strict conformance to big5-2003 or hkscs mappings, the present charset aliases can be replaced with distinct variants. reportedly there are other non-standard big5 extensions in common use in Taiwan and perhaps elsewhere, which could also be added as layers on top of the existing big5 support. there may be additional characters which should be added to the hkscs table: the whatwg standard for big5 defines what appears to be a superset of hkscs.
Diffstat (limited to 'src/locale/iconv.c')
-rw-r--r--src/locale/iconv.c37
1 files changed, 33 insertions, 4 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 30ea8da6..a0b02320 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -49,7 +49,7 @@ static const unsigned char charmaps[] =
"gb18030\0\0\330"
"gbk\0\0\331"
"gb2312\0\0\332"
-"big5\0bigfive\0cp950\0\0\340"
+"big5\0bigfive\0cp950\0big5hkscs\0\0\340"
"euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
#include "codepages.h"
;
@@ -70,6 +70,10 @@ static const unsigned short big5[89][157] = {
#include "big5.h"
};
+static const unsigned short hkscs[] = {
+#include "hkscs.h"
+};
+
static const unsigned short ksc[93][94] = {
#include "ksc.h"
};
@@ -294,12 +298,37 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
l = 2;
if (*inb < 2) goto starved;
d = *((unsigned char *)*in + 1);
- if (c-0xa1>=0xfa-0xa1) goto ilseq;
- c -= 0xa1;
if (d-0x40>=0xff-0x40 || d-0x7f<0xa1-0x7f) goto ilseq;
d -= 0x40;
if (d > 0x3e) d -= 0x22;
- c = big5[c][d];
+ if (c-0xa1>=0xfa-0xa1) {
+ if (c-0x87>=0xff-0x87) goto ilseq;
+ if (c < 0xa1) c -= 0x87;
+ else c -= 0x87 + (0xfa-0xa1);
+ c = (hkscs[4867+(c*157+d)/16]>>(c*157+d)%16)%2<<17
+ | hkscs[c*157+d];
+ /* A few HKSCS characters map to pairs of UCS
+ * characters. These are mapped to surrogate
+ * range in the hkscs table then hard-coded
+ * here. Ugly, yes. */
+ if (c/256 == 0xdc) {
+ if (totype-0300U > 8) k = 2;
+ else k = "\10\4\4\10\4\4\10\2\4"[totype-0300];
+ if (k > *outb) goto toobig;
+ x += iconv((iconv_t)(uintptr_t)to,
+ &(char *){"\303\212\314\204"
+ "\303\212\314\214"
+ "\303\252\314\204"
+ "\303\252\314\214"
+ +c%256}, &(size_t){4},
+ out, outb);
+ continue;
+ }
+ if (!c) goto ilseq;
+ break;
+ }
+ c -= 0xa1;
+ c = big5[c][d]|(c==0x27&&(d==0x3a||d==0x3c||d==0x42))<<17;
if (!c) goto ilseq;
break;
case EUC_KR: