iconv support for legacy Korean encodings

like for other character sets, stateful iso-2022 form is not supported yet but everything else should work. all charset aliases are treated the same, as Windows codepage 949, because reportedly the EUC-KR charset name is in widespread (mis?)usage in email and on the web for data which actually uses the extended characters outside the standard 93x94 grid. this could easily be changed if desired. the principle of this converter for handling the giant bulk of rare Hangul syllables outside of the standard KS X 1001 93x94 grid is the same as the GB18030 converter's treatment of non-explicitly-coded Unicode codepoints: sequences in the extension range are mapped to an integer index N, and the converter explicitly computes the Nth Hangul syllable not explicitly encoded in the character map. empirically, this requires at most 7 passes over the grid. this approach reduces the table size required for Korean legacy encodings from roughly 44k to 17k and should have minimal performance impact on real-world text conversions since the "slow" characters are rare. where it does have impact, the cost is merely a large constant time factor.
author: Rich Felker <dalias@aerifal.cx> 2013-08-05 13:14:17 -0400
committer: Rich Felker <dalias@aerifal.cx> 2013-08-05 13:14:17 -0400
commit: 734062b298e129a8f8bdae299f8d2b7b19419867 (patch)
tree: 328aa8e11d1391ba5d02515c1ed1c96fa473f687 /src/locale/iconv.c
parent: a7f18a55298ffaa287336fd0c81dcd3fe45e16b6 (diff)
download: musl-734062b298e129a8f8bdae299f8d2b7b19419867.tar.gz
1 files changed, 38 insertions, 0 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index a2332ce0..d3caafa7 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -20,6 +20,7 @@
 #define GB18030     0330
 #define GBK         0331
 #define GB2312      0332
+#define EUC_KR      0350
 
 /* FIXME: these are not implemented yet
  * EUC:   A1-FE A1-FE
@@ -47,6 +48,7 @@ static const unsigned char charmaps[] =
 "gb18030\0\0\330"
 "gbk\0\0\331"
 "gb2312\0\0\332"
+"euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
 #include "codepages.h"
 ;
 
@@ -62,6 +64,10 @@ static const unsigned short gb18030[126][190] = {
 #include "gb18030.h"
 };
 
+static const unsigned short ksc[93][94] = {
+#include "ksc.h"
+};
+
 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
 {
 	for (; *a && *b; a++, b++) {
@@ -278,6 +284,38 @@ size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restr
 			if (d>63) d--;
 			c = gb18030[c][d];
 			break;
+		case EUC_KR:
+			l = 2;
+			if (*inb < 2) goto starved;
+			d = *((unsigned char *)*in + 1);
+			c -= 0xa1;
+			d -= 0xa1;
+			if (c >= 93 || d >= 94) {
+				c += (0xa1-0x81);
+				d += 0xa1;
+				if (c >= 93 || c>=0xc6-0x81 && d>0x52)
+					goto ilseq;
+				if (d-'A'<26) d = d-'A';
+				else if (d-'a'<26) d = d-'a'+26;
+				else if (d-0x81<0xff-0x81) d = d-0x81+52;
+				else goto ilseq;
+				if (c < 0x20) c = 178*c + d;
+				else c = 178*0x20 + 84*(c-0x20) + d;
+				c += 0xac00;
+				for (d=0xac00; d<=c; ) {
+					k = 0;
+					for (int i=0; i<93; i++)
+						for (int j=0; j<94; j++)
+							if (ksc[i][j]-d <= c-d)
+								k++;
+					d = c+1;
+					c += k;
+				}
+				break;
+			}
+			c = ksc[c][d];
+			if (!c) goto ilseq;
+			break;
 		default:
 			if (c < 128+type) break;
 			c -= 128+type;
author	Rich Felker <dalias@aerifal.cx>	2013-08-05 13:14:17 -0400
committer	Rich Felker <dalias@aerifal.cx>	2013-08-05 13:14:17 -0400
commit	734062b298e129a8f8bdae299f8d2b7b19419867 (patch)
tree	328aa8e11d1391ba5d02515c1ed1c96fa473f687 /src/locale/iconv.c
parent	a7f18a55298ffaa287336fd0c81dcd3fe45e16b6 (diff)
download	musl-734062b298e129a8f8bdae299f8d2b7b19419867.tar.gz