From 95c6044e2ae85846330814c4ac5ebf4102dbe02c Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@aerifal.cx>
Date: Mon, 18 Dec 2017 22:08:54 -0500
Subject: add support for BOM-determined-endian UCS2, UTF-16, and UTF-32 to
 iconv

previously, the charset names without endianness specified were always
interpreted as big endian. unicode specifies that UTF-16 and UTF-32
have BOM-determined endianness if BOM is present, and are otherwise
big endian. since commit 5b546faa67544af395d6407553762b37e9711157
added support for stateful encodings, it is now possible to implement
BOM support via the conversion descriptor state.

for conversions to these charsets, the output is always big endian and
does not have a BOM.
---
 src/locale/iconv.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'src/locale')

diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 1784dc9d..c5dd122f 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -16,6 +16,9 @@
 #define WCHAR_T     0306
 #define US_ASCII    0307
 #define UTF_8       0310
+#define UTF_16      0312
+#define UTF_32      0313
+#define UCS2        0314
 #define EUC_JP      0320
 #define SHIFT_JIS   0321
 #define ISO2022_JP  0322
@@ -35,13 +38,16 @@
 static const unsigned char charmaps[] =
 "utf8\0char\0\0\310"
 "wchart\0\0\306"
-"ucs2\0ucs2be\0\0\304"
+"ucs2be\0\0\304"
 "ucs2le\0\0\305"
-"utf16\0utf16be\0\0\302"
+"utf16be\0\0\302"
 "utf16le\0\0\301"
-"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
+"ucs4be\0utf32be\0\0\300"
 "ucs4le\0utf32le\0\0\303"
 "ascii\0usascii\0iso646\0iso646us\0\0\307"
+"utf16\0\0\312"
+"ucs4\0utf32\0\0\313"
+"ucs2\0\0\314"
 "eucjp\0\0\320"
 "shiftjis\0sjis\0\0\321"
 "iso2022jp\0\0\322"
@@ -145,6 +151,9 @@ iconv_t iconv_open(const char *to, const char *from)
 	iconv_t cd = combine_to_from(t, f);
 
 	switch (charmaps[f]) {
+	case UTF_16:
+	case UTF_32:
+	case UCS2:
 	case ISO2022_JP:
 		scd = malloc(sizeof *scd);
 		if (!scd) return (iconv_t)-1;
@@ -285,6 +294,31 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 				c = ((c-0xd7c0)<<10) + (d-0xdc00);
 			}
 			break;
+		case UCS2:
+		case UTF_16:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 2) goto starved;
+				c = get_16((void *)*in, 0);
+				scd->state = type==UCS2
+					? c==0xfffe ? UCS2LE : UCS2BE
+					: c==0xfffe ? UTF_16LE : UTF_16BE;
+				if (c == 0xfffe || c == 0xfeff)
+					l = 2;
+			}
+			type = scd->state;
+			continue;
+		case UTF_32:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 4) goto starved;
+				c = get_32((void *)*in, 0);
+				scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
+				if (c == 0xfffe0000 || c == 0xfeff)
+					l = 4;
+			}
+			type = scd->state;
+			continue;
 		case SHIFT_JIS:
 			if (c < 128) break;
 			if (c-0xa1 <= 0xdf-0xa1) {
@@ -589,8 +623,11 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			*(*out)++ = 'B';
 			*outb -= 8;
 			break;
+		case UCS2:
+			totype = UCS2BE;
 		case UCS2BE:
 		case UCS2LE:
+		case UTF_16:
 		case UTF_16BE:
 		case UTF_16LE:
 			if (c < 0x10000 || type-UCS2BE < 2U) {
-- 
cgit v1.2.1