From 95c6044e2ae85846330814c4ac5ebf4102dbe02c Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 18 Dec 2017 22:08:54 -0500 Subject: add support for BOM-determined-endian UCS2, UTF-16, and UTF-32 to iconv previously, the charset names without endianness specified were always interpreted as big endian. unicode specifies that UTF-16 and UTF-32 have BOM-determined endianness if BOM is present, and are otherwise big endian. since commit 5b546faa67544af395d6407553762b37e9711157 added support for stateful encodings, it is now possible to implement BOM support via the conversion descriptor state. for conversions to these charsets, the output is always big endian and does not have a BOM. --- src/locale/iconv.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'src/locale') diff --git a/src/locale/iconv.c b/src/locale/iconv.c index 1784dc9d..c5dd122f 100644 --- a/src/locale/iconv.c +++ b/src/locale/iconv.c @@ -16,6 +16,9 @@ #define WCHAR_T 0306 #define US_ASCII 0307 #define UTF_8 0310 +#define UTF_16 0312 +#define UTF_32 0313 +#define UCS2 0314 #define EUC_JP 0320 #define SHIFT_JIS 0321 #define ISO2022_JP 0322 @@ -35,13 +38,16 @@ static const unsigned char charmaps[] = "utf8\0char\0\0\310" "wchart\0\0\306" -"ucs2\0ucs2be\0\0\304" +"ucs2be\0\0\304" "ucs2le\0\0\305" -"utf16\0utf16be\0\0\302" +"utf16be\0\0\302" "utf16le\0\0\301" -"ucs4\0ucs4be\0utf32\0utf32be\0\0\300" +"ucs4be\0utf32be\0\0\300" "ucs4le\0utf32le\0\0\303" "ascii\0usascii\0iso646\0iso646us\0\0\307" +"utf16\0\0\312" +"ucs4\0utf32\0\0\313" +"ucs2\0\0\314" "eucjp\0\0\320" "shiftjis\0sjis\0\0\321" "iso2022jp\0\0\322" @@ -145,6 +151,9 @@ iconv_t iconv_open(const char *to, const char *from) iconv_t cd = combine_to_from(t, f); switch (charmaps[f]) { + case UTF_16: + case UTF_32: + case UCS2: case ISO2022_JP: scd = malloc(sizeof *scd); if (!scd) return (iconv_t)-1; @@ -285,6 +294,31 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri c = ((c-0xd7c0)<<10) + (d-0xdc00); } break; + case UCS2: + case UTF_16: + l = 0; + if (!scd->state) { + if (*inb < 2) goto starved; + c = get_16((void *)*in, 0); + scd->state = type==UCS2 + ? c==0xfffe ? UCS2LE : UCS2BE + : c==0xfffe ? UTF_16LE : UTF_16BE; + if (c == 0xfffe || c == 0xfeff) + l = 2; + } + type = scd->state; + continue; + case UTF_32: + l = 0; + if (!scd->state) { + if (*inb < 4) goto starved; + c = get_32((void *)*in, 0); + scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE; + if (c == 0xfffe0000 || c == 0xfeff) + l = 4; + } + type = scd->state; + continue; case SHIFT_JIS: if (c < 128) break; if (c-0xa1 <= 0xdf-0xa1) { @@ -589,8 +623,11 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri *(*out)++ = 'B'; *outb -= 8; break; + case UCS2: + totype = UCS2BE; case UCS2BE: case UCS2LE: + case UTF_16: case UTF_16BE: case UTF_16LE: if (c < 0x10000 || type-UCS2BE < 2U) { -- cgit v1.2.1