/******************************************** * Encode and decode UTF-8, UTF-16 and UTF-32 strings. * * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D * wchar type. * For Posix systems, the C wchar_t type is UTF-32 and corresponds to * the D utf.dchar type. * * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). * * See_Also: * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)
* $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)
* $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) * * Copyright: Copyright Digital Mars 2003 - 2016. * License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0). * Authors: Walter Bright, Sean Kelly * Source: $(DRUNTIMESRC src/rt/util/_utf.d) */ module rt.util.utf; extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure; /******************************* * Test if c is a valid UTF-32 character. * * \uFFFE and \uFFFF are considered valid by this function, * as they are permitted for internal use by an application, * but they are not allowed for interchange by the Unicode standard. * * Returns: true if it is, false if not. */ @safe @nogc pure nothrow bool isValidDchar(dchar c) { /* Note: FFFE and FFFF are specifically permitted by the * Unicode standard for application internal use, but are not * allowed for interchange. * (thanks to Arcane Jill) */ return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); } unittest { debug(utf) printf("utf.isValidDchar.unittest\n"); assert(isValidDchar(cast(dchar)'a') == true); assert(isValidDchar(cast(dchar)0x1FFFFF) == false); } static immutable UTF8stride = [ cast(ubyte) 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, ]; /** * stride() returns the length of a UTF-8 sequence starting at index i * in string s. * Returns: * The number of bytes in the UTF-8 sequence or * 0xFF meaning s[i] is not the start of of UTF-8 sequence. */ @safe @nogc pure nothrow uint stride(in char[] s, size_t i) { return UTF8stride[s[i]]; } /** * stride() returns the length of a UTF-16 sequence starting at index i * in string s. */ @safe @nogc pure nothrow uint stride(in wchar[] s, size_t i) { uint u = s[i]; return 1 + (u >= 0xD800 && u <= 0xDBFF); } /** * stride() returns the length of a UTF-32 sequence starting at index i * in string s. * Returns: The return value will always be 1. */ @safe @nogc pure nothrow uint stride(in dchar[] s, size_t i) { return 1; } /******************************************* * Given an index i into an array of characters s[], * and assuming that index i is at the start of a UTF character, * determine the number of UCS characters up to that index i. */ @safe pure size_t toUCSindex(in char[] s, size_t i) { size_t n; size_t j; for (j = 0; j < i; ) { j += stride(s, j); n++; } if (j > i) { onUnicodeError("invalid UTF-8 sequence", j); } return n; } /** ditto */ @safe pure size_t toUCSindex(in wchar[] s, size_t i) { size_t n; size_t j; for (j = 0; j < i; ) { j += stride(s, j); n++; } if (j > i) { onUnicodeError("invalid UTF-16 sequence", j); } return n; } /** ditto */ @safe @nogc pure nothrow size_t toUCSindex(in dchar[] s, size_t i) { return i; } /****************************************** * Given a UCS index n into an array of characters s[], return the UTF index. */ @safe pure size_t toUTFindex(in char[] s, size_t n) { size_t i; while (n--) { uint j = UTF8stride[s[i]]; if (j == 0xFF) onUnicodeError("invalid UTF-8 sequence", i); i += j; } return i; } /** ditto */ @safe @nogc pure nothrow size_t toUTFindex(in wchar[] s, size_t n) { size_t i; while (n--) { wchar u = s[i]; i += 1 + (u >= 0xD800 && u <= 0xDBFF); } return i; } /** ditto */ @safe @nogc pure nothrow size_t toUTFindex(in dchar[] s, size_t n) { return n; } /* =================== Decode ======================= */ /*************** * Decodes and returns character starting at s[idx]. idx is advanced past the * decoded character. If the character is not well formed, a UtfException is * thrown and idx remains unchanged. */ @safe pure dchar decode(in char[] s, ref size_t idx) in { assert(idx >= 0 && idx < s.length); } out (result) { assert(isValidDchar(result)); } body { size_t len = s.length; dchar V; size_t i = idx; char u = s[i]; if (u & 0x80) { uint n; char u2; /* The following encodings are valid, except for the 5 and 6 byte * combinations: * 0xxxxxxx * 110xxxxx 10xxxxxx * 1110xxxx 10xxxxxx 10xxxxxx * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ for (n = 1; ; n++) { if (n > 4) goto Lerr; // only do the first 4 of 6 encodings if (((u << n) & 0x80) == 0) { if (n == 1) goto Lerr; break; } } // Pick off (7 - n) significant bits of B from first byte of octet V = cast(dchar)(u & ((1 << (7 - n)) - 1)); if (i + (n - 1) >= len) goto Lerr; // off end of string /* The following combinations are overlong, and illegal: * 1100000x (10xxxxxx) * 11100000 100xxxxx (10xxxxxx) * 11110000 1000xxxx (10xxxxxx 10xxxxxx) * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */ u2 = s[i + 1]; if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) goto Lerr; // overlong combination for (uint j = 1; j != n; j++) { u = s[i + j]; if ((u & 0xC0) != 0x80) goto Lerr; // trailing bytes are 10xxxxxx V = (V << 6) | (u & 0x3F); } if (!isValidDchar(V)) goto Lerr; i += n; } else { V = cast(dchar) u; i++; } idx = i; return V; Lerr: onUnicodeError("invalid UTF-8 sequence", i); return V; // dummy return } unittest { size_t i; dchar c; debug(utf) printf("utf.decode.unittest\n"); static s1 = "abcd"c; i = 0; c = decode(s1, i); assert(c == cast(dchar)'a'); assert(i == 1); c = decode(s1, i); assert(c == cast(dchar)'b'); assert(i == 2); static s2 = "\xC2\xA9"c; i = 0; c = decode(s2, i); assert(c == cast(dchar)'\u00A9'); assert(i == 2); static s3 = "\xE2\x89\xA0"c; i = 0; c = decode(s3, i); assert(c == cast(dchar)'\u2260'); assert(i == 3); static s4 = [ "\xE2\x89"c[], // too short "\xC0\x8A", "\xE0\x80\x8A", "\xF0\x80\x80\x8A", "\xF8\x80\x80\x80\x8A", "\xFC\x80\x80\x80\x80\x8A", ]; for (int j = 0; j < s4.length; j++) { try { i = 0; c = decode(s4[j], i); assert(0); } catch (Throwable o) { i = 23; } assert(i == 23); } } /** ditto */ @safe pure dchar decode(in wchar[] s, ref size_t idx) in { assert(idx >= 0 && idx < s.length); } out (result) { assert(isValidDchar(result)); } body { string msg; dchar V; size_t i = idx; uint u = s[i]; if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { uint u2; if (i + 1 == s.length) { msg = "surrogate UTF-16 high value past end of string"; goto Lerr; } u2 = s[i + 1]; if (u2 < 0xDC00 || u2 > 0xDFFF) { msg = "surrogate UTF-16 low value out of range"; goto Lerr; } u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); i += 2; } else if (u >= 0xDC00 && u <= 0xDFFF) { msg = "unpaired surrogate UTF-16 value"; goto Lerr; } else if (u == 0xFFFE || u == 0xFFFF) { msg = "illegal UTF-16 value"; goto Lerr; } else i++; } else { i++; } idx = i; return cast(dchar)u; Lerr: onUnicodeError(msg, i); return cast(dchar)u; // dummy return } /** ditto */ @safe pure dchar decode(in dchar[] s, ref size_t idx) in { assert(idx >= 0 && idx < s.length); } body { size_t i = idx; dchar c = s[i]; if (!isValidDchar(c)) goto Lerr; idx = i + 1; return c; Lerr: onUnicodeError("invalid UTF-32 value", i); return c; // dummy return } /* =================== Encode ======================= */ /******************************* * Encodes character c and appends it to array s[]. */ @safe pure nothrow void encode(ref char[] s, dchar c) in { assert(isValidDchar(c)); } body { char[] r = s; if (c <= 0x7F) { r ~= cast(char) c; } else { char[4] buf; uint L; if (c <= 0x7FF) { buf[0] = cast(char)(0xC0 | (c >> 6)); buf[1] = cast(char)(0x80 | (c & 0x3F)); L = 2; } else if (c <= 0xFFFF) { buf[0] = cast(char)(0xE0 | (c >> 12)); buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); buf[2] = cast(char)(0x80 | (c & 0x3F)); L = 3; } else if (c <= 0x10FFFF) { buf[0] = cast(char)(0xF0 | (c >> 18)); buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); buf[3] = cast(char)(0x80 | (c & 0x3F)); L = 4; } else { assert(0); } r ~= buf[0 .. L]; } s = r; } unittest { debug(utf) printf("utf.encode.unittest\n"); char[] s = "abcd".dup; encode(s, cast(dchar)'a'); assert(s.length == 5); assert(s == "abcda"); encode(s, cast(dchar)'\u00A9'); assert(s.length == 7); assert(s == "abcda\xC2\xA9"); //assert(s == "abcda\u00A9"); // BUG: fix compiler encode(s, cast(dchar)'\u2260'); assert(s.length == 10); assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); } /** ditto */ @safe pure nothrow void encode(ref wchar[] s, dchar c) in { assert(isValidDchar(c)); } body { wchar[] r = s; if (c <= 0xFFFF) { r ~= cast(wchar) c; } else { wchar[2] buf; buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); r ~= buf; } s = r; } /** ditto */ @safe pure nothrow void encode(ref dchar[] s, dchar c) in { assert(isValidDchar(c)); } body { s ~= c; } /** Returns the code length of $(D c) in the encoding using $(D C) as a code point. The code is returned in character count, not in bytes. */ @safe pure nothrow @nogc ubyte codeLength(C)(dchar c) { static if (C.sizeof == 1) { if (c <= 0x7F) return 1; if (c <= 0x7FF) return 2; if (c <= 0xFFFF) return 3; if (c <= 0x10FFFF) return 4; assert(false); } else static if (C.sizeof == 2) { return c <= 0xFFFF ? 1 : 2; } else { static assert(C.sizeof == 4); return 1; } } /* =================== Validation ======================= */ /*********************************** Checks to see if string is well formed or not. $(D S) can be an array of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) if it is not. Use to check all untrusted input for correctness. */ @safe pure void validate(S)(in S s) { auto len = s.length; for (size_t i = 0; i < len; ) { decode(s, i); } } /* =================== Conversion to UTF8 ======================= */ @safe pure nothrow @nogc char[] toUTF8(char[] buf, dchar c) in { assert(isValidDchar(c)); } body { if (c <= 0x7F) { buf[0] = cast(char) c; return buf[0 .. 1]; } else if (c <= 0x7FF) { buf[0] = cast(char)(0xC0 | (c >> 6)); buf[1] = cast(char)(0x80 | (c & 0x3F)); return buf[0 .. 2]; } else if (c <= 0xFFFF) { buf[0] = cast(char)(0xE0 | (c >> 12)); buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); buf[2] = cast(char)(0x80 | (c & 0x3F)); return buf[0 .. 3]; } else if (c <= 0x10FFFF) { buf[0] = cast(char)(0xF0 | (c >> 18)); buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); buf[3] = cast(char)(0x80 | (c & 0x3F)); return buf[0 .. 4]; } assert(0); } /******************* * Encodes string s into UTF-8 and returns the encoded string. */ @safe pure nothrow string toUTF8(string s) in { validate(s); } body { return s; } /** ditto */ @trusted pure string toUTF8(in wchar[] s) { char[] r; size_t i; size_t slen = s.length; r.length = slen; for (i = 0; i < slen; i++) { wchar c = s[i]; if (c <= 0x7F) r[i] = cast(char)c; // fast path for ascii else { r.length = i; foreach (dchar c; s[i .. slen]) { encode(r, c); } break; } } return cast(string)r; } /** ditto */ @trusted pure string toUTF8(in dchar[] s) { char[] r; size_t i; size_t slen = s.length; r.length = slen; for (i = 0; i < slen; i++) { dchar c = s[i]; if (c <= 0x7F) r[i] = cast(char)c; // fast path for ascii else { r.length = i; foreach (dchar d; s[i .. slen]) { encode(r, d); } break; } } return cast(string)r; } /* =================== Conversion to UTF16 ======================= */ @safe pure nothrow @nogc wchar[] toUTF16(wchar[] buf, dchar c) in { assert(isValidDchar(c)); } body { if (c <= 0xFFFF) { buf[0] = cast(wchar) c; return buf[0 .. 1]; } else { buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); return buf[0 .. 2]; } } /**************** * Encodes string s into UTF-16 and returns the encoded string. * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take * an LPWSTR or LPCWSTR argument. */ @trusted pure wstring toUTF16(in char[] s) { wchar[] r; size_t slen = s.length; r.length = slen; r.length = 0; for (size_t i = 0; i < slen; ) { dchar c = s[i]; if (c <= 0x7F) { i++; r ~= cast(wchar)c; } else { c = decode(s, i); encode(r, c); } } return cast(wstring)r; } alias const(wchar)* wptr; /** ditto */ @safe pure wptr toUTF16z(in char[] s) { wchar[] r; size_t slen = s.length; r.length = slen + 1; r.length = 0; for (size_t i = 0; i < slen; ) { dchar c = s[i]; if (c <= 0x7F) { i++; r ~= cast(wchar)c; } else { c = decode(s, i); encode(r, c); } } r ~= '\000'; return &r[0]; } /** ditto */ @safe pure nothrow wstring toUTF16(wstring s) in { validate(s); } body { return s; } /** ditto */ @trusted pure nothrow wstring toUTF16(in dchar[] s) { wchar[] r; size_t slen = s.length; r.length = slen; r.length = 0; for (size_t i = 0; i < slen; i++) { encode(r, s[i]); } return cast(wstring)r; } /* =================== Conversion to UTF32 ======================= */ /***** * Encodes string s into UTF-32 and returns the encoded string. */ @trusted pure dstring toUTF32(in char[] s) { dchar[] r; size_t slen = s.length; size_t j = 0; r.length = slen; // r[] will never be longer than s[] for (size_t i = 0; i < slen; ) { dchar c = s[i]; if (c >= 0x80) c = decode(s, i); else i++; // c is ascii, no need for decode r[j++] = c; } return cast(dstring)r[0 .. j]; } /** ditto */ @trusted pure dstring toUTF32(in wchar[] s) { dchar[] r; size_t slen = s.length; size_t j = 0; r.length = slen; // r[] will never be longer than s[] for (size_t i = 0; i < slen; ) { dchar c = s[i]; if (c >= 0x80) c = decode(s, i); else i++; // c is ascii, no need for decode r[j++] = c; } return cast(dstring)r[0 .. j]; } /** ditto */ @safe pure nothrow dstring toUTF32(dstring s) in { validate(s); } body { return s; } /* ================================ tests ================================== */ unittest { debug(utf) printf("utf.toUTF.unittest\n"); auto c = "hello"c[]; auto w = toUTF16(c); assert(w == "hello"); auto d = toUTF32(c); assert(d == "hello"); c = toUTF8(w); assert(c == "hello"); d = toUTF32(w); assert(d == "hello"); c = toUTF8(d); assert(c == "hello"); w = toUTF16(d); assert(w == "hello"); c = "hel\u1234o"; w = toUTF16(c); assert(w == "hel\u1234o"); d = toUTF32(c); assert(d == "hel\u1234o"); c = toUTF8(w); assert(c == "hel\u1234o"); d = toUTF32(w); assert(d == "hel\u1234o"); c = toUTF8(d); assert(c == "hel\u1234o"); w = toUTF16(d); assert(w == "hel\u1234o"); c = "he\U000BAAAAllo"; w = toUTF16(c); //foreach (wchar c; w) printf("c = x%x\n", c); //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c); assert(w == "he\U000BAAAAllo"); d = toUTF32(c); assert(d == "he\U000BAAAAllo"); c = toUTF8(w); assert(c == "he\U000BAAAAllo"); d = toUTF32(w); assert(d == "he\U000BAAAAllo"); c = toUTF8(d); assert(c == "he\U000BAAAAllo"); w = toUTF16(d); assert(w == "he\U000BAAAAllo"); wchar[2] buf; auto ret = toUTF16(buf, '\U000BAAAA'); assert(ret == "\U000BAAAA"); }