summaryrefslogtreecommitdiffstats
path: root/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp')
-rw-r--r--other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp288
1 files changed, 288 insertions, 0 deletions
diff --git a/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp b/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp
new file mode 100644
index 000000000..b09bbcdb3
--- /dev/null
+++ b/other-licenses/7zstub/src/CPP/Common/UTFConvert.cpp
@@ -0,0 +1,288 @@
+// UTFConvert.cpp
+
+#include "StdAfx.h"
+
+#include "MyTypes.h"
+#include "UTFConvert.h"
+
+#ifdef _WIN32
+#define _WCHART_IS_16BIT 1
+#endif
+
+/*
+ _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
+
+ n : _UTF8_START(n) : Bits of code point
+
+ 0 : 0x80 : : unused
+ 1 : 0xC0 : 11 :
+ 2 : 0xE0 : 16 : Basic Multilingual Plane
+ 3 : 0xF0 : 21 : Unicode space
+ 3 : 0xF8 : 26 :
+ 5 : 0xFC : 31 : UCS-4
+ 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
+ 7 : 0xFF :
+*/
+
+#define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
+
+#define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); }
+
+#define _UTF8_HEAD_PARSE \
+ _UTF8_HEAD_PARSE2(1) \
+ else _UTF8_HEAD_PARSE2(2) \
+ else _UTF8_HEAD_PARSE2(3) \
+ else _UTF8_HEAD_PARSE2(4) \
+ else _UTF8_HEAD_PARSE2(5) \
+
+ // else _UTF8_HEAD_PARSE2(6)
+
+bool CheckUTF8(const char *src, bool allowReduced) throw()
+{
+ for (;;)
+ {
+ Byte c = *src++;
+ if (c == 0)
+ return true;
+
+ if (c < 0x80)
+ continue;
+ if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars
+ return false;
+
+ unsigned numBytes;
+ _UTF8_HEAD_PARSE
+ else
+ return false;
+
+ UInt32 val = c;
+
+ do
+ {
+ Byte c2 = *src++;
+ if (c2 < 0x80 || c2 >= 0xC0)
+ return allowReduced && c2 == 0;
+ val <<= 6;
+ val |= (c2 - 0x80);
+ }
+ while (--numBytes);
+
+ if (val >= 0x110000)
+ return false;
+ }
+}
+
+
+#define _ERROR_UTF8 \
+ { if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; }
+
+static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw()
+{
+ size_t destPos = 0;
+ bool ok = true;
+
+ for (;;)
+ {
+ Byte c;
+ if (src == srcLim)
+ {
+ *destLen = destPos;
+ return ok;
+ }
+ c = *src++;
+
+ if (c < 0x80)
+ {
+ if (dest)
+ dest[destPos] = (wchar_t)c;
+ destPos++;
+ continue;
+ }
+ if (c < 0xC0)
+ _ERROR_UTF8
+
+ unsigned numBytes;
+ _UTF8_HEAD_PARSE
+ else
+ _ERROR_UTF8
+
+ UInt32 val = c;
+
+ do
+ {
+ Byte c2;
+ if (src == srcLim)
+ break;
+ c2 = *src;
+ if (c2 < 0x80 || c2 >= 0xC0)
+ break;
+ src++;
+ val <<= 6;
+ val |= (c2 - 0x80);
+ }
+ while (--numBytes);
+
+ if (numBytes != 0)
+ _ERROR_UTF8
+
+ if (val < 0x10000)
+ {
+ if (dest)
+ dest[destPos] = (wchar_t)val;
+ destPos++;
+ }
+ else
+ {
+ val -= 0x10000;
+ if (val >= 0x100000)
+ _ERROR_UTF8
+ if (dest)
+ {
+ dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10));
+ dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF));
+ }
+ destPos += 2;
+ }
+ }
+}
+
+#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
+
+#define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))
+#define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
+
+static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim)
+{
+ size_t size = srcLim - src;
+ for (;;)
+ {
+ if (src == srcLim)
+ return size;
+
+ UInt32 val = *src++;
+
+ if (val < 0x80)
+ continue;
+
+ if (val < _UTF8_RANGE(1))
+ {
+ size++;
+ continue;
+ }
+
+ if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
+ {
+ UInt32 c2 = *src;
+ if (c2 >= 0xDC00 && c2 < 0xE000)
+ {
+ src++;
+ size += 2;
+ continue;
+ }
+ }
+
+ #ifdef _WCHART_IS_16BIT
+
+ size += 2;
+
+ #else
+
+ if (val < _UTF8_RANGE(2)) size += 2;
+ else if (val < _UTF8_RANGE(3)) size += 3;
+ else if (val < _UTF8_RANGE(4)) size += 4;
+ else if (val < _UTF8_RANGE(5)) size += 5;
+ else size += 6;
+
+ #endif
+ }
+}
+
+static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim)
+{
+ for (;;)
+ {
+ if (src == srcLim)
+ return dest;
+
+ UInt32 val = *src++;
+
+ if (val < 0x80)
+ {
+ *dest++ = (char)val;
+ continue;
+ }
+
+ if (val < _UTF8_RANGE(1))
+ {
+ dest[0] = _UTF8_HEAD(1, val);
+ dest[1] = _UTF8_CHAR(0, val);
+ dest += 2;
+ continue;
+ }
+
+ if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
+ {
+ UInt32 c2 = *src;
+ if (c2 >= 0xDC00 && c2 < 0xE000)
+ {
+ src++;
+ val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
+ dest[0] = _UTF8_HEAD(3, val);
+ dest[1] = _UTF8_CHAR(2, val);
+ dest[2] = _UTF8_CHAR(1, val);
+ dest[3] = _UTF8_CHAR(0, val);
+ dest += 4;
+ continue;
+ }
+ }
+
+ #ifndef _WCHART_IS_16BIT
+ if (val < _UTF8_RANGE(2))
+ #endif
+ {
+ dest[0] = _UTF8_HEAD(2, val);
+ dest[1] = _UTF8_CHAR(1, val);
+ dest[2] = _UTF8_CHAR(0, val);
+ dest += 3;
+ continue;
+ }
+
+ #ifndef _WCHART_IS_16BIT
+
+ UInt32 b;
+ unsigned numBits;
+ if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
+ else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }
+ else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }
+ else { numBits = 6 * 6; b = _UTF8_START(6); }
+
+ *dest++ = (Byte)b;
+
+ do
+ {
+ numBits -= 6;
+ *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
+ }
+ while (numBits != 0);
+
+ #endif
+ }
+}
+
+bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
+{
+ dest.Empty();
+ size_t destLen = 0;
+ Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len()));
+ bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len()));
+ dest.ReleaseBuf_SetEnd((unsigned)destLen);
+ return res;
+}
+
+void ConvertUnicodeToUTF8(const UString &src, AString &dest)
+{
+ dest.Empty();
+ size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()));
+ Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len()));
+ dest.ReleaseBuf_SetEnd((unsigned)destLen);
+}