Commit 3aeb76af authored by Chen Bill's avatar Chen Bill Committed by GitHub

fix EncodeUTF8String, DecodeUTF8String (#2585)

* add CheckStringSize

* add IsUnicodeChar

* fix EncodeUTF8String

* fix DecodeUTF8String

* add ConvertUTF8

* remove unnecessary inline
parent 397cc7a1
......@@ -6,30 +6,30 @@
class BufferIO {
public:
inline static int ReadInt32(unsigned char*& p) {
static int ReadInt32(unsigned char*& p) {
return buffer_read<int32_t>(p);
}
inline static short ReadInt16(unsigned char*& p) {
static short ReadInt16(unsigned char*& p) {
return buffer_read<int16_t>(p);
}
inline static char ReadInt8(unsigned char*& p) {
static char ReadInt8(unsigned char*& p) {
return buffer_read<char>(p);
}
inline static unsigned char ReadUInt8(unsigned char*& p) {
static unsigned char ReadUInt8(unsigned char*& p) {
return buffer_read<unsigned char>(p);
}
inline static void WriteInt32(unsigned char*& p, int val) {
static void WriteInt32(unsigned char*& p, int val) {
buffer_write<int32_t>(p, val);
}
inline static void WriteInt16(unsigned char*& p, short val) {
static void WriteInt16(unsigned char*& p, short val) {
buffer_write<int16_t>(p, val);
}
inline static void WriteInt8(unsigned char*& p, char val) {
static void WriteInt8(unsigned char*& p, char val) {
buffer_write<char>(p, val);
}
// return: string length
template<typename T1, typename T2>
inline static int CopyWStr(const T1* src, T2* pstr, int bufsize) {
static int CopyWStr(const T1* src, T2* pstr, int bufsize) {
int l = 0;
while(src[l] && l < bufsize - 1) {
pstr[l] = (T2)src[l];
......@@ -39,7 +39,7 @@ public:
return l;
}
template<typename T1, typename T2>
inline static int CopyWStrRef(const T1* src, T2*& pstr, int bufsize) {
static int CopyWStrRef(const T1* src, T2*& pstr, int bufsize) {
int l = 0;
while(src[l] && l < bufsize - 1) {
pstr[l] = (T2)src[l];
......@@ -49,22 +49,117 @@ public:
*pstr = 0;
return l;
}
template<typename T>
static bool CheckUTF8Byte(const T* str, int len) {
for (int i = 1; i < len; ++i) {
if ((str[i] & 0xc0U) != 0x80U)
return false;
}
return true;
}
static unsigned int ConvertUTF8(const char*& p) {
unsigned int cur = 0;
if ((p[0] & 0x80U) == 0) {
cur = p[0] & 0xffU;
p++;
}
else if ((p[0] & 0xe0U) == 0xc0U) {
if (!CheckUTF8Byte(p, 2)) {
p++;
return UINT32_MAX;
}
cur = ((p[0] & 0x1fU) << 6) | (p[1] & 0x3fU);
p += 2;
if(cur < 0x80U)
return UINT32_MAX;
}
else if ((p[0] & 0xf0U) == 0xe0U) {
if (!CheckUTF8Byte(p, 3)) {
p++;
return UINT32_MAX;
}
cur = ((p[0] & 0xfU) << 12) | ((p[1] & 0x3fU) << 6) | (p[2] & 0x3fU);
p += 3;
if (cur < 0x800U)
return UINT32_MAX;
}
else if ((p[0] & 0xf8U) == 0xf0U) {
if (!CheckUTF8Byte(p, 4)) {
p++;
return UINT32_MAX;
}
cur = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
p += 4;
if (cur < 0x10000U)
return UINT32_MAX;
}
else {
p++;
return UINT32_MAX;
}
return cur;
}
static bool IsHighSurrogate(unsigned int c) {
return (c >= 0xd800U && c <= 0xdbffU);
}
static bool IsLowSurrogate(unsigned int c) {
return (c >= 0xdc00U && c <= 0xdfffU);
}
static bool IsUnicodeChar(unsigned int c) {
if(IsHighSurrogate(c))
return false;
if (IsLowSurrogate(c))
return false;
if (c > 0x10ffffU)
return false;
return true;
}
// UTF-16/UTF-32 to UTF-8
// return: string length
static int EncodeUTF8String(const wchar_t* wsrc, char* str, int size) {
char* pstr = str;
while (*wsrc != 0) {
unsigned cur = *wsrc;
auto pw = wsrc;
auto pstr = str;
while (*pw != 0) {
unsigned cur = 0;
int codepoint_size = 0;
if (sizeof(wchar_t) == 2) {
if (IsHighSurrogate(pw[0])) {
if (pw[1] == 0)
break;
if (IsLowSurrogate(pw[1])) {
cur = ((pw[0] & 0x3ffU) << 10) | (pw[1] & 0x3ffU);
cur += 0x10000;
pw += 2;
}
else {
pw++;
continue;
}
}
else if (IsLowSurrogate(pw[0])) {
pw++;
continue;
}
else {
cur = *pw;
pw++;
}
}
else {
cur = *pw;
pw++;
}
if (!IsUnicodeChar(cur))
continue;
if (cur < 0x80U)
codepoint_size = 1;
else if (cur < 0x800U)
codepoint_size = 2;
else if (cur < 0x10000U && (cur < 0xd800U || cur > 0xdfffU))
else if (cur < 0x10000U)
codepoint_size = 3;
else
codepoint_size = 4;
if (pstr - str + codepoint_size > size - 1)
if ((int)(pstr - str) + codepoint_size > size - 1)
break;
switch (codepoint_size) {
case 1:
......@@ -80,13 +175,6 @@ public:
pstr[2] = (cur & 0x3f) | 0x80;
break;
case 4:
if (sizeof(wchar_t) == 2) {
cur = 0;
cur |= (*wsrc & 0x3ffU) << 10;
++wsrc;
cur |= *wsrc & 0x3ffU;
cur += 0x10000;
}
pstr[0] = ((cur >> 18) & 0x7) | 0xf0;
pstr[1] = ((cur >> 12) & 0x3f) | 0x80;
pstr[2] = ((cur >> 6) & 0x3f) | 0x80;
......@@ -96,10 +184,9 @@ public:
break;
}
pstr += codepoint_size;
wsrc++;
}
*pstr = 0;
return pstr - str;
return (int)(pstr - str);
}
// UTF-8 to UTF-16/UTF-32
// return: string length
......@@ -107,9 +194,11 @@ public:
const char* p = src;
wchar_t* wp = wstr;
while(*p != 0) {
const unsigned cur = *p & 0xffU;
unsigned int cur = ConvertUTF8(p);
int codepoint_size = 0;
if ((cur & 0xf8) == 0xf0) {
if (!IsUnicodeChar(cur))
continue;
if (cur >= 0x10000) {
if (sizeof(wchar_t) == 2)
codepoint_size = 2;
else
......@@ -117,30 +206,18 @@ public:
}
else
codepoint_size = 1;
if (wp - wstr + codepoint_size > size - 1)
if ((int)(wp - wstr) + codepoint_size > size - 1)
break;
if((cur & 0x80) == 0) {
*wp = *p;
p++;
} else if((cur & 0xe0) == 0xc0) {
*wp = ((p[0] & 0x1fU) << 6) | (p[1] & 0x3fU);
p += 2;
} else if((cur & 0xf0) == 0xe0) {
*wp = ((p[0] & 0xfU) << 12) | ((p[1] & 0x3fU) << 6) | (p[2] & 0x3fU);
p += 3;
} else if((cur & 0xf8) == 0xf0) {
if (sizeof(wchar_t) == 2) {
unsigned unicode = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
unicode -= 0x10000;
*wp++ = (unicode >> 10) | 0xd800;
*wp = (unicode & 0x3ff) | 0xdc00;
} else {
*wp = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
}
p += 4;
} else
p++;
wp++;
if (codepoint_size == 1) {
wp[0] = cur;
wp++;
}
else {
cur -= 0x10000U;
wp[0] = (cur >> 10) | 0xd800;
wp[1] = (cur & 0x3ff) | 0xdc00;
wp += 2;
}
}
*wp = 0;
return wp - wstr;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment