fix EncodeUTF8String, DecodeUTF8String (#2585)

* add CheckStringSize * add IsUnicodeChar * fix EncodeUTF8String * fix DecodeUTF8String * add ConvertUTF8 * remove unnecessary inline

fix EncodeUTF8String, DecodeUTF8String (#2585)
* add CheckStringSize * add IsUnicodeChar * fix EncodeUTF8String * fix DecodeUTF8String * add ConvertUTF8 * remove unnecessary inline
3aeb76af · salix5 · GitHub · 397cc7a1 · 3aeb76af
Commit 3aeb76af authored Sep 07, 2024 by salix5 Committed by GitHub Sep 07, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 125 additions and 48 deletions

gframe/bufferio.h gframe/bufferio.h +125 -48

No files found.
--- a/gframe/bufferio.h
+++ b/gframe/bufferio.h
@@ -6,30 +6,30 @@

 class BufferIO {
 public:
-	inline static int ReadInt32(unsigned char*& p) {
+	static int ReadInt32(unsigned char*& p) {
 		return buffer_read<int32_t>(p);
 	}
-	inline static short ReadInt16(unsigned char*& p) {
+	static short ReadInt16(unsigned char*& p) {
 		return buffer_read<int16_t>(p);
 	}
-	inline static char ReadInt8(unsigned char*& p) {
+	static char ReadInt8(unsigned char*& p) {
 		return buffer_read<char>(p);
 	}
-	inline static unsigned char ReadUInt8(unsigned char*& p) {
+	static unsigned char ReadUInt8(unsigned char*& p) {
 		return buffer_read<unsigned char>(p);
 	}
-	inline static void WriteInt32(unsigned char*& p, int val) {
+	static void WriteInt32(unsigned char*& p, int val) {
 		buffer_write<int32_t>(p, val);
 	}
-	inline static void WriteInt16(unsigned char*& p, short val) {
+	static void WriteInt16(unsigned char*& p, short val) {
 		buffer_write<int16_t>(p, val);
 	}
-	inline static void WriteInt8(unsigned char*& p, char val) {
+	static void WriteInt8(unsigned char*& p, char val) {
 		buffer_write<char>(p, val);
 	}
 	// return: string length
 	template<typename T1, typename T2>
-	inline static int CopyWStr(const T1* src, T2* pstr, int bufsize) {
+	static int CopyWStr(const T1* src, T2* pstr, int bufsize) {
 		int l = 0;
 		while(src[l] && l < bufsize - 1) {
 			pstr[l] = (T2)src[l];
@@ -39,7 +39,7 @@ public:
 		return l;
 	}
 	template<typename T1, typename T2>
-	inline static int CopyWStrRef(const T1* src, T2*& pstr, int bufsize) {
+	static int CopyWStrRef(const T1* src, T2*& pstr, int bufsize) {
 		int l = 0;
 		while(src[l] && l < bufsize - 1) {
 			pstr[l] = (T2)src[l];
@@ -49,22 +49,117 @@ public:
 		*pstr = 0;
 		return l;
 	}
+	template<typename T>
+	static bool CheckUTF8Byte(const T* str, int len) {
+		for (int i = 1; i < len; ++i) {
+			if ((str[i] & 0xc0U) != 0x80U)
+				return false;
+		}
+		return true;
+	}
+	static unsigned int ConvertUTF8(const char*& p) {
+		unsigned int cur = 0;
+		if ((p[0] & 0x80U) == 0) {
+			cur = p[0] & 0xffU;
+			p++;
+		}
+		else if ((p[0] & 0xe0U) == 0xc0U) {
+			if (!CheckUTF8Byte(p, 2)) {
+				p++;
+				return UINT32_MAX;
+			}
+			cur = ((p[0] & 0x1fU) << 6) | (p[1] & 0x3fU);
+			p += 2;
+			if(cur < 0x80U)
+				return UINT32_MAX;
+		}
+		else if ((p[0] & 0xf0U) == 0xe0U) {
+			if (!CheckUTF8Byte(p, 3)) {
+				p++;
+				return UINT32_MAX;
+			}
+			cur = ((p[0] & 0xfU) << 12) | ((p[1] & 0x3fU) << 6) | (p[2] & 0x3fU);
+			p += 3;
+			if (cur < 0x800U)
+				return UINT32_MAX;
+		}
+		else if ((p[0] & 0xf8U) == 0xf0U) {
+			if (!CheckUTF8Byte(p, 4)) {
+				p++;
+				return UINT32_MAX;
+			}
+			cur = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
+			p += 4;
+			if (cur < 0x10000U)
+				return UINT32_MAX;
+		}
+		else {
+			p++;
+			return UINT32_MAX;
+		}
+		return cur;
+	}
+	static bool IsHighSurrogate(unsigned int c) {
+		return (c >= 0xd800U && c <= 0xdbffU);
+	}
+	static bool IsLowSurrogate(unsigned int c) {
+		return (c >= 0xdc00U && c <= 0xdfffU);
+	}
+	static bool IsUnicodeChar(unsigned int c) {
+		if(IsHighSurrogate(c))
+			return false;
+		if (IsLowSurrogate(c))
+			return false;
+		if (c > 0x10ffffU)
+			return false;
+		return true;
+	}
 	// UTF-16/UTF-32 to UTF-8
 	// return: string length
 	static int EncodeUTF8String(const wchar_t* wsrc, char* str, int size) {
-		char* pstr = str;
-		while (*wsrc != 0) {
-			unsigned cur = *wsrc;
+		auto pw = wsrc;
+		auto pstr = str;
+		while (*pw != 0) {
+			unsigned cur = 0;
 			int codepoint_size = 0;
+			if (sizeof(wchar_t) == 2) {
+				if (IsHighSurrogate(pw[0])) {
+					if (pw[1] == 0)
+						break;
+					if (IsLowSurrogate(pw[1])) {
+						cur = ((pw[0] & 0x3ffU) << 10) | (pw[1] & 0x3ffU);
+						cur += 0x10000;
+						pw += 2;
+					}
+					else {
+						pw++;
+						continue;
+					}
+				}
+				else if (IsLowSurrogate(pw[0])) {
+					pw++;
+					continue;
+				}
+				else {
+					cur = *pw;
+					pw++;
+				}
+			}
+			else {
+				cur = *pw;
+				pw++;
+			}
+			if (!IsUnicodeChar(cur))
+				continue;
 			if (cur < 0x80U)
 				codepoint_size = 1;
 			else if (cur < 0x800U)
 				codepoint_size = 2;
-			else if (cur < 0x10000U && (cur < 0xd800U || cur > 0xdfffU))
+			else if (cur < 0x10000U)
 				codepoint_size = 3;
 			else
 				codepoint_size = 4;
-			if (pstr - str + codepoint_size > size - 1)
+			if ((int)(pstr - str) + codepoint_size > size - 1)
 				break;
 			switch (codepoint_size) {
 			case 1:
@@ -80,13 +175,6 @@ public:
 				pstr[2] = (cur & 0x3f) | 0x80;
 				break;
 			case 4:
-				if (sizeof(wchar_t) == 2) {
-					cur = 0;
-					cur |= (*wsrc & 0x3ffU) << 10;
-					++wsrc;
-					cur |= *wsrc & 0x3ffU;
-					cur += 0x10000;
-				}
 				pstr[0] = ((cur >> 18) & 0x7) | 0xf0;
 				pstr[1] = ((cur >> 12) & 0x3f) | 0x80;
 				pstr[2] = ((cur >> 6) & 0x3f) | 0x80;
@@ -96,10 +184,9 @@ public:
 				break;
 			}
 			pstr += codepoint_size;
-			wsrc++;
 		}
 		*pstr = 0;
-		return pstr - str;
+		return (int)(pstr - str);
 	}
 	// UTF-8 to UTF-16/UTF-32
 	// return: string length
@@ -107,9 +194,11 @@ public:
 		const char* p = src;
 		wchar_t* wp = wstr;
 		while(*p != 0) {
-			const unsigned cur = *p & 0xffU;
+			unsigned int cur = ConvertUTF8(p);
 			int codepoint_size = 0;
-			if ((cur & 0xf8) == 0xf0) {
+			if (!IsUnicodeChar(cur))
+				continue;
+			if (cur >= 0x10000) {
 				if (sizeof(wchar_t) == 2)
 					codepoint_size = 2;
 				else
@@ -117,30 +206,18 @@ public:
 			}
 			else
 				codepoint_size = 1;
-			if (wp - wstr + codepoint_size > size - 1)
+			if ((int)(wp - wstr) + codepoint_size > size - 1)
 				break;
-			if((cur & 0x80) == 0) {
-				*wp = *p;
-				p++;
-			} else if((cur & 0xe0) == 0xc0) {
-				*wp = ((p[0] & 0x1fU) << 6) | (p[1] & 0x3fU);
-				p += 2;
-			} else if((cur & 0xf0) == 0xe0) {
-				*wp = ((p[0] & 0xfU) << 12) | ((p[1] & 0x3fU) << 6) | (p[2] & 0x3fU);
-				p += 3;
-			} else if((cur & 0xf8) == 0xf0) {
-				if (sizeof(wchar_t) == 2) {
-					unsigned unicode = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
-					unicode -= 0x10000;
-					*wp++ = (unicode >> 10) | 0xd800;
-					*wp = (unicode & 0x3ff) | 0xdc00;
-				} else {
-					*wp = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
-				}
-				p += 4;
-			} else
-				p++;
-			wp++;
+			if (codepoint_size == 1) {
+				wp[0] = cur;
+				wp++;
+			}
+			else {
+				cur -= 0x10000U;
+				wp[0] = (cur >> 10) | 0xd800;
+				wp[1] = (cur & 0x3ff) | 0xdc00;
+				wp += 2;
+			}
 		}
 		*wp = 0;
 		return wp - wstr;