use wcsrtombs in UTF-8 encoding (#2860)

* use wcsrtombs in UTF-8 encoding * fix underflow * static_cast

use wcsrtombs in UTF-8 encoding (#2860)
* use wcsrtombs in UTF-8 encoding * fix underflow * static_cast
a58e7c31 · salix5 · GitHub · a3d10c01 · a58e7c31
Commit a58e7c31 authored Jul 07, 2025 by salix5 Committed by GitHub Jul 07, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 150 deletions

gframe/bufferio.h gframe/bufferio.h +20 -150

No files found.
--- a/gframe/bufferio.h
+++ b/gframe/bufferio.h
@@ -91,56 +91,6 @@ public:
 		std::wcsncpy(dst, src, N - 1);
 		dst[N - 1] = 0;
 	}
-	template<typename T>
-	static bool CheckUTF8Byte(const T* str, int len) {
-		for (int i = 1; i < len; ++i) {
-			if ((str[i] & 0xc0U) != 0x80U)
-				return false;
-		}
-		return true;
-	}
-	static unsigned int ConvertUTF8(const char*& p) {
-		unsigned int cur = 0;
-		if ((p[0] & 0x80U) == 0) {
-			cur = p[0] & 0xffU;
-			p++;
-		}
-		else if ((p[0] & 0xe0U) == 0xc0U) {
-			if (!CheckUTF8Byte(p, 2)) {
-				p++;
-				return UINT32_MAX;
-			}
-			cur = ((p[0] & 0x1fU) << 6) | (p[1] & 0x3fU);
-			p += 2;
-			if(cur < 0x80U)
-				return UINT32_MAX;
-		}
-		else if ((p[0] & 0xf0U) == 0xe0U) {
-			if (!CheckUTF8Byte(p, 3)) {
-				p++;
-				return UINT32_MAX;
-			}
-			cur = ((p[0] & 0xfU) << 12) | ((p[1] & 0x3fU) << 6) | (p[2] & 0x3fU);
-			p += 3;
-			if (cur < 0x800U)
-				return UINT32_MAX;
-		}
-		else if ((p[0] & 0xf8U) == 0xf0U) {
-			if (!CheckUTF8Byte(p, 4)) {
-				p++;
-				return UINT32_MAX;
-			}
-			cur = ((p[0] & 0x7U) << 18) | ((p[1] & 0x3fU) << 12) | ((p[2] & 0x3fU) << 6) | (p[3] & 0x3fU);
-			p += 4;
-			if (cur < 0x10000U)
-				return UINT32_MAX;
-		}
-		else {
-			p++;
-			return UINT32_MAX;
-		}
-		return cur;
-	}
 	static bool IsHighSurrogate(unsigned int c) {
 		return (c >= 0xd800U && c <= 0xdbffU);
 	}
@@ -158,111 +108,31 @@ public:
 	}
 	// UTF-16/UTF-32 to UTF-8
 	// return: string length
-	static int EncodeUTF8String(const wchar_t* wsrc, char* str, int size) {
+	static int EncodeUTF8String(const wchar_t* wsrc, char* str, size_t len) {
-		auto pw = wsrc;
+		if (len == 0) {
-		auto pstr = str;
+			str[0] = 0;
-		while (*pw != 0) {
+			return 0;
-			unsigned cur = 0;
-			int codepoint_size = 0;
-			if (sizeof(wchar_t) == 2) {
-				if (IsHighSurrogate(pw[0])) {
-					if (pw[1] == 0)
-						break;
-					if (IsLowSurrogate(pw[1])) {
-						cur = ((pw[0] & 0x3ffU) << 10) | (pw[1] & 0x3ffU);
-						cur += 0x10000;
-						pw += 2;
-					}
-					else {
-						pw++;
-						continue;
-					}
-				}
-				else if (IsLowSurrogate(pw[0])) {
-					pw++;
-					continue;
-				}
-				else {
-					cur = *pw;
-					pw++;
-				}
-			}
-			else {
-				cur = *pw;
-				pw++;
-			}
-			if (!IsUnicodeChar(cur))
-				continue;
-			if (cur < 0x80U)
-				codepoint_size = 1;
-			else if (cur < 0x800U)
-				codepoint_size = 2;
-			else if (cur < 0x10000U)
-				codepoint_size = 3;
-			else
-				codepoint_size = 4;
-			if ((int)(pstr - str) + codepoint_size > size - 1)
-				break;
-			switch (codepoint_size) {
-			case 1:
-				*pstr = (char)cur;
-				break;
-			case 2:
-				pstr[0] = ((cur >> 6) & 0x1f) | 0xc0;
-				pstr[1] = (cur & 0x3f) | 0x80;
-				break;
-			case 3:
-				pstr[0] = ((cur >> 12) & 0xf) | 0xe0;
-				pstr[1] = ((cur >> 6) & 0x3f) | 0x80;
-				pstr[2] = (cur & 0x3f) | 0x80;
-				break;
-			case 4:
-				pstr[0] = ((cur >> 18) & 0x7) | 0xf0;
-				pstr[1] = ((cur >> 12) & 0x3f) | 0x80;
-				pstr[2] = ((cur >> 6) & 0x3f) | 0x80;
-				pstr[3] = (cur & 0x3f) | 0x80;
-				break;
-			default:
-				break;
-			}
-			pstr += codepoint_size;
 		}
-		*pstr = 0;
+		std::mbstate_t state{};
-		return (int)(pstr - str);
+		size_t result_len = std::wcsrtombs(str, &wsrc, len - 1, &state);
+		if (result_len == static_cast<size_t>(-1))
+			result_len = 0;
+		str[result_len] = 0;
+		return static_cast<int>(result_len);
 	}
 	// UTF-8 to UTF-16/UTF-32
 	// return: string length
-	static int DecodeUTF8String(const char* src, wchar_t* wstr, int size) {
+	static int DecodeUTF8String(const char* src, wchar_t* wstr, size_t len) {
-		const char* p = src;
+		if (len == 0) {
-		wchar_t* wp = wstr;
+			wstr[0] = 0;
-		while(*p != 0) {
+			return 0;
-			unsigned int cur = ConvertUTF8(p);
-			int codepoint_size = 0;
-			if (!IsUnicodeChar(cur))
-				continue;
-			if (cur >= 0x10000) {
-				if (sizeof(wchar_t) == 2)
-					codepoint_size = 2;
-				else
-					codepoint_size = 1;
-			}
-			else
-				codepoint_size = 1;
-			if ((int)(wp - wstr) + codepoint_size > size - 1)
-				break;
-			if (codepoint_size == 1) {
-				wp[0] = cur;
-				wp++;
-			}
-			else {
-				cur -= 0x10000U;
-				wp[0] = (cur >> 10) | 0xd800;
-				wp[1] = (cur & 0x3ff) | 0xdc00;
-				wp += 2;
-			}
 		}
-		*wp = 0;
+		std::mbstate_t state{};
-		return wp - wstr;
+		size_t result_len = std::mbsrtowcs(wstr, &src, len - 1, &state);
+		if (result_len == static_cast<size_t>(-1))
+			result_len = 0;
+		wstr[result_len] = 0;
+		return static_cast<int>(result_len);
 	}
 	template<size_t N>
 	static int EncodeUTF8(const wchar_t* src, char(&dst)[N]) {