From 40dbcf08a93760e64d9b3f72bc4df3cad6d8627e Mon Sep 17 00:00:00 2001 From: jief Date: Mon, 24 Feb 2020 16:07:24 +0300 Subject: [PATCH] UTF8 conversion for XStringW --- rEFIt_UEFI/cpp_foundation/XObjArray.h | 4 +- rEFIt_UEFI/cpp_foundation/XStringW.cpp | 35 +++- rEFIt_UEFI/cpp_foundation/XStringW.h | 18 +- rEFIt_UEFI/cpp_foundation/printf_lite.cpp | 1 + rEFIt_UEFI/cpp_foundation/printf_lite.h | 1 + rEFIt_UEFI/cpp_foundation/utf8Conversion.cpp | 185 +++++++++++++++++++ rEFIt_UEFI/cpp_foundation/utf8Conversion.h | 14 ++ rEFIt_UEFI/cpp_unit_test/XStringW_test.cpp | 65 +++++-- 8 files changed, 285 insertions(+), 38 deletions(-) create mode 120000 rEFIt_UEFI/cpp_foundation/printf_lite.cpp create mode 120000 rEFIt_UEFI/cpp_foundation/printf_lite.h create mode 100755 rEFIt_UEFI/cpp_foundation/utf8Conversion.cpp create mode 100644 rEFIt_UEFI/cpp_foundation/utf8Conversion.h diff --git a/rEFIt_UEFI/cpp_foundation/XObjArray.h b/rEFIt_UEFI/cpp_foundation/XObjArray.h index a3686628f..6409542cf 100755 --- a/rEFIt_UEFI/cpp_foundation/XObjArray.h +++ b/rEFIt_UEFI/cpp_foundation/XObjArray.h @@ -51,8 +51,8 @@ class XObjArrayNC virtual ~XObjArrayNC(); protected: - XObjArrayNC(const XObjArrayNC &anObjArrayNC) { throw "Intentionally not defined"; } - const XObjArrayNC &operator =(const XObjArrayNC &anObjArrayNC) { throw "Intentionally not defined"; } + XObjArrayNC(const XObjArrayNC &anObjArrayNC) { DebugLog(2, "Intentionally not defined"); CpuDeadLoop(); } + const XObjArrayNC &operator =(const XObjArrayNC &anObjArrayNC) { DebugLog(2, "Intentionally not defined"); CpuDeadLoop(); } xsize _getLen() const { return _Len; } public: diff --git a/rEFIt_UEFI/cpp_foundation/XStringW.cpp b/rEFIt_UEFI/cpp_foundation/XStringW.cpp index da34bea75..c28fb20dc 100755 --- a/rEFIt_UEFI/cpp_foundation/XStringW.cpp +++ b/rEFIt_UEFI/cpp_foundation/XStringW.cpp @@ -21,12 +21,8 @@ #include "XToolsCommon.h" #include "XStringW.h" -//extern "C" { -// #include -// #include -//} #include -//#include "refit/IO.h" +#include "printf_lite.h" UINTN XStringWGrowByDefault = 1024; const XStringW NullXStringW; @@ -83,6 +79,14 @@ DBG("Constructor(const wchar_t aChar)\n"); StrnCpy(&aChar, 1); } +XStringW::XStringW(const char* S) +{ +DBG("Constructor(const char* S)\n"); + xsize newLen = StrLenInWChar(S, AsciiStrLen(S)); + Init(newLen); + utf8ToWChar(m_data, m_size+1, S, AsciiStrLen(S)); // m_size doesn't count the NULL terminator +} + //xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx // Destructor //xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx @@ -220,8 +224,21 @@ XStringW XStringW::SubStringReplace(wchar_t c1, wchar_t c2) return Result; } -void XStringW::vSPrintf(const wchar_t *format, VA_LIST va) +static XStringW* sprintfBuf; + +void transmitSprintf(const wchar_t* buf, size_t nbyte) { + (*sprintfBuf).StrnCat(buf, nbyte); +} + +void XStringW::vSPrintf(const char* format, VA_LIST va) +{ + SetLength(0); + + sprintfBuf = this; + vprintf_with_callback(format, va, transmitSprintf); + + // This is an attempt to use _PPrint from IO.c. Problem is : you have to allocate the memory BEFORE calling it. // POOL_PRINT spc; // PRINT_STATE ps; // @@ -240,11 +257,11 @@ void XStringW::vSPrintf(const wchar_t *format, VA_LIST va) // VA_END(ps.args); } -void XStringW::SPrintf(const wchar_t *format, ...) +void XStringW::SPrintf(const char* format, ...) { VA_LIST va; - VA_START (va, format); + VA_START (va, format); vSPrintf(format, va); VA_END(va); } @@ -464,7 +481,7 @@ const XStringW &XStringW::operator +=(const wchar_t *S) // Functions //----------------------------------------------------------------------------- -XStringW SPrintf(const wchar_t *format, ...) +XStringW SPrintf(const char* format, ...) { VA_LIST va; XStringW str; diff --git a/rEFIt_UEFI/cpp_foundation/XStringW.h b/rEFIt_UEFI/cpp_foundation/XStringW.h index 94642ee21..298e52202 100755 --- a/rEFIt_UEFI/cpp_foundation/XStringW.h +++ b/rEFIt_UEFI/cpp_foundation/XStringW.h @@ -11,11 +11,7 @@ #include "XToolsCommon.h" #include -//#include "XConstStringW.h" - -//extern "C" { -// #include -//} +#include "utf8Conversion.h" #define LPATH_SEPARATOR L'\\' @@ -37,6 +33,8 @@ public: XStringW(const wchar_t* S, UINTN count); XStringW(const wchar_t); + XStringW(const char*); + ~XStringW(); protected: @@ -89,8 +87,8 @@ public: void Insert(UINTN pos, const XStringW& Str); - void vSPrintf(const wchar_t *format, VA_LIST va); - void SPrintf(const wchar_t *format, ...); + void vSPrintf(const char* format, VA_LIST va); + void SPrintf(const char* format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); const XStringW &operator =(const XStringW &aString); const XStringW &operator =(const wchar_t* S); @@ -101,7 +99,7 @@ public: const XStringW &operator += (wchar_t); XStringW SubString(UINTN pos, UINTN count) const; - UINTN IdxOf(wchar_t c, UINTN Pos = 0) const; + UINTN IdxOf(wchar_t c, UINTN Pos = 0) const; UINTN IdxOf(const XStringW& S, UINTN Pos = 0) const; UINTN RIdxOf(const wchar_t c, UINTN Pos = MAX_XSIZE) const; UINTN RIdxOf(const XStringW& S, UINTN Pos = MAX_XSIZE) const; @@ -116,7 +114,7 @@ public: void Replace(wchar_t c1, wchar_t c2); XStringW SubStringReplace(wchar_t c1, wchar_t c2); - int Compare(const wchar_t* S) const { return StrCmp(data(), S) ; } + int Compare(const wchar_t* S) const { return (int)StrCmp(data(), S) ; } bool Equal(const wchar_t* S) const { return Compare(S) == 0; }; bool BeginingEqual(const wchar_t* S) const { return StrnCmp(data(), S, StrLen(S)); } @@ -174,7 +172,7 @@ public: //extern const XStringW NullXStringW; -XStringW SPrintf(const wchar_t *format, ...); +XStringW SPrintf(const char* format, ...) __attribute__ ((__format__ (__printf__, 1, 2))); XStringW SubString(const wchar_t *S, UINTN pos, UINTN count); XStringW CleanCtrl(const XStringW &S); diff --git a/rEFIt_UEFI/cpp_foundation/printf_lite.cpp b/rEFIt_UEFI/cpp_foundation/printf_lite.cpp new file mode 120000 index 000000000..b991930c9 --- /dev/null +++ b/rEFIt_UEFI/cpp_foundation/printf_lite.cpp @@ -0,0 +1 @@ +/JiefLand/5.Devel/Embedded/Shared/printf_lite-master/printf_lite.cpp \ No newline at end of file diff --git a/rEFIt_UEFI/cpp_foundation/printf_lite.h b/rEFIt_UEFI/cpp_foundation/printf_lite.h new file mode 120000 index 000000000..3a25de4f8 --- /dev/null +++ b/rEFIt_UEFI/cpp_foundation/printf_lite.h @@ -0,0 +1 @@ +/JiefLand/5.Devel/Embedded/Shared/printf_lite-master/printf_lite.h \ No newline at end of file diff --git a/rEFIt_UEFI/cpp_foundation/utf8Conversion.cpp b/rEFIt_UEFI/cpp_foundation/utf8Conversion.cpp new file mode 100755 index 000000000..b3018d562 --- /dev/null +++ b/rEFIt_UEFI/cpp_foundation/utf8Conversion.cpp @@ -0,0 +1,185 @@ +// +// utf8Conversion.hpp +// +// Created by jief the 24 Feb 2020. +// + +#include "utf8Conversion.h" + +#include + + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#define uint16_t UINT16 +#define uint32_t UINT32 +#define size_t UINTN + + +size_t StrLenInWChar(const char *s, size_t src_len) +{ + size_t dst_len = 0; + + while ( *s ) { + char32_t c; + if (*s & 0x80) { + if (*(s+1) == 0) { + // Finished in the middle of an utf8 multibyte char + return dst_len; + } + if ((*(s+1) & 0xc0) != 0x80) { + s += 1; + continue; + } + if ((*s & 0xe0) == 0xe0) { + if (*(s+2) == 0) { + // Finished in the middle of an utf8 multibyte char + return dst_len; + } + if ((*(s+2) & 0xc0) != 0x80) { + s += 2; + continue; + } + if ((*s & 0xf0) == 0xf0) { + if (*(s+3) == 0) { + // Finished in the middle of an utf8 multibyte char + return dst_len; + } + if ((*s & 0xf8) != 0xf0 || (*(s+3) & 0xc0) != 0x80) { + s += 3; + continue; + } + /* 4-byte code */ + c = (*s & 0x7) << 18; + c |= (*(s+1) & 0x3f) << 12; + c |= (*(s+2) & 0x3f) << 6; + c |= *(s+3) & 0x3f; + s += 4; + } else { + /* 3-byte code */ + c = (*s & 0xf) << 12; + c |= (*(s+1) & 0x3f) << 6; + c |= *(s+2) & 0x3f; + s += 3; + } + } else { + /* 2-byte code */ + c = (*s & 0x1f) << 6; + c |= *(s+1) & 0x3f; + s += 2; + } + } else { + /* 1-byte code */ + c = *s; + s += 1; + } +#if __WCHAR_MAX__ > 0xFFFFu + dst_len++; +#else + if ( c <= 0xFFFF) { + dst_len++; + } else { + dst_len++; + dst_len++; + } +#endif + } + return dst_len; +} + + + + + + +#define halfBase 0x0010000UL +#define halfMask 0x3FFUL +#define halfShift 10 /* used for shifting by 10 bits */ +#define UNI_SUR_HIGH_START 0xD800u +#define UNI_SUR_LOW_START 0xDC00u + + + +void utf8ToWChar(wchar_t* dst, size_t dst_max_len, const char *s, size_t src_len) +{ + if ( dst_max_len == 0 ) return; + dst_max_len -= 1; + + size_t dst_len = 0; + + while ( *s ) { + char32_t c; + if (*s & 0x80) { + if (*(s+1) == 0) { + // Finished in the middle of an utf8 multibyte char + goto exit; + } + if ((*(s+1) & 0xc0) != 0x80) { + s += 1; + continue; + } + if ((*s & 0xe0) == 0xe0) { + if (*(s+2) == 0) { + // Finished in the middle of an utf8 multibyte char + goto exit; + } + if ((*(s+2) & 0xc0) != 0x80) { + s += 2; + continue; + } + if ((*s & 0xf0) == 0xf0) { + if (*(s+3) == 0) { + // Finished in the middle of an utf8 multibyte char + goto exit; + } + if ((*s & 0xf8) != 0xf0 || (*(s+3) & 0xc0) != 0x80) { + s += 3; + continue; + } + /* 4-byte code */ + c = (*s & 0x7) << 18; + c |= (*(s+1) & 0x3f) << 12; + c |= (*(s+2) & 0x3f) << 6; + c |= *(s+3) & 0x3f; + s += 4; + } else { + /* 3-byte code */ + c = (*s & 0xf) << 12; + c |= (*(s+1) & 0x3f) << 6; + c |= *(s+2) & 0x3f; + s += 3; + } + } else { + /* 2-byte code */ + c = (*s & 0x1f) << 6; + c |= *(s+1) & 0x3f; + s += 2; + } + } else { + /* 1-byte code */ + c = *s; + s += 1; + } +#if __WCHAR_MAX__ > 0xFFFFu + dst[dst_len++] = c; + if ( dst_len == dst_max_len ) goto exit; +#else + if ( c <= 0xFFFF) { + dst[dst_len++] = c; + if ( dst_len == dst_max_len ) goto exit; + } else { + c -= halfBase; + dst[dst_len++] = (wchar_t)((c >> halfShift) + UNI_SUR_HIGH_START); + if ( dst_len == dst_max_len ) goto exit; + dst[dst_len++] = (wchar_t)((c & halfMask) + UNI_SUR_LOW_START); + if ( dst_len == dst_max_len ) goto exit; + } +#endif + } +exit: + dst[dst_len] = 0; +} + + diff --git a/rEFIt_UEFI/cpp_foundation/utf8Conversion.h b/rEFIt_UEFI/cpp_foundation/utf8Conversion.h new file mode 100644 index 000000000..bcb491c02 --- /dev/null +++ b/rEFIt_UEFI/cpp_foundation/utf8Conversion.h @@ -0,0 +1,14 @@ +// +// utf8Conversion.hpp +// +// Created by jief the 24 Feb 2020. +// + +#ifndef utf816Conversion_hpp +#define utf816Conversion_hpp + + +UINTN StrLenInWChar(const char *src, UINTN src_len); +void utf8ToWChar(wchar_t* dst, UINTN dst_max_len, const char *s, UINTN src_len); + +#endif /* utf816Conversion_hpp */ diff --git a/rEFIt_UEFI/cpp_unit_test/XStringW_test.cpp b/rEFIt_UEFI/cpp_unit_test/XStringW_test.cpp index fa2626105..ba12a578f 100644 --- a/rEFIt_UEFI/cpp_unit_test/XStringW_test.cpp +++ b/rEFIt_UEFI/cpp_unit_test/XStringW_test.cpp @@ -1,6 +1,10 @@ #include "../cpp_foundation/XStringW.h" #include "global1.h" #include "global2.h" +#include "../cpp_foundation/utf8Conversion.h" + + +//#include int XStringW_tests() @@ -11,25 +15,52 @@ int XStringW_tests() #endif if ( global_str1 != L"global_str1" ) return 1; - if ( global_str2 != L"global_str2" ) return 1; + if ( global_str2 != L"global_str2" ) return 2; + + XStringW str(L"1"); + if ( str != L"1" ) return 3; + str.StrCat(L"2"); + if ( str != L"12" ) return 4; + + XStringW str2; + if ( str2.NotNull() ) return 10; + str2.StrnCpy(str.data(), 2); + if ( str2 != L"12" ) return 11; + str2.StrnCat(L"345", 2); + if ( str2 != L"1234" ) return 12; + str2.Insert(1, str); + if ( str2 != L"112234" ) return 13; + str2 += L"6"; + if ( str2 != L"1122346" ) return 14; + +//wchar_t c2 = L'Ň'; +//printf("1=%lc\n", c2); +//const char* s1 = "𐌾"; + + str2.SPrintf("%c", 'a'); // signle UTF8 ascii char + if ( str2 != L"a" ) return 20; + str2.SPrintf("%ls", L"ab"); // UTF16(32) string containing ascii char + if ( str2 != L"ab" ) return 21; + + str2.SPrintf("%lc", L'Ň'); // signe UTF16(32) char. (2 bytes in total if UTF16) + if ( str2 != L"Ň" ) return 22; + str2.SPrintf("%s", "Ň"); // this is a UTF8 string 2 bytes long + if ( str2 != L"Ň" ) return 23; + +#if __WCHAR_MAX__ > 0xFFFFu + str2.SPrintf("%lc", L'𐌾'); // L'𐌾' // this char cannot convert to an UTF16 char. So it doesn't compile with -fshort-wchar + if ( str2 != L'𐌾' ) return 30; +#endif + str2.SPrintf("%ls", L"𐌾"); // this is a UTF8 string 4 bytes long + if ( str2 != L"𐌾" ) return 31; + str2.SPrintf("%ls", L"𐌾"); // this is a UTF16 or UTF32 string (depending of -fshort-wchar) + if ( str2 != L"𐌾" ) return 32; { - XStringW str(L"1"); - if ( str != L"1" ) return 1; - str.StrCat(L"2"); - if ( str != L"12" ) return 1; - - XStringW str2; - if ( str2.NotNull() ) return 10; - str2.StrnCpy(str.data(), 2); - if ( str2 != L"12" ) return 11; - str2.StrnCat(L"345", 2); - if ( str2 != L"1234" ) return 12; - str2.Insert(1, str); - if ( str2 != L"112234" ) return 13; - str2 += L"6"; - if ( str2 != L"1122346" ) return 14; + XStringW str3("a"); + if ( str3 != L"a" ) return 40; + XStringW str4("aŇ𐌾"); + if ( str4 != L"aŇ𐌾" ) return 41; } - return 0; }