From 42f7fc1876cbe68569771b97a8935fbca7fa3ee4 Mon Sep 17 00:00:00 2001 From: crupest Date: Mon, 7 Jun 2021 20:42:28 +0800 Subject: import(life): ... --- .../computer-network-experiment/StringUtil.cpp | 120 +++++++++++++-------- 1 file changed, 75 insertions(+), 45 deletions(-) (limited to 'works/life/computer-network-experiment/StringUtil.cpp') diff --git a/works/life/computer-network-experiment/StringUtil.cpp b/works/life/computer-network-experiment/StringUtil.cpp index 1224bdc..6bf906d 100644 --- a/works/life/computer-network-experiment/StringUtil.cpp +++ b/works/life/computer-network-experiment/StringUtil.cpp @@ -5,14 +5,14 @@ namespace cru { namespace { template -inline std::enable_if_t, ReturnType> ExtractBits( - UInt n) { +inline std::enable_if_t, ReturnType> +ExtractBits(UInt n) { return static_cast(n & ((1u << number_of_bit) - 1)); } -} // namespace +} // namespace CodePoint Utf8NextCodePoint(std::string_view str, Index current, - Index* next_position) { + Index *next_position) { CodePoint result; if (current >= static_cast(str.length())) { @@ -37,9 +37,9 @@ CodePoint Utf8NextCodePoint(std::string_view str, Index current, }; if ((1u << 7) & cu0) { - if ((1u << 6) & cu0) { // 2~4-length code point - if ((1u << 5) & cu0) { // 3~4-length code point - if ((1u << 4) & cu0) { // 4-length code point + if ((1u << 6) & cu0) { // 2~4-length code point + if ((1u << 5) & cu0) { // 3~4-length code point + if ((1u << 4) & cu0) { // 4-length code point if (cu0 & (1u << 3)) { throw TextEncodeException( "Unexpected bad-format begin byte (not 0b11110xxx) of 4-byte" @@ -52,14 +52,14 @@ CodePoint Utf8NextCodePoint(std::string_view str, Index current, const CodePoint s2 = read_next_folowing_code() << 6; const CodePoint s3 = read_next_folowing_code(); result = s0 + s1 + s2 + s3; - } else { // 3-length code point + } else { // 3-length code point const CodePoint s0 = ExtractBits(cu0) << (6 * 2); const CodePoint s1 = read_next_folowing_code() << 6; const CodePoint s2 = read_next_folowing_code(); result = s0 + s1 + s2; } - } else { // 2-length code point + } else { // 2-length code point const CodePoint s0 = ExtractBits(cu0) << 6; const CodePoint s1 = read_next_folowing_code(); @@ -74,12 +74,13 @@ CodePoint Utf8NextCodePoint(std::string_view str, Index current, } } - if (next_position != nullptr) *next_position = current; + if (next_position != nullptr) + *next_position = current; return result; } CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, - Index* next_position) { + Index *next_position) { CodePoint result; if (current >= static_cast(str.length())) { @@ -87,9 +88,9 @@ CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, } else { const auto cu0 = str[current++]; - if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point + if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point result = static_cast(cu0); - } else if (IsUtf16SurrogatePairLeading(cu0)) { // 2-length code point + } else if (IsUtf16SurrogatePairLeading(cu0)) { // 2-length code point if (current >= static_cast(str.length())) { throw TextEncodeException( "Unexpected end when reading second code unit of surrogate pair."); @@ -112,21 +113,22 @@ CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, } } - if (next_position != nullptr) *next_position = current; + if (next_position != nullptr) + *next_position = current; return result; } CodePoint Utf16PreviousCodePoint(std::u16string_view str, Index current, - Index* previous_position) { + Index *previous_position) { CodePoint result; if (current <= 0) { result = k_invalid_code_point; } else { const auto cu0 = str[--current]; - if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point + if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point result = static_cast(cu0); - } else if (IsUtf16SurrogatePairTrailing(cu0)) { // 2-length code point + } else if (IsUtf16SurrogatePairTrailing(cu0)) { // 2-length code point if (current <= 0) { throw TextEncodeException( "Unexpected end when reading first code unit of surrogate pair."); @@ -149,11 +151,12 @@ CodePoint Utf16PreviousCodePoint(std::u16string_view str, Index current, } } - if (previous_position != nullptr) *previous_position = current; + if (previous_position != nullptr) + *previous_position = current; return result; } -void Utf8EncodeCodePointAppend(CodePoint code_point, std::string& str) { +void Utf8EncodeCodePointAppend(CodePoint code_point, std::string &str) { auto write_continue_byte = [&str](std::uint8_t byte6) { str.push_back((1u << 7) + (((1u << 6) - 1) & byte6)); }; @@ -192,7 +195,7 @@ void Utf8EncodeCodePointAppend(CodePoint code_point, std::string& str) { } } -void Utf16EncodeCodePointAppend(CodePoint code_point, std::u16string& str) { +void Utf16EncodeCodePointAppend(CodePoint code_point, std::u16string &str) { if ((code_point >= 0 && code_point <= 0xD7FF) || (code_point >= 0xE000 && code_point <= 0xFFFF)) { str.push_back(static_cast(code_point)); @@ -224,6 +227,15 @@ std::u16string ToUtf16(std::string_view s) { } #ifdef WIN32 +std::string ToUtf8(std::wstring_view s) { + std::u16string_view string{reinterpret_cast(s.data()), + s.size()}; + std::string result; + for (CodePoint cp : Utf16CodePointIterator{string}) { + Utf8EncodeCodePointAppend(cp, result); + } + return result; +} std::wstring ToUtf16WString(std::string_view s) { std::u16string result; for (CodePoint cp : Utf8CodePointIterator{s}) { @@ -236,33 +248,43 @@ std::wstring ToUtf16WString(std::string_view s) { #endif bool Utf16IsValidInsertPosition(std::u16string_view s, gsl::index position) { - if (position < 0) return false; - if (position > static_cast(s.size())) return false; - if (position == 0) return true; - if (position == static_cast(s.size())) return true; + if (position < 0) + return false; + if (position > static_cast(s.size())) + return false; + if (position == 0) + return true; + if (position == static_cast(s.size())) + return true; return !IsUtf16SurrogatePairTrailing(s[position]); } gsl::index Utf16BackwardUntil(std::u16string_view str, gsl::index position, - const std::function& predicate) { - if (position <= 0) return position; + const std::function &predicate) { + if (position <= 0) + return position; while (true) { gsl::index p = position; auto c = Utf16PreviousCodePoint(str, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; + if (predicate(c)) + return p; + if (c == k_invalid_code_point) + return p; } UnreachableCode(); } gsl::index Utf16ForwardUntil(std::u16string_view str, gsl::index position, - const std::function& predicate) { - if (position >= static_cast(str.size())) return position; + const std::function &predicate) { + if (position >= static_cast(str.size())) + return position; while (true) { gsl::index p = position; auto c = Utf16NextCodePoint(str, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; + if (predicate(c)) + return p; + if (c == k_invalid_code_point) + return p; } UnreachableCode(); } @@ -270,29 +292,35 @@ gsl::index Utf16ForwardUntil(std::u16string_view str, gsl::index position, inline bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } gsl::index Utf16PreviousWord(std::u16string_view str, gsl::index position, - bool* is_space) { - if (position <= 0) return position; + bool *is_space) { + if (position <= 0) + return position; auto c = Utf16PreviousCodePoint(str, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; + if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). + if (is_space) + *is_space = true; return Utf16BackwardUntil(str, position, [](CodePoint c) { return !IsSpace(c); }); } else { - if (is_space) *is_space = false; + if (is_space) + *is_space = false; return Utf16BackwardUntil(str, position, IsSpace); } } gsl::index Utf16NextWord(std::u16string_view str, gsl::index position, - bool* is_space) { - if (position >= static_cast(str.size())) return position; + bool *is_space) { + if (position >= static_cast(str.size())) + return position; auto c = Utf16NextCodePoint(str, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; + if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). + if (is_space) + *is_space = true; return Utf16ForwardUntil(str, position, [](CodePoint c) { return !IsSpace(c); }); } else { - if (is_space) *is_space = false; + if (is_space) + *is_space = false; return Utf16ForwardUntil(str, position, IsSpace); } } @@ -313,13 +341,15 @@ char16_t ToUpper(char16_t c) { std::u16string ToLower(std::u16string_view s) { std::u16string result; - for (auto c : s) result.push_back(ToLower(c)); + for (auto c : s) + result.push_back(ToLower(c)); return result; } std::u16string ToUpper(std::u16string_view s) { std::u16string result; - for (auto c : s) result.push_back(ToUpper(c)); + for (auto c : s) + result.push_back(ToUpper(c)); return result; } -} // namespace cru +} // namespace cru -- cgit v1.2.3