From dc1f0c4c0096013799416664894c5194dc7e1f52 Mon Sep 17 00:00:00 2001 From: Yuqian Yang Date: Fri, 28 Feb 2025 23:13:39 +0800 Subject: chore(store): move everything to store. --- .../computer-network-experiment/StringUtil.cpp | 355 --------------------- 1 file changed, 355 deletions(-) delete mode 100644 works/life/computer-network-experiment/StringUtil.cpp (limited to 'works/life/computer-network-experiment/StringUtil.cpp') diff --git a/works/life/computer-network-experiment/StringUtil.cpp b/works/life/computer-network-experiment/StringUtil.cpp deleted file mode 100644 index 6bf906d..0000000 --- a/works/life/computer-network-experiment/StringUtil.cpp +++ /dev/null @@ -1,355 +0,0 @@ -#include "StringUtil.hpp" -#include "Base.hpp" -#include - -namespace cru { -namespace { -template -inline std::enable_if_t, ReturnType> -ExtractBits(UInt n) { - return static_cast(n & ((1u << number_of_bit) - 1)); -} -} // namespace - -CodePoint Utf8NextCodePoint(std::string_view str, Index current, - Index *next_position) { - CodePoint result; - - if (current >= static_cast(str.length())) { - result = k_invalid_code_point; - } else { - const auto cu0 = static_cast(str[current++]); - - auto read_next_folowing_code = [&str, ¤t]() -> CodePoint { - if (current == static_cast(str.length())) - throw TextEncodeException( - "Unexpected end when read continuing byte of multi-byte code " - "point."); - - const auto u = static_cast(str[current]); - if (!(u & (1u << 7)) || (u & (1u << 6))) { - throw TextEncodeException( - "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " - "multi-byte code point."); - } - - return ExtractBits(str[current++]); - }; - - if ((1u << 7) & cu0) { - if ((1u << 6) & cu0) { // 2~4-length code point - if ((1u << 5) & cu0) { // 3~4-length code point - if ((1u << 4) & cu0) { // 4-length code point - if (cu0 & (1u << 3)) { - throw TextEncodeException( - "Unexpected bad-format begin byte (not 0b11110xxx) of 4-byte" - "code point."); - } - - const CodePoint s0 = ExtractBits(cu0) - << (6 * 3); - const CodePoint s1 = read_next_folowing_code() << (6 * 2); - const CodePoint s2 = read_next_folowing_code() << 6; - const CodePoint s3 = read_next_folowing_code(); - result = s0 + s1 + s2 + s3; - } else { // 3-length code point - const CodePoint s0 = ExtractBits(cu0) - << (6 * 2); - const CodePoint s1 = read_next_folowing_code() << 6; - const CodePoint s2 = read_next_folowing_code(); - result = s0 + s1 + s2; - } - } else { // 2-length code point - const CodePoint s0 = ExtractBits(cu0) - << 6; - const CodePoint s1 = read_next_folowing_code(); - result = s0 + s1; - } - } else { - throw TextEncodeException( - "Unexpected bad-format (0b10xxxxxx) begin byte of a code point."); - } - } else { - result = static_cast(cu0); - } - } - - if (next_position != nullptr) - *next_position = current; - return result; -} - -CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, - Index *next_position) { - CodePoint result; - - if (current >= static_cast(str.length())) { - result = k_invalid_code_point; - } else { - const auto cu0 = str[current++]; - - if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point - result = static_cast(cu0); - } else if (IsUtf16SurrogatePairLeading(cu0)) { // 2-length code point - if (current >= static_cast(str.length())) { - throw TextEncodeException( - "Unexpected end when reading second code unit of surrogate pair."); - } - const auto cu1 = str[current++]; - - if (!IsUtf16SurrogatePairTrailing(cu1)) { - throw TextEncodeException( - "Unexpected bad-range second code unit of surrogate pair."); - } - - const auto s0 = ExtractBits(cu0) << 10; - const auto s1 = ExtractBits(cu1); - - result = s0 + s1 + 0x10000; - - } else { - throw TextEncodeException( - "Unexpected bad-range first code unit of surrogate pair."); - } - } - - if (next_position != nullptr) - *next_position = current; - return result; -} - -CodePoint Utf16PreviousCodePoint(std::u16string_view str, Index current, - Index *previous_position) { - CodePoint result; - if (current <= 0) { - result = k_invalid_code_point; - } else { - const auto cu0 = str[--current]; - - if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point - result = static_cast(cu0); - } else if (IsUtf16SurrogatePairTrailing(cu0)) { // 2-length code point - if (current <= 0) { - throw TextEncodeException( - "Unexpected end when reading first code unit of surrogate pair."); - } - const auto cu1 = str[--current]; - - if (!IsUtf16SurrogatePairLeading(cu1)) { - throw TextEncodeException( - "Unexpected bad-range first code unit of surrogate pair."); - } - - const auto s0 = ExtractBits(cu1) << 10; - const auto s1 = ExtractBits(cu0); - - result = s0 + s1 + 0x10000; - - } else { - throw TextEncodeException( - "Unexpected bad-range second code unit of surrogate pair."); - } - } - - if (previous_position != nullptr) - *previous_position = current; - return result; -} - -void Utf8EncodeCodePointAppend(CodePoint code_point, std::string &str) { - auto write_continue_byte = [&str](std::uint8_t byte6) { - str.push_back((1u << 7) + (((1u << 6) - 1) & byte6)); - }; - - if (code_point >= 0 && code_point <= 0x007F) { - str.push_back(static_cast(code_point)); - } else if (code_point >= 0x0080 && code_point <= 0x07FF) { - std::uint32_t unsigned_code_point = code_point; - str.push_back(static_cast(ExtractBits( - (unsigned_code_point >> 6)) + - 0b11000000)); - write_continue_byte( - ExtractBits(unsigned_code_point)); - } else if (code_point >= 0x0800 && code_point <= 0xFFFF) { - std::uint32_t unsigned_code_point = code_point; - str.push_back(static_cast(ExtractBits( - (unsigned_code_point >> (6 * 2))) + - 0b11100000)); - write_continue_byte( - ExtractBits(unsigned_code_point >> 6)); - write_continue_byte( - ExtractBits(unsigned_code_point)); - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t unsigned_code_point = code_point; - str.push_back(static_cast(ExtractBits( - (unsigned_code_point >> (6 * 3))) + - 0b11110000)); - write_continue_byte(ExtractBits( - unsigned_code_point >> (6 * 2))); - write_continue_byte( - ExtractBits(unsigned_code_point >> 6)); - write_continue_byte( - ExtractBits(unsigned_code_point)); - } else { - throw TextEncodeException("Code point out of range."); - } -} - -void Utf16EncodeCodePointAppend(CodePoint code_point, std::u16string &str) { - if ((code_point >= 0 && code_point <= 0xD7FF) || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - str.push_back(static_cast(code_point)); - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t u = code_point - 0x10000; - str.push_back(static_cast( - ExtractBits(u >> 10) + 0xD800u)); - str.push_back(static_cast( - ExtractBits(u) + 0xDC00u)); - } else { - throw TextEncodeException("Code point out of range."); - } -} - -std::string ToUtf8(std::u16string_view s) { - std::string result; - for (CodePoint cp : Utf16CodePointIterator{s}) { - Utf8EncodeCodePointAppend(cp, result); - } - return result; -} - -std::u16string ToUtf16(std::string_view s) { - std::u16string result; - for (CodePoint cp : Utf8CodePointIterator{s}) { - Utf16EncodeCodePointAppend(cp, result); - } - return result; -} - -#ifdef WIN32 -std::string ToUtf8(std::wstring_view s) { - std::u16string_view string{reinterpret_cast(s.data()), - s.size()}; - std::string result; - for (CodePoint cp : Utf16CodePointIterator{string}) { - Utf8EncodeCodePointAppend(cp, result); - } - return result; -} -std::wstring ToUtf16WString(std::string_view s) { - std::u16string result; - for (CodePoint cp : Utf8CodePointIterator{s}) { - Utf16EncodeCodePointAppend(cp, result); - } - - std::wstring r(result.cbegin(), result.cend()); - return r; -} -#endif - -bool Utf16IsValidInsertPosition(std::u16string_view s, gsl::index position) { - if (position < 0) - return false; - if (position > static_cast(s.size())) - return false; - if (position == 0) - return true; - if (position == static_cast(s.size())) - return true; - return !IsUtf16SurrogatePairTrailing(s[position]); -} - -gsl::index Utf16BackwardUntil(std::u16string_view str, gsl::index position, - const std::function &predicate) { - if (position <= 0) - return position; - while (true) { - gsl::index p = position; - auto c = Utf16PreviousCodePoint(str, p, &position); - if (predicate(c)) - return p; - if (c == k_invalid_code_point) - return p; - } - UnreachableCode(); -} - -gsl::index Utf16ForwardUntil(std::u16string_view str, gsl::index position, - const std::function &predicate) { - if (position >= static_cast(str.size())) - return position; - while (true) { - gsl::index p = position; - auto c = Utf16NextCodePoint(str, p, &position); - if (predicate(c)) - return p; - if (c == k_invalid_code_point) - return p; - } - UnreachableCode(); -} - -inline bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } - -gsl::index Utf16PreviousWord(std::u16string_view str, gsl::index position, - bool *is_space) { - if (position <= 0) - return position; - auto c = Utf16PreviousCodePoint(str, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) - *is_space = true; - return Utf16BackwardUntil(str, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) - *is_space = false; - return Utf16BackwardUntil(str, position, IsSpace); - } -} - -gsl::index Utf16NextWord(std::u16string_view str, gsl::index position, - bool *is_space) { - if (position >= static_cast(str.size())) - return position; - auto c = Utf16NextCodePoint(str, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) - *is_space = true; - return Utf16ForwardUntil(str, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) - *is_space = false; - return Utf16ForwardUntil(str, position, IsSpace); - } -} - -char16_t ToLower(char16_t c) { - if (c >= u'A' && c <= u'Z') { - return c - u'A' + u'a'; - } - return c; -} - -char16_t ToUpper(char16_t c) { - if (c >= u'a' && c <= u'z') { - return c - u'a' + u'A'; - } - return c; -} - -std::u16string ToLower(std::u16string_view s) { - std::u16string result; - for (auto c : s) - result.push_back(ToLower(c)); - return result; -} - -std::u16string ToUpper(std::u16string_view s) { - std::u16string result; - for (auto c : s) - result.push_back(ToUpper(c)); - return result; -} -} // namespace cru -- cgit v1.2.3