diff options
-rw-r--r-- | include/cru/base/StringUtil.h | 284 | ||||
-rw-r--r-- | src/base/StringUtil.cpp | 258 | ||||
-rw-r--r-- | src/ui/controls/TextHostControlService.cpp | 39 | ||||
-rw-r--r-- | test/base/StringUtilTest.cpp | 65 |
4 files changed, 332 insertions, 314 deletions
diff --git a/include/cru/base/StringUtil.h b/include/cru/base/StringUtil.h index 7a88f7e8..54e7b6e6 100644 --- a/include/cru/base/StringUtil.h +++ b/include/cru/base/StringUtil.h @@ -6,6 +6,7 @@ #include <cctype> #include <charconv> #include <compare> +#include <cstdint> #include <format> #include <functional> #include <string> @@ -14,8 +15,7 @@ #include <type_traits> #include <vector> -namespace cru { -namespace string { +namespace cru::string { std::weak_ordering CaseInsensitiveCompare(std::string_view left, std::string_view right); std::string TrimBegin(std::string_view str); @@ -137,32 +137,158 @@ struct ImplementFormatterByToString { } }; -} // namespace string - using CodePoint = std::int32_t; +using Utf8CodeUnit = char; +using Utf16CodeUnit = char16_t; constexpr CodePoint k_invalid_code_point = -1; -inline bool IsUtf16SurrogatePairCodeUnit(char16_t c) { +inline bool IsUtf8LeadingByte(Utf8CodeUnit c) { + return !(c & 0b10000000) || c & 0b01000000; +} + +inline bool IsUtf8FollowingByte(Utf8CodeUnit c) { + return !IsUtf8LeadingByte(c); +} + +inline bool IsUtf16SurrogatePairCodeUnit(Utf16CodeUnit c) { return c >= 0xD800 && c <= 0xDFFF; } -inline bool IsUtf16SurrogatePairLeading(char16_t c) { +inline bool IsUtf16SurrogatePairLeading(Utf16CodeUnit c) { return c >= 0xD800 && c <= 0xDBFF; } -inline bool IsUtf16SurrogatePairTrailing(char16_t c) { +inline bool IsUtf16SurrogatePairTrailing(Utf16CodeUnit c) { return c >= 0xDC00 && c <= 0xDFFF; } CodePoint CRU_BASE_API Utf8NextCodePoint(const char* ptr, Index size, Index current, Index* next_position); -CodePoint CRU_BASE_API Utf16NextCodePoint(const char16_t* ptr, Index size, +CodePoint CRU_BASE_API Utf8PreviousCodePoint(const char* ptr, Index size, + Index current, + Index* previous_position); + +namespace details { +template <typename Integer, int number_of_bit, typename ReturnType> +inline ReturnType ExtractBits(Integer n) { + return static_cast<ReturnType>(n & ((1u << number_of_bit) - 1)); +} +} // namespace details + +template <typename CharWriter> +std::enable_if_t<std::is_invocable_v<CharWriter, Utf8CodeUnit>, bool> +Utf8EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { + auto write_continue_byte = [&writer](Utf8CodeUnit byte6) { + writer((1u << 7) + (((1u << 6) - 1) & byte6)); + }; + + if (code_point >= 0 && code_point <= 0x007F) { + writer(static_cast<Utf8CodeUnit>(code_point)); + return true; + } else if (code_point >= 0x0080 && code_point <= 0x07FF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 5, Utf8CodeUnit>( + (unsigned_code_point >> 6)) + + 0b11000000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else if (code_point >= 0x0800 && code_point <= 0xFFFF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 4, Utf8CodeUnit>( + (unsigned_code_point >> (6 * 2))) + + 0b11100000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> 6)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 3, Utf8CodeUnit>( + (unsigned_code_point >> (6 * 3))) + + 0b11110000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> (6 * 2))); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> 6)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else { + return false; + } +} + +bool CRU_BASE_API Utf8IsValidInsertPosition(const Utf8CodeUnit* ptr, Index size, + Index position); + +// Return position after the character making predicate returns true or 0 if no +// character doing so. +Index CRU_BASE_API +Utf8BackwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); +// Return position before the character making predicate returns true or +// str.size() if no character doing so. +Index CRU_BASE_API +Utf8ForwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); + +Index CRU_BASE_API Utf8PreviousWord(const Utf8CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); +Index CRU_BASE_API Utf8NextWord(const Utf8CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); + +CodePoint CRU_BASE_API Utf16NextCodePoint(const Utf16CodeUnit* ptr, Index size, Index current, Index* next_position); -CodePoint CRU_BASE_API Utf16PreviousCodePoint(const char16_t* ptr, Index size, - Index current, +CodePoint CRU_BASE_API Utf16PreviousCodePoint(const Utf16CodeUnit* ptr, + Index size, Index current, Index* previous_position); +template <typename CharWriter> +std::enable_if_t<std::is_invocable_v<CharWriter, Utf16CodeUnit>, bool> +Utf16EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { + if ((code_point >= 0 && code_point <= 0xD7FF) || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + writer(static_cast<Utf16CodeUnit>(code_point)); + return true; + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + std::uint32_t u = code_point - 0x10000; + writer(static_cast<Utf16CodeUnit>( + details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u >> 10) + + 0xD800u)); + writer(static_cast<Utf16CodeUnit>( + details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u) + 0xDC00u)); + return true; + } else { + return false; + } +} + +// If given s is not a valid utf16 string, return value is UD. +bool CRU_BASE_API Utf16IsValidInsertPosition(const Utf16CodeUnit* ptr, + Index size, Index position); + +// Return position after the character making predicate returns true or 0 if no +// character doing so. +Index CRU_BASE_API +Utf16BackwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); +// Return position before the character making predicate returns true or +// str.size() if no character doing so. +Index CRU_BASE_API +Utf16ForwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); + +Index CRU_BASE_API Utf16PreviousWord(const Utf16CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); +Index CRU_BASE_API Utf16NextWord(const Utf16CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); + template <typename CharType> using NextCodePointFunctionType = CodePoint (*)(const CharType*, Index, Index, Index*); @@ -247,138 +373,6 @@ class CodePointIterator { }; using Utf8CodePointIterator = CodePointIterator<char, &Utf8NextCodePoint>; - -using Utf16CodePointIterator = CodePointIterator<char16_t, &Utf16NextCodePoint>; - -namespace details { -template <typename UInt, int number_of_bit, typename ReturnType> -inline std::enable_if_t<std::is_unsigned_v<UInt>, ReturnType> ExtractBits( - UInt n) { - return static_cast<ReturnType>(n & ((1u << number_of_bit) - 1)); -} -} // namespace details - -template <typename CharWriter> -std::enable_if_t<std::is_invocable_v<CharWriter, char>, bool> -Utf8EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { - auto write_continue_byte = [&writer](std::uint8_t byte6) { - writer((1u << 7) + (((1u << 6) - 1) & byte6)); - }; - - if (code_point >= 0 && code_point <= 0x007F) { - writer(static_cast<char>(code_point)); - return true; - } else if (code_point >= 0x0080 && code_point <= 0x07FF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 5, std::uint8_t>( - (unsigned_code_point >> 6)) + - 0b11000000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else if (code_point >= 0x0800 && code_point <= 0xFFFF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 4, std::uint8_t>( - (unsigned_code_point >> (6 * 2))) + - 0b11100000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> 6)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 3, std::uint8_t>( - (unsigned_code_point >> (6 * 3))) + - 0b11110000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> (6 * 2))); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> 6)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else { - return false; - } -} - -template <typename CharWriter> -std::enable_if_t<std::is_invocable_v<CharWriter, char16_t>, bool> -Utf16EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { - if ((code_point >= 0 && code_point <= 0xD7FF) || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - writer(static_cast<char16_t>(code_point)); - return true; - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t u = code_point - 0x10000; - writer(static_cast<char16_t>( - details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u >> 10) + - 0xD800u)); - writer(static_cast<char16_t>( - details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u) + 0xDC00u)); - return true; - } else { - return false; - } -} - -// If given s is not a valid utf16 string, return value is UD. -bool CRU_BASE_API Utf16IsValidInsertPosition(const char16_t* ptr, Index size, - Index position); - -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index CRU_BASE_API -Utf16BackwardUntil(const char16_t* ptr, Index size, Index position, - const std::function<bool(CodePoint)>& predicate); -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index CRU_BASE_API -Utf16ForwardUntil(const char16_t* ptr, Index size, Index position, - const std::function<bool(CodePoint)>& predicate); - -Index CRU_BASE_API Utf16PreviousWord(const char16_t* ptr, Index size, - Index position, bool* is_space = nullptr); -Index CRU_BASE_API Utf16NextWord(const char16_t* ptr, Index size, - Index position, bool* is_space = nullptr); - -char16_t CRU_BASE_API ToLower(char16_t c); -char16_t CRU_BASE_API ToUpper(char16_t c); - -bool CRU_BASE_API IsWhitespace(char16_t c); -bool CRU_BASE_API IsDigit(char16_t c); - -Utf8CodePointIterator CRU_BASE_API CreateUtf8Iterator(const std::byte* buffer, - Index size); -Utf8CodePointIterator CRU_BASE_API -CreateUtf8Iterator(const std::vector<std::byte>& buffer); - -CodePoint CRU_BASE_API Utf8NextCodePoint(std::string_view str, Index current, - Index* next_position); -CodePoint CRU_BASE_API Utf8PreviousCodePoint(std::string_view str, - Index current, - Index* next_position); -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index CRU_BASE_API -Utf8BackwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate); -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index CRU_BASE_API -Utf8ForwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate); - -bool CRU_BASE_API Utf8IsValidInsertPosition(std::string_view str, - Index position); - -Index CRU_BASE_API Utf8PreviousWord(std::string_view str, Index position, - bool* is_space = nullptr); -Index CRU_BASE_API Utf8NextWord(std::string_view str, Index position, - bool* is_space = nullptr); - -} // namespace cru +using Utf16CodePointIterator = + CodePointIterator<Utf16CodeUnit, &Utf16NextCodePoint>; +} // namespace cru::string diff --git a/src/base/StringUtil.cpp b/src/base/StringUtil.cpp index 4e622dab..581ebcab 100644 --- a/src/base/StringUtil.cpp +++ b/src/base/StringUtil.cpp @@ -7,8 +7,7 @@ #include <compare> #include <string_view> -namespace cru { -namespace string { +namespace cru::string { std::weak_ordering CaseInsensitiveCompare(std::string_view left, std::string_view right) { @@ -78,18 +77,58 @@ std::vector<std::string> Split(std::string_view str, std::string_view sep, return result; } -} // namespace string + +namespace { + +template <typename CharType, + NextCodePointFunctionType<CharType> NextCodePointFunction> +Index Until(const CharType* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + if (position <= 0) return position; + while (true) { + Index p = position; + auto c = NextCodePointFunction(ptr, size, p, &position); + if (predicate(c)) return p; + if (c == k_invalid_code_point) return p; + } + UnreachableCode(); +} + +static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } + +template <typename CharType> +using UntilFunctionType = Index (*)(const CharType*, Index, Index, + const std::function<bool(CodePoint)>&); + +template <typename CharType, + NextCodePointFunctionType<CharType> NextCodePointFunction, + UntilFunctionType<CharType> UntilFunction> +Index Word(const CharType* ptr, Index size, Index position, bool* is_space) { + if (position <= 0) return position; + auto c = NextCodePointFunction(ptr, size, position, nullptr); + if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). + if (is_space) *is_space = true; + return UntilFunction(ptr, size, position, + [](CodePoint c) { return !IsSpace(c); }); + } else { + if (is_space) *is_space = false; + return UntilFunction(ptr, size, position, + [](CodePoint c) { return IsSpace(c); }); + } +} + +} // namespace using details::ExtractBits; -CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, +CodePoint Utf8NextCodePoint(const Utf8CodeUnit* ptr, Index size, Index current, Index* next_position) { CodePoint result; if (current >= size) { result = k_invalid_code_point; } else { - const auto cu0 = static_cast<std::uint8_t>(ptr[current++]); + const auto cu0 = static_cast<Utf8CodeUnit>(ptr[current++]); auto read_next_folowing_code = [ptr, size, ¤t]() -> CodePoint { if (current == size) @@ -97,14 +136,14 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, "Unexpected end when read continuing byte of multi-byte code " "point."); - const auto u = static_cast<std::uint8_t>(ptr[current]); + const auto u = static_cast<Utf8CodeUnit>(ptr[current]); if (!(u & (1u << 7)) || (u & (1u << 6))) { throw TextEncodeException( "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " "multi-byte code point."); } - return ExtractBits<std::uint8_t, 6, CodePoint>(ptr[current++]); + return ExtractBits<Utf8CodeUnit, 6, CodePoint>(ptr[current++]); }; if ((1u << 7) & cu0) { @@ -117,21 +156,21 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, "code point."); } - const CodePoint s0 = ExtractBits<std::uint8_t, 3, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 3, CodePoint>(cu0) << (6 * 3); const CodePoint s1 = read_next_folowing_code() << (6 * 2); const CodePoint s2 = read_next_folowing_code() << 6; const CodePoint s3 = read_next_folowing_code(); result = s0 + s1 + s2 + s3; } else { // 3-length code point - const CodePoint s0 = ExtractBits<std::uint8_t, 4, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 4, CodePoint>(cu0) << (6 * 2); const CodePoint s1 = read_next_folowing_code() << 6; const CodePoint s2 = read_next_folowing_code(); result = s0 + s1 + s2; } } else { // 2-length code point - const CodePoint s0 = ExtractBits<std::uint8_t, 5, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 5, CodePoint>(cu0) << 6; const CodePoint s1 = read_next_folowing_code(); result = s0 + s1; @@ -149,8 +188,67 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, return result; } -CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, - Index* next_position) { +CodePoint Utf8PreviousCodePoint(const Utf8CodeUnit* ptr, Index size, + Index current, Index* previous_position) { + CRU_UNUSED(size) + + CodePoint result; + if (current <= 0) { + result = k_invalid_code_point; + } else { + current--; + int i; + for (i = 0; i < 4; i++) { + if (IsUtf8LeadingByte(ptr[current])) { + break; + } + current--; + } + if (i == 4) { + throw TextEncodeException( + "Failed to find UTF-8 leading byte in 4 previous bytes."); + } + + result = Utf8NextCodePoint(ptr, size, current, nullptr); + } + + if (previous_position != nullptr) *previous_position = current; + return result; +} + +bool Utf8IsValidInsertPosition(const Utf8CodeUnit* ptr, Index size, + Index position) { + return position == 0 || position == size || + (position > 0 && position < size && IsUtf8LeadingByte(ptr[position])); +} + +Index Utf8BackwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + return Until<Utf8CodeUnit, Utf8PreviousCodePoint>(ptr, size, position, + predicate); +} + +Index Utf8ForwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + return Until<Utf8CodeUnit, Utf8NextCodePoint>(ptr, size, position, predicate); +} + +static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } + +Index Utf8PreviousWord(const Utf8CodeUnit* ptr, Index size, Index position, + bool* is_space) { + return Word<Utf8CodeUnit, Utf8PreviousCodePoint, Utf8BackwardUntil>( + ptr, size, position, is_space); +} + +Index Utf8NextWord(const Utf8CodeUnit* ptr, Index size, Index position, + bool* is_space) { + return Word<Utf8CodeUnit, Utf8NextCodePoint, Utf8ForwardUntil>( + ptr, size, position, is_space); +} + +CodePoint Utf16NextCodePoint(const Utf16CodeUnit* ptr, Index size, + Index current, Index* next_position) { CodePoint result; if (current >= size) { @@ -172,8 +270,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, "Unexpected bad-range second code unit of surrogate pair."); } - const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1); + const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0) << 10; + const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1); result = s0 + s1 + 0x10000; @@ -187,8 +285,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, return result; } -CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, - Index* previous_position) { +CodePoint Utf16PreviousCodePoint(const Utf16CodeUnit* ptr, Index size, + Index current, Index* previous_position) { CRU_UNUSED(size) CodePoint result; @@ -211,8 +309,8 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, "Unexpected bad-range first code unit of surrogate pair."); } - const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0); + const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1) << 10; + const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0); result = s0 + s1 + 0x10000; @@ -226,7 +324,7 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, return result; } -bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size, +bool Utf16IsValidInsertPosition(const Utf16CodeUnit* ptr, Index size, Index position) { if (position < 0) return false; if (position > size) return false; @@ -235,124 +333,28 @@ bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size, return !IsUtf16SurrogatePairTrailing(ptr[position]); } -Index Utf16BackwardUntil(const char16_t* ptr, Index size, Index position, +Index Utf16BackwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, const std::function<bool(CodePoint)>& predicate) { - if (position <= 0) return position; - while (true) { - Index p = position; - auto c = Utf16PreviousCodePoint(ptr, size, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; - } - UnreachableCode(); + return Until<Utf16CodeUnit, Utf16PreviousCodePoint>(ptr, size, position, + predicate); } -Index Utf16ForwardUntil(const char16_t* ptr, Index size, Index position, +Index Utf16ForwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, const std::function<bool(CodePoint)>& predicate) { - if (position >= size) return position; - while (true) { - Index p = position; - auto c = Utf16NextCodePoint(ptr, size, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; - } - UnreachableCode(); + return Until<Utf16CodeUnit, Utf16NextCodePoint>(ptr, size, position, + predicate); } -inline bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } - -Index Utf16PreviousWord(const char16_t* ptr, Index size, Index position, +Index Utf16PreviousWord(const Utf16CodeUnit* ptr, Index size, Index position, bool* is_space) { - if (position <= 0) return position; - auto c = Utf16PreviousCodePoint(ptr, size, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; - return Utf16BackwardUntil(ptr, size, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) *is_space = false; - return Utf16BackwardUntil(ptr, size, position, IsSpace); - } + return Word<Utf16CodeUnit, Utf16PreviousCodePoint, Utf16BackwardUntil>( + ptr, size, position, is_space); } -Index Utf16NextWord(const char16_t* ptr, Index size, Index position, +Index Utf16NextWord(const Utf16CodeUnit* ptr, Index size, Index position, bool* is_space) { - if (position >= size) return position; - auto c = Utf16NextCodePoint(ptr, size, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; - return Utf16ForwardUntil(ptr, size, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) *is_space = false; - return Utf16ForwardUntil(ptr, size, position, IsSpace); - } -} - -char16_t ToLower(char16_t c) { - if (c >= u'A' && c <= u'Z') { - return c - u'A' + u'a'; - } - return c; -} - -char16_t ToUpper(char16_t c) { - if (c >= u'a' && c <= u'z') { - return c - u'a' + u'A'; - } - return c; -} - -bool IsWhitespace(char16_t c) { - return c == u' ' || c == u'\t' || c == u'\n' || c == u'\r'; -} - -bool IsDigit(char16_t c) { return c >= u'0' && c <= u'9'; } - -Utf8CodePointIterator CreateUtf8Iterator(const std::byte* buffer, Index size) { - return Utf8CodePointIterator(reinterpret_cast<const char*>(buffer), size); -} - -Utf8CodePointIterator CreateUtf8Iterator(const std::vector<std::byte>& buffer) { - return CreateUtf8Iterator(buffer.data(), buffer.size()); -} - -CodePoint Utf8NextCodePoint(std::string_view str, Index current, - Index* next_position) { - NotImplemented(); -} - -CodePoint Utf8PreviousCodePoint(std::string_view str, Index current, - Index* next_position) { - NotImplemented(); -} - -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index Utf8BackwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate) { - NotImplemented(); -} - -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index Utf8ForwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate) { - NotImplemented(); -} - -bool Utf8IsValidInsertPosition(std::string_view str, Index position) { - NotImplemented(); -} - -Index Utf8PreviousWord(std::string_view str, Index position, - bool* is_space) { - NotImplemented(); -} - -Index Utf8NextWord(std::string_view str, Index position, - bool* is_space) { - NotImplemented(); + return Word<Utf16CodeUnit, Utf16NextCodePoint, Utf16ForwardUntil>( + ptr, size, position, is_space); } -} // namespace cru +} // namespace cru::string diff --git a/src/ui/controls/TextHostControlService.cpp b/src/ui/controls/TextHostControlService.cpp index bb723e3f..3c38c454 100644 --- a/src/ui/controls/TextHostControlService.cpp +++ b/src/ui/controls/TextHostControlService.cpp @@ -5,8 +5,6 @@ #include "cru/base/Base.h" #include "cru/base/StringUtil.h" #include "cru/base/log/Logger.h" -#include "cru/platform/graphics/Font.h" -#include "cru/platform/gui/Base.h" #include "cru/platform/gui/Clipboard.h" #include "cru/platform/gui/Cursor.h" #include "cru/platform/gui/InputMethod.h" @@ -16,7 +14,6 @@ #include "cru/ui/DebugFlags.h" #include "cru/ui/DeleteLater.h" #include "cru/ui/components/Menu.h" -#include "cru/ui/events/UiEvents.h" #include "cru/ui/helper/ShortcutHub.h" #include "cru/ui/host/WindowHost.h" #include "cru/ui/render/ScrollRenderObject.h" @@ -25,12 +22,15 @@ #include <memory> namespace cru::ui::controls { +using namespace cru::string; + TextControlMovePattern TextControlMovePattern::kLeft( "Left", helper::ShortcutKeyBind(platform::gui::KeyCode::Left), [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - Utf8PreviousCodePoint(text, current_position, ¤t_position); + Utf8PreviousCodePoint(text.data(), text.size(), current_position, + ¤t_position); return current_position; }); TextControlMovePattern TextControlMovePattern::kRight( @@ -38,7 +38,8 @@ TextControlMovePattern TextControlMovePattern::kRight( [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - Utf8NextCodePoint(text, current_position, ¤t_position); + Utf8NextCodePoint(text.data(), text.size(), current_position, + ¤t_position); return current_position; }); TextControlMovePattern TextControlMovePattern::kCtrlLeft( @@ -48,7 +49,7 @@ TextControlMovePattern TextControlMovePattern::kCtrlLeft( [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - return Utf8PreviousWord(text, current_position); + return Utf8PreviousWord(text.data(), text.size(), current_position); }); TextControlMovePattern TextControlMovePattern::kCtrlRight( "Ctrl+Right(Next Word)", @@ -57,7 +58,7 @@ TextControlMovePattern TextControlMovePattern::kCtrlRight( [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - return Utf8NextWord(text, current_position); + return Utf8NextWord(text.data(), text.size(), current_position); }); TextControlMovePattern TextControlMovePattern::kUp( "Up", helper::ShortcutKeyBind(platform::gui::KeyCode::Up), @@ -86,7 +87,7 @@ TextControlMovePattern TextControlMovePattern::kHome( [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - return Utf8BackwardUntil(text, current_position, + return Utf8BackwardUntil(text.data(), text.size(), current_position, [](CodePoint c) { return c == u'\n'; }); }); TextControlMovePattern TextControlMovePattern::kEnd( @@ -94,7 +95,7 @@ TextControlMovePattern TextControlMovePattern::kEnd( [](TextHostControlService* service, std::string_view text, Index current_position) { CRU_UNUSED(service) - return Utf8ForwardUntil(text, current_position, + return Utf8ForwardUntil(text.data(), text.size(), current_position, [](CodePoint c) { return c == u'\n'; }); }); TextControlMovePattern TextControlMovePattern::kCtrlHome( @@ -225,7 +226,8 @@ void TextHostControlService::SetText(std::string text, bool stop_composition) { void TextHostControlService::InsertText(Index position, std::string_view text, bool stop_composition) { - if (!Utf8IsValidInsertPosition(this->text_, position)) { + if (!Utf8IsValidInsertPosition(this->text_.data(), this->text_.size(), + position)) { CRU_LOG_TAG_ERROR("Invalid text insert position."); return; } @@ -239,26 +241,29 @@ void TextHostControlService::InsertText(Index position, std::string_view text, } void TextHostControlService::DeleteChar(Index position, bool stop_composition) { - if (!Utf8IsValidInsertPosition(this->text_, position)) { + if (!Utf8IsValidInsertPosition(this->text_.data(), this->text_.size(), + position)) { CRU_LOG_TAG_ERROR("Invalid text delete position."); return; } if (position == static_cast<Index>(this->text_.size())) return; Index next; - Utf8NextCodePoint(this->text_, position, &next); + Utf8NextCodePoint(this->text_.data(), this->text_.size(), position, &next); this->DeleteText(TextRange::FromTwoSides(position, next), stop_composition); } // Return the position of deleted character. Index TextHostControlService::DeleteCharPrevious(Index position, bool stop_composition) { - if (!Utf8IsValidInsertPosition(this->text_, position)) { + if (!Utf8IsValidInsertPosition(this->text_.data(), this->text_.size(), + position)) { CRU_LOG_TAG_ERROR("Invalid text delete position."); return 0; } if (position == 0) return 0; Index previous; - Utf8PreviousCodePoint(this->text_, position, &previous); + Utf8PreviousCodePoint(this->text_.data(), this->text_.size(), position, + &previous); this->DeleteText(TextRange::FromTwoSides(previous, position), stop_composition); return previous; @@ -268,11 +273,13 @@ void TextHostControlService::DeleteText(TextRange range, bool stop_composition) { if (range.count == 0) return; range = range.Normalize(); - if (!Utf8IsValidInsertPosition(this->text_, range.GetStart())) { + if (!Utf8IsValidInsertPosition(this->text_.data(), this->text_.size(), + range.GetStart())) { CRU_LOG_TAG_ERROR("Invalid text delete start position."); return; } - if (!Utf8IsValidInsertPosition(this->text_, range.GetStart())) { + if (!Utf8IsValidInsertPosition(this->text_.data(), this->text_.size(), + range.GetStart())) { CRU_LOG_TAG_ERROR("Invalid text delete end position."); return; } diff --git a/test/base/StringUtilTest.cpp b/test/base/StringUtilTest.cpp index 2b12780c..32fd0d88 100644 --- a/test/base/StringUtilTest.cpp +++ b/test/base/StringUtilTest.cpp @@ -3,18 +3,31 @@ #include <catch2/catch_test_macros.hpp> using cru::Index; -using cru::k_invalid_code_point; +using namespace cru::string; TEST_CASE("StringUtil Split", "[string]") { - using cru::string::Split; REQUIRE(Split("abc", "b") == std::vector<std::string>{"a", "c"}); REQUIRE(Split("abcd", "bc") == std::vector<std::string>{"a", "d"}); REQUIRE(Split("abcdbcd", "bc") == std::vector<std::string>{"a", "d", "d"}); REQUIRE(Split("aaa", "a") == std::vector<std::string>{"", "", "", ""}); } +TEST_CASE("StringUtil Utf8ByteType", "[string]") { + REQUIRE(IsUtf8LeadingByte(0b00100000)); + REQUIRE(IsUtf8LeadingByte(0b01000000)); + REQUIRE(IsUtf8LeadingByte(0b11000000)); + REQUIRE(IsUtf8LeadingByte(0b11100000)); + REQUIRE(IsUtf8LeadingByte(0b11110000)); + REQUIRE(!IsUtf8LeadingByte(0b10100000)); + REQUIRE(!IsUtf8FollowingByte(0b00100000)); + REQUIRE(!IsUtf8FollowingByte(0b01000000)); + REQUIRE(!IsUtf8FollowingByte(0b11000000)); + REQUIRE(!IsUtf8FollowingByte(0b11100000)); + REQUIRE(!IsUtf8FollowingByte(0b11110000)); + REQUIRE(IsUtf8FollowingByte(0b10100000)); +} + TEST_CASE("StringUtil Utf8NextCodePoint", "[string]") { - using cru::Utf8NextCodePoint; std::string_view text = "aπ你🤣!"; Index current = 0; REQUIRE(Utf8NextCodePoint(text.data(), text.size(), current, ¤t) == @@ -32,8 +45,25 @@ TEST_CASE("StringUtil Utf8NextCodePoint", "[string]") { REQUIRE(current == static_cast<Index>(text.size())); } +TEST_CASE("StringUtil Utf8PreviousCodePoint", "[string]") { + std::string_view text = "aπ你🤣!"; + Index current = text.size(); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + 0x0021); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + 0x1F923); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + 0x4F60); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + 0x03C0); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + 0x0061); + REQUIRE(Utf8PreviousCodePoint(text.data(), text.size(), current, ¤t) == + k_invalid_code_point); + REQUIRE(current == 0); +} + TEST_CASE("StringUtil Utf16NextCodePoint", "[string]") { - using cru::Utf16NextCodePoint; std::u16string_view text = u"aπ你🤣!"; Index current = 0; REQUIRE(Utf16NextCodePoint(text.data(), text.size(), current, ¤t) == @@ -52,7 +82,6 @@ TEST_CASE("StringUtil Utf16NextCodePoint", "[string]") { } TEST_CASE("StringUtil Utf16PreviousCodePoint", "[string]") { - using cru::Utf16PreviousCodePoint; std::u16string_view text = u"aπ你🤣!"; Index current = text.size(); REQUIRE(Utf16PreviousCodePoint(text.data(), text.size(), current, ¤t) == @@ -71,38 +100,34 @@ TEST_CASE("StringUtil Utf16PreviousCodePoint", "[string]") { } TEST_CASE("StringUtil Utf8CodePointIterator", "[string]") { - using cru::Utf8CodePointIterator; std::string_view text = "aπ你🤣!"; - std::vector<cru::CodePoint> code_points; + std::vector<CodePoint> code_points; for (auto cp : Utf8CodePointIterator(text.data(), text.size())) { code_points.push_back(cp); } - std::vector<cru::CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, - 0x1F923, 0x0021}; + std::vector<CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, 0x1F923, + 0x0021}; REQUIRE(code_points == expected_code_points); } TEST_CASE("StringUtil Utf16CodePointIterator", "[string]") { - using cru::Utf16CodePointIterator; std::u16string_view text = u"aπ你🤣!"; - std::vector<cru::CodePoint> code_points; + std::vector<CodePoint> code_points; for (auto cp : Utf16CodePointIterator(text.data(), text.size())) { code_points.push_back(cp); } - std::vector<cru::CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, - 0x1F923, 0x0021}; + std::vector<CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, 0x1F923, + 0x0021}; REQUIRE(code_points == expected_code_points); } TEST_CASE("ParseToNumber Work", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumber<int>("123"); REQUIRE(r1.valid); REQUIRE(r1.value == 123); @@ -121,8 +146,6 @@ TEST_CASE("ParseToNumber Work", "[string]") { } TEST_CASE("ParseToNumber AllowLeadingZeroFlag", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumber<int>(" 123"); REQUIRE(!r1.valid); @@ -142,8 +165,6 @@ TEST_CASE("ParseToNumber AllowLeadingZeroFlag", "[string]") { } TEST_CASE("StringToIntegerConverterImpl AllowTrailingSpacesFlag", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumber<int>("123 "); REQUIRE(!r1.valid); @@ -164,8 +185,6 @@ TEST_CASE("StringToIntegerConverterImpl AllowTrailingSpacesFlag", "[string]") { } TEST_CASE("StringToIntegerConverterImpl AllowTrailingJunk", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumber<int>("123ab"); REQUIRE(!r1.valid); @@ -185,8 +204,6 @@ TEST_CASE("StringToIntegerConverterImpl AllowTrailingJunk", "[string]") { } TEST_CASE("StringToIntegerConverterImpl CompositeFlags", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumber<int>(" 123ab", ParseToNumberFlags::AllowLeadingSpaces | ParseToNumberFlags::AllowTrailingJunk); @@ -203,8 +220,6 @@ TEST_CASE("StringToIntegerConverterImpl CompositeFlags", "[string]") { } TEST_CASE("String ParseToNumberList", "[string]") { - using namespace cru::string; - auto r1 = ParseToNumberList<int>("123 456 789"); REQUIRE(r1 == std::vector<int>{123, 456, 789}); |