diff options
author | crupest <crupest@outlook.com> | 2020-07-07 00:31:11 +0800 |
---|---|---|
committer | crupest <crupest@outlook.com> | 2020-07-07 00:31:11 +0800 |
commit | 3bf5b1fcf2315a1ce180ad69eb6bb1e57be37ca5 (patch) | |
tree | 722b5b5d808ce0ead4bc497dd910e081fa601656 | |
parent | cb241b7289abfc10111d3180def55ca1fbb2edb4 (diff) | |
download | cru-3bf5b1fcf2315a1ce180ad69eb6bb1e57be37ca5.tar.gz cru-3bf5b1fcf2315a1ce180ad69eb6bb1e57be37ca5.tar.bz2 cru-3bf5b1fcf2315a1ce180ad69eb6bb1e57be37ca5.zip |
...
-rw-r--r-- | include/cru/common/StringUtil.hpp | 115 | ||||
-rw-r--r-- | include/cru/platform/native/InputMethod.hpp | 18 | ||||
-rw-r--r-- | src/common/StringUtil.cpp | 252 | ||||
-rw-r--r-- | src/win/native/InputMethod.cpp | 4 | ||||
-rw-r--r-- | test/common/StringUtilTest.cpp | 92 |
5 files changed, 301 insertions, 180 deletions
diff --git a/include/cru/common/StringUtil.hpp b/include/cru/common/StringUtil.hpp index 714f1d49..b8edc302 100644 --- a/include/cru/common/StringUtil.hpp +++ b/include/cru/common/StringUtil.hpp @@ -10,51 +10,114 @@ class TextEncodeException : public std::runtime_error { using runtime_error::runtime_error; }; -inline bool IsSurrogatePair(char16_t c) { return c >= 0xD800 && c <= 0xDFFF; } +inline bool IsUtf16SurrogatePairCodeUnit(char16_t c) { + return c >= 0xD800 && c <= 0xDFFF; +} -inline bool IsSurrogatePairLeading(char16_t c) { +inline bool IsUtf16SurrogatePairLeading(char16_t c) { return c >= 0xD800 && c <= 0xDBFF; } -inline bool IsSurrogatePairTrailing(char16_t c) { +inline bool IsUtf16SurrogatePairTrailing(char16_t c) { return c >= 0xDC00 && c <= 0xDFFF; } -class Utf16Iterator : public Object { +CodePoint Utf8NextCodePoint(std::string_view str, Index current, + Index* next_position); + +CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, + Index* next_position); +CodePoint Utf16PreviousCodePoint(std::u16string_view str, Index current, + Index* previous_position); + +template <typename StringType> +using NextCodePointFunctionType = CodePoint (*)(StringType, Index, Index*); + +template <typename StringType, + NextCodePointFunctionType<StringType> NextCodePointFunction> +class CodePointIterator { + public: + using difference_type = Index; + using value_type = CodePoint; + using pointer = void; + using reference = value_type; + using iterator_category = std::forward_iterator_tag; + public: - explicit Utf16Iterator(std::u16string_view string) - : string_(std::move(string)) {} - Utf16Iterator(std::u16string_view string, Index position) - : string_(std::move(string)), position_(position) {} + struct past_end_tag_t {}; - CRU_DEFAULT_COPY(Utf16Iterator) - CRU_DEFAULT_MOVE(Utf16Iterator) + explicit CodePointIterator(StringType string) + : string_(std::move(string)), position_(0) {} + explicit CodePointIterator(StringType string, past_end_tag_t) + : string_(std::move(string)), position_(string_.size()) {} - ~Utf16Iterator() = default; + CRU_DEFAULT_COPY(CodePointIterator) + CRU_DEFAULT_MOVE(CodePointIterator) + + ~CodePointIterator() = default; public: - void SetPositionToHead() { position_ = 0; } - void SetPosition(Index position) { position_ = position; } + StringType GetString() const { return string_; } + Index GetPosition() const { return position_; } + + bool IsPastEnd() const { + return position_ == static_cast<Index>(string_.size()); + } - // Backward current position and get previous code point. Return - // k_invalid_code_point if reach head. Throw TextEncodeException if encounter - // encoding problem. - CodePoint Previous(); + public: + CodePointIterator begin() const { return *this; } + CodePointIterator end() const { + return CodePointIterator{string_, past_end_tag_t{}}; + } - // Advance current position and get next code point. Return - // k_invalid_code_point if reach tail. Throw TextEncodeException if encounter - // encoding problem. - CodePoint Next(); + public: + bool operator==(const CodePointIterator& other) const { + // You should compare iterator that iterate on the same string. + Expects(this->string_.data() == other.string_.data() && + this->string_.size() == other.string_.size()); + return this->position_ == other.position_; + } + bool operator!=(const CodePointIterator& other) const { + return !this->operator==(other); + } + + CodePointIterator& operator++() { + Expects(!IsPastEnd()); + Forward(); + return *this; + } + + CodePointIterator operator++(int) { + Expects(!IsPastEnd()); + CodePointIterator old = *this; + Forward(); + return old; + } + + CodePoint operator*() const { + return NextCodePointFunction(string_, position_, &next_position_cache_); + } - Index CurrentPosition() const { return this->position_; } + private: + void Forward() { + if (next_position_cache_ > position_) { + position_ = next_position_cache_; + } else { + NextCodePointFunction(string_, position_, &position_); + } + } private: - std::u16string_view string_; - Index position_ = 0; + StringType string_; + Index position_; + mutable Index next_position_cache_; }; -Index PreviousIndex(std::u16string_view string, Index current); -Index NextIndex(std::u16string_view string, Index current); +using Utf8CodePointIterator = + CodePointIterator<std::string_view, &Utf8NextCodePoint>; + +using Utf16CodePointIterator = + CodePointIterator<std::u16string_view, &Utf16NextCodePoint>; std::string ToUtf8(const std::u16string& s); inline std::string ToUtf8(std::u16string_view s) { diff --git a/include/cru/platform/native/InputMethod.hpp b/include/cru/platform/native/InputMethod.hpp index 1c5b287e..c975825f 100644 --- a/include/cru/platform/native/InputMethod.hpp +++ b/include/cru/platform/native/InputMethod.hpp @@ -23,6 +23,24 @@ struct CompositionText { TextRange selection; }; +// inline std::basic_ostream<char16_t>& operator<<( +// std::basic_ostream<char16_t>& stream, +// const CompositionText& composition_text) { +// stream << u"text: " << composition_text.text << u"\n" << u"clauses:\n"; +// for (int i = 0; i < static_cast<int>(composition_text.clauses.size()); i++) { +// const auto& clause = composition_text.clauses[i]; +// stream << u"\t" << i << u". start:" << clause.start << u" end:" +// << clause.end; +// if (clause.target) { +// stream << u" target"; +// } +// stream << u"\n"; +// } +// stream << u"selection: position:" << composition_text.selection.position +// << u" count:" << composition_text.selection.count; +// return stream; +// } + struct IInputMethodContext : virtual INativeResource { // Return true if you should draw composition text manually. Return false if // system will take care of that for you. diff --git a/src/common/StringUtil.cpp b/src/common/StringUtil.cpp index 0ebfd85e..6b185a82 100644 --- a/src/common/StringUtil.cpp +++ b/src/common/StringUtil.cpp @@ -3,160 +3,158 @@ #include <codecvt> namespace cru { +namespace { + template <typename UInt, int number_of_bit> inline std::enable_if_t<std::is_unsigned_v<UInt>, CodePoint> ExtractBits( UInt n) { return static_cast<CodePoint>(n & ((1u << number_of_bit) - 1)); } +} // namespace -CodePoint Utf16Iterator::Previous() { - if (position_ <= 0) return k_invalid_code_point; - - const auto cu0 = static_cast<std::uint16_t>(string_[--position_]); - - if (cu0 < 0xd800u || cu0 >= 0xe000u) { // 1-length code point - return static_cast<CodePoint>(cu0); - } else if (cu0 >= 0xdc00u || cu0 <= 0xdfffu) { // 2-length code point - if (position_ <= 0) { - throw TextEncodeException( - "Unexpected end when reading first code unit of surrogate pair " - "during backward."); - } - const auto cu1 = static_cast<std::uint16_t>(string_[--position_]); +CodePoint Utf8NextCodePoint(std::string_view str, Index current, + Index* next_position) { + CodePoint result; -#ifdef CRU_DEBUG - if (cu1 < 0xd800u || cu1 > 0xdbffu) { - throw TextEncodeException( - "Unexpected bad-range first code unit of surrogate pair during " - "backward."); + if (current >= static_cast<Index>(str.length())) { + result = k_invalid_code_point; + } else { + const auto cu0 = static_cast<std::uint8_t>(str[current++]); + + auto read_next_folowing_code = [&str, ¤t]() -> CodePoint { + if (current == static_cast<Index>(str.length())) + throw TextEncodeException( + "Unexpected end when read continuing byte of multi-byte code " + "point."); + + const auto u = static_cast<std::uint8_t>(str[current]); + if (!(u & (1u << 7)) || (u & (1u << 6))) { + throw TextEncodeException( + "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " + "multi-byte code point."); + } + + return ExtractBits<std::uint8_t, 6>(str[current++]); + }; + + if ((1u << 7) & cu0) { + if ((1u << 6) & cu0) { // 2~4-length code point + if ((1u << 5) & cu0) { // 3~4-length code point + if ((1u << 4) & cu0) { // 4-length code point + if (cu0 & (1u << 3)) { + throw TextEncodeException( + "Unexpected bad-format begin byte (not 0b11110xxx) of 4-byte" + "code point."); + } + + const CodePoint s0 = ExtractBits<std::uint8_t, 3>(cu0) << (6 * 3); + const CodePoint s1 = read_next_folowing_code() << (6 * 2); + const CodePoint s2 = read_next_folowing_code() << 6; + const CodePoint s3 = read_next_folowing_code(); + result = s0 + s1 + s2 + s3; + } else { // 3-length code point + const CodePoint s0 = ExtractBits<std::uint8_t, 4>(cu0) << (6 * 2); + const CodePoint s1 = read_next_folowing_code() << 6; + const CodePoint s2 = read_next_folowing_code(); + result = s0 + s1 + s2; + } + } else { // 2-length code point + const CodePoint s0 = ExtractBits<std::uint8_t, 5>(cu0) << 6; + const CodePoint s1 = read_next_folowing_code(); + result = s0 + s1; + } + } else { + throw TextEncodeException( + "Unexpected bad-format (0b10xxxxxx) begin byte of a code point."); + } + } else { + result = static_cast<CodePoint>(cu0); } -#endif + } - const auto s0 = ExtractBits<std::uint16_t, 10>(cu1) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10>(cu0); + if (next_position != nullptr) *next_position = current; + return result; +} - return s0 + s1 + 0x10000; +CodePoint Utf16NextCodePoint(std::u16string_view str, Index current, + Index* next_position) { + CodePoint result; + if (current >= static_cast<Index>(str.length())) { + result = k_invalid_code_point; } else { - throw TextEncodeException( - "Unexpected bad-range second code unit of surrogate pair during " - "backward."); - } -} + const auto cu0 = str[current++]; -CodePoint Utf16Iterator::Next() { - if (position_ >= static_cast<Index>(string_.length())) - return k_invalid_code_point; + if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point + result = static_cast<CodePoint>(cu0); + } else if (IsUtf16SurrogatePairLeading(cu0)) { // 2-length code point + if (current >= static_cast<Index>(str.length())) { + throw TextEncodeException( + "Unexpected end when reading second code unit of surrogate pair."); + } + const auto cu1 = str[current++]; - const auto cu0 = static_cast<std::uint16_t>(string_[position_++]); + if (!IsUtf16SurrogatePairTrailing(cu1)) { + throw TextEncodeException( + "Unexpected bad-range second code unit of surrogate pair."); + } - if (cu0 < 0xd800u || cu0 >= 0xe000u) { // 1-length code point - return static_cast<CodePoint>(cu0); - } else if (cu0 <= 0xdbffu) { // 2-length code point - if (position_ >= static_cast<Index>(string_.length())) { - throw TextEncodeException( - "Unexpected end when reading second code unit of surrogate pair " - "during forward."); - } - const auto cu1 = static_cast<std::uint16_t>(string_[position_++]); + const auto s0 = ExtractBits<std::uint16_t, 10>(cu0) << 10; + const auto s1 = ExtractBits<std::uint16_t, 10>(cu1); + + result = s0 + s1 + 0x10000; -#ifdef CRU_DEBUG - if (cu1 < 0xdc00u || cu1 > 0xdfffu) { + } else { throw TextEncodeException( - "Unexpected bad-format second code unit of surrogate pair during " - "forward."); + "Unexpected bad-range first code unit of surrogate pair."); } -#endif - - const auto s0 = ExtractBits<std::uint16_t, 10>(cu0) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10>(cu1); + } - return s0 + s1 + 0x10000; + if (next_position != nullptr) *next_position = current; + return result; +} +CodePoint Utf16PreviousCodePoint(std::u16string_view str, Index current, + Index* previous_position) { + CodePoint result; + if (current <= 0) { + result = k_invalid_code_point; } else { - throw TextEncodeException( - "Unexpected bad-format first code unit of surrogate pair during " - "forward."); - } -} + const auto cu0 = str[--current]; -Index PreviousIndex(std::u16string_view string, Index current) { - Utf16Iterator iterator{string, current}; - iterator.Previous(); - return iterator.CurrentPosition(); -} + if (!IsUtf16SurrogatePairCodeUnit(cu0)) { // 1-length code point + result = static_cast<CodePoint>(cu0); + } else if (IsUtf16SurrogatePairTrailing(cu0)) { // 2-length code point + if (current <= 0) { + throw TextEncodeException( + "Unexpected end when reading first code unit of surrogate pair."); + } + const auto cu1 = str[--current]; + + if (!IsUtf16SurrogatePairLeading(cu1)) { + throw TextEncodeException( + "Unexpected bad-range first code unit of surrogate pair."); + } + + const auto s0 = ExtractBits<std::uint16_t, 10>(cu1) << 10; + const auto s1 = ExtractBits<std::uint16_t, 10>(cu0); -Index NextIndex(std::u16string_view string, Index current) { - Utf16Iterator iterator{string, current}; - iterator.Next(); - return iterator.CurrentPosition(); + result = s0 + s1 + 0x10000; + + } else { + throw TextEncodeException( + "Unexpected bad-range second code unit of surrogate pair."); + } + } + + if (previous_position != nullptr) *previous_position = current; + return result; } std::string ToUtf8(const std::u16string& s) { - // TODO: Implement this by myself. + // TODO: Implement this by myself. Remember to remove deprecation warning + // suppress macro. return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{} .to_bytes(s); } - -// CodePoint Utf8Iterator::Next() { -// if (position_ == static_cast<Index>(string_.length())) -// return k_invalid_code_point; - -// const auto cu0 = static_cast<std::uint8_t>(string_[position_++]); - -// auto read_next_folowing_code = [this]() -> CodePoint { -// if (this->position_ == static_cast<Index>(string_.length())) -// throw TextEncodeException( -// "Unexpected end when read continuing byte of multi-byte code -// point."); - -// #ifdef CRU_DEBUG -// const auto u = static_cast<std::uint8_t>(string_[position_]); -// if (!(u & (1u << 7)) || (u & (1u << 6))) { -// throw TextEncodeException( -// "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " -// "multi-byte code point."); -// } -// #endif - -// return ExtractBits<std::uint8_t, 6>(string_[position_++]); -// }; - -// if ((1u << 7) & cu0) { -// if ((1u << 6) & cu0) { // 2~4-length code point -// if ((1u << 5) & cu0) { // 3~4-length code point -// if ((1u << 4) & cu0) { // 4-length code point -// #ifdef CRU_DEBUG -// if (cu0 & (1u << 3)) { -// throw TextEncodeException( -// "Unexpected bad-format begin byte (not 0b10xxxxxx) of 4-byte -// " "code point."); -// } -// #endif - -// const CodePoint s0 = ExtractBits<std::uint8_t, 3>(cu0) << (6 * 3); -// const CodePoint s1 = read_next_folowing_code() << (6 * 2); -// const CodePoint s2 = read_next_folowing_code() << 6; -// const CodePoint s3 = read_next_folowing_code(); -// return s0 + s1 + s2 + s3; -// } else { // 3-length code point -// const CodePoint s0 = ExtractBits<std::uint8_t, 4>(cu0) << (6 * 2); -// const CodePoint s1 = read_next_folowing_code() << 6; -// const CodePoint s2 = read_next_folowing_code(); -// return s0 + s1 + s2; -// } -// } else { // 2-length code point -// const CodePoint s0 = ExtractBits<std::uint8_t, 5>(cu0) << 6; -// const CodePoint s1 = read_next_folowing_code(); -// return s0 + s1; -// } -// } else { -// throw TextEncodeException( -// "Unexpected bad-format (0b10xxxxxx) begin byte of a code point."); -// } -// } else { -// return static_cast<CodePoint>(cu0); -// } -// } - } // namespace cru diff --git a/src/win/native/InputMethod.cpp b/src/win/native/InputMethod.cpp index 5fc6c934..f9a40ab4 100644 --- a/src/win/native/InputMethod.cpp +++ b/src/win/native/InputMethod.cpp @@ -247,7 +247,7 @@ void WinInputMethodContext::OnWindowNativeMessage( switch (message.msg) { case WM_CHAR: { const auto c = static_cast<char16_t>(message.w_param); - if (IsSurrogatePair(c)) { + if (IsUtf16SurrogatePairCodeUnit(c)) { // I don't think this will happen because normal key strike without ime // should only trigger ascci character. If it is a charater from // supplementary planes, it should be handled with ime messages. @@ -264,6 +264,8 @@ void WinInputMethodContext::OnWindowNativeMessage( case WM_IME_COMPOSITION: { composition_event_.Raise(nullptr); auto composition_text = GetCompositionText(); + // log::TagDebug(log_tag, u"WM_IME_COMPOSITION composition text:\n{}", + // composition_text); if (message.l_param & GCS_RESULTSTR) { auto result_string = GetResultString(); text_event_.Raise(result_string); diff --git a/test/common/StringUtilTest.cpp b/test/common/StringUtilTest.cpp index 21a9ea9c..ba5e9321 100644 --- a/test/common/StringUtilTest.cpp +++ b/test/common/StringUtilTest.cpp @@ -4,34 +4,74 @@ using cru::k_invalid_code_point; -// TEST(WinString, Utf8Iterator) { -// using cru::platform::win::Utf8Iterator; -// std::string_view text = "aπ你🤣!"; -// Utf8Iterator i{text}; -// ASSERT_EQ(i.Next(), 0x0061); -// ASSERT_EQ(i.Next(), 0x03C0); -// ASSERT_EQ(i.Next(), 0x4F60); -// ASSERT_EQ(i.Next(), 0x1F923); -// ASSERT_EQ(i.Next(), 0x0021); -// ASSERT_EQ(i.Next(), k_invalid_code_point); -// } +TEST(StringUtil, Utf8NextCodePoint) { + using cru::Utf8NextCodePoint; + std::string_view text = "aπ你🤣!"; + gsl::index current = 0; + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), 0x0061); + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), 0x03C0); + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), 0x4F60); + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), 0x1F923); + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), 0x0021); + ASSERT_EQ(Utf8NextCodePoint(text, current, ¤t), k_invalid_code_point); + ASSERT_EQ(current, static_cast<gsl::index>(text.size())); +} + +TEST(StringUtil, Utf16NextCodePoint) { + using cru::Utf16NextCodePoint; + std::u16string_view text = u"aπ你🤣!"; + gsl::index current = 0; + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), 0x0061); + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), 0x03C0); + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), 0x4F60); + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), 0x1F923); + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), 0x0021); + ASSERT_EQ(Utf16NextCodePoint(text, current, ¤t), k_invalid_code_point); + ASSERT_EQ(current, static_cast<gsl::index>(text.size())); +} + +TEST(StringUtil, Utf16PreviousCodePoint) { + using cru::Utf16PreviousCodePoint; + std::u16string_view text = u"aπ你🤣!"; + gsl::index current = text.size(); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), 0x0021); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), 0x1F923); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), 0x4F60); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), 0x03C0); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), 0x0061); + ASSERT_EQ(Utf16PreviousCodePoint(text, current, ¤t), + k_invalid_code_point); + ASSERT_EQ(current, 0u); +} -TEST(WinString, Utf16Iterator) { - using cru::Utf16Iterator; +TEST(StringUtil, Utf8CodePointIterator) { + using cru::Utf8CodePointIterator; + std::string_view text = "aπ你🤣!"; + std::vector<cru::CodePoint> code_points; + + for (auto cp : Utf8CodePointIterator{text}) { + code_points.push_back(cp); + } + + std::vector<cru::CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, + 0x1F923, 0x0021}; + + ASSERT_EQ(code_points, expected_code_points); +} + +TEST(StringUtil, Utf16CodePointIterator) { + using cru::Utf16CodePointIterator; std::u16string_view text = u"aπ你🤣!"; - Utf16Iterator i{text}; - ASSERT_EQ(i.Next(), 0x0061); - ASSERT_EQ(i.Next(), 0x03C0); - ASSERT_EQ(i.Next(), 0x4F60); - ASSERT_EQ(i.Next(), 0x1F923); - ASSERT_EQ(i.Next(), 0x0021); - ASSERT_EQ(i.Next(), k_invalid_code_point); - ASSERT_EQ(i.Previous(), 0x0021); - ASSERT_EQ(i.Previous(), 0x1F923); - ASSERT_EQ(i.Previous(), 0x4F60); - ASSERT_EQ(i.Previous(), 0x03C0); - ASSERT_EQ(i.Previous(), 0x0061); - ASSERT_EQ(i.Previous(), k_invalid_code_point); + std::vector<cru::CodePoint> code_points; + + for (auto cp : Utf16CodePointIterator{text}) { + code_points.push_back(cp); + } + + std::vector<cru::CodePoint> expected_code_points{0x0061, 0x03C0, 0x4F60, + 0x1F923, 0x0021}; + + ASSERT_EQ(code_points, expected_code_points); } // TEST(WinString, IndexUtf8ToUtf16) { |