diff options
author | crupest <crupest@outlook.com> | 2019-12-24 00:16:30 +0800 |
---|---|---|
committer | crupest <crupest@outlook.com> | 2019-12-24 00:16:30 +0800 |
commit | 0a25a6f5e3ece27791999d45e8aa83d83eb796d0 (patch) | |
tree | d1b4cad51424fc9209aa89f956f8eb4547b201f7 /src | |
parent | 6ad6638adf64d958cdae44ce1df6a8a3787fed84 (diff) | |
download | cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.gz cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.bz2 cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.zip |
...
Diffstat (limited to 'src')
-rw-r--r-- | src/win/string.cpp | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/src/win/string.cpp b/src/win/string.cpp index 84906f6b..c8b0ca87 100644 --- a/src/win/string.cpp +++ b/src/win/string.cpp @@ -2,6 +2,8 @@ #include "cru/win/exception.hpp" +#include <type_traits> + namespace cru::platform::win { std::string ToUtf8String(const std::wstring_view& string) { if (string.empty()) return std::string{}; @@ -43,4 +45,102 @@ std::wstring ToUtf16String(const std::string_view& string) { "Failed to convert wide string to UTF-16."); return result; } + +template <typename UInt, int number_of_bit> +inline std::enable_if_t<std::is_unsigned_v<UInt>, CodePoint> ExtractBits( + UInt n) { + return static_cast<CodePoint>(n & ((1u << number_of_bit) - 1)); +} + +CodePoint Utf8Iterator::Next() { + if (position_ == static_cast<int>(string_.length())) return k_code_point_end; + + const auto cu0 = static_cast<std::uint8_t>(string_[position_++]); + + auto read_next_folowing_code = [this]() -> CodePoint { + if (this->position_ == static_cast<int>(string_.length())) + throw TextEncodeException( + "Unexpected end when read continuing byte of multi-byte code point."); + +#ifdef CRU_DEBUG + const auto u = static_cast<std::uint8_t>(string_[position_]); + if (!(u & (1u << 7)) || (u & (1u << 6))) { + throw TextEncodeException( + "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " + "multi-byte code point."); + } +#endif + + return ExtractBits<std::uint8_t, 6>(string_[position_++]); + }; + + if ((1u << 7) & cu0) { + if ((1u << 6) & cu0) { // 2~4-length code point + if ((1u << 5) & cu0) { // 3~4-length code point + if ((1u << 4) & cu0) { // 4-length code point +#ifdef CRU_DEBUG + if (cu0 & (1u << 3)) { + throw TextEncodeException( + "Unexpected bad-format begin byte (not 0b10xxxxxx) of 4-byte " + "code point."); + } +#endif + + const CodePoint s0 = ExtractBits<std::uint8_t, 3>(cu0) << (6 * 3); + const CodePoint s1 = read_next_folowing_code() << (6 * 2); + const CodePoint s2 = read_next_folowing_code() << 6; + const CodePoint s3 = read_next_folowing_code(); + return s0 + s1 + s2 + s3; + } else { // 3-length code point + const CodePoint s0 = ExtractBits<std::uint8_t, 4>(cu0) << (6 * 2); + const CodePoint s1 = read_next_folowing_code() << 6; + const CodePoint s2 = read_next_folowing_code(); + return s0 + s1 + s2; + } + } else { // 2-length code point + const CodePoint s0 = ExtractBits<std::uint8_t, 5>(cu0) << 6; + const CodePoint s1 = read_next_folowing_code(); + return s0 + s1; + } + } else { + throw TextEncodeException( + "Unexpected bad-format (0b10xxxxxx) begin byte of a code point."); + } + } else { + return static_cast<CodePoint>(cu0); + } +} + +CodePoint Utf16Iterator::Next() { + if (position_ == static_cast<int>(string_.length())) return k_code_point_end; + + const auto cu0 = static_cast<std::uint16_t>(string_[position_++]); + + if (cu0 < 0xd800u || cu0 >= 0xe000u) { // 1-length code point + return static_cast<CodePoint>(cu0); + } else if (cu0 <= 0xdbffu) { // 2-length code point + if (position_ == static_cast<int>(string_.length())) { + throw TextEncodeException( + "Unexpected end when reading second code unit of surrogate pair."); + } + const auto cu1 = static_cast<std::uint16_t>(string_[position_++]); + +#ifdef CRU_DEBUG + if (cu1 < 0xDC00u || cu1 > 0xdfffu) { + throw TextEncodeException( + "Unexpected bad-format second code unit of surrogate pair."); + } +#endif + + const auto s0 = ExtractBits<std::uint16_t, 10>(cu0) << 10; + const auto s1 = ExtractBits<std::uint16_t, 10>(cu1); + + return s0 + s1 + 0x10000; + + } else { + throw TextEncodeException( + "Unexpected bad-format first code unit of surrogate pair."); + } +} + } // namespace cru::platform::win |