aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcrupest <crupest@outlook.com>2019-12-24 00:16:30 +0800
committercrupest <crupest@outlook.com>2019-12-24 00:16:30 +0800
commit0a25a6f5e3ece27791999d45e8aa83d83eb796d0 (patch)
treed1b4cad51424fc9209aa89f956f8eb4547b201f7
parent6ad6638adf64d958cdae44ce1df6a8a3787fed84 (diff)
downloadcru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.gz
cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.bz2
cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.zip
...
-rw-r--r--.vscode/settings.json122
-rw-r--r--include/cru/win/string.hpp81
-rw-r--r--src/win/string.cpp100
3 files changed, 241 insertions, 62 deletions
diff --git a/.vscode/settings.json b/.vscode/settings.json
index edf04435..2b5cd528 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,62 +1,62 @@
{
- "files.associations": {
- "algorithm": "cpp",
- "array": "cpp",
- "cctype": "cpp",
- "chrono": "cpp",
- "cmath": "cpp",
- "concepts": "cpp",
- "cstddef": "cpp",
- "cstdint": "cpp",
- "cstdio": "cpp",
- "cstdlib": "cpp",
- "cstring": "cpp",
- "ctime": "cpp",
- "cwchar": "cpp",
- "exception": "cpp",
- "forward_list": "cpp",
- "functional": "cpp",
- "initializer_list": "cpp",
- "ios": "cpp",
- "iosfwd": "cpp",
- "iostream": "cpp",
- "istream": "cpp",
- "iterator": "cpp",
- "limits": "cpp",
- "list": "cpp",
- "map": "cpp",
- "memory": "cpp",
- "new": "cpp",
- "optional": "cpp",
- "ostream": "cpp",
- "ratio": "cpp",
- "stdexcept": "cpp",
- "streambuf": "cpp",
- "string": "cpp",
- "system_error": "cpp",
- "tuple": "cpp",
- "type_traits": "cpp",
- "typeinfo": "cpp",
- "unordered_map": "cpp",
- "utility": "cpp",
- "vector": "cpp",
- "xfacet": "cpp",
- "xhash": "cpp",
- "xiosbase": "cpp",
- "xlocale": "cpp",
- "xlocinfo": "cpp",
- "xlocnum": "cpp",
- "xmemory": "cpp",
- "xstddef": "cpp",
- "xstring": "cpp",
- "xtr1common": "cpp",
- "xtree": "cpp",
- "xutility": "cpp",
- "atomic": "cpp",
- "clocale": "cpp",
- "condition_variable": "cpp",
- "mutex": "cpp",
- "shared_mutex": "cpp",
- "thread": "cpp"
- }
-} \ No newline at end of file
+ "files.associations": {
+ "algorithm": "cpp",
+ "array": "cpp",
+ "cctype": "cpp",
+ "chrono": "cpp",
+ "cmath": "cpp",
+ "concepts": "cpp",
+ "cstddef": "cpp",
+ "cstdint": "cpp",
+ "cstdio": "cpp",
+ "cstdlib": "cpp",
+ "cstring": "cpp",
+ "ctime": "cpp",
+ "cwchar": "cpp",
+ "exception": "cpp",
+ "forward_list": "cpp",
+ "functional": "cpp",
+ "initializer_list": "cpp",
+ "ios": "cpp",
+ "iosfwd": "cpp",
+ "iostream": "cpp",
+ "istream": "cpp",
+ "iterator": "cpp",
+ "limits": "cpp",
+ "list": "cpp",
+ "map": "cpp",
+ "memory": "cpp",
+ "new": "cpp",
+ "optional": "cpp",
+ "ostream": "cpp",
+ "ratio": "cpp",
+ "stdexcept": "cpp",
+ "streambuf": "cpp",
+ "string": "cpp",
+ "system_error": "cpp",
+ "tuple": "cpp",
+ "type_traits": "cpp",
+ "typeinfo": "cpp",
+ "unordered_map": "cpp",
+ "utility": "cpp",
+ "vector": "cpp",
+ "xfacet": "cpp",
+ "xhash": "cpp",
+ "xiosbase": "cpp",
+ "xlocale": "cpp",
+ "xlocinfo": "cpp",
+ "xlocnum": "cpp",
+ "xmemory": "cpp",
+ "xstddef": "cpp",
+ "xstring": "cpp",
+ "xtr1common": "cpp",
+ "xtree": "cpp",
+ "xutility": "cpp",
+ "atomic": "cpp",
+ "clocale": "cpp",
+ "condition_variable": "cpp",
+ "mutex": "cpp",
+ "shared_mutex": "cpp",
+ "thread": "cpp"
+ }
+}
diff --git a/include/cru/win/string.hpp b/include/cru/win/string.hpp
index 7a12e47e..75395052 100644
--- a/include/cru/win/string.hpp
+++ b/include/cru/win/string.hpp
@@ -1,10 +1,89 @@
+/*
+Because the text encoding problem on Windows, here I write some functions
+related to text encoding. The utf-8 and utf-16 conversion function is provided
+by win32 api. However win32 api does not provide any function about charactor
+iteration or index by code point. (At least I haven't found.) I don't use icu
+because it is not easy to build it on Windows and the bundled version in Windows
+(https://docs.microsoft.com/en-us/windows/win32/intl/international-components-for-unicode--icu-)
+is only available after Windows 10 Creators Update.
+
+Luckily, both utf-8 and utf-16 encoding are easy to learn and program with if we
+only do simple iteration rather than do much sophisticated work about
+complicated error situations. (And I learn the internal of the encoding by the
+way.)
+*/
+
#pragma once
#include "win_pre_config.hpp"
+#include "cru/common/base.hpp"
+
+#include <cstdint>
+#include <stdexcept>
#include <string>
#include <string_view>
namespace cru::platform::win {
std::string ToUtf8String(const std::wstring_view& string);
std::wstring ToUtf16String(const std::string_view& string);
-}
+
+using CodePoint = std::int32_t;
+constexpr CodePoint k_code_point_end = -1;
+
+class TextEncodeException : public std::runtime_error {
+ public:
+ using runtime_error::runtime_error;
+};
+
+class Utf8Iterator : public Object {
+ public:
+ Utf8Iterator(const std::string_view& string) : string_(string) {}
+
+ CRU_DEFAULT_COPY(Utf8Iterator)
+ CRU_DEFAULT_MOVE(Utf8Iterator)
+
+ ~Utf8Iterator() = default;
+
+ public:
+ void SetToHead() { position_ = 0; }
+
+ // Advance current position and get next code point. Return k_code_point_end
+ // if there is no next code unit(point). Throw TextEncodeException if decoding
+ // fails.
+ CodePoint Next();
+
+ int CurrentPosition() const { return this->position_; }
+
+ private:
+ std::string_view string_;
+ int position_ = 0;
+};
+
+class Utf16Iterator : public Object {
+ static_assert(sizeof(wchar_t) == 2,
+ "Emmm, according to my knowledge, wchar_t should be 2-length on "
+ "Windows. If not, Utf16 will be broken.");
+
+ public:
+ Utf16Iterator(const std::wstring_view& string) : string_(string) {}
+
+ CRU_DEFAULT_COPY(Utf16Iterator)
+ CRU_DEFAULT_MOVE(Utf16Iterator)
+
+ ~Utf16Iterator() = default;
+
+ public:
+ void SetToHead() { position_ = 0; }
+
+ // Advance current position and get next code point. Return k_code_point_end
+ // if there is no next code unit(point). Throw TextEncodeException if decoding
+ // fails.
+ CodePoint Next();
+
+ int CurrentPosition() const { return this->position_; }
+
+ private:
+ std::wstring_view string_;
+ int position_ = 0;
+};
+} // namespace cru::platform::win
diff --git a/src/win/string.cpp b/src/win/string.cpp
index 84906f6b..c8b0ca87 100644
--- a/src/win/string.cpp
+++ b/src/win/string.cpp
@@ -2,6 +2,8 @@
#include "cru/win/exception.hpp"
+#include <type_traits>
+
namespace cru::platform::win {
std::string ToUtf8String(const std::wstring_view& string) {
if (string.empty()) return std::string{};
@@ -43,4 +45,102 @@ std::wstring ToUtf16String(const std::string_view& string) {
"Failed to convert wide string to UTF-16.");
return result;
}
+
+template <typename UInt, int number_of_bit>
+inline std::enable_if_t<std::is_unsigned_v<UInt>, CodePoint> ExtractBits(
+ UInt n) {
+ return static_cast<CodePoint>(n & ((1u << number_of_bit) - 1));
+}
+
+CodePoint Utf8Iterator::Next() {
+ if (position_ == static_cast<int>(string_.length())) return k_code_point_end;
+
+ const auto cu0 = static_cast<std::uint8_t>(string_[position_++]);
+
+ auto read_next_folowing_code = [this]() -> CodePoint {
+ if (this->position_ == static_cast<int>(string_.length()))
+ throw TextEncodeException(
+ "Unexpected end when read continuing byte of multi-byte code point.");
+
+#ifdef CRU_DEBUG
+ const auto u = static_cast<std::uint8_t>(string_[position_]);
+ if (!(u & (1u << 7)) || (u & (1u << 6))) {
+ throw TextEncodeException(
+ "Unexpected bad-format (not 0b10xxxxxx) continuing byte of "
+ "multi-byte code point.");
+ }
+#endif
+
+ return ExtractBits<std::uint8_t, 6>(string_[position_++]);
+ };
+
+ if ((1u << 7) & cu0) {
+ if ((1u << 6) & cu0) { // 2~4-length code point
+ if ((1u << 5) & cu0) { // 3~4-length code point
+ if ((1u << 4) & cu0) { // 4-length code point
+#ifdef CRU_DEBUG
+ if (cu0 & (1u << 3)) {
+ throw TextEncodeException(
+ "Unexpected bad-format begin byte (not 0b10xxxxxx) of 4-byte "
+ "code point.");
+ }
+#endif
+
+ const CodePoint s0 = ExtractBits<std::uint8_t, 3>(cu0) << (6 * 3);
+ const CodePoint s1 = read_next_folowing_code() << (6 * 2);
+ const CodePoint s2 = read_next_folowing_code() << 6;
+ const CodePoint s3 = read_next_folowing_code();
+ return s0 + s1 + s2 + s3;
+ } else { // 3-length code point
+ const CodePoint s0 = ExtractBits<std::uint8_t, 4>(cu0) << (6 * 2);
+ const CodePoint s1 = read_next_folowing_code() << 6;
+ const CodePoint s2 = read_next_folowing_code();
+ return s0 + s1 + s2;
+ }
+ } else { // 2-length code point
+ const CodePoint s0 = ExtractBits<std::uint8_t, 5>(cu0) << 6;
+ const CodePoint s1 = read_next_folowing_code();
+ return s0 + s1;
+ }
+ } else {
+ throw TextEncodeException(
+ "Unexpected bad-format (0b10xxxxxx) begin byte of a code point.");
+ }
+ } else {
+ return static_cast<CodePoint>(cu0);
+ }
+}
+
+CodePoint Utf16Iterator::Next() {
+ if (position_ == static_cast<int>(string_.length())) return k_code_point_end;
+
+ const auto cu0 = static_cast<std::uint16_t>(string_[position_++]);
+
+ if (cu0 < 0xd800u || cu0 >= 0xe000u) { // 1-length code point
+ return static_cast<CodePoint>(cu0);
+ } else if (cu0 <= 0xdbffu) { // 2-length code point
+ if (position_ == static_cast<int>(string_.length())) {
+ throw TextEncodeException(
+ "Unexpected end when reading second code unit of surrogate pair.");
+ }
+ const auto cu1 = static_cast<std::uint16_t>(string_[position_++]);
+
+#ifdef CRU_DEBUG
+ if (cu1 < 0xDC00u || cu1 > 0xdfffu) {
+ throw TextEncodeException(
+ "Unexpected bad-format second code unit of surrogate pair.");
+ }
+#endif
+
+ const auto s0 = ExtractBits<std::uint16_t, 10>(cu0) << 10;
+ const auto s1 = ExtractBits<std::uint16_t, 10>(cu1);
+
+ return s0 + s1 + 0x10000;
+
+ } else {
+ throw TextEncodeException(
+ "Unexpected bad-format first code unit of surrogate pair.");
+ }
+}
+
} // namespace cru::platform::win