...

author: crupest <crupest@outlook.com> 2019-12-24 00:16:30 +0800
committer: crupest <crupest@outlook.com> 2019-12-24 00:16:30 +0800
commit: 0a25a6f5e3ece27791999d45e8aa83d83eb796d0 (patch)
tree: d1b4cad51424fc9209aa89f956f8eb4547b201f7
parent: 6ad6638adf64d958cdae44ce1df6a8a3787fed84 (diff)
download: cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.gz
cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.bz2
cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.zip
3 files changed, 241 insertions, 62 deletions
diff --git a/.vscode/settings.json b/.vscode/settings.json
index edf04435..2b5cd528 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,62 +1,62 @@
 {
-    "files.associations": {
-        "algorithm": "cpp",
-        "array": "cpp",
-        "cctype": "cpp",
-        "chrono": "cpp",
-        "cmath": "cpp",
-        "concepts": "cpp",
-        "cstddef": "cpp",
-        "cstdint": "cpp",
-        "cstdio": "cpp",
-        "cstdlib": "cpp",
-        "cstring": "cpp",
-        "ctime": "cpp",
-        "cwchar": "cpp",
-        "exception": "cpp",
-        "forward_list": "cpp",
-        "functional": "cpp",
-        "initializer_list": "cpp",
-        "ios": "cpp",
-        "iosfwd": "cpp",
-        "iostream": "cpp",
-        "istream": "cpp",
-        "iterator": "cpp",
-        "limits": "cpp",
-        "list": "cpp",
-        "map": "cpp",
-        "memory": "cpp",
-        "new": "cpp",
-        "optional": "cpp",
-        "ostream": "cpp",
-        "ratio": "cpp",
-        "stdexcept": "cpp",
-        "streambuf": "cpp",
-        "string": "cpp",
-        "system_error": "cpp",
-        "tuple": "cpp",
-        "type_traits": "cpp",
-        "typeinfo": "cpp",
-        "unordered_map": "cpp",
-        "utility": "cpp",
-        "vector": "cpp",
-        "xfacet": "cpp",
-        "xhash": "cpp",
-        "xiosbase": "cpp",
-        "xlocale": "cpp",
-        "xlocinfo": "cpp",
-        "xlocnum": "cpp",
-        "xmemory": "cpp",
-        "xstddef": "cpp",
-        "xstring": "cpp",
-        "xtr1common": "cpp",
-        "xtree": "cpp",
-        "xutility": "cpp",
-        "atomic": "cpp",
-        "clocale": "cpp",
-        "condition_variable": "cpp",
-        "mutex": "cpp",
-        "shared_mutex": "cpp",
-        "thread": "cpp"
-    }
-}
-\ No newline at end of file
+  "files.associations": {
+    "algorithm": "cpp",
+    "array": "cpp",
+    "cctype": "cpp",
+    "chrono": "cpp",
+    "cmath": "cpp",
+    "concepts": "cpp",
+    "cstddef": "cpp",
+    "cstdint": "cpp",
+    "cstdio": "cpp",
+    "cstdlib": "cpp",
+    "cstring": "cpp",
+    "ctime": "cpp",
+    "cwchar": "cpp",
+    "exception": "cpp",
+    "forward_list": "cpp",
+    "functional": "cpp",
+    "initializer_list": "cpp",
+    "ios": "cpp",
+    "iosfwd": "cpp",
+    "iostream": "cpp",
+    "istream": "cpp",
+    "iterator": "cpp",
+    "limits": "cpp",
+    "list": "cpp",
+    "map": "cpp",
+    "memory": "cpp",
+    "new": "cpp",
+    "optional": "cpp",
+    "ostream": "cpp",
+    "ratio": "cpp",
+    "stdexcept": "cpp",
+    "streambuf": "cpp",
+    "string": "cpp",
+    "system_error": "cpp",
+    "tuple": "cpp",
+    "type_traits": "cpp",
+    "typeinfo": "cpp",
+    "unordered_map": "cpp",
+    "utility": "cpp",
+    "vector": "cpp",
+    "xfacet": "cpp",
+    "xhash": "cpp",
+    "xiosbase": "cpp",
+    "xlocale": "cpp",
+    "xlocinfo": "cpp",
+    "xlocnum": "cpp",
+    "xmemory": "cpp",
+    "xstddef": "cpp",
+    "xstring": "cpp",
+    "xtr1common": "cpp",
+    "xtree": "cpp",
+    "xutility": "cpp",
+    "atomic": "cpp",
+    "clocale": "cpp",
+    "condition_variable": "cpp",
+    "mutex": "cpp",
+    "shared_mutex": "cpp",
+    "thread": "cpp"
+  }
+}
diff --git a/include/cru/win/string.hpp b/include/cru/win/string.hpp
index 7a12e47e..75395052 100644
--- a/include/cru/win/string.hpp
+++ b/include/cru/win/string.hpp
@@ -1,10 +1,89 @@
+/*
+Because the text encoding problem on Windows, here I write some functions
+related to text encoding. The utf-8 and utf-16 conversion function is provided
+by win32 api. However win32 api does not provide any function about charactor
+iteration or index by code point. (At least I haven't found.) I don't use icu
+because it is not easy to build it on Windows and the bundled version in Windows
+(https://docs.microsoft.com/en-us/windows/win32/intl/international-components-for-unicode--icu-)
+is only available after Windows 10 Creators Update.
+
+Luckily, both utf-8 and utf-16 encoding are easy to learn and program with if we
+only do simple iteration rather than do much sophisticated work about
+complicated error situations. (And I learn the internal of the encoding by the
+way.)
+*/
+
 #pragma once
 #include "win_pre_config.hpp"
 
+#include "cru/common/base.hpp"
+
+#include <cstdint>
+#include <stdexcept>
 #include <string>
 #include <string_view>
 
 namespace cru::platform::win {
 std::string ToUtf8String(const std::wstring_view& string);
 std::wstring ToUtf16String(const std::string_view& string);
-}
+
+using CodePoint = std::int32_t;
+constexpr CodePoint k_code_point_end = -1;
+
+class TextEncodeException : public std::runtime_error {
+ public:
+  using runtime_error::runtime_error;
+};
+
+class Utf8Iterator : public Object {
+ public:
+  Utf8Iterator(const std::string_view& string) : string_(string) {}
+
+  CRU_DEFAULT_COPY(Utf8Iterator)
+  CRU_DEFAULT_MOVE(Utf8Iterator)
+
+  ~Utf8Iterator() = default;
+
+ public:
+  void SetToHead() { position_ = 0; }
+
+  // Advance current position and get next code point. Return k_code_point_end
+  // if there is no next code unit(point). Throw TextEncodeException if decoding
+  // fails.
+  CodePoint Next();
+
+  int CurrentPosition() const { return this->position_; }
+
+ private:
+  std::string_view string_;
+  int position_ = 0;
+};
+
+class Utf16Iterator : public Object {
+  static_assert(sizeof(wchar_t) == 2,
+                "Emmm, according to my knowledge, wchar_t should be 2-length on "
+                "Windows. If not, Utf16 will be broken.");
+
+ public:
+  Utf16Iterator(const std::wstring_view& string) : string_(string) {}
+
+  CRU_DEFAULT_COPY(Utf16Iterator)
+  CRU_DEFAULT_MOVE(Utf16Iterator)
+
+  ~Utf16Iterator() = default;
+
+ public:
+  void SetToHead() { position_ = 0; }
+
+  // Advance current position and get next code point. Return k_code_point_end
+  // if there is no next code unit(point). Throw TextEncodeException if decoding
+  // fails.
+  CodePoint Next();
+
+  int CurrentPosition() const { return this->position_; }
+
+ private:
+  std::wstring_view string_;
+  int position_ = 0;
+};
+}  // namespace cru::platform::win
diff --git a/src/win/string.cpp b/src/win/string.cpp
index 84906f6b..c8b0ca87 100644
--- a/src/win/string.cpp
+++ b/src/win/string.cpp
@@ -2,6 +2,8 @@
 
 #include "cru/win/exception.hpp"
 
+#include <type_traits>
+
 namespace cru::platform::win {
 std::string ToUtf8String(const std::wstring_view& string) {
   if (string.empty()) return std::string{};
@@ -43,4 +45,102 @@ std::wstring ToUtf16String(const std::string_view& string) {
                           "Failed to convert wide string to UTF-16.");
   return result;
 }
+
+template <typename UInt, int number_of_bit>
+inline std::enable_if_t<std::is_unsigned_v<UInt>, CodePoint> ExtractBits(
+    UInt n) {
+  return static_cast<CodePoint>(n & ((1u << number_of_bit) - 1));
+}
+
+CodePoint Utf8Iterator::Next() {
+  if (position_ == static_cast<int>(string_.length())) return k_code_point_end;
+
+  const auto cu0 = static_cast<std::uint8_t>(string_[position_++]);
+
+  auto read_next_folowing_code = [this]() -> CodePoint {
+    if (this->position_ == static_cast<int>(string_.length()))
+      throw TextEncodeException(
+          "Unexpected end when read continuing byte of multi-byte code point.");
+
+#ifdef CRU_DEBUG
+    const auto u = static_cast<std::uint8_t>(string_[position_]);
+    if (!(u & (1u << 7)) || (u & (1u << 6))) {
+      throw TextEncodeException(
+          "Unexpected bad-format (not 0b10xxxxxx) continuing byte of "
+          "multi-byte code point.");
+    }
+#endif
+
+    return ExtractBits<std::uint8_t, 6>(string_[position_++]);
+  };
+
+  if ((1u << 7) & cu0) {
+    if ((1u << 6) & cu0) {      // 2~4-length code point
+      if ((1u << 5) & cu0) {    // 3~4-length code point
+        if ((1u << 4) & cu0) {  // 4-length code point
+#ifdef CRU_DEBUG
+          if (cu0 & (1u << 3)) {
+            throw TextEncodeException(
+                "Unexpected bad-format begin byte (not 0b10xxxxxx) of 4-byte "
+                "code point.");
+          }
+#endif
+
+          const CodePoint s0 = ExtractBits<std::uint8_t, 3>(cu0) << (6 * 3);
+          const CodePoint s1 = read_next_folowing_code() << (6 * 2);
+          const CodePoint s2 = read_next_folowing_code() << 6;
+          const CodePoint s3 = read_next_folowing_code();
+          return s0 + s1 + s2 + s3;
+        } else {  // 3-length code point
+          const CodePoint s0 = ExtractBits<std::uint8_t, 4>(cu0) << (6 * 2);
+          const CodePoint s1 = read_next_folowing_code() << 6;
+          const CodePoint s2 = read_next_folowing_code();
+          return s0 + s1 + s2;
+        }
+      } else {  // 2-length code point
+        const CodePoint s0 = ExtractBits<std::uint8_t, 5>(cu0) << 6;
+        const CodePoint s1 = read_next_folowing_code();
+        return s0 + s1;
+      }
+    } else {
+      throw TextEncodeException(
+          "Unexpected bad-format (0b10xxxxxx) begin byte of a code point.");
+    }
+  } else {
+    return static_cast<CodePoint>(cu0);
+  }
+}
+
+CodePoint Utf16Iterator::Next() {
+  if (position_ == static_cast<int>(string_.length())) return k_code_point_end;
+
+  const auto cu0 = static_cast<std::uint16_t>(string_[position_++]);
+
+  if (cu0 < 0xd800u || cu0 >= 0xe000u) {  // 1-length code point
+    return static_cast<CodePoint>(cu0);
+  } else if (cu0 <= 0xdbffu) {  // 2-length code point
+    if (position_ == static_cast<int>(string_.length())) {
+      throw TextEncodeException(
+          "Unexpected end when reading second code unit of surrogate pair.");
+    }
+    const auto cu1 = static_cast<std::uint16_t>(string_[position_++]);
+
+#ifdef CRU_DEBUG
+    if (cu1 < 0xDC00u || cu1 > 0xdfffu) {
+      throw TextEncodeException(
+          "Unexpected bad-format second code unit of surrogate pair.");
+    }
+#endif
+
+    const auto s0 = ExtractBits<std::uint16_t, 10>(cu0) << 10;
+    const auto s1 = ExtractBits<std::uint16_t, 10>(cu1);
+
+    return s0 + s1 + 0x10000;
+
+  } else {
+    throw TextEncodeException(
+        "Unexpected bad-format first code unit of surrogate pair.");
+  }
+}
+
 }  // namespace cru::platform::win
author	crupest <crupest@outlook.com>	2019-12-24 00:16:30 +0800
committer	crupest <crupest@outlook.com>	2019-12-24 00:16:30 +0800
commit	0a25a6f5e3ece27791999d45e8aa83d83eb796d0 (patch)
tree	d1b4cad51424fc9209aa89f956f8eb4547b201f7
parent	6ad6638adf64d958cdae44ce1df6a8a3787fed84 (diff)
download	cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.gz cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.tar.bz2 cru-0a25a6f5e3ece27791999d45e8aa83d83eb796d0.zip