aboutsummaryrefslogtreecommitdiff
path: root/include/cru/win/string.hpp
blob: 3d68cff7b688a8492228f64a169864f55525aee5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
Because the text encoding problem on Windows, here I write some functions
related to text encoding. The utf-8 and utf-16 conversion function is provided
by win32 api. However win32 api does not provide any function about charactor
iteration or index by code point. (At least I haven't found.) I don't use icu
because it is not easy to build it on Windows and the bundled version in Windows
(https://docs.microsoft.com/en-us/windows/win32/intl/international-components-for-unicode--icu-)
is only available after Windows 10 Creators Update.

Luckily, both utf-8 and utf-16 encoding are easy to learn and program with if we
only do simple iteration rather than do much sophisticated work about
complicated error situations. (And I learn the internal of the encoding by the
way.)
*/

#pragma once
#include "WinPreConfig.hpp"

#include "cru/common/Base.hpp"

#include <cstdint>
#include <stdexcept>
#include <string>
#include <string_view>

namespace cru::platform::win {
std::string ToUtf8String(const std::wstring_view& string);
std::wstring ToUtf16String(const std::string_view& string);

inline bool IsSurrogatePair(wchar_t c) { return c >= 0xD800 && c <= 0xDFFF; }

inline bool IsSurrogatePairLeading(wchar_t c) {
  return c >= 0xD800 && c <= 0xDBFF;
}

inline bool IsSurrogatePairTrailing(wchar_t c) {
  return c >= 0xDC00 && c <= 0xDFFF;
}

using CodePoint = std::int32_t;
constexpr CodePoint k_code_point_end = -1;

class TextEncodeException : public std::runtime_error {
 public:
  using runtime_error::runtime_error;
};

class Utf8Iterator : public Object {
 public:
  Utf8Iterator(const std::string_view& string) : string_(string) {}

  CRU_DEFAULT_COPY(Utf8Iterator)
  CRU_DEFAULT_MOVE(Utf8Iterator)

  ~Utf8Iterator() = default;

 public:
  void SetToHead() { position_ = 0; }

  // Advance current position and get next code point. Return k_code_point_end
  // if there is no next code unit(point). Throw TextEncodeException if decoding
  // fails.
  CodePoint Next();

  int CurrentPosition() const { return this->position_; }

 private:
  std::string_view string_;
  int position_ = 0;
};

class Utf16Iterator : public Object {
  static_assert(
      sizeof(wchar_t) == 2,
      "Emmm, according to my knowledge, wchar_t should be 2-length on "
      "Windows. If not, Utf16 will be broken.");

 public:
  Utf16Iterator(const std::wstring_view& string) : string_(string) {}

  CRU_DEFAULT_COPY(Utf16Iterator)
  CRU_DEFAULT_MOVE(Utf16Iterator)

  ~Utf16Iterator() = default;

 public:
  void SetToHead() { position_ = 0; }

  // Advance current position and get next code point. Return k_code_point_end
  // if there is no next code unit(point). Throw TextEncodeException if decoding
  // fails.
  CodePoint Next();

  int CurrentPosition() const { return this->position_; }

 private:
  std::wstring_view string_;
  int position_ = 0;
};

int IndexUtf8ToUtf16(const std::string_view& utf8_string, int utf8_index,
                     const std::wstring_view& utf16_string);

int IndexUtf16ToUtf8(const std::wstring_view& utf16_string, int utf16_index,
                     const std::string_view& utf8_string);

}  // namespace cru::platform::win