1 files changed, 130 insertions, 128 deletions
diff --git a/src/base/StringUtil.cpp b/src/base/StringUtil.cpp
index 4e622dab..581ebcab 100644
--- a/src/base/StringUtil.cpp
+++ b/src/base/StringUtil.cpp
@@ -7,8 +7,7 @@
 #include <compare>
 #include <string_view>
 
-namespace cru {
-namespace string {
+namespace cru::string {
 
 std::weak_ordering CaseInsensitiveCompare(std::string_view left,
                                           std::string_view right) {
@@ -78,18 +77,58 @@ std::vector<std::string> Split(std::string_view str, std::string_view sep,
 
   return result;
 }
-}  // namespace string
+
+namespace {
+
+template <typename CharType,
+          NextCodePointFunctionType<CharType> NextCodePointFunction>
+Index Until(const CharType* ptr, Index size, Index position,
+            const std::function<bool(CodePoint)>& predicate) {
+  if (position <= 0) return position;
+  while (true) {
+    Index p = position;
+    auto c = NextCodePointFunction(ptr, size, p, &position);
+    if (predicate(c)) return p;
+    if (c == k_invalid_code_point) return p;
+  }
+  UnreachableCode();
+}
+
+static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; }
+
+template <typename CharType>
+using UntilFunctionType = Index (*)(const CharType*, Index, Index,
+                                    const std::function<bool(CodePoint)>&);
+
+template <typename CharType,
+          NextCodePointFunctionType<CharType> NextCodePointFunction,
+          UntilFunctionType<CharType> UntilFunction>
+Index Word(const CharType* ptr, Index size, Index position, bool* is_space) {
+  if (position <= 0) return position;
+  auto c = NextCodePointFunction(ptr, size, position, nullptr);
+  if (IsSpace(c)) {  // TODO: Currently only test against 0x20(space).
+    if (is_space) *is_space = true;
+    return UntilFunction(ptr, size, position,
+                         [](CodePoint c) { return !IsSpace(c); });
+  } else {
+    if (is_space) *is_space = false;
+    return UntilFunction(ptr, size, position,
+                         [](CodePoint c) { return IsSpace(c); });
+  }
+}
+
+}  // namespace
 
 using details::ExtractBits;
 
-CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current,
+CodePoint Utf8NextCodePoint(const Utf8CodeUnit* ptr, Index size, Index current,
                             Index* next_position) {
   CodePoint result;
 
   if (current >= size) {
     result = k_invalid_code_point;
   } else {
-    const auto cu0 = static_cast<std::uint8_t>(ptr[current++]);
+    const auto cu0 = static_cast<Utf8CodeUnit>(ptr[current++]);
 
     auto read_next_folowing_code = [ptr, size, &current]() -> CodePoint {
       if (current == size)
@@ -97,14 +136,14 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current,
             "Unexpected end when read continuing byte of multi-byte code "
             "point.");
 
-      const auto u = static_cast<std::uint8_t>(ptr[current]);
+      const auto u = static_cast<Utf8CodeUnit>(ptr[current]);
       if (!(u & (1u << 7)) || (u & (1u << 6))) {
         throw TextEncodeException(
             "Unexpected bad-format (not 0b10xxxxxx) continuing byte of "
             "multi-byte code point.");
       }
 
-      return ExtractBits<std::uint8_t, 6, CodePoint>(ptr[current++]);
+      return ExtractBits<Utf8CodeUnit, 6, CodePoint>(ptr[current++]);
     };
 
     if ((1u << 7) & cu0) {
@@ -117,21 +156,21 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current,
                   "code point.");
             }
 
-            const CodePoint s0 = ExtractBits<std::uint8_t, 3, CodePoint>(cu0)
+            const CodePoint s0 = ExtractBits<Utf8CodeUnit, 3, CodePoint>(cu0)
                                  << (6 * 3);
             const CodePoint s1 = read_next_folowing_code() << (6 * 2);
             const CodePoint s2 = read_next_folowing_code() << 6;
             const CodePoint s3 = read_next_folowing_code();
             result = s0 + s1 + s2 + s3;
           } else {  // 3-length code point
-            const CodePoint s0 = ExtractBits<std::uint8_t, 4, CodePoint>(cu0)
+            const CodePoint s0 = ExtractBits<Utf8CodeUnit, 4, CodePoint>(cu0)
                                  << (6 * 2);
             const CodePoint s1 = read_next_folowing_code() << 6;
             const CodePoint s2 = read_next_folowing_code();
             result = s0 + s1 + s2;
           }
         } else {  // 2-length code point
-          const CodePoint s0 = ExtractBits<std::uint8_t, 5, CodePoint>(cu0)
+          const CodePoint s0 = ExtractBits<Utf8CodeUnit, 5, CodePoint>(cu0)
                                << 6;
           const CodePoint s1 = read_next_folowing_code();
           result = s0 + s1;
@@ -149,8 +188,67 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current,
   return result;
 }
 
-CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current,
-                             Index* next_position) {
+CodePoint Utf8PreviousCodePoint(const Utf8CodeUnit* ptr, Index size,
+                                Index current, Index* previous_position) {
+  CRU_UNUSED(size)
+
+  CodePoint result;
+  if (current <= 0) {
+    result = k_invalid_code_point;
+  } else {
+    current--;
+    int i;
+    for (i = 0; i < 4; i++) {
+      if (IsUtf8LeadingByte(ptr[current])) {
+        break;
+      }
+      current--;
+    }
+    if (i == 4) {
+      throw TextEncodeException(
+          "Failed to find UTF-8 leading byte in 4 previous bytes.");
+    }
+
+    result = Utf8NextCodePoint(ptr, size, current, nullptr);
+  }
+
+  if (previous_position != nullptr) *previous_position = current;
+  return result;
+}
+
+bool Utf8IsValidInsertPosition(const Utf8CodeUnit* ptr, Index size,
+                               Index position) {
+  return position == 0 || position == size ||
+         (position > 0 && position < size && IsUtf8LeadingByte(ptr[position]));
+}
+
+Index Utf8BackwardUntil(const Utf8CodeUnit* ptr, Index size, Index position,
+                        const std::function<bool(CodePoint)>& predicate) {
+  return Until<Utf8CodeUnit, Utf8PreviousCodePoint>(ptr, size, position,
+                                                    predicate);
+}
+
+Index Utf8ForwardUntil(const Utf8CodeUnit* ptr, Index size, Index position,
+                       const std::function<bool(CodePoint)>& predicate) {
+  return Until<Utf8CodeUnit, Utf8NextCodePoint>(ptr, size, position, predicate);
+}
+
+static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; }
+
+Index Utf8PreviousWord(const Utf8CodeUnit* ptr, Index size, Index position,
+                       bool* is_space) {
+  return Word<Utf8CodeUnit, Utf8PreviousCodePoint, Utf8BackwardUntil>(
+      ptr, size, position, is_space);
+}
+
+Index Utf8NextWord(const Utf8CodeUnit* ptr, Index size, Index position,
+                   bool* is_space) {
+  return Word<Utf8CodeUnit, Utf8NextCodePoint, Utf8ForwardUntil>(
+      ptr, size, position, is_space);
+}
+
+CodePoint Utf16NextCodePoint(const Utf16CodeUnit* ptr, Index size,
+                             Index current, Index* next_position) {
   CodePoint result;
 
   if (current >= size) {
@@ -172,8 +270,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current,
             "Unexpected bad-range second code unit of surrogate pair.");
       }
 
-      const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0) << 10;
-      const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1);
+      const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0) << 10;
+      const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1);
 
       result = s0 + s1 + 0x10000;
 
@@ -187,8 +285,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current,
   return result;
 }
 
-CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current,
-                                 Index* previous_position) {
+CodePoint Utf16PreviousCodePoint(const Utf16CodeUnit* ptr, Index size,
+                                 Index current, Index* previous_position) {
   CRU_UNUSED(size)
 
   CodePoint result;
@@ -211,8 +309,8 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current,
             "Unexpected bad-range first code unit of surrogate pair.");
       }
 
-      const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1) << 10;
-      const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0);
+      const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1) << 10;
+      const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0);
 
       result = s0 + s1 + 0x10000;
 
@@ -226,7 +324,7 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current,
   return result;
 }
 
-bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size,
+bool Utf16IsValidInsertPosition(const Utf16CodeUnit* ptr, Index size,
                                 Index position) {
   if (position < 0) return false;
   if (position > size) return false;
@@ -235,124 +333,28 @@ bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size,
   return !IsUtf16SurrogatePairTrailing(ptr[position]);
 }
 
-Index Utf16BackwardUntil(const char16_t* ptr, Index size, Index position,
+Index Utf16BackwardUntil(const Utf16CodeUnit* ptr, Index size, Index position,
                          const std::function<bool(CodePoint)>& predicate) {
-  if (position <= 0) return position;
-  while (true) {
-    Index p = position;
-    auto c = Utf16PreviousCodePoint(ptr, size, p, &position);
-    if (predicate(c)) return p;
-    if (c == k_invalid_code_point) return p;
-  }
-  UnreachableCode();
+  return Until<Utf16CodeUnit, Utf16PreviousCodePoint>(ptr, size, position,
+                                                      predicate);
 }
 
-Index Utf16ForwardUntil(const char16_t* ptr, Index size, Index position,
+Index Utf16ForwardUntil(const Utf16CodeUnit* ptr, Index size, Index position,
                         const std::function<bool(CodePoint)>& predicate) {
-  if (position >= size) return position;
-  while (true) {
-    Index p = position;
-    auto c = Utf16NextCodePoint(ptr, size, p, &position);
-    if (predicate(c)) return p;
-    if (c == k_invalid_code_point) return p;
-  }
-  UnreachableCode();
+  return Until<Utf16CodeUnit, Utf16NextCodePoint>(ptr, size, position,
+                                                  predicate);
 }
 
-inline bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; }
-
-Index Utf16PreviousWord(const char16_t* ptr, Index size, Index position,
+Index Utf16PreviousWord(const Utf16CodeUnit* ptr, Index size, Index position,
                         bool* is_space) {
-  if (position <= 0) return position;
-  auto c = Utf16PreviousCodePoint(ptr, size, position, nullptr);
-  if (IsSpace(c)) {  // TODO: Currently only test against 0x20(space).
-    if (is_space) *is_space = true;
-    return Utf16BackwardUntil(ptr, size, position,
-                              [](CodePoint c) { return !IsSpace(c); });
-  } else {
-    if (is_space) *is_space = false;
-    return Utf16BackwardUntil(ptr, size, position, IsSpace);
-  }
+  return Word<Utf16CodeUnit, Utf16PreviousCodePoint, Utf16BackwardUntil>(
+      ptr, size, position, is_space);
 }
 
-Index Utf16NextWord(const char16_t* ptr, Index size, Index position,
+Index Utf16NextWord(const Utf16CodeUnit* ptr, Index size, Index position,
                     bool* is_space) {
-  if (position >= size) return position;
-  auto c = Utf16NextCodePoint(ptr, size, position, nullptr);
-  if (IsSpace(c)) {  // TODO: Currently only test against 0x20(space).
-    if (is_space) *is_space = true;
-    return Utf16ForwardUntil(ptr, size, position,
-                             [](CodePoint c) { return !IsSpace(c); });
-  } else {
-    if (is_space) *is_space = false;
-    return Utf16ForwardUntil(ptr, size, position, IsSpace);
-  }
-}
-
-char16_t ToLower(char16_t c) {
-  if (c >= u'A' && c <= u'Z') {
-    return c - u'A' + u'a';
-  }
-  return c;
-}
-
-char16_t ToUpper(char16_t c) {
-  if (c >= u'a' && c <= u'z') {
-    return c - u'a' + u'A';
-  }
-  return c;
-}
-
-bool IsWhitespace(char16_t c) {
-  return c == u' ' || c == u'\t' || c == u'\n' || c == u'\r';
-}
-
-bool IsDigit(char16_t c) { return c >= u'0' && c <= u'9'; }
-
-Utf8CodePointIterator CreateUtf8Iterator(const std::byte* buffer, Index size) {
-  return Utf8CodePointIterator(reinterpret_cast<const char*>(buffer), size);
-}
-
-Utf8CodePointIterator CreateUtf8Iterator(const std::vector<std::byte>& buffer) {
-  return CreateUtf8Iterator(buffer.data(), buffer.size());
-}
-
-CodePoint Utf8NextCodePoint(std::string_view str, Index current,
-                            Index* next_position) {
-  NotImplemented();
-}
-
-CodePoint Utf8PreviousCodePoint(std::string_view str, Index current,
-                                Index* next_position) {
-  NotImplemented();
-}
-
-// Return position after the character making predicate returns true or 0 if no
-// character doing so.
-Index Utf8BackwardUntil(std::string_view str, Index position,
-                        const std::function<bool(CodePoint)>& predicate) {
-  NotImplemented();
-}
-
-// Return position before the character making predicate returns true or
-// str.size() if no character doing so.
-Index Utf8ForwardUntil(std::string_view str, Index position,
-                       const std::function<bool(CodePoint)>& predicate) {
-  NotImplemented();
-}
-
-bool Utf8IsValidInsertPosition(std::string_view str, Index position) {
-  NotImplemented();
-}
-
-Index Utf8PreviousWord(std::string_view str, Index position,
-                       bool* is_space) {
-  NotImplemented();
-}
-
-Index Utf8NextWord(std::string_view str, Index position,
-                   bool* is_space) {
-  NotImplemented();
+  return Word<Utf16CodeUnit, Utf16NextCodePoint, Utf16ForwardUntil>(
+      ptr, size, position, is_space);
 }
 
-}  // namespace cru
+}  // namespace cru::string