diff options
Diffstat (limited to 'src/base/StringUtil.cpp')
-rw-r--r-- | src/base/StringUtil.cpp | 258 |
1 files changed, 130 insertions, 128 deletions
diff --git a/src/base/StringUtil.cpp b/src/base/StringUtil.cpp index 4e622dab..581ebcab 100644 --- a/src/base/StringUtil.cpp +++ b/src/base/StringUtil.cpp @@ -7,8 +7,7 @@ #include <compare> #include <string_view> -namespace cru { -namespace string { +namespace cru::string { std::weak_ordering CaseInsensitiveCompare(std::string_view left, std::string_view right) { @@ -78,18 +77,58 @@ std::vector<std::string> Split(std::string_view str, std::string_view sep, return result; } -} // namespace string + +namespace { + +template <typename CharType, + NextCodePointFunctionType<CharType> NextCodePointFunction> +Index Until(const CharType* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + if (position <= 0) return position; + while (true) { + Index p = position; + auto c = NextCodePointFunction(ptr, size, p, &position); + if (predicate(c)) return p; + if (c == k_invalid_code_point) return p; + } + UnreachableCode(); +} + +static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } + +template <typename CharType> +using UntilFunctionType = Index (*)(const CharType*, Index, Index, + const std::function<bool(CodePoint)>&); + +template <typename CharType, + NextCodePointFunctionType<CharType> NextCodePointFunction, + UntilFunctionType<CharType> UntilFunction> +Index Word(const CharType* ptr, Index size, Index position, bool* is_space) { + if (position <= 0) return position; + auto c = NextCodePointFunction(ptr, size, position, nullptr); + if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). + if (is_space) *is_space = true; + return UntilFunction(ptr, size, position, + [](CodePoint c) { return !IsSpace(c); }); + } else { + if (is_space) *is_space = false; + return UntilFunction(ptr, size, position, + [](CodePoint c) { return IsSpace(c); }); + } +} + +} // namespace using details::ExtractBits; -CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, +CodePoint Utf8NextCodePoint(const Utf8CodeUnit* ptr, Index size, Index current, Index* next_position) { CodePoint result; if (current >= size) { result = k_invalid_code_point; } else { - const auto cu0 = static_cast<std::uint8_t>(ptr[current++]); + const auto cu0 = static_cast<Utf8CodeUnit>(ptr[current++]); auto read_next_folowing_code = [ptr, size, ¤t]() -> CodePoint { if (current == size) @@ -97,14 +136,14 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, "Unexpected end when read continuing byte of multi-byte code " "point."); - const auto u = static_cast<std::uint8_t>(ptr[current]); + const auto u = static_cast<Utf8CodeUnit>(ptr[current]); if (!(u & (1u << 7)) || (u & (1u << 6))) { throw TextEncodeException( "Unexpected bad-format (not 0b10xxxxxx) continuing byte of " "multi-byte code point."); } - return ExtractBits<std::uint8_t, 6, CodePoint>(ptr[current++]); + return ExtractBits<Utf8CodeUnit, 6, CodePoint>(ptr[current++]); }; if ((1u << 7) & cu0) { @@ -117,21 +156,21 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, "code point."); } - const CodePoint s0 = ExtractBits<std::uint8_t, 3, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 3, CodePoint>(cu0) << (6 * 3); const CodePoint s1 = read_next_folowing_code() << (6 * 2); const CodePoint s2 = read_next_folowing_code() << 6; const CodePoint s3 = read_next_folowing_code(); result = s0 + s1 + s2 + s3; } else { // 3-length code point - const CodePoint s0 = ExtractBits<std::uint8_t, 4, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 4, CodePoint>(cu0) << (6 * 2); const CodePoint s1 = read_next_folowing_code() << 6; const CodePoint s2 = read_next_folowing_code(); result = s0 + s1 + s2; } } else { // 2-length code point - const CodePoint s0 = ExtractBits<std::uint8_t, 5, CodePoint>(cu0) + const CodePoint s0 = ExtractBits<Utf8CodeUnit, 5, CodePoint>(cu0) << 6; const CodePoint s1 = read_next_folowing_code(); result = s0 + s1; @@ -149,8 +188,67 @@ CodePoint Utf8NextCodePoint(const char* ptr, Index size, Index current, return result; } -CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, - Index* next_position) { +CodePoint Utf8PreviousCodePoint(const Utf8CodeUnit* ptr, Index size, + Index current, Index* previous_position) { + CRU_UNUSED(size) + + CodePoint result; + if (current <= 0) { + result = k_invalid_code_point; + } else { + current--; + int i; + for (i = 0; i < 4; i++) { + if (IsUtf8LeadingByte(ptr[current])) { + break; + } + current--; + } + if (i == 4) { + throw TextEncodeException( + "Failed to find UTF-8 leading byte in 4 previous bytes."); + } + + result = Utf8NextCodePoint(ptr, size, current, nullptr); + } + + if (previous_position != nullptr) *previous_position = current; + return result; +} + +bool Utf8IsValidInsertPosition(const Utf8CodeUnit* ptr, Index size, + Index position) { + return position == 0 || position == size || + (position > 0 && position < size && IsUtf8LeadingByte(ptr[position])); +} + +Index Utf8BackwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + return Until<Utf8CodeUnit, Utf8PreviousCodePoint>(ptr, size, position, + predicate); +} + +Index Utf8ForwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate) { + return Until<Utf8CodeUnit, Utf8NextCodePoint>(ptr, size, position, predicate); +} + +static bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } + +Index Utf8PreviousWord(const Utf8CodeUnit* ptr, Index size, Index position, + bool* is_space) { + return Word<Utf8CodeUnit, Utf8PreviousCodePoint, Utf8BackwardUntil>( + ptr, size, position, is_space); +} + +Index Utf8NextWord(const Utf8CodeUnit* ptr, Index size, Index position, + bool* is_space) { + return Word<Utf8CodeUnit, Utf8NextCodePoint, Utf8ForwardUntil>( + ptr, size, position, is_space); +} + +CodePoint Utf16NextCodePoint(const Utf16CodeUnit* ptr, Index size, + Index current, Index* next_position) { CodePoint result; if (current >= size) { @@ -172,8 +270,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, "Unexpected bad-range second code unit of surrogate pair."); } - const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1); + const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0) << 10; + const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1); result = s0 + s1 + 0x10000; @@ -187,8 +285,8 @@ CodePoint Utf16NextCodePoint(const char16_t* ptr, Index size, Index current, return result; } -CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, - Index* previous_position) { +CodePoint Utf16PreviousCodePoint(const Utf16CodeUnit* ptr, Index size, + Index current, Index* previous_position) { CRU_UNUSED(size) CodePoint result; @@ -211,8 +309,8 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, "Unexpected bad-range first code unit of surrogate pair."); } - const auto s0 = ExtractBits<std::uint16_t, 10, CodePoint>(cu1) << 10; - const auto s1 = ExtractBits<std::uint16_t, 10, CodePoint>(cu0); + const auto s0 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu1) << 10; + const auto s1 = ExtractBits<Utf16CodeUnit, 10, CodePoint>(cu0); result = s0 + s1 + 0x10000; @@ -226,7 +324,7 @@ CodePoint Utf16PreviousCodePoint(const char16_t* ptr, Index size, Index current, return result; } -bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size, +bool Utf16IsValidInsertPosition(const Utf16CodeUnit* ptr, Index size, Index position) { if (position < 0) return false; if (position > size) return false; @@ -235,124 +333,28 @@ bool Utf16IsValidInsertPosition(const char16_t* ptr, Index size, return !IsUtf16SurrogatePairTrailing(ptr[position]); } -Index Utf16BackwardUntil(const char16_t* ptr, Index size, Index position, +Index Utf16BackwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, const std::function<bool(CodePoint)>& predicate) { - if (position <= 0) return position; - while (true) { - Index p = position; - auto c = Utf16PreviousCodePoint(ptr, size, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; - } - UnreachableCode(); + return Until<Utf16CodeUnit, Utf16PreviousCodePoint>(ptr, size, position, + predicate); } -Index Utf16ForwardUntil(const char16_t* ptr, Index size, Index position, +Index Utf16ForwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, const std::function<bool(CodePoint)>& predicate) { - if (position >= size) return position; - while (true) { - Index p = position; - auto c = Utf16NextCodePoint(ptr, size, p, &position); - if (predicate(c)) return p; - if (c == k_invalid_code_point) return p; - } - UnreachableCode(); + return Until<Utf16CodeUnit, Utf16NextCodePoint>(ptr, size, position, + predicate); } -inline bool IsSpace(CodePoint c) { return c == 0x20 || c == 0xA; } - -Index Utf16PreviousWord(const char16_t* ptr, Index size, Index position, +Index Utf16PreviousWord(const Utf16CodeUnit* ptr, Index size, Index position, bool* is_space) { - if (position <= 0) return position; - auto c = Utf16PreviousCodePoint(ptr, size, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; - return Utf16BackwardUntil(ptr, size, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) *is_space = false; - return Utf16BackwardUntil(ptr, size, position, IsSpace); - } + return Word<Utf16CodeUnit, Utf16PreviousCodePoint, Utf16BackwardUntil>( + ptr, size, position, is_space); } -Index Utf16NextWord(const char16_t* ptr, Index size, Index position, +Index Utf16NextWord(const Utf16CodeUnit* ptr, Index size, Index position, bool* is_space) { - if (position >= size) return position; - auto c = Utf16NextCodePoint(ptr, size, position, nullptr); - if (IsSpace(c)) { // TODO: Currently only test against 0x20(space). - if (is_space) *is_space = true; - return Utf16ForwardUntil(ptr, size, position, - [](CodePoint c) { return !IsSpace(c); }); - } else { - if (is_space) *is_space = false; - return Utf16ForwardUntil(ptr, size, position, IsSpace); - } -} - -char16_t ToLower(char16_t c) { - if (c >= u'A' && c <= u'Z') { - return c - u'A' + u'a'; - } - return c; -} - -char16_t ToUpper(char16_t c) { - if (c >= u'a' && c <= u'z') { - return c - u'a' + u'A'; - } - return c; -} - -bool IsWhitespace(char16_t c) { - return c == u' ' || c == u'\t' || c == u'\n' || c == u'\r'; -} - -bool IsDigit(char16_t c) { return c >= u'0' && c <= u'9'; } - -Utf8CodePointIterator CreateUtf8Iterator(const std::byte* buffer, Index size) { - return Utf8CodePointIterator(reinterpret_cast<const char*>(buffer), size); -} - -Utf8CodePointIterator CreateUtf8Iterator(const std::vector<std::byte>& buffer) { - return CreateUtf8Iterator(buffer.data(), buffer.size()); -} - -CodePoint Utf8NextCodePoint(std::string_view str, Index current, - Index* next_position) { - NotImplemented(); -} - -CodePoint Utf8PreviousCodePoint(std::string_view str, Index current, - Index* next_position) { - NotImplemented(); -} - -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index Utf8BackwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate) { - NotImplemented(); -} - -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index Utf8ForwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate) { - NotImplemented(); -} - -bool Utf8IsValidInsertPosition(std::string_view str, Index position) { - NotImplemented(); -} - -Index Utf8PreviousWord(std::string_view str, Index position, - bool* is_space) { - NotImplemented(); -} - -Index Utf8NextWord(std::string_view str, Index position, - bool* is_space) { - NotImplemented(); + return Word<Utf16CodeUnit, Utf16NextCodePoint, Utf16ForwardUntil>( + ptr, size, position, is_space); } -} // namespace cru +} // namespace cru::string |