diff options
Diffstat (limited to 'include/cru')
-rw-r--r-- | include/cru/base/StringUtil.h | 284 |
1 files changed, 139 insertions, 145 deletions
diff --git a/include/cru/base/StringUtil.h b/include/cru/base/StringUtil.h index 7a88f7e8..54e7b6e6 100644 --- a/include/cru/base/StringUtil.h +++ b/include/cru/base/StringUtil.h @@ -6,6 +6,7 @@ #include <cctype> #include <charconv> #include <compare> +#include <cstdint> #include <format> #include <functional> #include <string> @@ -14,8 +15,7 @@ #include <type_traits> #include <vector> -namespace cru { -namespace string { +namespace cru::string { std::weak_ordering CaseInsensitiveCompare(std::string_view left, std::string_view right); std::string TrimBegin(std::string_view str); @@ -137,32 +137,158 @@ struct ImplementFormatterByToString { } }; -} // namespace string - using CodePoint = std::int32_t; +using Utf8CodeUnit = char; +using Utf16CodeUnit = char16_t; constexpr CodePoint k_invalid_code_point = -1; -inline bool IsUtf16SurrogatePairCodeUnit(char16_t c) { +inline bool IsUtf8LeadingByte(Utf8CodeUnit c) { + return !(c & 0b10000000) || c & 0b01000000; +} + +inline bool IsUtf8FollowingByte(Utf8CodeUnit c) { + return !IsUtf8LeadingByte(c); +} + +inline bool IsUtf16SurrogatePairCodeUnit(Utf16CodeUnit c) { return c >= 0xD800 && c <= 0xDFFF; } -inline bool IsUtf16SurrogatePairLeading(char16_t c) { +inline bool IsUtf16SurrogatePairLeading(Utf16CodeUnit c) { return c >= 0xD800 && c <= 0xDBFF; } -inline bool IsUtf16SurrogatePairTrailing(char16_t c) { +inline bool IsUtf16SurrogatePairTrailing(Utf16CodeUnit c) { return c >= 0xDC00 && c <= 0xDFFF; } CodePoint CRU_BASE_API Utf8NextCodePoint(const char* ptr, Index size, Index current, Index* next_position); -CodePoint CRU_BASE_API Utf16NextCodePoint(const char16_t* ptr, Index size, +CodePoint CRU_BASE_API Utf8PreviousCodePoint(const char* ptr, Index size, + Index current, + Index* previous_position); + +namespace details { +template <typename Integer, int number_of_bit, typename ReturnType> +inline ReturnType ExtractBits(Integer n) { + return static_cast<ReturnType>(n & ((1u << number_of_bit) - 1)); +} +} // namespace details + +template <typename CharWriter> +std::enable_if_t<std::is_invocable_v<CharWriter, Utf8CodeUnit>, bool> +Utf8EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { + auto write_continue_byte = [&writer](Utf8CodeUnit byte6) { + writer((1u << 7) + (((1u << 6) - 1) & byte6)); + }; + + if (code_point >= 0 && code_point <= 0x007F) { + writer(static_cast<Utf8CodeUnit>(code_point)); + return true; + } else if (code_point >= 0x0080 && code_point <= 0x07FF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 5, Utf8CodeUnit>( + (unsigned_code_point >> 6)) + + 0b11000000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else if (code_point >= 0x0800 && code_point <= 0xFFFF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 4, Utf8CodeUnit>( + (unsigned_code_point >> (6 * 2))) + + 0b11100000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> 6)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + std::uint32_t unsigned_code_point = code_point; + writer(static_cast<Utf8CodeUnit>( + details::ExtractBits<std::uint32_t, 3, Utf8CodeUnit>( + (unsigned_code_point >> (6 * 3))) + + 0b11110000)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> (6 * 2))); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point >> 6)); + write_continue_byte(details::ExtractBits<std::uint32_t, 6, Utf8CodeUnit>( + unsigned_code_point)); + return true; + } else { + return false; + } +} + +bool CRU_BASE_API Utf8IsValidInsertPosition(const Utf8CodeUnit* ptr, Index size, + Index position); + +// Return position after the character making predicate returns true or 0 if no +// character doing so. +Index CRU_BASE_API +Utf8BackwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); +// Return position before the character making predicate returns true or +// str.size() if no character doing so. +Index CRU_BASE_API +Utf8ForwardUntil(const Utf8CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); + +Index CRU_BASE_API Utf8PreviousWord(const Utf8CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); +Index CRU_BASE_API Utf8NextWord(const Utf8CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); + +CodePoint CRU_BASE_API Utf16NextCodePoint(const Utf16CodeUnit* ptr, Index size, Index current, Index* next_position); -CodePoint CRU_BASE_API Utf16PreviousCodePoint(const char16_t* ptr, Index size, - Index current, +CodePoint CRU_BASE_API Utf16PreviousCodePoint(const Utf16CodeUnit* ptr, + Index size, Index current, Index* previous_position); +template <typename CharWriter> +std::enable_if_t<std::is_invocable_v<CharWriter, Utf16CodeUnit>, bool> +Utf16EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { + if ((code_point >= 0 && code_point <= 0xD7FF) || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + writer(static_cast<Utf16CodeUnit>(code_point)); + return true; + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + std::uint32_t u = code_point - 0x10000; + writer(static_cast<Utf16CodeUnit>( + details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u >> 10) + + 0xD800u)); + writer(static_cast<Utf16CodeUnit>( + details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u) + 0xDC00u)); + return true; + } else { + return false; + } +} + +// If given s is not a valid utf16 string, return value is UD. +bool CRU_BASE_API Utf16IsValidInsertPosition(const Utf16CodeUnit* ptr, + Index size, Index position); + +// Return position after the character making predicate returns true or 0 if no +// character doing so. +Index CRU_BASE_API +Utf16BackwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); +// Return position before the character making predicate returns true or +// str.size() if no character doing so. +Index CRU_BASE_API +Utf16ForwardUntil(const Utf16CodeUnit* ptr, Index size, Index position, + const std::function<bool(CodePoint)>& predicate); + +Index CRU_BASE_API Utf16PreviousWord(const Utf16CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); +Index CRU_BASE_API Utf16NextWord(const Utf16CodeUnit* ptr, Index size, + Index position, bool* is_space = nullptr); + template <typename CharType> using NextCodePointFunctionType = CodePoint (*)(const CharType*, Index, Index, Index*); @@ -247,138 +373,6 @@ class CodePointIterator { }; using Utf8CodePointIterator = CodePointIterator<char, &Utf8NextCodePoint>; - -using Utf16CodePointIterator = CodePointIterator<char16_t, &Utf16NextCodePoint>; - -namespace details { -template <typename UInt, int number_of_bit, typename ReturnType> -inline std::enable_if_t<std::is_unsigned_v<UInt>, ReturnType> ExtractBits( - UInt n) { - return static_cast<ReturnType>(n & ((1u << number_of_bit) - 1)); -} -} // namespace details - -template <typename CharWriter> -std::enable_if_t<std::is_invocable_v<CharWriter, char>, bool> -Utf8EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { - auto write_continue_byte = [&writer](std::uint8_t byte6) { - writer((1u << 7) + (((1u << 6) - 1) & byte6)); - }; - - if (code_point >= 0 && code_point <= 0x007F) { - writer(static_cast<char>(code_point)); - return true; - } else if (code_point >= 0x0080 && code_point <= 0x07FF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 5, std::uint8_t>( - (unsigned_code_point >> 6)) + - 0b11000000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else if (code_point >= 0x0800 && code_point <= 0xFFFF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 4, std::uint8_t>( - (unsigned_code_point >> (6 * 2))) + - 0b11100000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> 6)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t unsigned_code_point = code_point; - writer( - static_cast<char>(details::ExtractBits<std::uint32_t, 3, std::uint8_t>( - (unsigned_code_point >> (6 * 3))) + - 0b11110000)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> (6 * 2))); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point >> 6)); - write_continue_byte(details::ExtractBits<std::uint32_t, 6, std::uint8_t>( - unsigned_code_point)); - return true; - } else { - return false; - } -} - -template <typename CharWriter> -std::enable_if_t<std::is_invocable_v<CharWriter, char16_t>, bool> -Utf16EncodeCodePointAppend(CodePoint code_point, CharWriter&& writer) { - if ((code_point >= 0 && code_point <= 0xD7FF) || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - writer(static_cast<char16_t>(code_point)); - return true; - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - std::uint32_t u = code_point - 0x10000; - writer(static_cast<char16_t>( - details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u >> 10) + - 0xD800u)); - writer(static_cast<char16_t>( - details::ExtractBits<std::uint32_t, 10, std::uint32_t>(u) + 0xDC00u)); - return true; - } else { - return false; - } -} - -// If given s is not a valid utf16 string, return value is UD. -bool CRU_BASE_API Utf16IsValidInsertPosition(const char16_t* ptr, Index size, - Index position); - -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index CRU_BASE_API -Utf16BackwardUntil(const char16_t* ptr, Index size, Index position, - const std::function<bool(CodePoint)>& predicate); -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index CRU_BASE_API -Utf16ForwardUntil(const char16_t* ptr, Index size, Index position, - const std::function<bool(CodePoint)>& predicate); - -Index CRU_BASE_API Utf16PreviousWord(const char16_t* ptr, Index size, - Index position, bool* is_space = nullptr); -Index CRU_BASE_API Utf16NextWord(const char16_t* ptr, Index size, - Index position, bool* is_space = nullptr); - -char16_t CRU_BASE_API ToLower(char16_t c); -char16_t CRU_BASE_API ToUpper(char16_t c); - -bool CRU_BASE_API IsWhitespace(char16_t c); -bool CRU_BASE_API IsDigit(char16_t c); - -Utf8CodePointIterator CRU_BASE_API CreateUtf8Iterator(const std::byte* buffer, - Index size); -Utf8CodePointIterator CRU_BASE_API -CreateUtf8Iterator(const std::vector<std::byte>& buffer); - -CodePoint CRU_BASE_API Utf8NextCodePoint(std::string_view str, Index current, - Index* next_position); -CodePoint CRU_BASE_API Utf8PreviousCodePoint(std::string_view str, - Index current, - Index* next_position); -// Return position after the character making predicate returns true or 0 if no -// character doing so. -Index CRU_BASE_API -Utf8BackwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate); -// Return position before the character making predicate returns true or -// str.size() if no character doing so. -Index CRU_BASE_API -Utf8ForwardUntil(std::string_view str, Index position, - const std::function<bool(CodePoint)>& predicate); - -bool CRU_BASE_API Utf8IsValidInsertPosition(std::string_view str, - Index position); - -Index CRU_BASE_API Utf8PreviousWord(std::string_view str, Index position, - bool* is_space = nullptr); -Index CRU_BASE_API Utf8NextWord(std::string_view str, Index position, - bool* is_space = nullptr); - -} // namespace cru +using Utf16CodePointIterator = + CodePointIterator<Utf16CodeUnit, &Utf16NextCodePoint>; +} // namespace cru::string |