From e4c00cc67a48ef85336d3f66c6a28e32510d8fe6 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Wed, 20 Mar 2024 13:43:01 -0700 Subject: Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower() PiperOrigin-RevId: 617613544 Change-Id: I526b5bc087edf54046c77795dddf5412478ac6a8 --- absl/strings/ascii.cc | 103 ++++++++++++++++---------------------------------- 1 file changed, 32 insertions(+), 71 deletions(-) (limited to 'absl/strings/ascii.cc') diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc index 5460b2c6..20a696a1 100644 --- a/absl/strings/ascii.cc +++ b/absl/strings/ascii.cc @@ -15,13 +15,14 @@ #include "absl/strings/ascii.h" #include -#include +#include #include #include -#include +#include "absl/base/attributes.h" #include "absl/base/config.h" #include "absl/base/nullability.h" +#include "absl/base/optimization.h" namespace absl { ABSL_NAMESPACE_BEGIN @@ -162,19 +163,6 @@ ABSL_DLL const char kToUpper[256] = { }; // clang-format on -template -static constexpr T BroadcastByte(unsigned char value) { - static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && - std::is_unsigned::value, - "only unsigned integers up to 64-bit allowed"); - T result = value; - constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; - result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); - return result; -} - // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). @@ -189,47 +177,10 @@ constexpr bool AsciiInAZRange(unsigned char c) { return static_cast(u) < threshold; } +// Force-inline so the compiler won't merge the short and long implementations. template -static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { - using vec_t = size_t; - const size_t n = static_cast(end - p); - - // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html - constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; - char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); - while (p < swar_end) { - vec_t v = vec_t(); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - v |= static_cast(static_cast(p[i])) - << (i * CHAR_BIT); - } - - constexpr unsigned int msb = 1u << (CHAR_BIT - 1); - const vec_t v_msb = v & BroadcastByte(msb); - const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1)); - const vec_t v_nonascii = v & v_nonascii_mask; - const vec_t v_ascii = v & ~v_nonascii_mask; - const vec_t a = v_ascii + BroadcastByte(msb - ch_a - 0), - z = v_ascii + BroadcastByte(msb - ch_z - 1); - v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte(msb)) >> 2); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - p[i] = static_cast(v >> (i * CHAR_BIT)); - } - - p += sizeof(v); - } - - return p; -} - -template -static constexpr void AsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { +ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl( + absl::Nonnull p, size_t size) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We @@ -237,20 +188,32 @@ static constexpr void AsciiStrCaseFold(absl::Nonnull p, // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; - using vec_t = size_t; - // TODO(b/316380338): When FDO becomes able to vectorize these, - // revert this manual optimization and just leave the naive loop. - if (static_cast(end - p) >= sizeof(vec_t)) { - p = ascii_internal::PartialAsciiStrCaseFold(p, end); - } - while (p < end) { - unsigned char v = static_cast(*p); + for (size_t i = 0; i < size; ++i) { + unsigned char v = static_cast(p[i]); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; - *p = static_cast(v); - ++p; + p[i] = static_cast(v); } } +// The string size threshold for starting using the long string version. +constexpr size_t kCaseFoldThreshold = 16; + +// No-inline so the compiler won't merge the short and long implementations. +template +ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong( + absl::Nonnull p, size_t size) { + ABSL_ASSUME(size >= kCaseFoldThreshold); + AsciiStrCaseFoldImpl(p, size); +} + +// Splitting to short and long strings to allow vectorization decisions +// to be made separately in the long and short cases. +template +constexpr void AsciiStrCaseFold(absl::Nonnull p, size_t size) { + size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl(p, size) + : AsciiStrCaseFoldLong(p, size); +} + static constexpr size_t ValidateAsciiCasefold() { constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN; size_t incorrect_index = 0; @@ -259,8 +222,8 @@ static constexpr size_t ValidateAsciiCasefold() { for (unsigned int i = 0; i < num_chars; ++i) { uppered[i] = lowered[i] = static_cast(i); } - AsciiStrCaseFold(&lowered[0], &lowered[num_chars]); - AsciiStrCaseFold(&uppered[0], &uppered[num_chars]); + AsciiStrCaseFold(&lowered[0], num_chars); + AsciiStrCaseFold(&uppered[0], num_chars); for (size_t i = 0; i < num_chars; ++i) { const char ch = static_cast(i), ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch), @@ -278,13 +241,11 @@ static_assert(ValidateAsciiCasefold() == 0, "error in case conversion"); } // namespace ascii_internal void AsciiStrToLower(absl::Nonnull s) { - char* p = &(*s)[0]; // Guaranteed to be valid for empty strings - return ascii_internal::AsciiStrCaseFold(p, p + s->size()); + return ascii_internal::AsciiStrCaseFold(&(*s)[0], s->size()); } void AsciiStrToUpper(absl::Nonnull s) { - char* p = &(*s)[0]; // Guaranteed to be valid for empty strings - return ascii_internal::AsciiStrCaseFold(p, p + s->size()); + return ascii_internal::AsciiStrCaseFold(&(*s)[0], s->size()); } void RemoveExtraAsciiWhitespace(absl::Nonnull str) { -- cgit v1.2.3