From d073d80e90b747ef9ef050f1784bc1c02c0a5bf3 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Tue, 20 Feb 2024 10:45:55 -0800 Subject: Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower() PiperOrigin-RevId: 608661989 Change-Id: Ibfd94f8b2d23fd232bf93904ed68e11a400b3644 --- absl/strings/ascii.cc | 71 ++++++--------------------------------------------- 1 file changed, 8 insertions(+), 63 deletions(-) (limited to 'absl/strings/ascii.cc') diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc index 5460b2c6..8f778a45 100644 --- a/absl/strings/ascii.cc +++ b/absl/strings/ascii.cc @@ -15,10 +15,8 @@ #include "absl/strings/ascii.h" #include -#include #include #include -#include #include "absl/base/config.h" #include "absl/base/nullability.h" @@ -162,19 +160,6 @@ ABSL_DLL const char kToUpper[256] = { }; // clang-format on -template -static constexpr T BroadcastByte(unsigned char value) { - static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && - std::is_unsigned::value, - "only unsigned integers up to 64-bit allowed"); - T result = value; - constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; - result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); - return result; -} - // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). @@ -190,46 +175,8 @@ constexpr bool AsciiInAZRange(unsigned char c) { } template -static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { - using vec_t = size_t; - const size_t n = static_cast(end - p); - - // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html - constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; - char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); - while (p < swar_end) { - vec_t v = vec_t(); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - v |= static_cast(static_cast(p[i])) - << (i * CHAR_BIT); - } - - constexpr unsigned int msb = 1u << (CHAR_BIT - 1); - const vec_t v_msb = v & BroadcastByte(msb); - const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1)); - const vec_t v_nonascii = v & v_nonascii_mask; - const vec_t v_ascii = v & ~v_nonascii_mask; - const vec_t a = v_ascii + BroadcastByte(msb - ch_a - 0), - z = v_ascii + BroadcastByte(msb - ch_z - 1); - v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte(msb)) >> 2); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - p[i] = static_cast(v >> (i * CHAR_BIT)); - } - - p += sizeof(v); - } - - return p; -} - -template -static constexpr void AsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { +constexpr void AsciiStrCaseFold(absl::Nonnull p, + absl::Nonnull end) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We @@ -237,17 +184,15 @@ static constexpr void AsciiStrCaseFold(absl::Nonnull p, // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; - using vec_t = size_t; - // TODO(b/316380338): When FDO becomes able to vectorize these, - // revert this manual optimization and just leave the naive loop. - if (static_cast(end - p) >= sizeof(vec_t)) { - p = ascii_internal::PartialAsciiStrCaseFold(p, end); - } - while (p < end) { +#ifdef __clang__ +// Temporary workaround until the mentioned bug is fixed. +// NOLINTNEXTLINE(whitespace/line_length) +#pragma clang loop vectorize(enable) +#endif + for (; p < end; ++p) { unsigned char v = static_cast(*p); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; *p = static_cast(v); - ++p; } } -- cgit v1.2.3 From f576ea0ed7eaa1e9f2a3cf82160af8ef7c906bb7 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Tue, 20 Feb 2024 15:55:18 -0800 Subject: Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower() PiperOrigin-RevId: 608770171 Change-Id: Icca54086037e42826c272f04374aeb33d060ace5 --- absl/strings/ascii.cc | 71 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 8 deletions(-) (limited to 'absl/strings/ascii.cc') diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc index 8f778a45..5460b2c6 100644 --- a/absl/strings/ascii.cc +++ b/absl/strings/ascii.cc @@ -15,8 +15,10 @@ #include "absl/strings/ascii.h" #include +#include #include #include +#include #include "absl/base/config.h" #include "absl/base/nullability.h" @@ -160,6 +162,19 @@ ABSL_DLL const char kToUpper[256] = { }; // clang-format on +template +static constexpr T BroadcastByte(unsigned char value) { + static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && + std::is_unsigned::value, + "only unsigned integers up to 64-bit allowed"); + T result = value; + constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; + result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); + result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); + result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); + return result; +} + // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). @@ -175,8 +190,46 @@ constexpr bool AsciiInAZRange(unsigned char c) { } template -constexpr void AsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { +static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, + absl::Nonnull end) { + using vec_t = size_t; + const size_t n = static_cast(end - p); + + // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html + constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; + char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); + while (p < swar_end) { + vec_t v = vec_t(); + + // memcpy the vector, but constexpr + for (size_t i = 0; i < sizeof(vec_t); ++i) { + v |= static_cast(static_cast(p[i])) + << (i * CHAR_BIT); + } + + constexpr unsigned int msb = 1u << (CHAR_BIT - 1); + const vec_t v_msb = v & BroadcastByte(msb); + const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1)); + const vec_t v_nonascii = v & v_nonascii_mask; + const vec_t v_ascii = v & ~v_nonascii_mask; + const vec_t a = v_ascii + BroadcastByte(msb - ch_a - 0), + z = v_ascii + BroadcastByte(msb - ch_z - 1); + v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte(msb)) >> 2); + + // memcpy the vector, but constexpr + for (size_t i = 0; i < sizeof(vec_t); ++i) { + p[i] = static_cast(v >> (i * CHAR_BIT)); + } + + p += sizeof(v); + } + + return p; +} + +template +static constexpr void AsciiStrCaseFold(absl::Nonnull p, + absl::Nonnull end) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We @@ -184,15 +237,17 @@ constexpr void AsciiStrCaseFold(absl::Nonnull p, // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; -#ifdef __clang__ -// Temporary workaround until the mentioned bug is fixed. -// NOLINTNEXTLINE(whitespace/line_length) -#pragma clang loop vectorize(enable) -#endif - for (; p < end; ++p) { + using vec_t = size_t; + // TODO(b/316380338): When FDO becomes able to vectorize these, + // revert this manual optimization and just leave the naive loop. + if (static_cast(end - p) >= sizeof(vec_t)) { + p = ascii_internal::PartialAsciiStrCaseFold(p, end); + } + while (p < end) { unsigned char v = static_cast(*p); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; *p = static_cast(v); + ++p; } } -- cgit v1.2.3 From e4c00cc67a48ef85336d3f66c6a28e32510d8fe6 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Wed, 20 Mar 2024 13:43:01 -0700 Subject: Performance improvement for absl::AsciiStrToUpper() and absl::AsciiStrToLower() PiperOrigin-RevId: 617613544 Change-Id: I526b5bc087edf54046c77795dddf5412478ac6a8 --- absl/strings/ascii.cc | 103 ++++++++++++++------------------------------- absl/strings/ascii_test.cc | 4 ++ 2 files changed, 36 insertions(+), 71 deletions(-) (limited to 'absl/strings/ascii.cc') diff --git a/absl/strings/ascii.cc b/absl/strings/ascii.cc index 5460b2c6..20a696a1 100644 --- a/absl/strings/ascii.cc +++ b/absl/strings/ascii.cc @@ -15,13 +15,14 @@ #include "absl/strings/ascii.h" #include -#include +#include #include #include -#include +#include "absl/base/attributes.h" #include "absl/base/config.h" #include "absl/base/nullability.h" +#include "absl/base/optimization.h" namespace absl { ABSL_NAMESPACE_BEGIN @@ -162,19 +163,6 @@ ABSL_DLL const char kToUpper[256] = { }; // clang-format on -template -static constexpr T BroadcastByte(unsigned char value) { - static_assert(std::is_integral::value && sizeof(T) <= sizeof(uint64_t) && - std::is_unsigned::value, - "only unsigned integers up to 64-bit allowed"); - T result = value; - constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT; - result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1)); - result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1)); - return result; -} - // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). @@ -189,47 +177,10 @@ constexpr bool AsciiInAZRange(unsigned char c) { return static_cast(u) < threshold; } +// Force-inline so the compiler won't merge the short and long implementations. template -static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { - using vec_t = size_t; - const size_t n = static_cast(end - p); - - // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html - constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z'; - char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t); - while (p < swar_end) { - vec_t v = vec_t(); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - v |= static_cast(static_cast(p[i])) - << (i * CHAR_BIT); - } - - constexpr unsigned int msb = 1u << (CHAR_BIT - 1); - const vec_t v_msb = v & BroadcastByte(msb); - const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1)); - const vec_t v_nonascii = v & v_nonascii_mask; - const vec_t v_ascii = v & ~v_nonascii_mask; - const vec_t a = v_ascii + BroadcastByte(msb - ch_a - 0), - z = v_ascii + BroadcastByte(msb - ch_z - 1); - v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte(msb)) >> 2); - - // memcpy the vector, but constexpr - for (size_t i = 0; i < sizeof(vec_t); ++i) { - p[i] = static_cast(v >> (i * CHAR_BIT)); - } - - p += sizeof(v); - } - - return p; -} - -template -static constexpr void AsciiStrCaseFold(absl::Nonnull p, - absl::Nonnull end) { +ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl( + absl::Nonnull p, size_t size) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We @@ -237,20 +188,32 @@ static constexpr void AsciiStrCaseFold(absl::Nonnull p, // have the same single bit difference. constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A'; - using vec_t = size_t; - // TODO(b/316380338): When FDO becomes able to vectorize these, - // revert this manual optimization and just leave the naive loop. - if (static_cast(end - p) >= sizeof(vec_t)) { - p = ascii_internal::PartialAsciiStrCaseFold(p, end); - } - while (p < end) { - unsigned char v = static_cast(*p); + for (size_t i = 0; i < size; ++i) { + unsigned char v = static_cast(p[i]); v ^= AsciiInAZRange(v) ? kAsciiCaseBitFlip : 0; - *p = static_cast(v); - ++p; + p[i] = static_cast(v); } } +// The string size threshold for starting using the long string version. +constexpr size_t kCaseFoldThreshold = 16; + +// No-inline so the compiler won't merge the short and long implementations. +template +ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong( + absl::Nonnull p, size_t size) { + ABSL_ASSUME(size >= kCaseFoldThreshold); + AsciiStrCaseFoldImpl(p, size); +} + +// Splitting to short and long strings to allow vectorization decisions +// to be made separately in the long and short cases. +template +constexpr void AsciiStrCaseFold(absl::Nonnull p, size_t size) { + size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl(p, size) + : AsciiStrCaseFoldLong(p, size); +} + static constexpr size_t ValidateAsciiCasefold() { constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN; size_t incorrect_index = 0; @@ -259,8 +222,8 @@ static constexpr size_t ValidateAsciiCasefold() { for (unsigned int i = 0; i < num_chars; ++i) { uppered[i] = lowered[i] = static_cast(i); } - AsciiStrCaseFold(&lowered[0], &lowered[num_chars]); - AsciiStrCaseFold(&uppered[0], &uppered[num_chars]); + AsciiStrCaseFold(&lowered[0], num_chars); + AsciiStrCaseFold(&uppered[0], num_chars); for (size_t i = 0; i < num_chars; ++i) { const char ch = static_cast(i), ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch), @@ -278,13 +241,11 @@ static_assert(ValidateAsciiCasefold() == 0, "error in case conversion"); } // namespace ascii_internal void AsciiStrToLower(absl::Nonnull s) { - char* p = &(*s)[0]; // Guaranteed to be valid for empty strings - return ascii_internal::AsciiStrCaseFold(p, p + s->size()); + return ascii_internal::AsciiStrCaseFold(&(*s)[0], s->size()); } void AsciiStrToUpper(absl::Nonnull s) { - char* p = &(*s)[0]; // Guaranteed to be valid for empty strings - return ascii_internal::AsciiStrCaseFold(p, p + s->size()); + return ascii_internal::AsciiStrCaseFold(&(*s)[0], s->size()); } void RemoveExtraAsciiWhitespace(absl::Nonnull str) { diff --git a/absl/strings/ascii_test.cc b/absl/strings/ascii_test.cc index 117140c1..8885bb15 100644 --- a/absl/strings/ascii_test.cc +++ b/absl/strings/ascii_test.cc @@ -190,11 +190,13 @@ TEST(AsciiStrTo, Lower) { const std::string str("GHIJKL"); const std::string str2("MNOPQR"); const absl::string_view sp(str2); + const std::string long_str("ABCDEFGHIJKLMNOPQRSTUVWXYZ1!a"); std::string mutable_str("_`?@[{AMNOPQRSTUVWXYZ"); EXPECT_EQ("abcdef", absl::AsciiStrToLower(buf)); EXPECT_EQ("ghijkl", absl::AsciiStrToLower(str)); EXPECT_EQ("mnopqr", absl::AsciiStrToLower(sp)); + EXPECT_EQ("abcdefghijklmnopqrstuvwxyz1!a", absl::AsciiStrToLower(long_str)); absl::AsciiStrToLower(&mutable_str); EXPECT_EQ("_`?@[{amnopqrstuvwxyz", mutable_str); @@ -210,10 +212,12 @@ TEST(AsciiStrTo, Upper) { const std::string str("ghijkl"); const std::string str2("_`?@[{amnopqrstuvwxyz"); const absl::string_view sp(str2); + const std::string long_str("abcdefghijklmnopqrstuvwxyz1!A"); EXPECT_EQ("ABCDEF", absl::AsciiStrToUpper(buf)); EXPECT_EQ("GHIJKL", absl::AsciiStrToUpper(str)); EXPECT_EQ("_`?@[{AMNOPQRSTUVWXYZ", absl::AsciiStrToUpper(sp)); + EXPECT_EQ("ABCDEFGHIJKLMNOPQRSTUVWXYZ1!A", absl::AsciiStrToUpper(long_str)); char mutable_buf[] = "Mutable"; std::transform(mutable_buf, mutable_buf + strlen(mutable_buf), -- cgit v1.2.3