Rollback adding support for ARM intrinsics

In some configurations this change causes compilation errors. We will roll this forward again after those issue are addressed. PiperOrigin-RevId: 562810916 Change-Id: I45b2a8d456273e9eff188f36da8f11323c4dfe66
author: Abseil Team <absl-team@google.com> 2023-09-05 09:56:46 -0700
committer: Copybara-Service <copybara-worker@google.com> 2023-09-05 09:57:30 -0700
commit: 461f1e49b395700ff4d7b0bb820df49e0f8ba5cb (patch)
tree: 22fce051a60e5e47dd173b9110e69847a6843906 /absl/crc/internal/crc_memcpy_x86_arm_combined.cc
parent: 1a882833c0e81309d0d72d46c768820744d053df (diff)
download: abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.tar.gz
abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.tar.bz2
abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.zip
1 files changed, 0 insertions, 450 deletions
diff --git a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc b/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
deleted file mode 100644
index a06485a6..00000000
--- a/absl/crc/internal/crc_memcpy_x86_arm_combined.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright 2022 The Abseil Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
-// registers because XMM registers do not support the CRC instruction (yet).
-// While copying, compute the running CRC of the data being copied.
-//
-// It is assumed that any CPU running this code has SSE4.2 instructions
-// available (for CRC32C).  This file will do nothing if that is not true.
-//
-// The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
-// here (unlike a traditional memcopy, which has almost no ALU use), so we will
-// need to copy in such a way that the CRC unit is used efficiently. We have two
-// regimes in this code:
-//  1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
-//  2. For operations of size > kCrcSmallSize:
-//      a) compute an initial CRC + copy on a small amount of data to align the
-//         destination pointer on a 16-byte boundary.
-//      b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
-//      c) Do the copy and CRC of the 3 main regions, interleaving (start with
-//         full cache line copies for each region, then move to single 16 byte
-//         pieces per region).
-//      d) Combine the CRCs with CRC32C::Concat.
-//      e) Copy the tail and extend the CRC with the CRC of the tail.
-// This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
-// takes a significant amount of time.  A medium-sized approach could be added
-// using 3 CRCs over fixed-size blocks where the zero-extensions required for
-// CRC32C::Concat can be precomputed.
-
-#ifdef __SSE4_2__
-#include <immintrin.h>
-#endif
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
-#include "absl/base/config.h"
-#include "absl/base/optimization.h"
-#include "absl/base/prefetch.h"
-#include "absl/crc/crc32c.h"
-#include "absl/crc/internal/cpu_detect.h"
-#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
-#include "absl/crc/internal/crc_memcpy.h"
-#include "absl/strings/string_view.h"
-
-#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
-    defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
-
-namespace absl {
-ABSL_NAMESPACE_BEGIN
-namespace crc_internal {
-
-namespace {
-
-inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
-                             crc32c_t crc) {
-  // Small copy: just go 1 byte at a time: being nice to the branch predictor
-  // is more important here than anything else
-  uint32_t crc_uint32 = static_cast<uint32_t>(crc);
-  for (std::size_t i = 0; i < length; i++) {
-    uint8_t data = *reinterpret_cast<const uint8_t*>(src);
-    crc_uint32 = CRC32_u8(crc_uint32, data);
-    *reinterpret_cast<uint8_t*>(dst) = data;
-    ++src;
-    ++dst;
-  }
-  return crc32c_t{crc_uint32};
-}
-
-constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
-
-// Common function for copying the tails of multiple large regions.
-template <size_t vec_regions, size_t int_regions>
-inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
-                          size_t region_size, size_t copy_rounds) {
-  std::array<V128, vec_regions> data;
-  std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
-
-  while (copy_rounds > 0) {
-    for (size_t i = 0; i < vec_regions; i++) {
-      size_t region = i;
-
-      auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
-      auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
-
-      // Load the blocks, unaligned
-      data[i] = V128_LoadU(vsrc);
-
-      // Store the blocks, aligned
-      V128_Store(vdst, data[i]);
-
-      // Compute the running CRC
-      crcs[region] = crc32c_t{static_cast<uint32_t>(
-          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                    static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
-      crcs[region] = crc32c_t{static_cast<uint32_t>(
-          CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                    static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
-    }
-
-    for (size_t i = 0; i < int_regions; i++) {
-      size_t region = vec_regions + i;
-
-      auto* usrc =
-          reinterpret_cast<const uint64_t*>(*src + region_size * region);
-      auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
-
-      for (size_t j = 0; j < kIntLoadsPerVec; j++) {
-        size_t data_index = i * kIntLoadsPerVec + j;
-
-        int_data[data_index] = *(usrc + j);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
-            static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
-
-        *(udst + j) = int_data[data_index];
-      }
-    }
-
-    // Increment pointers
-    *src += sizeof(V128);
-    *dst += sizeof(V128);
-    --copy_rounds;
-  }
-}
-
-}  // namespace
-
-template <size_t vec_regions, size_t int_regions>
-class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
- public:
-  AcceleratedCrcMemcpyEngine() = default;
-  AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
-  AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
-      delete;
-
-  crc32c_t Compute(void* __restrict dst, const void* __restrict src,
-                   std::size_t length, crc32c_t initial_crc) const override;
-};
-
-template <size_t vec_regions, size_t int_regions>
-crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
-    void* __restrict dst, const void* __restrict src, std::size_t length,
-    crc32c_t initial_crc) const {
-  constexpr std::size_t kRegions = vec_regions + int_regions;
-  static_assert(kRegions > 0, "Must specify at least one region.");
-  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
-  constexpr std::size_t kBlockSize = sizeof(V128);
-  constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
-
-  // Number of blocks per cacheline.
-  constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
-
-  char* dst_bytes = static_cast<char*>(dst);
-  const char* src_bytes = static_cast<const char*>(src);
-
-  // Make sure that one prefetch per big block is enough to cover the whole
-  // dataset, and we don't prefetch too much.
-  static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
-                "Cache lines are not divided evenly into blocks, may have "
-                "unintended behavior!");
-
-  // Experimentally-determined boundary between a small and large copy.
-  // Below this number, spin-up and concatenation of CRCs takes enough time that
-  // it kills the throughput gains of using 3 regions and wide vectors.
-  constexpr size_t kCrcSmallSize = 256;
-
-  // Experimentally-determined prefetch distance.  Main loop copies will
-  // prefeth data 2 cache lines ahead.
-  constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
-
-  // Small-size CRC-memcpy : just do CRC + memcpy
-  if (length < kCrcSmallSize) {
-    crc32c_t crc =
-        ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
-    memcpy(dst, src, length);
-    return crc;
-  }
-
-  // Start work on the CRC: undo the XOR from the previous calculation or set up
-  // the initial value of the CRC.
-  // initial_crc ^= kCrcDataXor;
-  initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
-
-  // Do an initial alignment copy, so we can use aligned store instructions to
-  // the destination pointer.  We align the destination pointer because the
-  // penalty for an unaligned load is small compared to the penalty of an
-  // unaligned store on modern CPUs.
-  std::size_t bytes_from_last_aligned =
-      reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
-  if (bytes_from_last_aligned != 0) {
-    std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
-
-    // Do the short-sized copy and CRC.
-    initial_crc =
-        ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
-    src_bytes += bytes_for_alignment;
-    dst_bytes += bytes_for_alignment;
-    length -= bytes_for_alignment;
-  }
-
-  // We are going to do the copy and CRC in kRegions regions to make sure that
-  // we can saturate the CRC unit.  The CRCs will be combined at the end of the
-  // run.  Copying will use the SSE registers, and we will extract words from
-  // the SSE registers to add to the CRC.  Initially, we run the loop one full
-  // cache line per region at a time, in order to insert prefetches.
-
-  // Initialize CRCs for kRegions regions.
-  crc32c_t crcs[kRegions];
-  crcs[0] = initial_crc;
-  for (size_t i = 1; i < kRegions; i++) {
-    crcs[i] = crc32c_t{kCrcDataXor};
-  }
-
-  // Find the number of rounds to copy and the region size.  Also compute the
-  // tail size here.
-  size_t copy_rounds = length / kCopyRoundSize;
-
-  // Find the size of each region and the size of the tail.
-  const std::size_t region_size = copy_rounds * kBlockSize;
-  const std::size_t tail_size = length - (kRegions * region_size);
-
-  // Holding registers for data in each region.
-  std::array<V128, vec_regions> vec_data;
-  std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
-
-  // Main loop.
-  while (copy_rounds > kBlocksPerCacheLine) {
-    // Prefetch kPrefetchAhead bytes ahead of each pointer.
-    for (size_t i = 0; i < kRegions; i++) {
-      absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
-#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
-      // TODO(b/297082454): investigate dropping prefetch on x86.
-      absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
-#endif
-    }
-
-    // Load and store data, computing CRC on the way.
-    for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
-      // Copy and CRC the data for the CRC regions.
-      for (size_t j = 0; j < vec_regions; j++) {
-        // Cycle which regions get vector load/store and integer load/store, to
-        // engage prefetching logic around vector load/stores and save issue
-        // slots by using the integer registers.
-        size_t region = (j + i) % kRegions;
-
-        auto* vsrc =
-            reinterpret_cast<const V128u*>(src_bytes + region_size * region);
-        auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
-
-        // Load and CRC data.
-        vec_data[j] = V128_LoadU(vsrc + i);
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                      static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
-        crcs[region] = crc32c_t{static_cast<uint32_t>(
-            CRC32_u64(static_cast<uint32_t>(crcs[region]),
-                      static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
-
-        // Store the data.
-        V128_Store(vdst + i, vec_data[j]);
-      }
-
-      // Preload the partial CRCs for the CLMUL subregions.
-      for (size_t j = 0; j < int_regions; j++) {
-        // Cycle which regions get vector load/store and integer load/store, to
-        // engage prefetching logic around vector load/stores and save issue
-        // slots by using the integer registers.
-        size_t region = (j + vec_regions + i) % kRegions;
-
-        auto* usrc =
-            reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
-        auto* udst =
-            reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
-
-        for (size_t k = 0; k < kIntLoadsPerVec; k++) {
-          size_t data_index = j * kIntLoadsPerVec + k;
-
-          // Load and CRC the data.
-          int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
-          crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
-              static_cast<uint32_t>(crcs[region]), int_data[data_index]))};
-
-          // Store the data.
-          *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
-        }
-      }
-    }
-
-    // Increment pointers
-    src_bytes += kBlockSize * kBlocksPerCacheLine;
-    dst_bytes += kBlockSize * kBlocksPerCacheLine;
-    copy_rounds -= kBlocksPerCacheLine;
-  }
-
-  // Copy and CRC the tails of each region.
-  LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
-                                          region_size, copy_rounds);
-
-  // Move the source and destination pointers to the end of the region
-  src_bytes += region_size * (kRegions - 1);
-  dst_bytes += region_size * (kRegions - 1);
-
-  // Copy and CRC the tail through the XMM registers.
-  std::size_t tail_blocks = tail_size / kBlockSize;
-  LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
-                      tail_blocks);
-
-  // Final tail copy for under 16 bytes.
-  crcs[kRegions - 1] =
-      ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
-                   crcs[kRegions - 1]);
-
-  if (kRegions == 1) {
-    // If there is only one region, finalize and return its CRC.
-    return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor};
-  }
-
-  // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
-  // XOR done before doing block copy + CRCs.
-  for (size_t i = 0; i + 1 < kRegions; i++) {
-    crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
-  }
-
-  // Build a CRC of the first kRegions - 1 regions.
-  crc32c_t full_crc = crcs[0];
-  for (size_t i = 1; i + 1 < kRegions; i++) {
-    full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
-  }
-
-  // Finalize and concatenate the final CRC, then return.
-  crcs[kRegions - 1] =
-      crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
-  return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
-}
-
-CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
-#ifdef UNDEFINED_BEHAVIOR_SANITIZER
-  // UBSAN does not play nicely with unaligned loads (which we use a lot).
-  // Get the underlying architecture.
-  CpuType cpu_type = GetCpuType();
-  switch (cpu_type) {
-    case CpuType::kAmdRome:
-    case CpuType::kAmdNaples:
-    case CpuType::kAmdMilan:
-    case CpuType::kAmdGenoa:
-    case CpuType::kAmdRyzenV3000:
-    case CpuType::kIntelCascadelakeXeon:
-    case CpuType::kIntelSkylakeXeon:
-    case CpuType::kIntelSkylake:
-    case CpuType::kIntelBroadwell:
-    case CpuType::kIntelHaswell:
-    case CpuType::kIntelIvybridge:
-      return {
-          /*.temporal=*/new FallbackCrcMemcpyEngine(),
-          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-      };
-    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
-    case CpuType::kIntelSandybridge:
-      return {
-          /*.temporal=*/new FallbackCrcMemcpyEngine(),
-          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
-      };
-    default:
-      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
-              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
-  }
-#else
-  // Get the underlying architecture.
-  CpuType cpu_type = GetCpuType();
-  switch (cpu_type) {
-    // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
-    // which data movement from the vector registers to the integer registers
-    // (where CRC32C happens) to crowd the same units as vector stores.  As a
-    // result, using that path exclusively causes bottlenecking on this port.
-    // We can avoid this bottleneck by using the integer side of the CPU for
-    // most operations rather than the vector side.  We keep a vector region to
-    // engage some of the prefetching logic in the cache hierarchy which seems
-    // to give vector instructions special treatment.  These prefetch units see
-    // strided access to each region, and do the right thing.
-    case CpuType::kAmdRome:
-    case CpuType::kAmdNaples:
-    case CpuType::kAmdMilan:
-    case CpuType::kAmdGenoa:
-    case CpuType::kAmdRyzenV3000:
-      return {
-          /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
-          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-      };
-    // PCLMULQDQ is slow and we don't have wide enough issue width to take
-    // advantage of it.  For an unknown architecture, don't risk using CLMULs.
-    case CpuType::kIntelCascadelakeXeon:
-    case CpuType::kIntelSkylakeXeon:
-    case CpuType::kIntelSkylake:
-    case CpuType::kIntelBroadwell:
-    case CpuType::kIntelHaswell:
-    case CpuType::kIntelIvybridge:
-      return {
-          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
-          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
-      };
-    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
-    case CpuType::kIntelSandybridge:
-      return {
-          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
-          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
-      };
-    default:
-      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
-              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
-  }
-#endif  // UNDEFINED_BEHAVIOR_SANITIZER
-}
-
-// For testing, allow the user to specify which engine they want.
-std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
-                                                          int integer) {
-  if (vector == 3 && integer == 0) {
-    return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
-  } else if (vector == 1 && integer == 2) {
-    return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
-  } else if (vector == 1 && integer == 0) {
-    return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>();
-  }
-  return nullptr;
-}
-
-}  // namespace crc_internal
-ABSL_NAMESPACE_END
-}  // namespace absl
-
-#endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
-        // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
author	Abseil Team <absl-team@google.com>	2023-09-05 09:56:46 -0700
committer	Copybara-Service <copybara-worker@google.com>	2023-09-05 09:57:30 -0700
commit	461f1e49b395700ff4d7b0bb820df49e0f8ba5cb (patch)
tree	22fce051a60e5e47dd173b9110e69847a6843906 /absl/crc/internal/crc_memcpy_x86_arm_combined.cc
parent	1a882833c0e81309d0d72d46c768820744d053df (diff)
download	abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.tar.gz abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.tar.bz2 abseil-461f1e49b395700ff4d7b0bb820df49e0f8ba5cb.zip