aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/common_avx2.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/common_avx2.h')
-rw-r--r--src/dsp/x86/common_avx2.h151
1 files changed, 51 insertions, 100 deletions
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
index 4ce7de2..373116a 100644
--- a/src/dsp/x86/common_avx2.h
+++ b/src/dsp/x86/common_avx2.h
@@ -27,109 +27,60 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
+#include <cstring>
namespace libgav1 {
namespace dsp {
-
-//------------------------------------------------------------------------------
-// Compatibility functions.
-
-inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
- // For compatibility with older gcc toolchains (< 8) use
- // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
- // are implemented similarly to the following, clang uses a different method
- // but no differences in assembly have been observed.
- return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-//------------------------------------------------------------------------------
-// Load functions.
-
-inline __m256i LoadAligned32(const void* a) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- return _mm256_load_si256(static_cast<const __m256i*>(a));
-}
-
-inline void LoadAligned64(const void* a, __m256i dst[2]) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
- dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
-}
-
-inline __m256i LoadUnaligned32(const void* a) {
- return _mm256_loadu_si256(static_cast<const __m256i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m256i MaskOverreads(const __m256i source,
- const ptrdiff_t over_read_in_bytes) {
- __m256i dst = source;
-#if LIBGAV1_MSAN
- if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
- if (over_read_in_bytes > 0) {
- __m128i m = _mm_set1_epi8(-1);
- for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
- m = _mm_srli_si128(m, 1);
- }
- const __m256i mask = (over_read_in_bytes < 16)
- ? SetrM128i(_mm_set1_epi8(-1), m)
- : SetrM128i(m, _mm_setzero_si128());
- dst = _mm256_and_si256(dst, mask);
- }
-#else
- static_cast<void>(over_read_in_bytes);
-#endif
- return dst;
-}
-
-inline __m256i LoadAligned32Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
-}
-
-inline void LoadAligned64Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes,
- __m256i dst[2]) {
- dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
- dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
- over_read_in_bytes);
-}
-
-inline __m256i LoadUnaligned32Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void StoreAligned32(void* a, const __m256i v) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- _mm256_store_si256(static_cast<__m256i*>(a), v);
-}
-
-inline void StoreAligned64(void* a, const __m256i v[2]) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
- _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
-}
-
-inline void StoreUnaligned32(void* a, const __m256i v) {
- _mm256_storeu_si256(static_cast<__m256i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
- assert(bits <= 16);
- const __m256i v_bias_d =
- _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
- const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
- return _mm256_srai_epi16(v_tmp_d, bits);
-}
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
} // namespace dsp
} // namespace libgav1