diff options
Diffstat (limited to 'src/dsp/x86/common_avx2.h')
-rw-r--r-- | src/dsp/x86/common_avx2.h | 151 |
1 files changed, 51 insertions, 100 deletions
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h index 4ce7de2..373116a 100644 --- a/src/dsp/x86/common_avx2.h +++ b/src/dsp/x86/common_avx2.h @@ -27,109 +27,60 @@ #include <cassert> #include <cstddef> #include <cstdint> +#include <cstring> namespace libgav1 { namespace dsp { - -//------------------------------------------------------------------------------ -// Compatibility functions. - -inline __m256i SetrM128i(const __m128i lo, const __m128i hi) { - // For compatibility with older gcc toolchains (< 8) use - // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations - // are implemented similarly to the following, clang uses a different method - // but no differences in assembly have been observed. - return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); -} - -//------------------------------------------------------------------------------ -// Load functions. - -inline __m256i LoadAligned32(const void* a) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - return _mm256_load_si256(static_cast<const __m256i*>(a)); -} - -inline void LoadAligned64(const void* a, __m256i dst[2]) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0); - dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1); -} - -inline __m256i LoadUnaligned32(const void* a) { - return _mm256_loadu_si256(static_cast<const __m256i*>(a)); -} - -//------------------------------------------------------------------------------ -// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. - -inline __m256i MaskOverreads(const __m256i source, - const ptrdiff_t over_read_in_bytes) { - __m256i dst = source; -#if LIBGAV1_MSAN - if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); - if (over_read_in_bytes > 0) { - __m128i m = _mm_set1_epi8(-1); - for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { - m = _mm_srli_si128(m, 1); - } - const __m256i mask = (over_read_in_bytes < 16) - ? SetrM128i(_mm_set1_epi8(-1), m) - : SetrM128i(m, _mm_setzero_si128()); - dst = _mm256_and_si256(dst, mask); - } -#else - static_cast<void>(over_read_in_bytes); -#endif - return dst; -} - -inline __m256i LoadAligned32Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadAligned32(source), over_read_in_bytes); -} - -inline void LoadAligned64Msan(const void* const source, - const ptrdiff_t over_read_in_bytes, - __m256i dst[2]) { - dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); - dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1), - over_read_in_bytes); -} - -inline __m256i LoadUnaligned32Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); -} - -//------------------------------------------------------------------------------ -// Store functions. - -inline void StoreAligned32(void* a, const __m256i v) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - _mm256_store_si256(static_cast<__m256i*>(a), v); -} - -inline void StoreAligned64(void* a, const __m256i v[2]) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); - _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); -} - -inline void StoreUnaligned32(void* a, const __m256i v) { - _mm256_storeu_si256(static_cast<__m256i*>(a), v); -} - -//------------------------------------------------------------------------------ -// Arithmetic utilities. - -inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { - assert(bits <= 16); - const __m256i v_bias_d = - _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); - const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); - return _mm256_srai_epi16(v_tmp_d, bits); -} +namespace avx2 { + +#include "src/dsp/x86/common_avx2.inc" +#include "src/dsp/x86/common_sse4.inc" + +} // namespace avx2 + +// NOLINTBEGIN(misc-unused-using-decls) +// These function aliases shall not be visible to external code. They are +// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two +// possible implementations of common functions, which may differ based on +// whether the compiler is permitted to use avx2 instructions. + +// common_sse4.inc +using avx2::Load2; +using avx2::Load2x2; +using avx2::Load4; +using avx2::Load4x2; +using avx2::LoadAligned16; +using avx2::LoadAligned16Msan; +using avx2::LoadHi8; +using avx2::LoadHi8Msan; +using avx2::LoadLo8; +using avx2::LoadLo8Msan; +using avx2::LoadUnaligned16; +using avx2::LoadUnaligned16Msan; +using avx2::MaskHighNBytes; +using avx2::RightShiftWithRounding_S16; +using avx2::RightShiftWithRounding_S32; +using avx2::RightShiftWithRounding_U16; +using avx2::RightShiftWithRounding_U32; +using avx2::Store2; +using avx2::Store4; +using avx2::StoreAligned16; +using avx2::StoreHi8; +using avx2::StoreLo8; +using avx2::StoreUnaligned16; + +// common_avx2.inc +using avx2::LoadAligned32; +using avx2::LoadAligned32Msan; +using avx2::LoadAligned64; +using avx2::LoadAligned64Msan; +using avx2::LoadUnaligned32; +using avx2::LoadUnaligned32Msan; +using avx2::SetrM128i; +using avx2::StoreAligned32; +using avx2::StoreAligned64; +using avx2::StoreUnaligned32; +// NOLINTEND } // namespace dsp } // namespace libgav1 |