diff options
Diffstat (limited to 'src/dsp/x86/common_sse4.h')
-rw-r--r-- | src/dsp/x86/common_sse4.h | 225 |
1 files changed, 45 insertions, 180 deletions
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h index c510f8c..41a3a68 100644 --- a/src/dsp/x86/common_sse4.h +++ b/src/dsp/x86/common_sse4.h @@ -28,7 +28,6 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> #include <cstring> #if 0 @@ -71,192 +70,58 @@ inline void PrintRegX(const int r, const char* const name) { #define PR(var, N) PrintReg(var, #var, N) #define PD(var) PrintReg(var, #var); #define PX(var) PrintRegX(var, #var); -#endif // 0 - -namespace libgav1 { -namespace dsp { - -//------------------------------------------------------------------------------ -// Load functions. - -inline __m128i Load2(const void* src) { - int16_t val; - memcpy(&val, src, sizeof(val)); - return _mm_cvtsi32_si128(val); -} - -inline __m128i Load2x2(const void* src1, const void* src2) { - uint16_t val1; - uint16_t val2; - memcpy(&val1, src1, sizeof(val1)); - memcpy(&val2, src2, sizeof(val2)); - return _mm_cvtsi32_si128(val1 | (val2 << 16)); -} - -// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. -template <int lane> -inline __m128i Load2(const void* const buf, __m128i val) { - uint16_t temp; - memcpy(&temp, buf, 2); - return _mm_insert_epi16(val, temp, lane); -} - -inline __m128i Load4(const void* src) { - // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 - // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a - // movss instruction. - // - // Until compiler support of _mm_loadu_si32 is widespread, use of - // _mm_loadu_si32 is banned. - int val; - memcpy(&val, src, sizeof(val)); - return _mm_cvtsi32_si128(val); -} - -inline __m128i Load4x2(const void* src1, const void* src2) { - // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 - // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a - // movss instruction. - // - // Until compiler support of _mm_loadu_si32 is widespread, use of - // _mm_loadu_si32 is banned. - int val1, val2; - memcpy(&val1, src1, sizeof(val1)); - memcpy(&val2, src2, sizeof(val2)); - return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1); -} -inline __m128i LoadLo8(const void* a) { - return _mm_loadl_epi64(static_cast<const __m128i*>(a)); -} - -inline __m128i LoadHi8(const __m128i v, const void* a) { - const __m128 x = - _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a)); - return _mm_castps_si128(x); -} - -inline __m128i LoadUnaligned16(const void* a) { - return _mm_loadu_si128(static_cast<const __m128i*>(a)); -} - -inline __m128i LoadAligned16(const void* a) { - assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); - return _mm_load_si128(static_cast<const __m128i*>(a)); -} - -//------------------------------------------------------------------------------ -// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. - -inline __m128i MaskOverreads(const __m128i source, - const ptrdiff_t over_read_in_bytes) { - __m128i dst = source; #if LIBGAV1_MSAN - if (over_read_in_bytes > 0) { - __m128i mask = _mm_set1_epi8(-1); - for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) { - mask = _mm_srli_si128(mask, 1); - } - dst = _mm_and_si128(dst, mask); - } -#else - static_cast<void>(over_read_in_bytes); -#endif - return dst; -} +#include <sanitizer/msan_interface.h> -inline __m128i LoadLo8Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); +inline void PrintShadow(const void* r, const char* const name, + const size_t size) { + fprintf(stderr, "Shadow for %s:\n", name); + __msan_print_shadow(r, size); } +#define PS(var, N) PrintShadow(var, #var, N) -inline __m128i LoadHi8Msan(const __m128i v, const void* source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadHi8(v, source), over_read_in_bytes); -} - -inline __m128i LoadAligned16Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadAligned16(source), over_read_in_bytes); -} +#endif // LIBGAV1_MSAN -inline __m128i LoadUnaligned16Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); -} - -//------------------------------------------------------------------------------ -// Store functions. - -inline void Store2(void* dst, const __m128i x) { - const int val = _mm_cvtsi128_si32(x); - memcpy(dst, &val, 2); -} - -inline void Store4(void* dst, const __m128i x) { - const int val = _mm_cvtsi128_si32(x); - memcpy(dst, &val, sizeof(val)); -} - -inline void StoreLo8(void* a, const __m128i v) { - _mm_storel_epi64(static_cast<__m128i*>(a), v); -} - -inline void StoreHi8(void* a, const __m128i v) { - _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); -} - -inline void StoreAligned16(void* a, const __m128i v) { - assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); - _mm_store_si128(static_cast<__m128i*>(a), v); -} - -inline void StoreUnaligned16(void* a, const __m128i v) { - _mm_storeu_si128(static_cast<__m128i*>(a), v); -} - -//------------------------------------------------------------------------------ -// Arithmetic utilities. - -inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) { - assert(bits <= 16); - // Shift out all but the last bit. - const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); - // Avg with zero will shift by 1 and round. - return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128()); -} - -inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) { - assert(bits <= 16); - const __m128i v_bias_d = - _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); - const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); - return _mm_srai_epi16(v_tmp_d, bits); -} - -inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srli_epi32(v_tmp_d, bits); -} - -inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - -//------------------------------------------------------------------------------ -// Masking utilities -inline __m128i MaskHighNBytes(int n) { - static constexpr uint8_t kMask[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - }; +#endif // 0 - return LoadUnaligned16(kMask + n); -} +namespace libgav1 { +namespace dsp { +namespace sse4 { + +#include "src/dsp/x86/common_sse4.inc" + +} // namespace sse4 + +// NOLINTBEGIN(misc-unused-using-decls) +// These function aliases shall not be visible to external code. They are +// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two +// possible implementations of common functions, which may differ based on +// whether the compiler is permitted to use avx2 instructions. +using sse4::Load2; +using sse4::Load2x2; +using sse4::Load4; +using sse4::Load4x2; +using sse4::LoadAligned16; +using sse4::LoadAligned16Msan; +using sse4::LoadHi8; +using sse4::LoadHi8Msan; +using sse4::LoadLo8; +using sse4::LoadLo8Msan; +using sse4::LoadUnaligned16; +using sse4::LoadUnaligned16Msan; +using sse4::MaskHighNBytes; +using sse4::RightShiftWithRounding_S16; +using sse4::RightShiftWithRounding_S32; +using sse4::RightShiftWithRounding_U16; +using sse4::RightShiftWithRounding_U32; +using sse4::Store2; +using sse4::Store4; +using sse4::StoreAligned16; +using sse4::StoreHi8; +using sse4::StoreLo8; +using sse4::StoreUnaligned16; +// NOLINTEND } // namespace dsp } // namespace libgav1 |