diff options
Diffstat (limited to 'src/dsp/x86/inverse_transform_sse4.cc')
-rw-r--r-- | src/dsp/x86/inverse_transform_sse4.cc | 104 |
1 files changed, 25 insertions, 79 deletions
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc index 787d706..12c008f 100644 --- a/src/dsp/x86/inverse_transform_sse4.cc +++ b/src/dsp/x86/inverse_transform_sse4.cc @@ -94,8 +94,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b, static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16)); const __m128i ba = _mm_unpacklo_epi16(*a, *b); const __m128i ab = _mm_unpacklo_epi16(*b, *a); - const __m128i sign = - _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001)); // -sin cos, -sin cos, -sin cos, -sin cos const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); const __m128i x0 = _mm_madd_epi16(ba, msin_pcos); @@ -121,8 +120,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b, const int16_t sin128 = Sin128(angle); const __m128i psin_pcos = _mm_set1_epi32( static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16)); - const __m128i sign = - _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001)); // -sin cos, -sin cos, -sin cos, -sin cos const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); const __m128i ba = _mm_unpacklo_epi16(*a, *b); @@ -229,7 +227,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); const __m128i v_src = (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1039,7 +1038,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src = _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1194,7 +1194,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, __m128i s[8]; const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1519,7 +1520,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, __m128i x[16]; const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1615,7 +1617,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1767,7 +1770,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1859,7 +1863,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round0 = @@ -2918,75 +2923,11 @@ void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type, //------------------------------------------------------------------------------ -template <typename Residual, typename Pixel> -void InitAll(Dsp* const dsp) { - // Maximum transform size for Dct is 64. - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = - Dct4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = - Dct4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = - Dct8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = - Dct8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = - Dct16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = - Dct16TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = - Dct32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = - Dct32TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = - Dct64TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = - Dct64TransformLoopColumn_SSE4_1; - - // Maximum transform size for Adst is 16. - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = - Adst4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = - Adst4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = - Adst8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = - Adst8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = - Adst16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = - Adst16TransformLoopColumn_SSE4_1; - - // Maximum transform size for Identity transform is 32. - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = - Identity4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = - Identity4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = - Identity8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = - Identity8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = - Identity16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = - Identity16TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = - Identity32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = - Identity32TransformLoopColumn_SSE4_1; - - // Maximum transform size for Wht is 4. - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = - Wht4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = - Wht4TransformLoopColumn_SSE4_1; -} - void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS - InitAll<int16_t, uint8_t>(dsp); -#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + + // Maximum transform size for Dct is 64. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct) dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = Dct4TransformLoopRow_SSE4_1; @@ -3017,6 +2958,8 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = Dct64TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Adst is 16. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst) dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = Adst4TransformLoopRow_SSE4_1; @@ -3035,6 +2978,8 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = Adst16TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Identity transform is 32. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity) dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = Identity4TransformLoopRow_SSE4_1; @@ -3059,13 +3004,14 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = Identity32TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Wht is 4. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht) dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = Wht4TransformLoopRow_SSE4_1; dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = Wht4TransformLoopColumn_SSE4_1; #endif -#endif } } // namespace @@ -3075,7 +3021,7 @@ void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |