aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/inverse_transform_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/inverse_transform_sse4.cc')
-rw-r--r--src/dsp/x86/inverse_transform_sse4.cc104
1 files changed, 25 insertions, 79 deletions
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
index 787d706..12c008f 100644
--- a/src/dsp/x86/inverse_transform_sse4.cc
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -94,8 +94,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
const __m128i ab = _mm_unpacklo_epi16(*b, *a);
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
@@ -121,8 +120,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
const int16_t sin128 = Sin128(angle);
const __m128i psin_pcos = _mm_set1_epi32(
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
@@ -229,7 +227,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
const __m128i v_src =
(width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1039,7 +1038,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src =
_mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1194,7 +1194,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
__m128i s[8];
const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1519,7 +1520,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
__m128i x[16];
const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1615,7 +1617,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1767,7 +1770,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1859,7 +1863,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round0 =
@@ -2918,75 +2923,11 @@ void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
//------------------------------------------------------------------------------
-template <typename Residual, typename Pixel>
-void InitAll(Dsp* const dsp) {
- // Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
- Dct4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
- Dct4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
- Dct8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
- Dct8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
- Dct16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
- Dct16TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
- Dct32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
- Dct32TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
- Dct64TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
- Dct64TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
- Adst4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
- Adst4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
- Adst8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
- Adst8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
- Adst16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
- Adst16TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
- Identity4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
- Identity4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
- Identity8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
- Identity8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
- Identity16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
- Identity16TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
- Identity32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
- Identity32TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Wht is 4.
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
- Wht4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
- Wht4TransformLoopColumn_SSE4_1;
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- InitAll<int16_t, uint8_t>(dsp);
-#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // Maximum transform size for Dct is 64.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
Dct4TransformLoopRow_SSE4_1;
@@ -3017,6 +2958,8 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
Dct64TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Adst is 16.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
Adst4TransformLoopRow_SSE4_1;
@@ -3035,6 +2978,8 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
Adst16TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Identity transform is 32.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
Identity4TransformLoopRow_SSE4_1;
@@ -3059,13 +3004,14 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
Identity32TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Wht is 4.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
Wht4TransformLoopRow_SSE4_1;
dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
Wht4TransformLoopColumn_SSE4_1;
#endif
-#endif
}
} // namespace
@@ -3075,7 +3021,7 @@ void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {