diff options
Diffstat (limited to 'src/dsp/x86/average_blend_sse4.cc')
-rw-r--r-- | src/dsp/x86/average_blend_sse4.cc | 224 |
1 files changed, 222 insertions, 2 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc index 8e008d1..ec9f589 100644 --- a/src/dsp/x86/average_blend_sse4.cc +++ b/src/dsp/x86/average_blend_sse4.cc @@ -30,6 +30,7 @@ namespace libgav1 { namespace dsp { +namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; @@ -138,13 +139,232 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void AverageBlendInit_SSE4_1() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kInterPostRoundBitPlusOne = 5; + +template <const int width, const int offset> +inline void AverageBlendRow(const uint16_t* prediction_0, + const uint16_t* prediction_1, + const __m128i& compound_offset, + const __m128i& round_offset, const __m128i& max, + const __m128i& zero, uint16_t* dst, + const ptrdiff_t dest_stride) { + // pred_0/1 max range is 16b. + const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset); + const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset); + const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0); + const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero); + const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1); + const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero); + + const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10); + const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11); + const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset); + const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset); + // RightShiftWithRounding and Clip3. + const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset); + const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset); + const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne); + const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne); + const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max); + if (width != 4) { + // Store width=8/16/32/64/128. + StoreUnaligned16(dst + offset, result); + return; + } + assert(width == 4); + StoreLo8(dst, result); + StoreHi8(dst + dest_stride, result); +} + +void AverageBlend10bpp_SSE4_1(const void* prediction_0, + const void* prediction_1, const int width, + const int height, void* const dest, + const ptrdiff_t dst_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const __m128i compound_offset = + _mm_set1_epi32(kCompoundOffset + kCompoundOffset); + const __m128i round_offset = + _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1); + const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1); + const __m128i zero = _mm_setzero_si128(); + int y = height; + + if (width == 4) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0,1 + AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 8) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // row1. + AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 16) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // row1. + AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 32) { + do { + // pred [0 - 15]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [16 - 31]. + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + return; + } + if (width == 64) { + do { + // pred [0 - 31]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [31 - 63]. + AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + return; + } + assert(width == 128); + do { + // pred [0 - 31]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [31 - 63]. + AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + + // pred [64 - 95]. + AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [96 - 127]. + AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend) + dsp->average_blend = AverageBlend10bpp_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void AverageBlendInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |