aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/average_blend_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/average_blend_sse4.cc')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc224
1 files changed, 222 insertions, 2 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 8e008d1..ec9f589 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -30,6 +30,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
@@ -138,13 +139,232 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
-void AverageBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const __m128i& compound_offset,
+ const __m128i& round_offset, const __m128i& max,
+ const __m128i& zero, uint16_t* dst,
+ const ptrdiff_t dest_stride) {
+ // pred_0/1 max range is 16b.
+ const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+ const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+ const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+ const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+ const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+ const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+ const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+ const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+ const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+ const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+ // RightShiftWithRounding and Clip3.
+ const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+ const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+ const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+ const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+ if (width != 4) {
+ // Store width=8/16/32/64/128.
+ StoreUnaligned16(dst + offset, result);
+ return;
+ }
+ assert(width == 4);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* prediction_0,
+ const void* prediction_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const __m128i compound_offset =
+ _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+ const __m128i round_offset =
+ _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ int y = height;
+
+ if (width == 4) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0,1
+ AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 16) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 32) {
+ do {
+ // pred [0 - 15].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [16 - 31].
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ if (width == 64) {
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ assert(width == 128);
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+
+ // pred [64 - 95].
+ AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [96 - 127].
+ AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {