aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/average_blend_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/average_blend_sse4.cc')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc156
1 files changed, 156 insertions, 0 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644
index 0000000..8e008d1
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -0,0 +1,156 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadLo8(prediction_0);
+ const __m128i pred_1 = LoadLo8(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ Store4(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlend8Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadAligned16(prediction_0);
+ const __m128i pred_1 = LoadAligned16(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ StoreLo8(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+ const int16_t* prediction_1, const int width,
+ uint8_t* dest) {
+ int x = 0;
+ do {
+ const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+ const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+ __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+ res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+ const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+ const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+ __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+ res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+ StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+ x += 16;
+ } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ do {
+ // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
+ // to load 8 values at a time.
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void AverageBlendInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1