aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/average_blend_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/average_blend_sse4.cc')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc84
1 files changed, 49 insertions, 35 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 911c5a9..c08b3d6 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,24 +35,46 @@ namespace {
constexpr int kInterPostRoundBit = 4;
-inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadLo8(prediction_0);
- const __m128i pred_1 = LoadLo8(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- Store4(dest, _mm_packus_epi16(res, res));
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ Store4(dest, result_pixels);
+ dest += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dest, &result_1, sizeof(result_1));
+ dest += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dest, &result_2, sizeof(result_2));
+ dest += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dest, &result_3, sizeof(result_3));
}
inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadAligned16(prediction_0);
- const __m128i pred_1 = LoadAligned16(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- StoreLo8(dest, _mm_packus_epi16(res, res));
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ StoreLo8(dest, result_pixels);
+ StoreHi8(dest + dest_stride, result_pixels);
}
inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
@@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
int y = height;
if (width == 4) {
+ const ptrdiff_t dest_stride4 = dest_stride << 2;
+ constexpr ptrdiff_t width4 = 4 << 2;
do {
- // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
- // to load 8 values at a time.
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride4;
+ pred_0 += width4;
+ pred_1 += width4;
- y -= 2;
+ y -= 4;
} while (y != 0);
return;
}
if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ constexpr ptrdiff_t width2 = 8 << 1;
do {
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
y -= 2;
} while (y != 0);