aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc84
-rw-r--r--src/dsp/x86/common_sse4_test.cc4
-rw-r--r--src/dsp/x86/convolve_avx2.cc322
-rw-r--r--src/dsp/x86/convolve_sse4.cc187
-rw-r--r--src/dsp/x86/convolve_sse4.inc98
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc152
-rw-r--r--src/dsp/x86/film_grain_sse4.cc14
-rw-r--r--src/dsp/x86/intrapred_directional_sse4.cc239
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc1
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc336
-rw-r--r--src/dsp/x86/obmc_sse4.cc144
-rw-r--r--src/dsp/x86/warp_sse4.cc58
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc360
13 files changed, 976 insertions, 1023 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 911c5a9..c08b3d6 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,24 +35,46 @@ namespace {
constexpr int kInterPostRoundBit = 4;
-inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadLo8(prediction_0);
- const __m128i pred_1 = LoadLo8(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- Store4(dest, _mm_packus_epi16(res, res));
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ Store4(dest, result_pixels);
+ dest += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dest, &result_1, sizeof(result_1));
+ dest += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dest, &result_2, sizeof(result_2));
+ dest += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dest, &result_3, sizeof(result_3));
}
inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadAligned16(prediction_0);
- const __m128i pred_1 = LoadAligned16(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- StoreLo8(dest, _mm_packus_epi16(res, res));
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ StoreLo8(dest, result_pixels);
+ StoreHi8(dest + dest_stride, result_pixels);
}
inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
@@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
int y = height;
if (width == 4) {
+ const ptrdiff_t dest_stride4 = dest_stride << 2;
+ constexpr ptrdiff_t width4 = 4 << 2;
do {
- // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
- // to load 8 values at a time.
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride4;
+ pred_0 += width4;
+ pred_1 += width4;
- y -= 2;
+ y -= 4;
} while (y != 0);
return;
}
if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ constexpr ptrdiff_t width2 = 8 << 1;
do {
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
y -= 2;
} while (y != 0);
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
index 4ea811a..3288cfc 100644
--- a/src/dsp/x86/common_sse4_test.cc
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -31,7 +31,7 @@ namespace {
// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
// negative values.
-TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
+TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) {
for (int bits = 0; bits < 16; ++bits) {
const int bias = (1 << bits) >> 1;
for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
@@ -56,7 +56,7 @@ TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
#else // !LIBGAV1_TARGETING_SSE4_1
-TEST(CommonDspTest, SSE4) {
+TEST(CommonDspTest, SSE41) {
GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
"the tests.";
}
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 4126ca9..6e94347 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -39,17 +39,17 @@ namespace {
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
// sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
__m256i sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
sum = _mm256_add_epi16(v_madd_21, v_madd_43);
sum = _mm256_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
@@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
} else {
@@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m256i SumHorizontalTaps(const __m256i* const src,
const __m256i* const v_tap) {
__m256i v_src[4];
@@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src,
const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
}
- return SumOnePassTaps<filter_index>(v_src, v_tap);
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
}
-template <int filter_index>
+template <int num_taps>
__m256i SimpleHorizontalTaps(const __m256i* const src,
const __m256i* const v_tap) {
- __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
return _mm256_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m256i HorizontalTaps8To16(const __m256i* const src,
const __m256i* const v_tap) {
- const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
// Filter 2xh sizes.
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
do {
if (is_2d) {
const __m128i sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
Store4(&dest16[0], sum);
dest16 += pred_stride;
Store4(&dest16[0], _mm_srli_si128(sum, 8));
dest16 += pred_stride;
} else {
const __m128i sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
Store2(dest8, sum);
dest8 += pred_stride;
Store2(dest8, _mm_srli_si128(sum, 4));
@@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
__m128i sum;
const __m128i input = LoadLo8(&src[2]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 ....
const __m128i v_src_43 =
_mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
}
// Filter widths >= 4.
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long, v_tap);
const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
LoadUnaligned16(&src[x + 24]));
const __m256i result2 =
- HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
if (is_2d) {
StoreAligned32(&dest16[x], result);
StoreAligned32(&dest16[x + 16], result2);
@@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load src used to calculate dest8[7:0] and dest8[23:16].
const __m256i src_long = LoadUnaligned32(&src[x]);
const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
// Load src used to calculate dest8[15:8] and dest8[31:24].
const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
const __m256i result2 =
- SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
// Combine results and store.
StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
}
@@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load into 2 128 bit lanes.
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
const __m256i src_long2 =
SetrM128i(LoadUnaligned16(&src[src_stride]),
LoadUnaligned16(&src[8 + src_stride]));
const __m256i result2 =
- HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
if (is_2d) {
StoreAligned32(&dest16[0], result);
StoreAligned32(&dest16[pred_stride], result2);
@@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
LoadUnaligned16(&src[src_stride]));
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
const __m256i src_long2 = SetrM128i(
LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
const __m256i result2 =
- SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
StoreUnaligned16(&dest8[pred_stride],
@@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
if (is_2d) {
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreAligned32(&dest16[0], result);
}
@@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
if (is_2d) {
StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
StoreAligned16(&dest16[pred_stride],
@@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(this_row, next_row);
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
}
@@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// filter the remaining row.
if (is_2d) {
const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
}
@@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
} else {
@@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(this_row, next_row);
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
Store4(&dest8[0], _mm256_castsi256_si128(result));
Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
}
@@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// filter the remaining row.
if (is_2d) {
const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
}
}
@@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
const __m128i v_horizontal_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
- if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(32) uint16_t
@@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) {
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index, bool unpack_high = false>
+template <int num_taps, bool unpack_high = false>
__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
__m256i v_src[4];
if (!unpack_high) {
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
}
} else {
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
}
}
- return SumOnePassTaps<filter_index>(v_src, v_tap);
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadUnaligned32(src_x);
src_x += src_stride;
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m256i sums_hi =
- SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
if (is_compound) {
const __m256i results =
Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row - 1] = _mm256_inserti128_si256(
srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m256i sums_hi =
- SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
if (is_compound) {
const __m256i results =
Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row - 1] = _mm256_inserti128_si256(
srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m256i results = Compound1DShift(sums);
const __m128i this_dst = _mm256_castsi256_si128(results);
@@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
// Use 256 bits for width > 4.
if (width > 4) {
__m256i taps_256[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps_256);
if (width == 8) {
FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
@@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else {
- SetupTaps<4>(&v_filter, taps_256);
- if (width == 8) {
- FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- } else if (width == 16) {
- FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- } else {
- FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- }
}
} else { // width <= 8
// Use 128 bit code.
__m128i taps[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
- taps);
- } else {
- FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
- taps);
- }
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_filter, taps);
- if (width == 2) {
- FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
}
- } else {
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
}
}
}
@@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2(
// Use 256 bits for width > 4.
if (width > 4) {
__m256i taps_256[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<0, /*is_compound=*/true>(
+ FilterVertical8xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<0, /*is_compound=*/true>(
+ FilterVertical16xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<0, /*is_compound=*/true>(
+ FilterVertical32xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<2, /*is_compound=*/true>(
+ FilterVertical8xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<2, /*is_compound=*/true>(
+ FilterVertical16xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<2, /*is_compound=*/true>(
+ FilterVertical32xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<3, /*is_compound=*/true>(
+ FilterVertical8xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<3, /*is_compound=*/true>(
+ FilterVertical16xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<3, /*is_compound=*/true>(
+ FilterVertical32xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps_256);
if (width == 8) {
FilterVertical8xH<4, /*is_compound=*/true>(
@@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2(
FilterVertical32xH<4, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else {
- SetupTaps<4>(&v_filter, taps_256);
- if (width == 8) {
- FilterVertical8xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- } else if (width == 16) {
- FilterVertical16xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- } else {
- FilterVertical32xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- }
}
} else { // width <= 4
// Use 128 bit code.
__m128i taps[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
- FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 2) { // 8 tap.
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
- FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 3) { // 2 tap.
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
- FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_filter, taps);
- FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps);
- FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
}
}
}
@@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2(
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(32) uint16_t
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index f7e5a71..f427c4c 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -36,7 +36,7 @@ namespace {
#include "src/dsp/x86/convolve_sse4.inc"
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i v_src[4];
@@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int x = 0;
do {
if (is_2d || is_compound) {
- const __m128i v_sum =
- HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
if (is_2d) {
StoreAligned16(&dest16[x], v_sum);
} else {
StoreUnaligned16(&dest16[x], v_sum);
}
} else {
- const __m128i result =
- SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
StoreLo8(&dest8[x], result);
}
x += 8;
@@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int y = height;
do {
if (is_2d || is_compound) {
- const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
StoreLo8(dest16, v_sum);
} else {
- const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
Store4(&dest8[0], result);
}
src += src_stride;
@@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
do {
if (is_2d) {
const __m128i sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
Store4(&dest16[0], sum);
dest16 += pred_stride;
Store4(&dest16[0], _mm_srli_si128(sum, 8));
dest16 += pred_stride;
} else {
const __m128i sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
Store2(dest8, sum);
dest8 += pred_stride;
Store2(dest8, _mm_srli_si128(sum, 4));
@@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
__m128i sum;
const __m128i input = LoadLo8(&src[2]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 ....
const __m128i v_src_43 =
_mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(16) uint16_t
@@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
}
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16_x, results);
@@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else {
- // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
- // See convolve_neon.cc
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 2) {
- FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else if (width == 4) {
- FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else {
- FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
}
}
-void ConvolveCompoundCopy_SSE4(
+void ConvolveCompoundCopy_SSE4_1(
const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
@@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4(
_mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
const __m128i v_dest_hi =
_mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
- // TODO(slavarnway): Investigate using aligned stores.
StoreUnaligned16(&dest[x], v_dest_lo);
StoreUnaligned16(&dest[x + 8], v_dest_hi);
x += 16;
@@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else {
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 4) {
- FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
- } else {
- FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
- width, height, taps);
- }
}
}
@@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1(
// Similarly for height.
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
const auto* const src = static_cast<const uint8_t*>(reference) -
@@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
source);
StoreLo8(intermediate, RightShiftWithRounding_S16(
- SumOnePassTaps<filter_index>(source, taps),
+ SumOnePassTaps<num_taps>(source, taps),
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
@@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
// Shift by one less because the taps are halved.
- StoreAligned16(
- intermediate_x,
- RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
- kInterRoundBitsHorizontal - 1));
+ StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+ SumOnePassTaps<num_taps>(source, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
} while (--y != 0);
@@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
alignas(16) int16_t
intermediate_result[kIntermediateAllocWidth *
(2 * kIntermediateAllocWidth + kSubPixelTaps)];
- const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
kScaleSubPixelBits) +
@@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
// inputs in each iteration on large blocks. When step_x is large, we need a
// second register and alignr in order to gather all filter inputs.
// |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
- const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
const int kernel_start_ceiling = 16 - num_horiz_taps;
// This truncated quotient |grade_x_threshold| selects |step_x| such that:
// (step_x * 7) >> kScaleSubPixelBits < single load limit
@@ -1891,7 +1860,7 @@ void Init8bpp() {
dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
- dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
index 550d6a4..5548c5b 100644
--- a/src/dsp/x86/convolve_sse4.inc
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -18,20 +18,63 @@
#include "src/dsp/convolve.inc"
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+ if (filter_index == 0) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 1) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+ (filter_id == 8) | (filter_id == 9)) != 0) {
+ return 6;
+ }
+ // When |filter_index| == 1, the |filter_id| values not listed above map to
+ // 4 tap filters.
+ return 4;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
// sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
__m128i sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
sum = _mm_add_epi16(v_madd_21, v_madd_43);
sum = _mm_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
@@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
} else {
@@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
const __m128i v_src_43 = _mm_shuffle_epi8(
v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
@@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return v_sum_5432;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
@@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) {
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index>
+template <int num_taps>
__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
__m128i v_src[4];
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
-// 2D version.
-template <int num_taps, int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 12 13 20 21 22 23
srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 30 31 32 33 40 41 42 43
srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 50 51 52 53 60 61 62 63
srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 70 71 72 73 80 81 82 83
srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
}
}
-template <int num_taps, int filter_index, bool negative_outside_taps = false>
+template <int num_taps, bool negative_outside_taps = false>
void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 20 21 30 31 40 41
srcs[1] = _mm_srli_si128(srcs_0_2, 2);
// This uses srcs[0]..srcs[1].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[3] = _mm_srli_si128(srcs_0_4, 6);
// This uses srcs[0]..srcs[3].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[5] = _mm_srli_si128(srcs_4_8, 2);
// This uses srcs[0]..srcs[5].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[7] = _mm_srli_si128(srcs_4_8, 6);
// This uses srcs[0]..srcs[7].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index c813df4..8c32117 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -34,54 +34,50 @@ namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
const __m128i& pred1,
- const __m128i& weights) {
- // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
- const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
- const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
- const __m128i result_lo =
- RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
-
- const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
- const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
- const __m128i result_hi =
- RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
-
- return _mm_packs_epi32(result_lo, result_hi);
+ const __m128i& weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+ // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+ const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+ // (x << 11) >> 15 == x >> 4
+ const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
}
template <int height>
inline void DistanceWeightedBlend4xH_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
- const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
for (int y = 0; y < height; y += 4) {
- // TODO(b/150326556): Use larger loads.
- const __m128i src_00 = LoadLo8(pred_0);
- const __m128i src_10 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- __m128i src_0 = LoadHi8(src_00, pred_0);
- __m128i src_1 = LoadHi8(src_10, pred_1);
- pred_0 += 4;
- pred_1 += 4;
- const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
-
- const __m128i src_01 = LoadLo8(pred_0);
- const __m128i src_11 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- src_0 = LoadHi8(src_01, pred_0);
- src_1 = LoadHi8(src_11, pred_1);
- pred_0 += 4;
- pred_1 += 4;
- const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
const __m128i result_pixels = _mm_packus_epi16(res0, res1);
Store4(dst, result_pixels);
@@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
template <int height>
inline void DistanceWeightedBlend8xH_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
- const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
for (int y = 0; y < height; y += 2) {
const __m128i src_00 = LoadAligned16(pred_0);
@@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
inline void DistanceWeightedBlendLarge_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height,
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
int y = height;
do {
@@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1,
const uint8_t weight_0,
- const uint8_t weight_1, const int width,
+ const uint8_t /*weight_1*/, const int width,
const int height,
void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const uint8_t weight = weight_0;
if (width == 4) {
if (height == 4) {
- DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
} else if (height == 8) {
- DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
} else {
assert(height == 16);
- DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
}
return;
}
@@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
if (width == 8) {
switch (height) {
case 4:
- DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
case 8:
- DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
case 16:
- DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
default:
assert(height == 32);
- DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
}
}
- DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
- height, dest, dest_stride);
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+ dest_stride);
}
void Init8bpp() {
@@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
int y = height;
do {
- const __m128i src_00 = LoadLo8(pred_0);
- const __m128i src_10 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- __m128i src_0 = LoadHi8(src_00, pred_0);
- __m128i src_1 = LoadHi8(src_10, pred_1);
- pred_0 += 4;
- pred_1 += 4;
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
const __m128i res0 =
- ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
-
- const __m128i src_01 = LoadLo8(pred_0);
- const __m128i src_11 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- src_0 = LoadHi8(src_01, pred_0);
- src_1 = LoadHi8(src_11, pred_1);
- pred_0 += 4;
- pred_1 += 4;
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
const __m128i res1 =
- ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
StoreLo8(dst, res0);
dst += dest_stride;
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 9ece947..59d18a6 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -23,14 +23,15 @@
#include <cstdint>
#include <cstring>
-#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/film_grain_common.h"
#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
@@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_width; x += 8) {
+ for (; x + 8 <= safe_width; x += 8) {
const __m128i orig = LoadSource(&in_y_row[x]);
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
// Prevent arbitrary indices from entering GetScalingFactors.
memset(luma_buffer, 0, sizeof(luma_buffer));
const int valid_range = width - x;
+ assert(valid_range < 8);
memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
luma_buffer[valid_range] = in_y_row[width - 1];
const __m128i orig = LoadSource(&in_y_row[x]);
@@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_chroma_width; x += 8) {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
@@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
}
- // This section only runs if width % (8 << sub_x) != 0. It should never run
- // on 720p and above.
if (x < chroma_width) {
// Prevent huge indices from entering GetScalingFactors due to
// uninitialized values. This is not a problem in 8bpp because the table
@@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_chroma_width; x += 8) {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
index e642aee..bc61745 100644
--- a/src/dsp/x86/intrapred_directional_sse4.cc
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
}
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
template <bool upsampled>
inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
@@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH(
}
}
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+ const __m128i& left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+ // Cover 8x4 case.
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ if (max_top_only_y == height) return;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ } else {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+ base_left_y, -ystep);
+ }
+}
+
// 7.11.2.4 (8) 90 < angle > 180
// The strategy for this function is to know how many blocks can be processed
// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
@@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
const int width, const int height,
const int xstep, const int ystep) {
auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride8 = stride << 3;
- const __m128i dest_index_x =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute. This assumes minimum |xstep| is 3.
+ // All columns from |min_top_only_x| to the right will only need |top_row|
+ // to compute. This assumes minimum |xstep| is 3.
const int min_top_only_x = std::min((height * xstep) >> 6, width);
- // For steep angles, the source pixels from left_column may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
- const int xstep8 = xstep << 3;
- const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
// Accumulate xstep across 8 rows.
const __m128i xstep_dup = _mm_set1_epi16(-xstep);
const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
@@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
// offset. Following values need the full ystep as a relative offset.
const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
__m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
left_y = _mm_add_epi16(ystep_init, left_y);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
int x = 0;
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_shuffle_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x;
x += 8,
xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
// Watch left_y because it can still get big.
left_y = _mm_add_epi16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- DirectionalZone1_4xH(dst_x + 4, stride,
- top_row + ((x + 4) << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Pick up from the last y-value, using the 10% slower but secure method for
- // left prediction.
- const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
- }
+ DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
}
for (; x < width; x += 4) {
DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
@@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
left_offset -= left_base_increment4) {
uint8_t* dst_x = dst + x;
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ // Round down to the nearest multiple of 4.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
max_top_only_y, -xstep, upsampled_top);
int y = max_top_only_y;
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 3363f0e..b4df072 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
__m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc.
s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
sq[0][0] = SquareLo8(s[0][0]);
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index a18444b..833814c 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -30,35 +30,81 @@
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
namespace {
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 = LoadUnaligned16(mask);
+ const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
// Width can only be 4 when it is subsampled from a block of width 8, hence
// subsampling_x is always 1 when this function is called.
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_val_01 = LoadUnaligned16(mask);
+ // Stride is fixed because this is the smallest block size.
+ const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+ // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+ const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+ const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
if (subsampling_x == 1) {
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
- const __m128i mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- if (subsampling_y == 1) {
- const __m128i next_mask_val_0 =
- _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
- const __m128i next_mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
- subsampled_mask = _mm_add_epi16(
- subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
- }
- return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ return GetMask4x2<subsampling_x, subsampling_y>(mask);
}
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
+ assert(subsampling_y == 0 && subsampling_x == 0);
const __m128i mask_val_0 = Load4(mask);
const __m128i mask_val_1 = Load4(mask + mask_stride);
return _mm_cvtepu8_epi16(
_mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
}
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
// 16-bit is also the lowest packing for hadd, but without subsampling there is
// an unfortunate conversion required.
@@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
return _mm_cvtepu8_epi16(mask_val);
}
-// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
-// when is_inter_intra is true, the prediction values are brought to 8-bit
-// packing as well.
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t stride) {
- if (subsampling_x == 1) {
- const __m128i row_vals = LoadUnaligned16(mask);
-
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
- const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-
- if (subsampling_y == 1) {
- const __m128i next_row_vals = LoadUnaligned16(mask + stride);
- const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
- const __m128i next_mask_val_1 =
- _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
- subsampled_mask = _mm_add_epi16(
- subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
- }
- const __m128i ret =
- RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
- return _mm_packus_epi16(ret, ret);
- }
- assert(subsampling_y == 0 && subsampling_x == 0);
- // Unfortunately there is no shift operation for 8-bit packing, or else we
- // could return everything with 8-bit packing.
- const __m128i mask_val = LoadLo8(mask);
- return mask_val;
-}
-
inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
const int16_t* LIBGAV1_RESTRICT const pred_1,
const __m128i pred_mask_0,
@@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1,
- const uint8_t* LIBGAV1_RESTRICT mask,
- const ptrdiff_t mask_stride,
- uint8_t* LIBGAV1_RESTRICT dst,
- const ptrdiff_t dst_stride) {
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const __m128i mask_inverter = _mm_set1_epi16(64);
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1,
- const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int height,
- uint8_t* LIBGAV1_RESTRICT dst,
- const ptrdiff_t dst_stride) {
+inline void MaskBlending4xH_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ assert(subsampling_x == 1);
const uint8_t* mask = mask_ptr;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
if (height == 4) {
- MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+ dst, dst_stride);
return;
}
const __m128i mask_inverter = _mm_set1_epi16(64);
int y = 0;
do {
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
@@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- const ptrdiff_t /*prediction_stride_1*/,
- const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int width,
- const int height, void* LIBGAV1_RESTRICT dest,
- const ptrdiff_t dst_stride) {
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
const ptrdiff_t pred_stride_0 = width;
const ptrdiff_t pred_stride_1 = width;
if (width == 4) {
- MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, height, dst, dst_stride);
return;
}
const uint8_t* mask = mask_ptr;
@@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
const __m128i pred_val_0 = LoadLo8(pred_0);
- // TODO(b/150326556): One load.
__m128i pred_val_1 = Load4(pred_1);
pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
pred_val_1);
@@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_SSE4(
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
const ptrdiff_t mask_stride) {
const __m128i mask_inverter = _mm_set1_epi8(64);
const __m128i pred_mask_u16_first =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
mask += mask_stride << (1 + subsampling_y);
const __m128i pred_mask_u16_second =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
mask += mask_stride << (1 + subsampling_y);
__m128i pred_mask_1 =
_mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
@@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4xH_SSE4(
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
const ptrdiff_t pred_stride_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
const int height) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
return;
}
int y = 0;
do {
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
pred_0 += 4 << 2;
pred_1 += pred_stride_1 << 2;
mask += mask_stride << (2 + subsampling_y);
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
pred_0 += 4 << 2;
pred_1 += pred_stride_1 << 2;
@@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(
} while (y < height);
}
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+ return _mm_packus_epi16(ret, ret);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ // Unfortunately there is no shift operation for 8-bit packing, or else we
+ // could return everything with 8-bit packing.
+ const __m128i mask_val = LoadLo8(mask);
+ return mask_val;
+}
+
template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_SSE4(
+void InterIntraMaskBlend8bpp_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT prediction_0,
uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
const int width, const int height) {
if (width == 4) {
- InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
height);
return;
@@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4(
int x = 0;
do {
const __m128i pred_mask_1 =
- GetInterIntraMask8<subsampling_x, subsampling_y>(
+ GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
mask + (x << subsampling_x), mask_stride);
// 64 - mask
const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
@@ -411,24 +440,24 @@ void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
- dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+ dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
- dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
- dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+ dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
#endif
// The is_inter_intra index of mask_blend[][] is replaced by
// inter_intra_mask_blend_8bpp[] in 8-bit.
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
- dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
- dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
- dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
#endif
}
@@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1;
constexpr int kMaskInverse = 64;
constexpr int kRoundBitsMaskBlend = 4;
-inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
- const __m128i zero) {
- // Shift out all but the last bit.
- const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
- // Avg with zero will shift by 1 and round.
- return _mm_avg_epu16(v_tmp_d, zero);
-}
-
inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
const __m128i shift) {
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
@@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
}
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
- const __m128i zero) {
- if (subsampling_x == 1) {
- if (subsampling_y == 0) {
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
- const __m128i mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
- }
- const __m128i one = _mm_set1_epi8(1);
- const __m128i mask_val_0 =
- LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
- const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
- mask + (mask_stride << 1) + mask_stride);
- const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
- const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+ const __m128i mask_val_3 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+ const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+ const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+ const __m128i subsampled_mask =
+ _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+ return RightShiftWithRounding_U16(subsampled_mask, 2);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const __m128i mask_val_0 = Load4(mask);
- const __m128i mask_val_1 = Load4(mask + mask_stride);
- return _mm_cvtepu8_epi16(
- _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
-}
-
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
- const __m128i zero) {
if (subsampling_x == 1) {
- if (subsampling_y == 0) {
- const __m128i row_vals = LoadUnaligned16(mask);
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
- const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
- }
- const __m128i one = _mm_set1_epi8(1);
- const __m128i mask_val_0 = LoadUnaligned16(mask);
- const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
- const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
- const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
- return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const __m128i mask_val = LoadLo8(mask);
- return _mm_cvtepu8_epi16(mask_val);
+ return _mm_cvtepu8_epi16(LoadLo8(mask));
}
inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
@@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
const __m128i offset = _mm_set1_epi32(kCompoundOffset);
const __m128i max = _mm_set1_epi16(kMax10bppSample);
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
pred_mask_1, offset, max, shift4, dst,
@@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
pred_mask_1, offset, max, shift4, dst,
@@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1(
return;
}
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const uint8_t pred0_stride2 = 4 << 1;
const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
@@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
int y = height;
do {
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
@@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1(
}
const uint8_t* mask = mask_ptr;
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
const __m128i offset = _mm_set1_epi32(kCompoundOffset);
const __m128i max = _mm_set1_epi16(kMax10bppSample);
@@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1(
int x = 0;
do {
const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride, zero);
+ mask + (x << subsampling_x), mask_stride);
const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
// 64 - mask
@@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1(
mask += mask_stride_ss;
} while (--y != 0);
}
-
inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
const uint16_t* LIBGAV1_RESTRICT prediction_0,
const uint16_t* LIBGAV1_RESTRICT prediction_1,
@@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
- const __m128i zero = _mm_setzero_si128();
__m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, shift6,
@@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
dst += dst_stride << 1;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, shift6,
@@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
return;
}
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
const uint8_t pred0_stride2 = 4 << 1;
const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
@@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
int y = height;
do {
__m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1(
const uint8_t* mask = mask_ptr;
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
- const __m128i zero = _mm_setzero_si128();
const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
int y = height;
do {
int x = 0;
do {
const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride, zero);
+ mask + (x << subsampling_x), mask_stride);
const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
// 64 - mask
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index 8ce23b4..f068ff3 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -39,8 +39,8 @@ namespace {
inline void OverlapBlendFromLeft2xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 2;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
- const __m128i obmc_pred_val =
- Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+ const __m128i obmc_pred_val = Load4(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
@@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
inline void OverlapBlendFromLeft4xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
int y = height;
do {
const __m128i pred_val0 = Load4(pred);
- const __m128i obmc_pred_val0 = Load4(obmc_pred);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
// Place the second row of each source in the second four bytes.
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
- const __m128i obmc_pred_val = _mm_alignr_epi8(
- Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
const int second_row_result = _mm_extract_epi32(packed_result, 1);
memcpy(pred, &second_row_result, sizeof(second_row_result));
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
y -= 2;
} while (y != 0);
}
@@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
inline void OverlapBlendFromLeft8xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1(
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
int y = height;
do {
- const __m128i pred_val = LoadLo8(pred);
- const __m128i obmc_pred_val = LoadLo8(obmc_pred);
- const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
- const __m128i result =
- RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
- StoreLo8(pred, _mm_packus_epi16(result, result));
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (--y != 0);
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
}
void OverlapBlendFromLeft_SSE4_1(
@@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1(
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1(
inline void OverlapBlendFromTop4xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
_mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
const __m128i pred_val0 = Load4(pred);
- const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
- const __m128i obmc_pred_val = _mm_alignr_epi8(
- Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
Store4(pred - prediction_stride, packed_result);
Store4(pred, _mm_srli_si128(packed_result, 4));
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
y += 2;
} while (y < compute_height);
}
@@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
inline void OverlapBlendFromTop8xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const uint8_t* mask = kObmcMask + height - 2;
@@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1(
const int compute_height = height - (height >> 2);
int y = compute_height;
do {
- const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+ const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
// 64 - mask
- const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
- const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
- const __m128i pred_val = LoadLo8(pred);
- const __m128i obmc_pred_val = LoadLo8(obmc_pred);
- const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
- const __m128i result =
- RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+ const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
- StoreLo8(pred, _mm_packus_epi16(result, result));
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+ --y;
+ const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+ const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (--y != 0);
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (--y > 0);
}
void OverlapBlendFromTop_SSE4_1(
@@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1(
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
@@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6;
inline void OverlapBlendFromLeft2xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 2;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = Load4x2(pred, pred + pred_stride);
- const __m128i obmc_pred_val =
- Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
const __m128i result = RightShiftWithRounding_U32(
_mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
@@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
inline void OverlapBlendFromLeft4xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
- const __m128i obmc_pred_val =
- LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
const __m128i result_lo = RightShiftWithRounding_U32(
@@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
inline void OverlapBlendFromTop4xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
- const __m128i obmc_pred_val =
- LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
const __m128i result_lo = RightShiftWithRounding_U32(
@@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1(
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 5830894..5498052 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8],
}
template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
int delta, DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
}
template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
- int gamma, int delta,
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+ int64_t y4, int gamma, int delta,
DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
- ptrdiff_t source_stride, int source_width, int y4,
+ ptrdiff_t source_stride, int source_width, int64_t y4,
int ix4, int iy4, int gamma, int delta,
int16_t intermediate_result_column[15],
DestType* LIBGAV1_RESTRICT dst_row,
@@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
ptrdiff_t source_stride, int source_height, int alpha,
- int beta, int x4, int ix4, int iy4,
+ int beta, int64_t x4, int ix4, int iy4,
int16_t intermediate_result[15][8]) {
// Region 3
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
- ptrdiff_t source_stride, int alpha, int beta, int x4,
- int ix4, int iy4, int16_t intermediate_result[15][8]) {
+ ptrdiff_t source_stride, int alpha, int beta,
+ int64_t x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
// Region 4.
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
int16_t intermediate_result_column[15];
};
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
- if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
// Outside the frame in both directions. One repeated value.
- WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
- source_height, ix4, iy4, dst_row,
- dest_stride);
+ WarpRegion1<is_compound, DestType>(
+ src, source_stride, source_width, source_height, filter_params.ix4,
+ filter_params.iy4, dst_row, dest_stride);
return;
}
// Outside the frame horizontally. Rows repeated.
WarpRegion2<is_compound, DestType>(
- src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
- intermediate_result_column, dst_row, dest_stride);
+ src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+ filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+ dest_stride);
return;
}
- if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
// Outside the frame vertically.
- WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
- beta, x4, ix4, iy4, intermediate_result);
+ WarpRegion3<is_compound, DestType>(
+ src, source_stride, source_height, alpha, beta, filter_params.x4,
+ filter_params.ix4, filter_params.iy4, intermediate_result);
} else {
// Inside the frame.
- WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
- iy4, intermediate_result);
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+ filter_params.x4, filter_params.ix4,
+ filter_params.iy4, intermediate_result);
}
// Region 3 and 4 vertical filter.
- VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
- dst_row, dest_stride);
+ VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+ gamma, delta, dst_row, dest_stride);
}
template <bool is_compound>
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index 69cb784..53a374d 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -37,10 +37,10 @@ namespace {
constexpr int kRoundingBits8bpp = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const __m128i pred_00 = LoadAligned16(prediction_0);
const __m128i pred_10 = LoadAligned16(prediction_1);
const __m128i difference_0 = RightShiftWithRounding_U16(
@@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
}
#define WEIGHT8_PAIR_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+ WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT8_PAIR_AND_STRIDE \
WEIGHT8_PAIR_WITHOUT_STRIDE; \
@@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
@@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 3;
@@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 5;
@@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
#define WEIGHT16_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT16_AND_STRIDE \
WEIGHT16_WITHOUT_STRIDE; \
@@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y = 7;
@@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT16_WITHOUT_STRIDE;
}
-#define WEIGHT32_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE \
WEIGHT32_WITHOUT_STRIDE; \
@@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
WEIGHT32_AND_STRIDE;
@@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT32_WITHOUT_STRIDE;
}
-#define WEIGHT64_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
- mask + 32, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
- mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE \
WEIGHT64_WITHOUT_STRIDE; \
@@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 0;
@@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
dsp->weight_mask[w_index][h_index][0] = \
- WeightMask##width##x##height##_SSE4<0>; \
- dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+ WeightMask##width##x##height##_SSE4_1<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_SSE4_1<1>
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6;
constexpr int kScaledDiffShift = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_10bpp_SSE4(
+inline void WeightMask16_10bpp_SSE4_1(
const uint16_t* LIBGAV1_RESTRICT prediction_0,
const uint16_t* LIBGAV1_RESTRICT prediction_1,
uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
@@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4(
}
}
-#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
- mask_stride)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+ mask_stride)
#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \
@@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4(
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 3;
@@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 5;
@@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT16_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride)
+#define WEIGHT16_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride)
#define WEIGHT16_AND_STRIDE_10BPP \
WEIGHT16_WITHOUT_STRIDE_10BPP; \
@@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y = 7;
@@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT16_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT32_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE_10BPP \
WEIGHT32_WITHOUT_STRIDE_10BPP; \
@@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
WEIGHT32_AND_STRIDE_10BPP;
@@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT32_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT64_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
- mask + 32, mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
- mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE_10BPP \
WEIGHT64_WITHOUT_STRIDE_10BPP; \
@@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 42;
@@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 42;
@@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
dsp->weight_mask[w_index][h_index][0] = \
- WeightMask##width##x##height##_10bpp_SSE4<0>; \
+ WeightMask##width##x##height##_10bpp_SSE4_1<0>; \
dsp->weight_mask[w_index][h_index][1] = \
- WeightMask##width##x##height##_10bpp_SSE4<1>
+ WeightMask##width##x##height##_10bpp_SSE4_1<1>
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);