From d4dbf19f6b0181ee78034bfe4caf189d1c016998 Mon Sep 17 00:00:00 2001 From: Boyuan Yang Date: Thu, 14 Jul 2022 15:56:57 -0400 Subject: New upstream version 0.18.0 --- src/dsp/x86/average_blend_sse4.cc | 84 ++++--- src/dsp/x86/common_sse4_test.cc | 4 +- src/dsp/x86/convolve_avx2.cc | 322 ++++++++++--------------- src/dsp/x86/convolve_sse4.cc | 187 ++++++--------- src/dsp/x86/convolve_sse4.inc | 98 +++++--- src/dsp/x86/distance_weighted_blend_sse4.cc | 152 ++++++------ src/dsp/x86/film_grain_sse4.cc | 14 +- src/dsp/x86/intrapred_directional_sse4.cc | 239 +++++++++--------- src/dsp/x86/loop_restoration_sse4.cc | 1 + src/dsp/x86/mask_blend_sse4.cc | 336 +++++++++++++------------- src/dsp/x86/obmc_sse4.cc | 144 +++++------ src/dsp/x86/warp_sse4.cc | 58 ++--- src/dsp/x86/weight_mask_sse4.cc | 360 ++++++++++++++-------------- 13 files changed, 976 insertions(+), 1023 deletions(-) (limited to 'src/dsp/x86') diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc index 911c5a9..c08b3d6 100644 --- a/src/dsp/x86/average_blend_sse4.cc +++ b/src/dsp/x86/average_blend_sse4.cc @@ -35,24 +35,46 @@ namespace { constexpr int kInterPostRoundBit = 4; -inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadLo8(prediction_0); - const __m128i pred_1 = LoadLo8(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - Store4(dest, _mm_packus_epi16(res, res)); +inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + Store4(dest, result_pixels); + dest += dest_stride; + const int result_1 = _mm_extract_epi32(result_pixels, 1); + memcpy(dest, &result_1, sizeof(result_1)); + dest += dest_stride; + const int result_2 = _mm_extract_epi32(result_pixels, 2); + memcpy(dest, &result_2, sizeof(result_2)); + dest += dest_stride; + const int result_3 = _mm_extract_epi32(result_pixels, 3); + memcpy(dest, &result_3, sizeof(result_3)); } inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0, const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadAligned16(prediction_0); - const __m128i pred_1 = LoadAligned16(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - StoreLo8(dest, _mm_packus_epi16(res, res)); + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + StoreLo8(dest, result_pixels); + StoreHi8(dest + dest_stride, result_pixels); } inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0, @@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, int y = height; if (width == 4) { + const ptrdiff_t dest_stride4 = dest_stride << 2; + constexpr ptrdiff_t width4 = 4 << 2; do { - // TODO(b/150326556): |prediction_[01]| values are packed. It is possible - // to load 8 values at a time. - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride4; + pred_0 += width4; + pred_1 += width4; - y -= 2; + y -= 4; } while (y != 0); return; } if (width == 8) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + constexpr ptrdiff_t width2 = 8 << 1; do { - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend8Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; y -= 2; } while (y != 0); diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc index 4ea811a..3288cfc 100644 --- a/src/dsp/x86/common_sse4_test.cc +++ b/src/dsp/x86/common_sse4_test.cc @@ -31,7 +31,7 @@ namespace { // INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then // RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for // negative values. -TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) { +TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) { for (int bits = 0; bits < 16; ++bits) { const int bias = (1 << bits) >> 1; for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) { @@ -56,7 +56,7 @@ TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) { #else // !LIBGAV1_TARGETING_SSE4_1 -TEST(CommonDspTest, SSE4) { +TEST(CommonDspTest, SSE41) { GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable " "the tests."; } diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc index 4126ca9..6e94347 100644 --- a/src/dsp/x86/convolve_avx2.cc +++ b/src/dsp/x86/convolve_avx2.cc @@ -39,17 +39,17 @@ namespace { // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template +template __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { __m256i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1 const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3 const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm256_add_epi16(v_madd_21, v_madd_43); sum = _mm256_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0 const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32); const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76); sum = _mm256_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { return sum; } -template +template __m256i SumHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { __m256i v_src[4]; @@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src, const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long); const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - return SumOnePassTaps(v_src, v_tap); + return SumOnePassTaps(v_src, v_tap); } -template +template __m256i SimpleHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { - __m256i sum = SumHorizontalTaps(src, v_tap); + __m256i sum = SumHorizontalTaps(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src, return _mm256_packus_epi16(sum, sum); } -template +template __m256i HorizontalTaps8To16(const __m256i* const src, const __m256i* const v_tap) { - const __m256i sum = SumHorizontalTaps(src, v_tap); + const __m256i sum = SumHorizontalTaps(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } // Filter 2xh sizes. -template +template void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2(src, src_stride, v_tap); + HorizontalTaps8To16_2x2(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2(src, src_stride, v_tap); + SimpleHorizontalTaps2x2(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, } // Filter widths >= 4. -template +template void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m256i src_long = SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8])); const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + HorizontalTaps8To16(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]), LoadUnaligned16(&src[x + 24])); const __m256i result2 = - HorizontalTaps8To16(&src_long2, v_tap); + HorizontalTaps8To16(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[x], result); StoreAligned32(&dest16[x + 16], result2); @@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load src used to calculate dest8[7:0] and dest8[23:16]. const __m256i src_long = LoadUnaligned32(&src[x]); const __m256i result = - SimpleHorizontalTaps(&src_long, v_tap); + SimpleHorizontalTaps(&src_long, v_tap); // Load src used to calculate dest8[15:8] and dest8[31:24]. const __m256i src_long2 = LoadUnaligned32(&src[x + 8]); const __m256i result2 = - SimpleHorizontalTaps(&src_long2, v_tap); + SimpleHorizontalTaps(&src_long2, v_tap); // Combine results and store. StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); } @@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[src_stride]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - HorizontalTaps8To16(&src_long2, v_tap); + HorizontalTaps8To16(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[0], result); StoreAligned32(&dest16[pred_stride], result2); @@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[src_stride])); - const __m256i result = - SimpleHorizontalTaps(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps(&src_long, v_tap); const __m256i src_long2 = SetrM128i( LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - SimpleHorizontalTaps(&src_long2, v_tap); + SimpleHorizontalTaps(&src_long2, v_tap); const __m256i packed_result = _mm256_unpacklo_epi64(result, result2); StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result)); StoreUnaligned16(&dest8[pred_stride], @@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, if (is_2d) { const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); StoreAligned32(&dest16[0], result); } @@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); if (is_2d) { StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); StoreAligned16(&dest16[pred_stride], @@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps(&src_long, v_tap); StoreLo8(&dest8[0], _mm256_castsi256_si128(result)); StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); } @@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1)); } else { @@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps(&src_long, v_tap); Store4(&dest8[0], _mm256_castsi256_si128(result)); Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); } } @@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( const __m128i v_horizontal_filter = LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); - if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t @@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template +template __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) { __m256i v_src[4]; if (!unpack_high) { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); } } else { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); } } - return SumOnePassTaps(v_src, v_tap); + return SumOnePassTaps(v_src, v_tap); } -template +template void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadUnaligned32(src_x); src_x += src_stride; - const __m256i sums = SumVerticalTaps(srcs, v_tap); + const __m256i sums = SumVerticalTaps(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps(srcs, v_tap); + SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, } while (x < width); } -template +template void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps(srcs, v_tap); + const __m256i sums = SumVerticalTaps(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps(srcs, v_tap); + SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template +template void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps(srcs, v_tap); + const __m256i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(sums); const __m128i this_dst = _mm256_castsi256_si128(results); @@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template +template void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height, @@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else if (width == 16) { - FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else { - FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } } } else { // width <= 8 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } else { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } - } else { + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } } } @@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2( // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0, /*is_compound=*/true>( + FilterVertical8xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0, /*is_compound=*/true>( + FilterVertical16xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0, /*is_compound=*/true>( + FilterVertical32xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2, /*is_compound=*/true>( + FilterVertical8xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2, /*is_compound=*/true>( + FilterVertical16xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2, /*is_compound=*/true>( + FilterVertical32xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3, /*is_compound=*/true>( + FilterVertical8xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3, /*is_compound=*/true>( + FilterVertical16xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3, /*is_compound=*/true>( + FilterVertical32xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4, /*is_compound=*/true>( @@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2( FilterVertical32xH<4, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else if (width == 16) { - FilterVertical16xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else { - FilterVertical32xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } } } else { // width <= 4 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 2) { // 8 tap. + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 3) { // 2 tap. + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); } } } @@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2( void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index f7e5a71..f427c4c 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -36,7 +36,7 @@ namespace { #include "src/dsp/x86/convolve_sse4.inc" -template +template __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i v_src[4]; @@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long); const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - const __m128i sum = SumOnePassTaps(v_src, v_tap); + const __m128i sum = SumOnePassTaps(v_src, v_tap); return sum; } -template +template __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps(src, v_tap); + __m128i sum = SumHorizontalTaps(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, return _mm_packus_epi16(sum, sum); } -template +template __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - const __m128i sum = SumHorizontalTaps(src, v_tap); + const __m128i sum = SumHorizontalTaps(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template +template void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int x = 0; do { if (is_2d || is_compound) { - const __m128i v_sum = - HorizontalTaps8To16(&src[x], v_tap); + const __m128i v_sum = HorizontalTaps8To16(&src[x], v_tap); if (is_2d) { StoreAligned16(&dest16[x], v_sum); } else { StoreUnaligned16(&dest16[x], v_sum); } } else { - const __m128i result = - SimpleHorizontalTaps(&src[x], v_tap); + const __m128i result = SimpleHorizontalTaps(&src[x], v_tap); StoreLo8(&dest8[x], result); } x += 8; @@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int y = height; do { if (is_2d || is_compound) { - const __m128i v_sum = HorizontalTaps8To16(src, v_tap); + const __m128i v_sum = HorizontalTaps8To16(src, v_tap); StoreLo8(dest16, v_sum); } else { - const __m128i result = SimpleHorizontalTaps(src, v_tap); + const __m128i result = SimpleHorizontalTaps(src, v_tap); Store4(&dest8[0], result); } src += src_stride; @@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2(src, src_stride, v_tap); + HorizontalTaps8To16_2x2(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2(src, src_stride, v_tap); + SimpleHorizontalTaps2x2(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(16) uint16_t @@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, } } -template +template void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16_x, results); @@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); } - } else { - // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. - // See convolve_neon.cc - SetupTaps<4>(&v_filter, taps); - - if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else if (width == 4) { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, - taps); - } } } -void ConvolveCompoundCopy_SSE4( +void ConvolveCompoundCopy_SSE4_1( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, @@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4( _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical); const __m128i v_dest_hi = _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical); - // TODO(slavarnway): Investigate using aligned stores. StoreUnaligned16(&dest[x], v_dest_lo); StoreUnaligned16(&dest[x + 8], v_dest_hi); x += 16; @@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else { - SetupTaps<4>(&v_filter, taps); - - if (width == 4) { - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); - } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, - width, height, taps); - } } } @@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1( // Similarly for height. const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast(reference) - @@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, source); StoreLo8(intermediate, RightShiftWithRounding_S16( - SumOnePassTaps(source, taps), + SumOnePassTaps(source, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; @@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, PrepareSourceVectors(src_x, packed_indices, source); // Shift by one less because the taps are halved. - StoreAligned16( - intermediate_x, - RightShiftWithRounding_S16(SumOnePassTaps(source, taps), - kInterRoundBitsHorizontal - 1)); + StoreAligned16(intermediate_x, RightShiftWithRounding_S16( + SumOnePassTaps(source, taps), + kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); @@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, alignas(16) int16_t intermediate_result[kIntermediateAllocWidth * (2 * kIntermediateAllocWidth + kSubPixelTaps)]; - const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + @@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, // inputs in each iteration on large blocks. When step_x is large, we need a // second register and alignr in order to gather all filter inputs. // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. - const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index); const int kernel_start_ceiling = 16 - num_horiz_taps; // This truncated quotient |grade_x_threshold| selects |step_x| such that: // (step_x * 7) >> kScaleSubPixelBits < single load limit @@ -1891,7 +1860,7 @@ void Init8bpp() { dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; - dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc index 550d6a4..5548c5b 100644 --- a/src/dsp/x86/convolve_sse4.inc +++ b/src/dsp/x86/convolve_sse4.inc @@ -18,20 +18,63 @@ #include "src/dsp/convolve.inc" +// This version checks for the special cases when filter_index == 1. +int GetNumTapsInFilter(const int filter_index, const int filter_id) { + if (filter_index == 0) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 1) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) | + (filter_id == 8) | (filter_id == 9)) != 0) { + return 6; + } + // When |filter_index| == 1, the |filter_id| values not listed above map to + // 4 tap filters. + return 4; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template +template __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { __m128i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm_add_epi16(v_madd_21, v_madd_43); sum = _mm_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); sum = _mm_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { return sum; } -template +template __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 const __m128i v_src_43 = _mm_shuffle_epi8( v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); @@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return v_sum_5432; } -template +template __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2(src, src_stride, v_tap); + __m128i sum = SumHorizontalTaps2x2(src, src_stride, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return _mm_packus_epi16(sum, sum); } -template +template __m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2(src, src_stride, v_tap); + const __m128i sum = SumHorizontalTaps2x2(src, src_stride, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } @@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template +template __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { __m128i v_src[4]; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); } - const __m128i sum = SumOnePassTaps(v_src, v_tap); + const __m128i sum = SumOnePassTaps(v_src, v_tap); return sum; } -// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the -// 2D version. -template +template void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 12 13 20 21 22 23 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 30 31 32 33 40 41 42 43 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 50 51 52 53 60 61 62 63 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 70 71 72 73 80 81 82 83 srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } } -template +template void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 20 21 30 31 40 41 srcs[1] = _mm_srli_si128(srcs_0_2, 2); // This uses srcs[0]..srcs[1]. - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[3] = _mm_srli_si128(srcs_0_4, 6); // This uses srcs[0]..srcs[3]. - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[5] = _mm_srli_si128(srcs_4_8, 2); // This uses srcs[0]..srcs[5]. - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[7] = _mm_srli_si128(srcs_4_8, 6); // This uses srcs[0]..srcs[7]. - const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i sums = SumVerticalTaps(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc index c813df4..8c32117 100644 --- a/src/dsp/x86/distance_weighted_blend_sse4.cc +++ b/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -34,54 +34,50 @@ namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; +constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1); inline __m128i ComputeWeightedAverage8(const __m128i& pred0, const __m128i& pred1, - const __m128i& weights) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1); - const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights); - const __m128i result_lo = - RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4); - - const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1); - const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights); - const __m128i result_hi = - RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4); - - return _mm_packs_epi32(result_lo, result_hi); + const __m128i& weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1); + // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4) + const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1); + // (x << 11) >> 15 == x >> 4 + const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return _mm_mulhrs_epi16(upscaled_average, right_shift_prep); } template inline void DistanceWeightedBlend4xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 4) { - // TODO(b/150326556): Use larger loads. - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights); + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights); const __m128i result_pixels = _mm_packus_epi16(res0, res1); Store4(dst, result_pixels); @@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1( template inline void DistanceWeightedBlend8xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 2) { const __m128i src_00 = LoadAligned16(pred_0); @@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1( inline void DistanceWeightedBlendLarge_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + const int width, const int height, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); int y = height; do { @@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1( void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); + const uint8_t weight = weight_0; if (width == 4) { if (height == 4) { - DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); } else if (height == 8) { - DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); } else { assert(height == 16); - DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); } return; } @@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, if (width == 8) { switch (height) { case 4: - DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); return; case 8: - DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); return; case 16: - DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); return; default: assert(height == 32); - DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest, + dest_stride); return; } } - DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width, - height, dest, dest_stride); + DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest, + dest_stride); } void Init8bpp() { @@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1( int y = height; do { - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res0 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; + ComputeWeightedAverage8(src_00, src_10, weight0, weight1); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res1 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); + ComputeWeightedAverage8(src_01, src_11, weight0, weight1); StoreLo8(dst, res0); dst += dest_stride; diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc index 9ece947..59d18a6 100644 --- a/src/dsp/x86/film_grain_sse4.cc +++ b/src/dsp/x86/film_grain_sse4.cc @@ -23,14 +23,15 @@ #include #include -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" #include "src/dsp/x86/common_sse4.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_width; x += 8) { + for (; x + 8 <= safe_width; x += 8) { const __m128i orig = LoadSource(&in_y_row[x]); const __m128i scaling = GetScalingFactors(scaling_lut_y, &in_y_row[x]); @@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1( // Prevent arbitrary indices from entering GetScalingFactors. memset(luma_buffer, 0, sizeof(luma_buffer)); const int valid_range = width - x; + assert(valid_range < 8); memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0])); luma_buffer[valid_range] = in_y_row[width - 1]; const __m128i orig = LoadSource(&in_y_row[x]); @@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); @@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); } - // This section only runs if width % (8 << sub_x) != 0. It should never run - // on 720p and above. if (x < chroma_width) { // Prevent huge indices from entering GetScalingFactors due to // uninitialized values. This is not a problem in 8bpp because the table @@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc index e642aee..bc61745 100644 --- a/src/dsp/x86/intrapred_directional_sse4.cc +++ b/src/dsp/x86/intrapred_directional_sse4.cc @@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - template inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, @@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH( } } +template +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base, + const __m128i& left_y) { + const int upsample_left_shift = static_cast(upsampled_left); + const int upsample_top_shift = static_cast(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + const __m128i xstep8_vect = _mm_set1_epi16(xstep8); + + // Cover 8x4 case. + const int min_height = (height == 4) ? 4 : 8; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + DirectionalZone1_4xH(dst_x + 4, stride, + top_row + ((x + 4) << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + if (max_top_only_y == height) return; + + const __m128i max_shift = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + const auto base_left_y = static_cast(_mm_extract_epi16(left_y, 0)); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + // Pick up from the last y-value, using the 10% slower but secure method for + // left prediction. + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8_SSE4_1( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + } else { + DirectionalZone3_8xH( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + } + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_8xH( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8xH( + dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift), + base_left_y, -ystep); + } +} + // 7.11.2.4 (8) 90 < angle > 180 // The strategy for this function is to know how many blocks can be processed // with just pixels from |top_ptr|, then handle mixed blocks, then handle only @@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, const int width, const int height, const int xstep, const int ystep) { auto* dst = static_cast(dest); - const int upsample_left_shift = static_cast(upsampled_left); const int upsample_top_shift = static_cast(upsampled_top); - const __m128i max_shift = _mm_set1_epi8(32); - const ptrdiff_t stride8 = stride << 3; - const __m128i dest_index_x = - _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); - const __m128i sampler_top = - upsampled_top - ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute. This assumes minimum |xstep| is 3. + // All columns from |min_top_only_x| to the right will only need |top_row| + // to compute. This assumes minimum |xstep| is 3. const int min_top_only_x = std::min((height * xstep) >> 6, width); - // For steep angles, the source pixels from left_column may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); - - const int xstep8 = xstep << 3; - const __m128i xstep8_vect = _mm_set1_epi16(xstep8); // Accumulate xstep across 8 rows. const __m128i xstep_dup = _mm_set1_epi16(-xstep); const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); @@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, // offset. Following values need the full ystep as a relative offset. const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); const __m128i ystep_dup = _mm_set1_epi16(-ystep); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); left_y = _mm_add_epi16(ystep_init, left_y); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); const __m128i increment_top8 = _mm_set1_epi16(8 << 6); int x = 0; - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. + for (int left_offset = -left_base_increment; x < min_shuffle_x; + x += 8, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), + // Watch left_y because it can still get big. + left_y = _mm_add_epi16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), // Watch left_y because it can still get big. left_y = _mm_add_epi16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - DirectionalZone1_4xH(dst_x + 4, stride, - top_row + ((x + 4) << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); - // All rows from |min_left_only_y| down for this set of columns, only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); - __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); - int top_x = -xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - DirectionalZone2FromLeftCol_8x8_SSE4_1( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), left_y); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - DirectionalZone1Blend_8xH( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Pick up from the last y-value, using the 10% slower but secure method for - // left prediction. - const auto base_left_y = static_cast(_mm_extract_epi16(left_y, 0)); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - - DirectionalZone3_8xH( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - DirectionalZone1Blend_8xH( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8xH( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - } + DirectionalZone2_8xH( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); } for (; x < width; x += 4) { DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), @@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, left_offset -= left_base_increment4) { uint8_t* dst_x = dst + x; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; + // Round down to the nearest multiple of 4. + const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3; DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), max_top_only_y, -xstep, upsampled_top); int y = max_top_only_y; diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc index 3363f0e..b4df072 100644 --- a/src/dsp/x86/loop_restoration_sse4.cc +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; + ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc. s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); sq[0][0] = SquareLo8(s[0][0]); diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index a18444b..833814c 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -30,35 +30,81 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { +template +inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = LoadUnaligned16(mask); + const __m128i mask_val_1 = LoadUnaligned16(mask + stride); + const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + if (subsampling_x == 1) { + const __m128i row_vals = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +// Imitate behavior of ARM vtrn1q_u64. +inline __m128i Transpose1_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + +// Imitate behavior of ARM vtrn2q_u64. +inline __m128i Transpose2_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + // Width can only be 4 when it is subsampled from a block of width 8, hence // subsampling_x is always 1 when this function is called. template -inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_val_01 = LoadUnaligned16(mask); + // Stride is fixed because this is the smallest block size. + const __m128i mask_val_23 = LoadUnaligned16(mask + 16); + // Transpose rows to add row 0 to row 1, and row 2 to row 3. + const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23); + const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01); + const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13); + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + return GetMask8(mask, 0); +} + +template +inline __m128i GetInterIntraMask4x2(const uint8_t* mask, + ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - if (subsampling_y == 1) { - const __m128i next_mask_val_0 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride)); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + return GetMask4x2(mask); } + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. + assert(subsampling_y == 0 && subsampling_x == 0); const __m128i mask_val_0 = Load4(mask); const __m128i mask_val_1 = Load4(mask + mask_stride); return _mm_cvtepu8_epi16( _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); } +} // namespace + +namespace low_bitdepth { +namespace { + // This function returns a 16-bit packed mask to fit in _mm_madd_epi16. // 16-bit is also the lowest packing for hadd, but without subsampling there is // an unfortunate conversion required. @@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask, return _mm_cvtepu8_epi16(mask_val); } -// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, -// when is_inter_intra is true, the prediction values are brought to 8-bit -// packing as well. -template -inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t stride) { - if (subsampling_x == 1) { - const __m128i row_vals = LoadUnaligned16(mask); - - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - - if (subsampling_y == 1) { - const __m128i next_row_vals = LoadUnaligned16(mask + stride); - const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - const __m128i ret = - RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); - return _mm_packus_epi16(ret, ret); - } - assert(subsampling_y == 0 && subsampling_x == 0); - // Unfortunately there is no shift operation for 8-bit packing, or else we - // could return everything with 8-bit packing. - const __m128i mask_val = LoadLo8(mask); - return mask_val; -} - inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, const int16_t* LIBGAV1_RESTRICT const pred_1, const __m128i pred_mask_0, @@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, } template -inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT mask, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const __m128i mask_inverter = _mm_set1_epi16(64); - __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } template -inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4xH_SSE4_1( + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + assert(subsampling_x == 1); const uint8_t* mask = mask_ptr; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; if (height == 4) { - MaskBlending4x4_SSE4( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_SSE4_1(pred_0, pred_1, mask, + dst, dst_stride); return; } const __m128i mask_inverter = _mm_set1_epi16(64); int y = 0; do { - __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, @@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, } template -inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - const ptrdiff_t /*prediction_stride_1*/, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height, void* LIBGAV1_RESTRICT dest, - const ptrdiff_t dst_stride) { +inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t /*prediction_stride_1*/, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dst_stride) { auto* dst = static_cast(dest); const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); const ptrdiff_t pred_stride_0 = width; const ptrdiff_t pred_stride_1 = width; if (width == 4) { - MaskBlending4xH_SSE4( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_SSE4_1( + pred_0, pred_1, mask_ptr, height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); const __m128i pred_val_0 = LoadLo8(pred_0); - // TODO(b/150326556): One load. __m128i pred_val_1 = Load4(pred_1); pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4), pred_val_1); @@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( } template -inline void InterIntraMaskBlending8bpp4x4_SSE4( +inline void InterIntraMaskBlending8bpp4x4_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride) { const __m128i mask_inverter = _mm_set1_epi8(64); const __m128i pred_mask_u16_first = - GetMask4x2(mask, mask_stride); + GetInterIntraMask4x2(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); const __m128i pred_mask_u16_second = - GetMask4x2(mask, mask_stride); + GetInterIntraMask4x2(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); __m128i pred_mask_1 = _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second); @@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4( } template -inline void InterIntraMaskBlending8bpp4xH_SSE4( +inline void InterIntraMaskBlending8bpp4xH_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int height) { const uint8_t* mask = mask_ptr; if (height == 4) { - InterIntraMaskBlending8bpp4x4_SSE4( + InterIntraMaskBlending8bpp4x4_SSE4_1( pred_0, pred_1, pred_stride_1, mask, mask_stride); return; } int y = 0; do { - InterIntraMaskBlending8bpp4x4_SSE4( + InterIntraMaskBlending8bpp4x4_SSE4_1( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; mask += mask_stride << (2 + subsampling_y); - InterIntraMaskBlending8bpp4x4_SSE4( + InterIntraMaskBlending8bpp4x4_SSE4_1( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; @@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4( } while (y < height); } +// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, +// when is_inter_intra is true, the prediction values are brought to 8-bit +// packing as well. +template +inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { + if (subsampling_x == 1) { + const __m128i ret = GetMask8(mask, stride); + return _mm_packus_epi16(ret, ret); + } + assert(subsampling_y == 0 && subsampling_x == 0); + // Unfortunately there is no shift operation for 8-bit packing, or else we + // could return everything with 8-bit packing. + const __m128i mask_val = LoadLo8(mask); + return mask_val; +} + template -void InterIntraMaskBlend8bpp_SSE4( +void InterIntraMaskBlend8bpp_SSE4_1( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int width, const int height) { if (width == 4) { - InterIntraMaskBlending8bpp4xH_SSE4( + InterIntraMaskBlending8bpp4xH_SSE4_1( prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, height); return; @@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4( int x = 0; do { const __m128i pred_mask_1 = - GetInterIntraMask8( + GetInterIntraMask8bpp8( mask + (x << subsampling_x), mask_stride); // 64 - mask const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); @@ -411,24 +440,24 @@ void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444) - dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>; + dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422) - dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>; + dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420) - dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>; + dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>; #endif // The is_inter_intra index of mask_blend[][] is replaced by // inter_intra_mask_blend_8bpp[] in 8-bit. #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444) - dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>; + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422) - dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>; + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420) - dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>; + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>; #endif } @@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1; constexpr int kMaskInverse = 64; constexpr int kRoundBitsMaskBlend = 4; -inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits, - const __m128i zero) { - // Shift out all but the last bit. - const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); - // Avg with zero will shift by 1 and round. - return _mm_avg_epu16(v_tmp_d, zero); -} - inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, const __m128i shift) { const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift); @@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, } template -inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride, - const __m128i zero) { - if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = - LoadHi8(LoadLo8(mask), mask + (mask_stride << 1)); - const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride), - mask + (mask_stride << 1) + mask_stride); - const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i subsampled_mask = _mm_maddubs_epi16(add, one); - return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero); +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_row_23 = LoadUnaligned16(mask + 16); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23); + const __m128i mask_val_3 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8)); + const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2); + const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3); + const __m128i subsampled_mask = + _mm_add_epi16(subsampled_mask_02, subsampled_mask_13); + return RightShiftWithRounding_U16(subsampled_mask, 2); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val_0 = Load4(mask); - const __m128i mask_val_1 = Load4(mask + mask_stride); - return _mm_cvtepu8_epi16( - _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); -} - -template -inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, - const __m128i zero) { if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i row_vals = LoadUnaligned16(mask); - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = LoadUnaligned16(mask); - const __m128i mask_val_1 = LoadUnaligned16(mask + stride); - const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); - return RightShiftWithRoundingZero_U16(mask_0, 2, zero); + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val = LoadLo8(mask); - return _mm_cvtepu8_epi16(mask_val); + return _mm_cvtepu8_epi16(LoadLo8(mask)); } inline void WriteMaskBlendLine10bpp4x2_SSE4_1( @@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); - __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); @@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); int y = height; do { - __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, @@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1( } const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); @@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1( int x = 0; do { const __m128i pred_mask_0 = GetMask8( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask @@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1( mask += mask_stride_ss; } while (--y != 0); } - inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, @@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( dst += dst_stride << 1; pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; @@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( int y = height; do { __m128i pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2(mask, mask_stride, zero); + GetInterIntraMask4x2(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1( const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; int y = height; do { int x = 0; do { const __m128i pred_mask_0 = GetMask8( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index 8ce23b4..f068ff3 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -39,8 +39,8 @@ namespace { inline void OverlapBlendFromLeft2xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 2; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load2x2(pred, pred + prediction_stride); - const __m128i obmc_pred_val = - Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + const __m128i obmc_pred_val = Load4(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = @@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; // Place the second row of each source in the second four bytes. const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( const int second_row_result = _mm_extract_epi32(packed_result, 1); memcpy(pred, &second_row_result, sizeof(second_row_result)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y -= 2; } while (y != 0); } @@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( inline void OverlapBlendFromLeft8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi8(64); @@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1( const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); int y = height; do { - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); } void OverlapBlendFromLeft_SSE4_1( @@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1( _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( Store4(pred - prediction_stride, packed_result); Store4(pred, _mm_srli_si128(packed_result, 4)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y < compute_height); } @@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1( inline void OverlapBlendFromTop8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const uint8_t* mask = kObmcMask + height - 2; @@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1( const int compute_height = height - (height >> 2); int y = compute_height; do { - const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]); // 64 - mask - const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); - const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0); + const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6); + + --y; + const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1); + const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6); + + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (--y > 0); } void OverlapBlendFromTop_SSE4_1( @@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } @@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6; inline void OverlapBlendFromLeft2xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 2; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load4x2(pred, pred + pred_stride); - const __m128i obmc_pred_val = - Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U32( _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); @@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc index 5830894..5498052 100644 --- a/src/dsp/x86/warp_sse4.cc +++ b/src/dsp/x86/warp_sse4.cc @@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8], } template -inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, +inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, } template -inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4, - int gamma, int delta, +inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, + int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src, template inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int source_width, int y4, + ptrdiff_t source_stride, int source_width, int64_t y4, int ix4, int iy4, int gamma, int delta, int16_t intermediate_result_column[15], DestType* LIBGAV1_RESTRICT dst_row, @@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, template inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, ptrdiff_t source_stride, int source_height, int alpha, - int beta, int x4, int ix4, int iy4, + int beta, int64_t x4, int ix4, int iy4, int16_t intermediate_result[15][8]) { // Region 3 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, template inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int alpha, int beta, int x4, - int ix4, int iy4, int16_t intermediate_result[15][8]) { + ptrdiff_t source_stride, int alpha, int beta, + int64_t x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { // Region 4. // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, int16_t intermediate_result_column[15]; }; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame in both directions. One repeated value. - WarpRegion1(src, source_stride, source_width, - source_height, ix4, iy4, dst_row, - dest_stride); + WarpRegion1( + src, source_stride, source_width, source_height, filter_params.ix4, + filter_params.iy4, dst_row, dest_stride); return; } // Outside the frame horizontally. Rows repeated. WarpRegion2( - src, source_stride, source_width, y4, ix4, iy4, gamma, delta, - intermediate_result_column, dst_row, dest_stride); + src, source_stride, source_width, filter_params.y4, filter_params.ix4, + filter_params.iy4, gamma, delta, intermediate_result_column, dst_row, + dest_stride); return; } - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame vertically. - WarpRegion3(src, source_stride, source_height, alpha, - beta, x4, ix4, iy4, intermediate_result); + WarpRegion3( + src, source_stride, source_height, alpha, beta, filter_params.x4, + filter_params.ix4, filter_params.iy4, intermediate_result); } else { // Inside the frame. - WarpRegion4(src, source_stride, alpha, beta, x4, ix4, - iy4, intermediate_result); + WarpRegion4(src, source_stride, alpha, beta, + filter_params.x4, filter_params.ix4, + filter_params.iy4, intermediate_result); } // Region 3 and 4 vertical filter. - VerticalFilter(intermediate_result, y4, gamma, delta, - dst_row, dest_stride); + VerticalFilter(intermediate_result, filter_params.y4, + gamma, delta, dst_row, dest_stride); } template diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc index 69cb784..53a374d 100644 --- a/src/dsp/x86/weight_mask_sse4.cc +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -37,10 +37,10 @@ namespace { constexpr int kRoundingBits8bpp = 4; template -inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const __m128i pred_00 = LoadAligned16(prediction_0); const __m128i pred_10 = LoadAligned16(prediction_1); const __m128i difference_0 = RightShiftWithRounding_U16( @@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT8_PAIR_WITHOUT_STRIDE \ - WeightMask16_SSE4(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1(pred_0, pred_1, mask, mask_stride) #define WEIGHT8_PAIR_AND_STRIDE \ WEIGHT8_PAIR_WITHOUT_STRIDE; \ @@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, mask += mask_stride << 1 template -void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { +void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); @@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 3; @@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 5; @@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT16_WITHOUT_STRIDE \ - WeightMask16_SSE4(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1(pred_0, pred_1, mask, mask_stride) #define WEIGHT16_AND_STRIDE \ WEIGHT16_WITHOUT_STRIDE; \ @@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y = 7; @@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 5; @@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 6; @@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE; } -#define WEIGHT32_WITHOUT_STRIDE \ - WeightMask16_SSE4(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask16_SSE4_1(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE \ WEIGHT32_WITHOUT_STRIDE; \ @@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); WEIGHT32_AND_STRIDE; @@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 5; @@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 6; @@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE; } -#define WEIGHT64_WITHOUT_STRIDE \ - WeightMask16_SSE4(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_SSE4(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_SSE4(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask16_SSE4_1(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_SSE4_1(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_SSE4_1(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE \ WEIGHT64_WITHOUT_STRIDE; \ @@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 0; @@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 0; @@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 0; @@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 0; @@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 0; @@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 0; @@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_SSE4<0>; \ - dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1> + WeightMask##width##x##height##_SSE4_1<0>; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask##width##x##height##_SSE4_1<1> void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); @@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6; constexpr int kScaledDiffShift = 4; template -inline void WeightMask16_10bpp_SSE4( +inline void WeightMask16_10bpp_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { @@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4( } } -#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT8_PAIR_AND_STRIDE_10BPP \ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \ @@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4( mask += mask_stride << 1 template -void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); @@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 3; @@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 5; @@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; } -#define WEIGHT16_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT16_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT16_AND_STRIDE_10BPP \ WEIGHT16_WITHOUT_STRIDE_10BPP; \ @@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y = 7; @@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 5; @@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 6; @@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE_10BPP; } -#define WEIGHT32_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE_10BPP \ WEIGHT32_WITHOUT_STRIDE_10BPP; \ @@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); WEIGHT32_AND_STRIDE_10BPP; @@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 5; @@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 6; @@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE_10BPP; } -#define WEIGHT64_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_10bpp_SSE4(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_10bpp_SSE4(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_10bpp_SSE4_1(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_10bpp_SSE4_1(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE_10BPP \ WEIGHT64_WITHOUT_STRIDE_10BPP; \ @@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template -void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 5; @@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y5 = 6; @@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 42; @@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 21; @@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template -void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int y3 = 42; @@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_10bpp_SSE4<0>; \ + WeightMask##width##x##height##_10bpp_SSE4_1<0>; \ dsp->weight_mask[w_index][h_index][1] = \ - WeightMask##width##x##height##_10bpp_SSE4<1> + WeightMask##width##x##height##_10bpp_SSE4_1<1> void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); -- cgit v1.2.3