diff options
Diffstat (limited to 'src/dsp/x86/convolve_sse4.cc')
-rw-r--r-- | src/dsp/x86/convolve_sse4.cc | 1039 |
1 files changed, 62 insertions, 977 deletions
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index 3a0fff5..9b72fe4 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -34,41 +34,7 @@ namespace dsp { namespace low_bitdepth { namespace { -#include "src/dsp/convolve.inc" - -// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and -// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final -// sum from outranging int16_t. -template <int filter_index> -__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { - __m128i sum; - if (filter_index < 2) { - // 6 taps. - const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 - const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 - const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 - sum = _mm_add_epi16(v_madd_21, v_madd_43); - sum = _mm_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { - // 8 taps. - const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 - const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 - const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 - const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); - const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); - sum = _mm_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { - // 2 taps. - sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 - } else { - // 4 taps. - const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 - sum = _mm_add_epi16(v_madd_32, v_madd_54); - } - return sum; -} +#include "src/dsp/x86/convolve_sse4.inc" template <int filter_index> __m128i SumHorizontalTaps(const uint8_t* const src, @@ -125,68 +91,7 @@ __m128i HorizontalTaps8To16(const uint8_t* const src, return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index> -__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - const __m128i input0 = LoadLo8(&src[2]); - const __m128i input1 = LoadLo8(&src[2 + src_stride]); - - if (filter_index == 3) { - // 03 04 04 05 05 06 06 07 .... - const __m128i input0_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3); - // 13 14 14 15 15 16 16 17 .... - const __m128i input1_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3); - const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup); - const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 - return v_sum_43; - } - - // 02 03 03 04 04 05 05 06 06 07 .... - const __m128i input0_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1); - // 12 13 13 14 14 15 15 16 16 17 .... - const __m128i input1_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1); - // 04 05 05 06 06 07 07 08 ... - const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4); - // 14 15 15 16 16 17 17 18 ... - const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4); - const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup); - const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54); - const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 - const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); - return v_sum_5432; -} - -template <int filter_index> -__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); - sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); - return _mm_packus_epi16(sum, sum); -} - -template <int filter_index> -__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); -} - -template <int num_taps, int step, int filter_index, bool is_2d = false, +template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, void* const dest, const ptrdiff_t pred_stride, @@ -197,7 +102,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, // 4 tap filters are never used when width > 4. if (num_taps != 4 && width > 4) { - int y = 0; + int y = height; do { int x = 0; do { @@ -214,12 +119,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, SimpleHorizontalTaps<filter_index>(&src[x], v_tap); StoreLo8(&dest8[x], result); } - x += step; + x += 8; } while (x < width); src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); return; } @@ -229,7 +134,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, assert(num_taps <= 4); if (num_taps <= 4) { if (width == 4) { - int y = 0; + int y = height; do { if (is_2d || is_compound) { const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap); @@ -241,12 +146,13 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); return; } if (!is_compound) { - int y = 0; + int y = height; + if (is_2d) y -= 1; do { if (is_2d) { const __m128i sum = @@ -265,8 +171,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } src += src_stride << 1; - y += 2; - } while (y < height - 1); + y -= 2; + } while (y != 0); // The 2d filters have an odd |height| because the horizontal pass // generates context for the vertical pass. @@ -298,303 +204,6 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } } -template <int num_taps, bool is_2d_vertical = false> -LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, - __m128i* v_tap) { - if (num_taps == 8) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); - v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); - } - } else if (num_taps == 6) { - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 - v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - } - } else if (num_taps == 4) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - } - } else { // num_taps == 2 - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - } - } -} - -template <int num_taps, bool is_compound> -__m128i SimpleSum2DVerticalTaps(const __m128i* const src, - const __m128i* const taps) { - __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); - __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); - if (num_taps >= 4) { - __m128i madd_lo = - _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); - __m128i madd_hi = - _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - if (num_taps >= 6) { - madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); - madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - if (num_taps == 8) { - madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); - madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - } - } - } - - if (is_compound) { - return _mm_packs_epi32( - RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), - RightShiftWithRounding_S32(sum_hi, - kInterRoundBitsCompoundVertical - 1)); - } - - return _mm_packs_epi32( - RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), - RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); -} - -template <int num_taps, bool is_compound = false> -void Filter2DVertical(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int width, - const int height, const __m128i* const taps) { - assert(width >= 8); - constexpr int next_row = num_taps - 1; - // The Horizontal pass uses |width| as |stride| for the intermediate buffer. - const ptrdiff_t src_stride = width; - - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - int x = 0; - do { - __m128i srcs[8]; - const uint16_t* src_x = src + x; - srcs[0] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps >= 4) { - srcs[1] = LoadAligned16(src_x); - src_x += src_stride; - srcs[2] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps >= 6) { - srcs[3] = LoadAligned16(src_x); - src_x += src_stride; - srcs[4] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps == 8) { - srcs[5] = LoadAligned16(src_x); - src_x += src_stride; - srcs[6] = LoadAligned16(src_x); - src_x += src_stride; - } - } - } - - int y = 0; - do { - srcs[next_row] = LoadAligned16(src_x); - src_x += src_stride; - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); - if (is_compound) { - StoreUnaligned16(dst16 + x + y * dst_stride, sum); - } else { - StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum)); - } - - srcs[0] = srcs[1]; - if (num_taps >= 4) { - srcs[1] = srcs[2]; - srcs[2] = srcs[3]; - if (num_taps >= 6) { - srcs[3] = srcs[4]; - srcs[4] = srcs[5]; - if (num_taps == 8) { - srcs[5] = srcs[6]; - srcs[6] = srcs[7]; - } - } - } - } while (++y < height); - x += 8; - } while (x < width); -} - -// Take advantage of |src_stride| == |width| to process two rows at a time. -template <int num_taps, bool is_compound = false> -void Filter2DVertical4xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const __m128i* const taps) { - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - __m128i srcs[9]; - srcs[0] = LoadAligned16(src); - src += 8; - if (num_taps >= 4) { - srcs[2] = LoadAligned16(src); - src += 8; - srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); - if (num_taps >= 6) { - srcs[4] = LoadAligned16(src); - src += 8; - srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); - if (num_taps == 8) { - srcs[6] = LoadAligned16(src); - src += 8; - srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); - } - } - } - - int y = 0; - do { - srcs[num_taps] = LoadAligned16(src); - src += 8; - srcs[num_taps - 1] = _mm_unpacklo_epi64( - _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); - if (is_compound) { - StoreUnaligned16(dst16, sum); - dst16 += 4 << 1; - } else { - const __m128i results = _mm_packus_epi16(sum, sum); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - if (num_taps >= 4) { - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - if (num_taps >= 6) { - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - if (num_taps == 8) { - srcs[5] = srcs[7]; - srcs[6] = srcs[8]; - } - } - } - y += 2; - } while (y < height); -} - -// Take advantage of |src_stride| == |width| to process four rows at a time. -template <int num_taps> -void Filter2DVertical2xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const __m128i* const taps) { - constexpr int next_row = (num_taps < 6) ? 4 : 8; - - auto* dst8 = static_cast<uint8_t*>(dst); - - __m128i srcs[9]; - srcs[0] = LoadAligned16(src); - src += 8; - if (num_taps >= 6) { - srcs[4] = LoadAligned16(src); - src += 8; - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - if (num_taps == 8) { - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - } - } - - int y = 0; - do { - srcs[next_row] = LoadAligned16(src); - src += 8; - if (num_taps == 2) { - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - } else if (num_taps == 4) { - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - } else if (num_taps == 6) { - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); - } else if (num_taps == 8) { - srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); - srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); - srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); - } - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); - const __m128i results = _mm_packus_epi16(sum, sum); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. - // Therefore we don't need to check this condition when |height| > 4. - if (num_taps <= 4 && height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - if (num_taps == 6) { - srcs[1] = srcs[5]; - srcs[4] = srcs[8]; - } else if (num_taps == 8) { - srcs[1] = srcs[5]; - srcs[2] = srcs[6]; - srcs[3] = srcs[7]; - srcs[4] = srcs[8]; - } - - y += 4; - } while (y < height); -} - template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, @@ -607,28 +216,28 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 8, 2, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 1, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 0, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 4, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 5) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 5, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 8, 3, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -718,39 +327,6 @@ void Convolve2D_SSE4_1(const void* const reference, } } -// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D -// Vertical calculations. -__m128i Compound1DShift(const __m128i sum) { - return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); -} - -template <int filter_index> -__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { - __m128i v_src[4]; - - if (filter_index < 2) { - // 6 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { - // 8 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { - // 2 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { - // 4 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); - return sum; -} - template <int filter_index, bool is_compound = false> void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, @@ -787,7 +363,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } - int y = 0; + auto* dst8_x = dst8 + x; + auto* dst16_x = dst16 + x; + int y = height; do { srcs[next_row] = LoadLo8(src_x); src_x += src_stride; @@ -795,11 +373,13 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16 + x + y * dst_stride, results); + StoreUnaligned16(dst16_x, results); + dst16_x += dst_stride; } else { const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1); - StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results)); + StoreLo8(dst8_x, _mm_packus_epi16(results, results)); + dst8_x += dst_stride; } srcs[0] = srcs[1]; @@ -815,506 +395,11 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } } - } while (++y < height); + } while (--y != 0); x += 8; } while (x < width); } -template <int filter_index, bool is_compound = false> -void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - __m128i srcs[9]; - - if (num_taps == 2) { - srcs[2] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - - int y = 0; - do { - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - y += 2; - } while (y < height); - } else if (num_taps == 4) { - srcs[4] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - - int y = 0; - do { - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - y += 2; - } while (y < height); - } else if (num_taps == 6) { - srcs[6] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - - int y = 0; - do { - // 50 51 52 53 - const __m128i c = Load4(src); - // 40 41 42 43 50 51 52 53 - srcs[4] = _mm_unpacklo_epi32(srcs[4], c); - src += src_stride; - // 60 61 62 63 - srcs[6] = Load4(src); - src += src_stride; - // 50 51 52 53 60 61 62 63 - srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - y += 2; - } while (y < height); - } else if (num_taps == 8) { - srcs[8] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - // 50 51 52 53 - const __m128i c = Load4(src); - // 40 41 42 43 50 51 52 53 - srcs[4] = _mm_unpacklo_epi32(srcs[4], c); - src += src_stride; - // 60 61 62 63 - srcs[6] = Load4(src); - src += src_stride; - // 50 51 52 53 60 61 62 63 - srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - - int y = 0; - do { - // 70 71 72 73 - const __m128i d = Load4(src); - // 60 61 62 63 70 71 72 73 - srcs[6] = _mm_unpacklo_epi32(srcs[6], d); - src += src_stride; - // 80 81 82 83 - srcs[8] = Load4(src); - src += src_stride; - // 70 71 72 73 80 81 82 83 - srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - srcs[5] = srcs[7]; - srcs[6] = srcs[8]; - y += 2; - } while (y < height); - } -} - -template <int filter_index, bool negative_outside_taps = false> -void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); - auto* dst8 = static_cast<uint8_t*>(dst); - - __m128i srcs[9]; - - if (num_taps == 2) { - srcs[2] = _mm_setzero_si128(); - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - - int y = 0; - do { - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[2] = Load2<0>(src, srcs[2]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 - const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_2, 2); - // This uses srcs[0]..srcs[1]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - if (height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[2]; - y += 4; - } while (y < height); - } else if (num_taps == 4) { - srcs[4] = _mm_setzero_si128(); - - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - - int y = 0; - do { - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2<0>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4, 2); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - - // This uses srcs[0]..srcs[3]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - if (height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - y += 4; - } while (y < height); - } else if (num_taps == 6) { - // During the vertical pass the number of taps is restricted when - // |height| <= 4. - assert(height > 4); - srcs[8] = _mm_setzero_si128(); - - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2(src); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4x, 2); - - int y = 0; - do { - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 70 71 - srcs[4] = Load2<3>(src, srcs[4]); - src += src_stride; - // 80 81 - srcs[8] = Load2<0>(src, srcs[8]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); - // 50 51 60 61 70 71 80 81 - srcs[5] = _mm_srli_si128(srcs_4_8, 2); - - // This uses srcs[0]..srcs[5]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - srcs[1] = srcs[5]; - srcs[4] = srcs[8]; - y += 4; - } while (y < height); - } else if (num_taps == 8) { - // During the vertical pass the number of taps is restricted when - // |height| <= 4. - assert(height > 4); - srcs[8] = _mm_setzero_si128(); - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2(src); - src += src_stride; - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4, 2); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - - int y = 0; - do { - // 40 41 50 51 60 61 70 71 - srcs[4] = Load2<3>(src, srcs[4]); - src += src_stride; - // 80 81 - srcs[8] = Load2<0>(src, srcs[8]); - src += src_stride; - // 80 81 90 91 - srcs[8] = Load2<1>(src, srcs[8]); - src += src_stride; - // 80 81 90 91 a0 a1 - srcs[8] = Load2<2>(src, srcs[8]); - src += src_stride; - - // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 - const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); - // 50 51 60 61 70 71 80 81 - srcs[5] = _mm_srli_si128(srcs_4_8, 2); - // 60 61 70 71 80 81 90 91 - srcs[6] = _mm_srli_si128(srcs_4_8, 4); - // 70 71 80 81 90 91 a0 a1 - srcs[7] = _mm_srli_si128(srcs_4_8, 6); - - // This uses srcs[0]..srcs[7]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - srcs[1] = srcs[5]; - srcs[2] = srcs[6]; - srcs[3] = srcs[7]; - srcs[4] = srcs[8]; - y += 4; - } while (y < height); - } -} - void ConvolveVertical_SSE4_1(const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, @@ -1339,9 +424,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, if (filter_index < 2) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1349,9 +434,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1359,9 +444,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 3) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1369,9 +454,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1382,9 +467,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1474,8 +559,8 @@ void ConvolveCompoundVertical_SSE4_1( if (filter_index < 2) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1484,8 +569,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<8>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1494,8 +579,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<2>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1504,8 +589,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<4>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1514,8 +599,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<4>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1752,7 +837,11 @@ inline void GetHalfSubPixelFilter(__m128i* output) { template <int num_taps, int grade_x> inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, __m128i* const source /*[num_taps >> 1]*/) { - const __m128i src_vals = LoadUnaligned16(src); + // |used_bytes| is only computed in msan builds. Mask away unused bytes for + // msan because it incorrectly models the outcome of the shuffles in some + // cases. This has not been reproduced out of context. + const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2; + const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes); source[0] = _mm_shuffle_epi8(src_vals, src_indices); if (grade_x == 1) { if (num_taps > 2) { @@ -1768,7 +857,7 @@ inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, assert(grade_x > 1); assert(num_taps != 4); // grade_x > 1 also means width >= 8 && num_taps != 4 - const __m128i src_vals_ext = LoadLo8(src + 16); + const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes); if (num_taps > 2) { source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2), src_indices); @@ -1983,14 +1072,10 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo, // |width_class| is 2, 4, or 8, according to the Store function that should be // used. template <int num_taps, int width_class, bool is_compound> -#if LIBGAV1_MSAN -__attribute__((no_sanitize_memory)) void ConvolveVerticalScale( -#else -inline void ConvolveVerticalScale( -#endif - const int16_t* src, const int width, const int subpixel_y, - const int filter_index, const int step_y, const int height, void* dest, - const ptrdiff_t dest_stride) { +inline void ConvolveVerticalScale(const int16_t* src, const int width, + const int subpixel_y, const int filter_index, + const int step_y, const int height, + void* dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; constexpr int kernel_offset = (8 - num_taps) / 2; const int16_t* src_y = src; @@ -2819,7 +1904,7 @@ void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |