diff options
Diffstat (limited to 'src/dsp/x86/convolve_sse4.cc')
-rw-r--r-- | src/dsp/x86/convolve_sse4.cc | 187 |
1 files changed, 78 insertions, 109 deletions
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index f7e5a71..f427c4c 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -36,7 +36,7 @@ namespace { #include "src/dsp/x86/convolve_sse4.inc" -template <int filter_index> +template <int num_taps> __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i v_src[4]; @@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long); const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); return sum; } -template <int filter_index> +template <int num_taps> __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, return _mm_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int x = 0; do { if (is_2d || is_compound) { - const __m128i v_sum = - HorizontalTaps8To16<filter_index>(&src[x], v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap); if (is_2d) { StoreAligned16(&dest16[x], v_sum); } else { StoreUnaligned16(&dest16[x], v_sum); } } else { - const __m128i result = - SimpleHorizontalTaps<filter_index>(&src[x], v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap); StoreLo8(&dest8[x], result); } x += 8; @@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int y = height; do { if (is_2d || is_compound) { - const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap); StoreLo8(dest16, v_sum); } else { - const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap); Store4(&dest8[0], result); } src += src_stride; @@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(16) uint16_t @@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, } } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16_x, results); @@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); } - } else { - // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. - // See convolve_neon.cc - SetupTaps<4>(&v_filter, taps); - - if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else if (width == 4) { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, - taps); - } } } -void ConvolveCompoundCopy_SSE4( +void ConvolveCompoundCopy_SSE4_1( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, @@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4( _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical); const __m128i v_dest_hi = _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical); - // TODO(slavarnway): Investigate using aligned stores. StoreUnaligned16(&dest[x], v_dest_lo); StoreUnaligned16(&dest[x + 8], v_dest_hi); x += 16; @@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else { - SetupTaps<4>(&v_filter, taps); - - if (width == 4) { - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); - } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, - width, height, taps); - } } } @@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1( // Similarly for height. const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast<const uint8_t*>(reference) - @@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, source); StoreLo8(intermediate, RightShiftWithRounding_S16( - SumOnePassTaps<filter_index>(source, taps), + SumOnePassTaps<num_taps>(source, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; @@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source); // Shift by one less because the taps are halved. - StoreAligned16( - intermediate_x, - RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps), - kInterRoundBitsHorizontal - 1)); + StoreAligned16(intermediate_x, RightShiftWithRounding_S16( + SumOnePassTaps<num_taps>(source, taps), + kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); @@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, alignas(16) int16_t intermediate_result[kIntermediateAllocWidth * (2 * kIntermediateAllocWidth + kSubPixelTaps)]; - const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + @@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, // inputs in each iteration on large blocks. When step_x is large, we need a // second register and alignr in order to gather all filter inputs. // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. - const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index); const int kernel_start_ceiling = 16 - num_horiz_taps; // This truncated quotient |grade_x_threshold| selects |step_x| such that: // (step_x * 7) >> kScaleSubPixelBits < single load limit @@ -1891,7 +1860,7 @@ void Init8bpp() { dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; - dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; |