aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/convolve_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/convolve_sse4.cc')
-rw-r--r--src/dsp/x86/convolve_sse4.cc187
1 files changed, 78 insertions, 109 deletions
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index f7e5a71..f427c4c 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -36,7 +36,7 @@ namespace {
#include "src/dsp/x86/convolve_sse4.inc"
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i v_src[4];
@@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int x = 0;
do {
if (is_2d || is_compound) {
- const __m128i v_sum =
- HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
if (is_2d) {
StoreAligned16(&dest16[x], v_sum);
} else {
StoreUnaligned16(&dest16[x], v_sum);
}
} else {
- const __m128i result =
- SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
StoreLo8(&dest8[x], result);
}
x += 8;
@@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int y = height;
do {
if (is_2d || is_compound) {
- const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
StoreLo8(dest16, v_sum);
} else {
- const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
Store4(&dest8[0], result);
}
src += src_stride;
@@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
do {
if (is_2d) {
const __m128i sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
Store4(&dest16[0], sum);
dest16 += pred_stride;
Store4(&dest16[0], _mm_srli_si128(sum, 8));
dest16 += pred_stride;
} else {
const __m128i sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
Store2(dest8, sum);
dest8 += pred_stride;
Store2(dest8, _mm_srli_si128(sum, 4));
@@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
__m128i sum;
const __m128i input = LoadLo8(&src[2]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 ....
const __m128i v_src_43 =
_mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(16) uint16_t
@@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
}
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16_x, results);
@@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else {
- // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
- // See convolve_neon.cc
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 2) {
- FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else if (width == 4) {
- FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else {
- FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
}
}
-void ConvolveCompoundCopy_SSE4(
+void ConvolveCompoundCopy_SSE4_1(
const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
@@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4(
_mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
const __m128i v_dest_hi =
_mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
- // TODO(slavarnway): Investigate using aligned stores.
StoreUnaligned16(&dest[x], v_dest_lo);
StoreUnaligned16(&dest[x + 8], v_dest_hi);
x += 16;
@@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else {
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 4) {
- FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
- } else {
- FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
- width, height, taps);
- }
}
}
@@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1(
// Similarly for height.
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
const auto* const src = static_cast<const uint8_t*>(reference) -
@@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
source);
StoreLo8(intermediate, RightShiftWithRounding_S16(
- SumOnePassTaps<filter_index>(source, taps),
+ SumOnePassTaps<num_taps>(source, taps),
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
@@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
// Shift by one less because the taps are halved.
- StoreAligned16(
- intermediate_x,
- RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
- kInterRoundBitsHorizontal - 1));
+ StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+ SumOnePassTaps<num_taps>(source, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
} while (--y != 0);
@@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
alignas(16) int16_t
intermediate_result[kIntermediateAllocWidth *
(2 * kIntermediateAllocWidth + kSubPixelTaps)];
- const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
kScaleSubPixelBits) +
@@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
// inputs in each iteration on large blocks. When step_x is large, we need a
// second register and alignr in order to gather all filter inputs.
// |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
- const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
const int kernel_start_ceiling = 16 - num_horiz_taps;
// This truncated quotient |grade_x_threshold| selects |step_x| such that:
// (step_x * 7) >> kScaleSubPixelBits < single load limit
@@ -1891,7 +1860,7 @@ void Init8bpp() {
dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
- dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;