aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/convolve_sse4.inc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/convolve_sse4.inc')
-rw-r--r--src/dsp/x86/convolve_sse4.inc98
1 files changed, 69 insertions, 29 deletions
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
index 550d6a4..5548c5b 100644
--- a/src/dsp/x86/convolve_sse4.inc
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -18,20 +18,63 @@
#include "src/dsp/convolve.inc"
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+ if (filter_index == 0) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 1) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+ (filter_id == 8) | (filter_id == 9)) != 0) {
+ return 6;
+ }
+ // When |filter_index| == 1, the |filter_id| values not listed above map to
+ // 4 tap filters.
+ return 4;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
// sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
__m128i sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
sum = _mm_add_epi16(v_madd_21, v_madd_43);
sum = _mm_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
@@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
} else {
@@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
const __m128i v_src_43 = _mm_shuffle_epi8(
v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
@@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return v_sum_5432;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
@@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) {
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index>
+template <int num_taps>
__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
__m128i v_src[4];
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
-// 2D version.
-template <int num_taps, int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 12 13 20 21 22 23
srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 30 31 32 33 40 41 42 43
srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 50 51 52 53 60 61 62 63
srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 70 71 72 73 80 81 82 83
srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
}
}
-template <int num_taps, int filter_index, bool negative_outside_taps = false>
+template <int num_taps, bool negative_outside_taps = false>
void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 20 21 30 31 40 41
srcs[1] = _mm_srli_si128(srcs_0_2, 2);
// This uses srcs[0]..srcs[1].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[3] = _mm_srli_si128(srcs_0_4, 6);
// This uses srcs[0]..srcs[3].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[5] = _mm_srli_si128(srcs_4_8, 2);
// This uses srcs[0]..srcs[5].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[7] = _mm_srli_si128(srcs_4_8, 6);
// This uses srcs[0]..srcs[7].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);