diff options
Diffstat (limited to 'src/dsp/arm/convolve_10bit_neon.cc')
-rw-r--r-- | src/dsp/arm/convolve_10bit_neon.cc | 224 |
1 files changed, 106 insertions, 118 deletions
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc index b7205df..389f029 100644 --- a/src/dsp/arm/convolve_10bit_neon.cc +++ b/src/dsp/arm/convolve_10bit_neon.cc @@ -45,12 +45,12 @@ namespace { // Pixel output range: [ 0, 1023] // Compound output range: [ 3988, 61532] -template <int filter_index> +template <int num_taps> int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x8_t*>(src); int32x4x2_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, return sum; } -template <int filter_index> +template <int num_taps> int32x4_t SumOnePassTaps(const uint16x4_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x4_t*>(src); int32x4_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[3], taps[3]); sum = vmlal_s16(sum, ssrc[4], taps[4]); sum = vmlal_s16(sum, ssrc[5], taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[5], taps[5]); sum = vmlal_s16(sum, ssrc[6], taps[6]); sum = vmlal_s16(sum, ssrc[7], taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, return sum; } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(s + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } const int16x4_t d0 = @@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(src + x + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); @@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, int32x4_t v_sum; const uint16x8_t src_long = vld1q_u16(src); v_src[0] = vget_low_u16(src_long); - if (filter_index == 3) { + if (num_taps == 2) { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); } else { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2)); v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound || is_2d) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); @@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_2d> +template <int num_taps, bool is_2d> void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride)); const int16x8x2_t input = vzipq_s16(input0, input1); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)), @@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src)); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]); @@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, } } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const int16x4_t* const v_tap) { - assert(width < 8 || filter_index <= 3); + assert(width < 8 || num_taps != 4); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. - if (width >= 8 && filter_index <= 3) { - FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>( + if (width >= 8 && num_taps != 4) { + FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, width, height, v_tap); return; } @@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(filter_index >= 3 && filter_index <= 5); - if (filter_index >= 3 && filter_index <= 5) { + assert(num_taps == 2 || num_taps == 4); + if (num_taps == 2 || num_taps == 4) { if (width == 4) { - FilterHorizontalWidth4<filter_index, is_compound, is_2d>( + FilterHorizontalWidth4<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, height, v_tap); return; } assert(width == 2); if (!is_compound) { - FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, - pred_stride, height, v_tap); + FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride, + FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 1) { // 6 tap. - FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst, + } else if (filter_index < 2) { // 6 tap. + FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst, - dst_stride, width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst, - dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst, + FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, dst_stride, width, height, v_tap); } } @@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON( filter_index); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* const dst16 = static_cast<uint16_t*>(dst); @@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, srcs[next_row] = vld1q_u16(src_x); src_x += src_stride; - const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); const int16x4_t d0 = @@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, srcs[num_taps] = vld1_u16(src); src += src_stride; - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); - const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); + const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps); if (is_compound) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); const int16x4_t d1 = @@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index> +template <int num_taps> void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, src += src_stride; srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2); - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); const uint16x4_t d0 = vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); Store2<0>(dst16, d0); @@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps + 3); } } else { @@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } @@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 3); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 3); } } else { @@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } @@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap( PermuteSrcVals(src_bytes, src_lookup[1])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap( const uint16x4_t src_high[2] = {vget_high_u16(src[0]), vget_high_u16(src[1])}; - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; |