diff options
Diffstat (limited to 'src/dsp/arm/convolve_neon.cc')
-rw-r--r-- | src/dsp/arm/convolve_neon.cc | 451 |
1 files changed, 217 insertions, 234 deletions
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc index 331bfe2..5b80da2 100644 --- a/src/dsp/arm/convolve_neon.cc +++ b/src/dsp/arm/convolve_neon.cc @@ -103,9 +103,11 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src, template <int filter_index, bool negative_outside_taps, bool is_2d, bool is_compound> -void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, +void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int width, + const int height, const uint8x8_t* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); @@ -220,9 +222,11 @@ void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool is_2d, bool is_compound> -void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int height, const uint8x8_t* const v_tap) { +void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int height, + const uint8x8_t* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); int y = height; @@ -257,9 +261,11 @@ void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool is_2d> -void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int height, const uint8x8_t* const v_tap) { +void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int height, + const uint8x8_t* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); int y = height >> 1; @@ -345,10 +351,11 @@ void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride, template <int filter_index, bool negative_outside_taps, bool is_2d, bool is_compound> -void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, - const uint8x8_t* const v_tap) { +void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int width, + const int height, const uint8x8_t* const v_tap) { assert(width < 8 || filter_index <= 3); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. @@ -484,7 +491,8 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, } template <int num_taps, bool is_compound = false> -void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst, +void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x8_t taps) { assert(width >= 8); @@ -560,7 +568,8 @@ void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst, // Take advantage of |src_stride| == |width| to process two rows at a time. template <int num_taps, bool is_compound = false> -void Filter2DVerticalWidth4(const uint16_t* src, void* const dst, +void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x8_t taps) { auto* dst8 = static_cast<uint8_t*>(dst); @@ -626,7 +635,8 @@ void Filter2DVerticalWidth4(const uint16_t* src, void* const dst, // Take advantage of |src_stride| == |width| to process four rows at a time. template <int num_taps> -void Filter2DVerticalWidth2(const uint16_t* src, void* const dst, +void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x8_t taps) { constexpr int next_row = (num_taps < 6) ? 4 : 8; @@ -699,9 +709,10 @@ void Filter2DVerticalWidth2(const uint16_t* src, void* const dst, template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( - const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, - const ptrdiff_t dst_stride, const int width, const int height, - const int filter_id, const int filter_index) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, + const int width, const int height, const int filter_id, + const int filter_index) { // Duplicate the absolute value for each tap. Negative taps are corrected // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8. uint8x8_t v_tap[kSubPixelTaps]; @@ -739,9 +750,10 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } template <int vertical_taps> -void Filter2DVertical(const uint16_t* const intermediate_result, - const int width, const int height, const int16x8_t taps, - void* const prediction, const ptrdiff_t pred_stride) { +void Filter2DVertical( + const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width, + const int height, const int16x8_t taps, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { auto* const dest = static_cast<uint8_t*>(prediction); if (width >= 8) { Filter2DVerticalWidth8AndUp<vertical_taps>( @@ -756,13 +768,13 @@ void Filter2DVertical(const uint16_t* const intermediate_result, } } -void Convolve2D_NEON(const void* const reference, +void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* const prediction, + const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); @@ -772,6 +784,10 @@ void Convolve2D_NEON(const void* const reference, uint16_t intermediate_result[kMaxSuperBlockSizeInPixels * (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; +#if LIBGAV1_MSAN + // Quiet msan warnings. Set with random non-zero value to aid in debugging. + memset(intermediate_result, 0x33, sizeof(intermediate_result)); +#endif const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast<const uint8_t*>(reference) - @@ -815,6 +831,10 @@ inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) { const uint8x16_t src_val = vld1q_u8(src_x); ret.val[0] = vget_low_u8(src_val); ret.val[1] = vget_high_u8(src_val); +#if LIBGAV1_MSAN + // Initialize to quiet msan warnings when grade_x <= 1. + ret.val[2] = vdup_n_u8(0); +#endif if (grade_x > 1) { ret.val[2] = vld1_u8(src_x + 16); } @@ -833,12 +853,10 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) { } template <int grade_x> -inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src, - const ptrdiff_t src_stride, - const int width, const int subpixel_x, - const int step_x, - const int intermediate_height, - int16_t* intermediate) { +inline void ConvolveKernelHorizontal2Tap( + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int width, const int subpixel_x, const int step_x, + const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) { // Account for the 0-taps that precede the 2 nonzero taps. const int kernel_offset = 3; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -891,7 +909,6 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src, do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; - int16_t* intermediate_x = intermediate + x; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); @@ -917,11 +934,11 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src, vtbl3_u8(src_vals, src_indices), vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))}; - vst1q_s16(intermediate_x, + vst1q_s16(intermediate, vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; - intermediate_x += kIntermediateStride; + intermediate += kIntermediateStride; } while (--y != 0); x += 8; p += step_x8; @@ -943,8 +960,9 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. void ConvolveKernelHorizontalPositive4Tap( - const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, - const int step_x, const int intermediate_height, int16_t* intermediate) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int subpixel_x, const int step_x, const int intermediate_height, + int16_t* LIBGAV1_RESTRICT intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1010,8 +1028,9 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. inline void ConvolveKernelHorizontalSigned4Tap( - const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, - const int step_x, const int intermediate_height, int16_t* intermediate) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int subpixel_x, const int step_x, const int intermediate_height, + int16_t* LIBGAV1_RESTRICT intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1085,9 +1104,10 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalSigned6Tap( - const uint8_t* const src, const ptrdiff_t src_stride, const int width, - const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* const intermediate) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int width, const int subpixel_x, const int step_x, + const int intermediate_height, + int16_t* LIBGAV1_RESTRICT const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1100,6 +1120,7 @@ inline void ConvolveKernelHorizontalSigned6Tap( const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); + int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { @@ -1107,7 +1128,6 @@ inline void ConvolveKernelHorizontalSigned6Tap( // |trailing_width| can be up to 24. const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; - int16_t* intermediate_x = intermediate + x; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); @@ -1178,9 +1198,10 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalMixed6Tap( - const uint8_t* const src, const ptrdiff_t src_stride, const int width, - const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* const intermediate) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int width, const int subpixel_x, const int step_x, + const int intermediate_height, + int16_t* LIBGAV1_RESTRICT const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1198,12 +1219,12 @@ inline void ConvolveKernelHorizontalMixed6Tap( const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); + int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; - int16_t* intermediate_x = intermediate + x; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); @@ -1272,9 +1293,10 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalSigned8Tap( - const uint8_t* const src, const ptrdiff_t src_stride, const int width, - const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* const intermediate) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + const int width, const int subpixel_x, const int step_x, + const int intermediate_height, + int16_t* LIBGAV1_RESTRICT const intermediate) { const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -1286,11 +1308,12 @@ inline void ConvolveKernelHorizontalSigned8Tap( } const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x)); + + int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x]; - int16_t* intermediate_x = intermediate + x; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); @@ -1336,15 +1359,16 @@ inline void ConvolveKernelHorizontalSigned8Tap( // This function handles blocks of width 2 or 4. template <int num_taps, int grade_y, int width, bool is_compound> -void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y, - const int filter_index, const int step_y, - const int height, void* const dest, +void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src, + const int subpixel_y, const int filter_index, + const int step_y, const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; const int16_t* src_y = src; // |dest| is 16-bit in compound mode, Pixel otherwise. - uint16_t* dest16_y = static_cast<uint16_t*>(dest); - uint8_t* dest_y = static_cast<uint8_t*>(dest); + auto* dest16_y = static_cast<uint16_t*>(dest); + auto* dest_y = static_cast<uint8_t*>(dest); int16x4_t s[num_taps + grade_y]; int p = subpixel_y & 1023; @@ -1408,10 +1432,12 @@ void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y, } template <int num_taps, int grade_y, bool is_compound> -inline void ConvolveVerticalScale(const int16_t* const src, const int width, - const int subpixel_y, const int filter_index, - const int step_y, const int height, - void* const dest, +inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source, + const int intermediate_height, + const int width, const int subpixel_y, + const int filter_index, const int step_y, + const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; // A possible improvement is to use arithmetic to decide how many times to @@ -1421,11 +1447,11 @@ inline void ConvolveVerticalScale(const int16_t* const src, const int width, // |dest| is 16-bit in compound mode, Pixel otherwise. uint16_t* dest16_y; uint8_t* dest_y; + const int16_t* src = source; int x = 0; do { - const int16_t* const src_x = src + x; - const int16_t* src_y = src_x; + const int16_t* src_y = src; dest16_y = static_cast<uint16_t*>(dest) + x; dest_y = static_cast<uint8_t*>(dest) + x; int p = subpixel_y & 1023; @@ -1466,38 +1492,43 @@ inline void ConvolveVerticalScale(const int16_t* const src, const int width, vst1_u8(dest_y, vqmovun_s16(sum)); } p += step_y; - src_y = src_x + (p >> kScaleSubPixelBits) * src_stride; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; y -= 2; } while (y != 0); + src += kIntermediateStride * intermediate_height; x += 8; } while (x < width); } template <bool is_compound> -void ConvolveScale2D_NEON(const void* const reference, +void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int subpixel_x, const int subpixel_y, const int step_x, const int step_y, const int width, const int height, - void* const prediction, const ptrdiff_t pred_stride) { + void* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); assert(step_x <= 2048); + assert(step_y <= 2048); const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + num_vert_taps; - assert(step_x <= 2048); // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. - int16_t intermediate_result[kMaxSuperBlockSizeInPixels * - (2 * kMaxSuperBlockSizeInPixels + 8)]; - + int16_t intermediate_result[kIntermediateAllocWidth * + (2 * kIntermediateAllocWidth + 8)]; +#if LIBGAV1_MSAN + // Quiet msan warnings. Set with random non-zero value to aid in debugging. + memset(intermediate_result, 0x44, sizeof(intermediate_result)); +#endif // Horizontal filter. // Filter types used for width <= 4 are different from those for width > 4. // When width > 4, the valid filter index range is always [0, 3]. @@ -1597,8 +1628,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<6, 1, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { @@ -1611,8 +1642,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<6, 2, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } break; @@ -1628,8 +1659,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<8, 1, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { @@ -1642,8 +1673,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<8, 2, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } break; @@ -1659,8 +1690,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<2, 1, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { @@ -1673,8 +1704,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<2, 2, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } break; @@ -1693,8 +1724,8 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<4, 1, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { @@ -1707,21 +1738,19 @@ void ConvolveScale2D_NEON(const void* const reference, prediction, pred_stride); } else { ConvolveVerticalScale<4, 2, is_compound>( - intermediate, width, subpixel_y, filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + filter_index, step_y, height, prediction, pred_stride); } } } } -void ConvolveHorizontal_NEON(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int /*vertical_filter_index*/, - const int horizontal_filter_id, - const int /*vertical_filter_id*/, const int width, - const int height, void* const prediction, - const ptrdiff_t pred_stride) { +void ConvolveHorizontal_NEON( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. const auto* const src = @@ -1741,10 +1770,11 @@ uint16x8_t Compound1DShift(const int16x8_t sum) { template <int filter_index, bool is_compound = false, bool negative_outside_taps = false> -void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int width, const int height, - const uint8x8_t* const taps) { +void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* const dst8 = static_cast<uint8_t*>(dst); @@ -1814,9 +1844,11 @@ void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride, template <int filter_index, bool is_compound = false, bool negative_outside_taps = false> -void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const uint8x8_t* const taps) { +void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int height, + const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -2001,9 +2033,11 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool negative_outside_taps = false> -void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const uint8x8_t* const taps) { +void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int height, + const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); auto* dst8 = static_cast<uint8_t*>(dst); @@ -2205,14 +2239,12 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, // filtering is required. // The output is the single prediction of the block, clipped to valid pixel // range. -void ConvolveVertical_NEON(const void* const reference, - const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, - const int vertical_filter_index, - const int /*horizontal_filter_id*/, - const int vertical_filter_id, const int width, - const int height, void* const prediction, - const ptrdiff_t pred_stride) { +void ConvolveVertical_NEON( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int vertical_filter_index, const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; @@ -2239,8 +2271,9 @@ void ConvolveVertical_NEON(const void* const reference, FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, taps + 1); } - } else if ((filter_index == 1) & ((vertical_filter_id == 1) | - (vertical_filter_id == 15))) { // 5 tap. + } else if ((static_cast<int>(filter_index == 1) & + (static_cast<int>(vertical_filter_id == 1) | + static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap. if (width == 2) { FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, taps + 1); @@ -2251,9 +2284,11 @@ void ConvolveVertical_NEON(const void* const reference, FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, taps + 1); } - } else if ((filter_index == 1) & - ((vertical_filter_id == 7) | (vertical_filter_id == 8) | - (vertical_filter_id == 9))) { // 6 tap with weird negative taps. + } else if ((static_cast<int>(filter_index == 1) & + (static_cast<int>(vertical_filter_id == 7) | + static_cast<int>(vertical_filter_id == 8) | + static_cast<int>(vertical_filter_id == 9))) != + 0) { // 6 tap with weird negative taps. if (width == 2) { FilterVertical2xH<1, /*negative_outside_taps=*/true>( @@ -2325,11 +2360,11 @@ void ConvolveVertical_NEON(const void* const reference, } void ConvolveCompoundCopy_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const auto* src = static_cast<const uint8_t*>(reference); const ptrdiff_t src_stride = reference_stride; auto* dest = static_cast<uint16_t*>(prediction); @@ -2381,11 +2416,11 @@ void ConvolveCompoundCopy_NEON( } void ConvolveCompoundVertical_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int vertical_filter_index, - const int /*horizontal_filter_id*/, const int vertical_filter_id, - const int width, const int height, void* const prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int vertical_filter_index, const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; @@ -2408,8 +2443,9 @@ void ConvolveCompoundVertical_NEON( FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } - } else if ((filter_index == 1) & ((vertical_filter_id == 1) | - (vertical_filter_id == 15))) { // 5 tap. + } else if ((static_cast<int>(filter_index == 1) & + (static_cast<int>(vertical_filter_id == 1) | + static_cast<int>(vertical_filter_id == 15))) != 0) { // 5 tap. if (width == 4) { FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); @@ -2417,9 +2453,11 @@ void ConvolveCompoundVertical_NEON( FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } - } else if ((filter_index == 1) & - ((vertical_filter_id == 7) | (vertical_filter_id == 8) | - (vertical_filter_id == 9))) { // 6 tap with weird negative taps. + } else if ((static_cast<int>(filter_index == 1) & + (static_cast<int>(vertical_filter_id == 7) | + static_cast<int>(vertical_filter_id == 8) | + static_cast<int>(vertical_filter_id == 9))) != + 0) { // 6 tap with weird negative taps. if (width == 4) { FilterVertical4xH<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(src, src_stride, dest, @@ -2476,11 +2514,11 @@ void ConvolveCompoundVertical_NEON( } void ConvolveCompoundHorizontal_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int horizontal_filter_index, const int /*vertical_filter_index*/, - const int horizontal_filter_id, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); const auto* const src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; @@ -2492,9 +2530,10 @@ void ConvolveCompoundHorizontal_NEON( } template <int vertical_taps> -void Compound2DVertical(const uint16_t* const intermediate_result, - const int width, const int height, const int16x8_t taps, - void* const prediction) { +void Compound2DVertical( + const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width, + const int height, const int16x8_t taps, + void* LIBGAV1_RESTRICT const prediction) { auto* const dest = static_cast<uint16_t*>(prediction); if (width == 4) { Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>( @@ -2505,14 +2544,12 @@ void Compound2DVertical(const uint16_t* const intermediate_result, } } -void ConvolveCompound2D_NEON(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int vertical_filter_index, - const int horizontal_filter_id, - const int vertical_filter_id, const int width, - const int height, void* const prediction, - const ptrdiff_t /*pred_stride*/) { +void ConvolveCompound2D_NEON( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int vertical_filter_index, const int horizontal_filter_id, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. uint16_t @@ -2551,16 +2588,18 @@ void ConvolveCompound2D_NEON(const void* const reference, } } -inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) { +inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src, + uint8_t* LIBGAV1_RESTRICT const dst) { const uint8x16_t left = vld1q_u8(src); const uint8x16_t right = vld1q_u8(src + 1); vst1q_u8(dst, vrhaddq_u8(left, right)); } template <int width> -inline void IntraBlockCopyHorizontal(const uint8_t* src, +inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, - const int height, uint8_t* dst, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); @@ -2601,10 +2640,13 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, } void ConvolveIntraBlockCopyHorizontal_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*subpixel_x*/, const int /*subpixel_y*/, const int width, - const int height, void* const prediction, const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*subpixel_x*/, + const int /*subpixel_y*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { + assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); + assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); @@ -2630,7 +2672,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON( src += reference_stride; dest += pred_stride; } while (--y != 0); - } else if (width == 4) { + } else { // width == 4 uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); int y = height; @@ -2650,34 +2692,14 @@ void ConvolveIntraBlockCopyHorizontal_NEON( dest += pred_stride; y -= 2; } while (y != 0); - } else { - assert(width == 2); - uint8x8_t left = vdup_n_u8(0); - uint8x8_t right = vdup_n_u8(0); - int y = height; - do { - left = Load2<0>(src, left); - right = Load2<0>(src + 1, right); - src += reference_stride; - left = Load2<1>(src, left); - right = Load2<1>(src + 1, right); - src += reference_stride; - - const uint8x8_t result = vrhadd_u8(left, right); - - Store2<0>(dest, result); - dest += pred_stride; - Store2<1>(dest, result); - dest += pred_stride; - y -= 2; - } while (y != 0); } } template <int width> -inline void IntraBlockCopyVertical(const uint8_t* src, +inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, const int height, - uint8_t* dst, const ptrdiff_t dst_stride) { + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); uint8x16_t row[8], below[8]; @@ -2764,11 +2786,13 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } void ConvolveIntraBlockCopyVertical_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { + assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); + assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); @@ -2799,7 +2823,7 @@ void ConvolveIntraBlockCopyVertical_NEON( row = below; } while (--y != 0); - } else if (width == 4) { + } else { // width == 4 uint8x8_t row = Load4(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; @@ -2814,28 +2838,13 @@ void ConvolveIntraBlockCopyVertical_NEON( row = below; } while (--y != 0); - } else { - assert(width == 2); - uint8x8_t row = Load2(src); - uint8x8_t below = vdup_n_u8(0); - src += reference_stride; - - int y = height; - do { - below = Load2<0>(src, below); - src += reference_stride; - - Store2<0>(dest, vrhadd_u8(row, below)); - dest += pred_stride; - - row = below; - } while (--y != 0); } } template <int width> -inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, - const int height, uint8_t* dst, +inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 8); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); @@ -2996,11 +3005,13 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } void ConvolveIntraBlockCopy2D_NEON( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { + assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); + assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); // Note: allow vertical access to height + 1. Because this function is only @@ -3017,7 +3028,7 @@ void ConvolveIntraBlockCopy2D_NEON( IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride); - } else if (width == 4) { + } else { // width == 4 uint8x8_t left = Load4(src); uint8x8_t right = Load4(src + 1); src += reference_stride; @@ -3045,34 +3056,6 @@ void ConvolveIntraBlockCopy2D_NEON( row = vget_high_u16(below); y -= 2; } while (y != 0); - } else { - uint8x8_t left = Load2(src); - uint8x8_t right = Load2(src + 1); - src += reference_stride; - - uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); - - int y = height; - do { - left = Load2<0>(src, left); - right = Load2<0>(src + 1, right); - src += reference_stride; - left = Load2<2>(src, left); - right = Load2<2>(src + 1, right); - src += reference_stride; - - const uint16x8_t below = vaddl_u8(left, right); - - const uint8x8_t result = vrshrn_n_u16( - vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2); - Store2<0>(dest, result); - dest += pred_stride; - Store2<2>(dest, result); - dest += pred_stride; - - row = vget_high_u16(below); - y -= 2; - } while (y != 0); } } |