diff options
Diffstat (limited to 'src/dsp/arm/intrapred_smooth_neon.cc')
-rw-r--r-- | src/dsp/arm/intrapred_smooth_neon.cc | 339 |
1 files changed, 156 insertions, 183 deletions
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index bcda131..d6c1450 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -31,7 +31,6 @@ namespace libgav1 { namespace dsp { - namespace low_bitdepth { namespace { @@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; -inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, - const uint16x4_t weighted_left, - const uint16x4_t weighted_bl, - const uint16x4_t weighted_tr) { - const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); - const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); - const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); - return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); +// 256 - v = vneg_s8(v) +inline uint8x8_t NegateS8(const uint8x8_t v) { + return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); } template <int height> -inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 4; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v))); - const uint16x4_t weighted_bl = - vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v)); - - const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v)); - const uint16x4_t weighted_left = - vget_low_u16(vmull_u8(weights_x_v, left_v)); - const uint16x4_t weighted_tr = - vget_low_u16(vmull_u8(scaled_weights_x, top_right_v)); - const uint16x4_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); - - StoreLo4(dst, vmovn_u16(vcombine_u16(result, result))); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_bl, weights_y_v, top_v); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x_v, left_v); + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale); + + StoreLo4(dst, result); dst += stride; } } -inline uint8x8_t CalculatePred(const uint16x8_t weighted_top, - const uint16x8_t weighted_left, - const uint16x8_t weighted_bl, - const uint16x8_t weighted_tr) { - // Maximum value: 0xFF00 - const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl); - // Maximum value: 0xFF00 - const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr); - const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1); - return vrshrn_n_u16(pred_2, kSmoothWeightScale); +inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl, + const uint16x8_t weighted_left_tr) { + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + return vrshrn_n_u16(avg, kSmoothWeightScale); +} + +inline uint8x8_t CalculateWeightsAndPred( + const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, + const uint8x8_t bottom_left, const uint8x8_t weights_x, + const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { + const uint16x8_t weighted_top = vmull_u8(weights_y, top); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left); + const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); + return CalculatePred(weighted_top_bl, weighted_left_tr); } template <int height> -inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 8; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - - const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint8x8_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); + CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v, + weights_x_v, scaled_weights_y, weights_y_v); vst1_u8(dst, result); dst += stride; @@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred( const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, const uint8x8_t weights_y, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_top_bl_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint8x8_t result_low = CalculatePred( - weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t result_low = + CalculatePred(weighted_top_bl_low, weighted_left_tr_low); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t weighted_top_bl_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint8x8_t result_high = CalculatePred( - weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t result_high = + CalculatePred(weighted_top_bl_high, weighted_left_tr_high); return vcombine_u8(result_low, result_high); } +// 256 - v = vneg_s8(v) +inline uint8x16_t NegateS8(const uint8x16_t v) { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); +} + template <int width, int height> -inline void Smooth16PlusxN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; @@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); - // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. - // This currently has a performance slope similar to Paeth so it does not - // appear to be register bound for arm64. uint8x16_t weights_x_v[4]; weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); if (width > 16) { @@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); + scaled_weights_x[0] = NegateS8(weights_x_v[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); + scaled_weights_x[1] = NegateS8(weights_x_v[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); + scaled_weights_x[2] = NegateS8(weights_x_v[2]); + scaled_weights_x[3] = NegateS8(weights_x_v[3]); } } for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, @@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON( } template <int width, int height> -inline void SmoothVertical4Or8xN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; @@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON( inline uint8x16_t CalculateVerticalWeightsAndPred( const uint8x16_t top, const uint8x8_t weights_y, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); - const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); - const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); + const uint16x8_t pred_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t pred_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); const uint8x8_t pred_scaled_high = vrshrn_n_u16(pred_high, kSmoothWeightScale); @@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred( } template <int width, int height> -inline void SmoothVertical16PlusxN_NEON( +void SmoothVertical16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); const uint8x16_t pred_0 = @@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON( } template <int width, int height> -inline void SmoothHorizontal4Or8xN_NEON( +void SmoothHorizontal4Or8xN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); // Over-reads for 4xN but still within the array. const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); - - const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); - const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x, left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred( const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x) { const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); - const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_high = - vrshrn_n_u16(pred_high, kSmoothWeightScale); + vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale); return vcombine_u8(pred_scaled_low, pred_scaled_high); } template <int width, int height> -inline void SmoothHorizontal16PlusxN_NEON( +void SmoothHorizontal16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); + scaled_weights_x[0] = NegateS8(weights_x[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); + scaled_weights_x[1] = NegateS8(weights_x[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); + scaled_weights_x[2] = NegateS8(weights_x[2]); + scaled_weights_x[3] = NegateS8(weights_x[3]); } } @@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; +// 256 - v = vneg_s8(v) +inline uint16x4_t NegateS8(const uint16x4_t v) { + return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); +} + template <int height> -inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t top_v = vld1_u16(top); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v); - - // Weighted top right doesn't change with each row. + const uint16x4_t scaled_weights_x = NegateS8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // Common code between 8xH and [16|32|64]xH. inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, - const uint32x4_t& weighted_corners_low, - const uint32x4_t& weighted_corners_high, - const uint16x4x2_t& top_vals, - const uint16x4x2_t& weights_x, const uint16_t left_y, + const uint32x4_t weighted_corners_low, + const uint32x4_t weighted_corners_high, + const uint16x4x2_t top_vals, + const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. @@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, } template <int height> -inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), vld1_u16(kSmoothWeights + 8)}; - // Weighted top right doesn't change with each row. const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = @@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // For width 16 and above. template <int width, int height> -inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; @@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); // Precompute weighted values that don't vary with |y|. uint32x4_t weighted_tr_low[width >> 3]; uint32x4_t weighted_tr_high[width >> 3]; for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right); const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right); } const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); auto* dst_x = reinterpret_cast<uint16_t*>(dst); @@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, } template <int height> -inline void SmoothVertical4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON( } template <int height> -inline void SmoothVertical8xH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON( for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothVerticalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON( const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON( } template <int height> -inline void SmoothHorizontal4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON( auto* dst = static_cast<uint8_t*>(dest); const uint16x4_t weights_x = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x); + const uint16x4_t scaled_weights_x = NegateS8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON( } template <int height> -inline void SmoothHorizontal8xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON( vld1_u16(kSmoothWeights + 8)}; const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); @@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothHorizontalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); - uint16x4_t weights_x_low[width >> 3]; uint16x4_t weights_x_high[width >> 3]; uint32x4_t weighted_tr_low[width >> 3]; @@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON( for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right); weights_x_high[i] = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right); } for (int y = 0; y < height; ++y) { @@ -1141,6 +1113,7 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = SmoothHorizontalWxH_NEON<64, 64>; } + } // namespace } // namespace high_bitdepth #endif // LIBGAV1_MAX_BITDEPTH >= 10 |