diff options
Diffstat (limited to 'src/dsp/arm/intrapred_smooth_neon.cc')
-rw-r--r-- | src/dsp/arm/intrapred_smooth_neon.cc | 741 |
1 files changed, 645 insertions, 96 deletions
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index c33f333..bcda131 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -26,6 +26,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" +#include "src/utils/common.h" #include "src/utils/constants.h" namespace libgav1 { @@ -38,24 +39,9 @@ namespace { // to have visibility of the values. This helps reduce loads and in the // creation of the inverse weights. constexpr uint8_t kSmoothWeights[] = { - // block dimension = 4 - 255, 149, 85, 64, - // block dimension = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // block dimension = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // block dimension = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, - // block dimension = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, - 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, - 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; - -// TODO(b/150459137): Keeping the intermediate values in uint16_t would allow -// processing more values at once. At the high end, it could do 4x4 or 8x2 at a -// time. +#include "src/dsp/smooth_weights.inc" +}; + inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, const uint16x4_t weighted_left, const uint16x4_t weighted_bl, @@ -66,26 +52,74 @@ inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); } -template <int width, int height> -inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +template <int height> +inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + constexpr int width = 4; + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; const uint8_t bottom_left = left[height - 1]; const uint8_t* const weights_y = kSmoothWeights + height - 4; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); - uint8x8_t top_v; - if (width == 4) { - top_v = Load4(top); - } else { // width == 8 - top_v = vld1_u8(top); + const uint8x8_t top_v = Load4(top); + const uint8x8_t top_right_v = vdup_n_u8(top_right); + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4); + // 256 - weights = vneg_s8(weights) + const uint8x8_t scaled_weights_x = + vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + + for (int y = 0; y < height; ++y) { + const uint8x8_t left_v = vdup_n_u8(left[y]); + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = + vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v))); + const uint16x4_t weighted_bl = + vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v)); + + const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v)); + const uint16x4_t weighted_left = + vget_low_u16(vmull_u8(weights_x_v, left_v)); + const uint16x4_t weighted_tr = + vget_low_u16(vmull_u8(scaled_weights_x, top_right_v)); + const uint16x4_t result = + CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); + + StoreLo4(dst, vmovn_u16(vcombine_u16(result, result))); + dst += stride; } +} + +inline uint8x8_t CalculatePred(const uint16x8_t weighted_top, + const uint16x8_t weighted_left, + const uint16x8_t weighted_bl, + const uint16x8_t weighted_tr) { + // Maximum value: 0xFF00 + const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl); + // Maximum value: 0xFF00 + const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr); + const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1); + return vrshrn_n_u16(pred_2, kSmoothWeightScale); +} + +template <int height> +inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + constexpr int width = 8; + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); + const uint8_t top_right = top[width - 1]; + const uint8_t bottom_left = left[height - 1]; + const uint8_t* const weights_y = kSmoothWeights + height - 4; + auto* dst = static_cast<uint8_t*>(dest); + + const uint8x8_t top_v = vld1_u8(top); const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); - // Over-reads for 4xN but still within the array. const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); // 256 - weights = vneg_s8(weights) const uint8x8_t scaled_weights_x = @@ -100,18 +134,10 @@ inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride, const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); - const uint16x4_t dest_0 = - CalculatePred(vget_low_u16(weighted_top), vget_low_u16(weighted_left), - vget_low_u16(weighted_tr), vget_low_u16(weighted_bl)); + const uint8x8_t result = + CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); - if (width == 4) { - StoreLo4(dst, vmovn_u16(vcombine_u16(dest_0, dest_0))); - } else { // width == 8 - const uint16x4_t dest_1 = CalculatePred( - vget_high_u16(weighted_top), vget_high_u16(weighted_left), - vget_high_u16(weighted_tr), vget_high_u16(weighted_bl)); - vst1_u8(dst, vmovn_u16(vcombine_u16(dest_0, dest_1))); - } + vst1_u8(dst, result); dst += stride; } } @@ -124,39 +150,30 @@ inline uint8x16_t CalculateWeightsAndPred( const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); const uint16x8_t weighted_tr_low = vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint16x4_t dest_0 = CalculatePred( - vget_low_u16(weighted_top_low), vget_low_u16(weighted_left_low), - vget_low_u16(weighted_tr_low), vget_low_u16(weighted_bl)); - const uint16x4_t dest_1 = CalculatePred( - vget_high_u16(weighted_top_low), vget_high_u16(weighted_left_low), - vget_high_u16(weighted_tr_low), vget_high_u16(weighted_bl)); - const uint8x8_t dest_0_u8 = vmovn_u16(vcombine_u16(dest_0, dest_1)); + const uint8x8_t result_low = CalculatePred( + weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low); const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); const uint16x8_t weighted_tr_high = vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint16x4_t dest_2 = CalculatePred( - vget_low_u16(weighted_top_high), vget_low_u16(weighted_left_high), - vget_low_u16(weighted_tr_high), vget_low_u16(weighted_bl)); - const uint16x4_t dest_3 = CalculatePred( - vget_high_u16(weighted_top_high), vget_high_u16(weighted_left_high), - vget_high_u16(weighted_tr_high), vget_high_u16(weighted_bl)); - const uint8x8_t dest_1_u8 = vmovn_u16(vcombine_u16(dest_2, dest_3)); - - return vcombine_u8(dest_0_u8, dest_1_u8); + const uint8x8_t result_high = CalculatePred( + weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high); + + return vcombine_u8(result_low, result_high); } template <int width, int height> -inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +inline void Smooth16PlusxN_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; const uint8_t bottom_left = left[height - 1]; const uint8_t* const weights_y = kSmoothWeights + height - 4; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); uint8x16_t top_v[4]; top_v[0] = vld1q_u8(top); @@ -229,14 +246,15 @@ inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride, } template <int width, int height> -inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +inline void SmoothVertical4Or8xN_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; const uint8_t* const weights_y = kSmoothWeights + height - 4; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); uint8x8_t top_v; if (width == 4) { @@ -279,14 +297,15 @@ inline uint8x16_t CalculateVerticalWeightsAndPred( } template <int width, int height> -inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +inline void SmoothVertical16PlusxN_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; const uint8_t* const weights_y = kSmoothWeights + height - 4; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); uint8x16_t top_v[4]; top_v[0] = vld1q_u8(top); @@ -330,13 +349,14 @@ inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride, } template <int width, int height> -inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +inline void SmoothHorizontal4Or8xN_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); const uint8x8_t top_right_v = vdup_n_u8(top_right); // Over-reads for 4xN but still within the array. @@ -382,13 +402,14 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred( } template <int width, int height> -inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - const uint8_t* const left = static_cast<const uint8_t*>(left_column); +inline void SmoothHorizontal16PlusxN_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint8_t*>(top_row); + const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); const uint8x8_t top_right_v = vdup_n_u8(top_right); @@ -447,7 +468,7 @@ void Init8bpp() { assert(dsp != nullptr); // 4x4 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<4, 4>; + Smooth4xN_NEON<4>; dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<4, 4>; dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = @@ -455,7 +476,7 @@ void Init8bpp() { // 4x8 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<4, 8>; + Smooth4xN_NEON<8>; dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<4, 8>; dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = @@ -463,7 +484,7 @@ void Init8bpp() { // 4x16 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<4, 16>; + Smooth4xN_NEON<16>; dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<4, 16>; dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = @@ -471,7 +492,7 @@ void Init8bpp() { // 8x4 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<8, 4>; + Smooth8xN_NEON<4>; dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<8, 4>; dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = @@ -479,7 +500,7 @@ void Init8bpp() { // 8x8 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<8, 8>; + Smooth8xN_NEON<8>; dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<8, 8>; dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = @@ -487,7 +508,7 @@ void Init8bpp() { // 8x16 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<8, 16>; + Smooth8xN_NEON<16>; dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<8, 16>; dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = @@ -495,7 +516,7 @@ void Init8bpp() { // 8x32 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = - Smooth4Or8xN_NEON<8, 32>; + Smooth8xN_NEON<32>; dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = SmoothVertical4Or8xN_NEON<8, 32>; dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = @@ -601,7 +622,535 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// Note these constants are duplicated from intrapred.cc to allow the compiler +// to have visibility of the values. This helps reduce loads and in the +// creation of the inverse weights. +constexpr uint16_t kSmoothWeights[] = { +#include "src/dsp/smooth_weights.inc" +}; + +template <int height> +inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[3]; + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t top_v = vld1_u16(top); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights); + const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v); + + // Weighted top right doesn't change with each row. + const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); + + for (int y = 0; y < height; ++y) { + // Each variable in the running summation is named for the last item to be + // accumulated. + const uint32x4_t weighted_top = + vmlal_n_u16(weighted_tr, top_v, weights_y[y]); + const uint32x4_t weighted_left = + vmlal_n_u16(weighted_top, weights_x_v, left[y]); + const uint32x4_t weighted_bl = + vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); + + const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1); + vst1_u16(reinterpret_cast<uint16_t*>(dst), pred); + dst += stride; + } +} + +// Common code between 8xH and [16|32|64]xH. +inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, + const uint32x4_t& weighted_corners_low, + const uint32x4_t& weighted_corners_high, + const uint16x4x2_t& top_vals, + const uint16x4x2_t& weights_x, const uint16_t left_y, + const uint16_t weight_y) { + // Each variable in the running summation is named for the last item to be + // accumulated. + const uint32x4_t weighted_top_low = + vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); + const uint32x4_t weighted_edges_low = + vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); + + const uint16x4_t pred_low = + vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1); + vst1_u16(dst, pred_low); + + const uint32x4_t weighted_top_high = + vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); + const uint32x4_t weighted_edges_high = + vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); + + const uint16x4_t pred_high = + vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1); + vst1_u16(dst + 4, pred_high); +} + +template <int height> +inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[7]; + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)}; + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), + vld1_u16(kSmoothWeights + 8)}; + // Weighted top right doesn't change with each row. + const uint32x4_t weighted_tr_low = + vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + const uint32x4_t weighted_tr_high = + vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + + for (int y = 0; y < height; ++y) { + // |weighted_bl| is invariant across the row. + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + const uint32x4_t weighted_corners_low = + vaddq_u32(weighted_bl, weighted_tr_low); + const uint32x4_t weighted_corners_high = + vaddq_u32(weighted_bl, weighted_tr_high); + CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low, + weighted_corners_high, top_vals, weights_x, left[y], + weights_y[y]); + dst += stride; + } +} + +// For width 16 and above. +template <int width, int height> +inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[width - 1]; + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t weight_scaling = vdup_n_u16(256); + // Precompute weighted values that don't vary with |y|. + uint32x4_t weighted_tr_low[width >> 3]; + uint32x4_t weighted_tr_high[width >> 3]; + for (int i = 0; i < width >> 3; ++i) { + const int x = i << 3; + const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x); + weighted_tr_low[i] = + vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right); + const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x); + weighted_tr_high[i] = + vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right); + } + + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + for (int y = 0; y < height; ++y) { + // |weighted_bl| is invariant across the row. + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + auto* dst_x = reinterpret_cast<uint16_t*>(dst); + for (int i = 0; i < width >> 3; ++i) { + const int x = i << 3; + const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)}; + const uint32x4_t weighted_corners_low = + vaddq_u32(weighted_bl, weighted_tr_low[i]); + const uint32x4_t weighted_corners_high = + vaddq_u32(weighted_bl, weighted_tr_high[i]); + // Accumulate weighted edge values and store. + const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x), + vld1_u16(kSmoothWeights + width + x)}; + CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high, + top_vals, weights_x, left[y], weights_y[y]); + dst_x += 8; + } + dst += stride; + } +} + +template <int height> +inline void SmoothVertical4xH_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t top_v = vld1_u16(top); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + + for (int y = 0; y < height; ++y) { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + const uint32x4_t weighted_top = + vmlal_n_u16(weighted_bl, top_v, weights_y[y]); + vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale)); + + dst += stride; + } +} + +template <int height> +inline void SmoothVertical8xH_NEON( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t top_low = vld1_u16(top); + const uint16x4_t top_high = vld1_u16(top + 4); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + + for (int y = 0; y < height; ++y) { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + // |weighted_bl| is invariant across the row. + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + + const uint32x4_t weighted_top_low = + vmlal_n_u16(weighted_bl, top_low, weights_y[y]); + vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale)); + + const uint32x4_t weighted_top_high = + vmlal_n_u16(weighted_bl, top_high, weights_y[y]); + vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale)); + dst += stride; + } +} + +// For width 16 and above. +template <int width, int height> +inline void SmoothVerticalWxH_NEON( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t bottom_left = left[height - 1]; + const uint16_t* const weights_y = kSmoothWeights + height - 4; + + auto* dst = static_cast<uint8_t*>(dest); + + uint16x4x2_t top_vals[width >> 3]; + for (int i = 0; i < width >> 3; ++i) { + const int x = i << 3; + top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)}; + } + + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + for (int y = 0; y < height; ++y) { + // |weighted_bl| is invariant across the row. + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + + auto* dst_x = reinterpret_cast<uint16_t*>(dst); + for (int i = 0; i < width >> 3; ++i) { + const uint32x4_t weighted_top_low = + vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); + vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale)); + + const uint32x4_t weighted_top_high = + vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); + vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale)); + dst_x += 8; + } + dst += stride; + } +} + +template <int height> +inline void SmoothHorizontal4xH_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[3]; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t weights_x = vld1_u16(kSmoothWeights); + const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x); + + const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); + for (int y = 0; y < height; ++y) { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint32x4_t weighted_left = + vmlal_n_u16(weighted_tr, weights_x, left[y]); + vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale)); + dst += stride; + } +} + +template <int height> +inline void SmoothHorizontal8xH_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[7]; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), + vld1_u16(kSmoothWeights + 8)}; + + const uint32x4_t weighted_tr_low = + vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + const uint32x4_t weighted_tr_high = + vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + + for (int y = 0; y < height; ++y) { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint16_t left_y = left[y]; + const uint32x4_t weighted_left_low = + vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); + vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale)); + + const uint32x4_t weighted_left_high = + vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); + vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale)); + dst += stride; + } +} + +// For width 16 and above. +template <int width, int height> +inline void SmoothHorizontalWxH_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { + const auto* const top = static_cast<const uint16_t*>(top_row); + const auto* const left = static_cast<const uint16_t*>(left_column); + const uint16_t top_right = top[width - 1]; + + auto* dst = static_cast<uint8_t*>(dest); + + const uint16x4_t weight_scaling = vdup_n_u16(256); + + uint16x4_t weights_x_low[width >> 3]; + uint16x4_t weights_x_high[width >> 3]; + uint32x4_t weighted_tr_low[width >> 3]; + uint32x4_t weighted_tr_high[width >> 3]; + for (int i = 0; i < width >> 3; ++i) { + const int x = i << 3; + weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x); + weighted_tr_low[i] = + vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right); + weights_x_high[i] = vld1_u16(kSmoothWeights + width + x); + weighted_tr_high[i] = + vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right); + } + + for (int y = 0; y < height; ++y) { + auto* dst_x = reinterpret_cast<uint16_t*>(dst); + const uint16_t left_y = left[y]; + for (int i = 0; i < width >> 3; ++i) { + const uint32x4_t weighted_left_low = + vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); + vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale)); + + const uint32x4_t weighted_left_high = + vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); + vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale)); + dst_x += 8; + } + dst += stride; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + // 4x4 + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + Smooth4xH_NEON<4>; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + SmoothVertical4xH_NEON<4>; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4xH_NEON<4>; + + // 4x8 + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + Smooth4xH_NEON<8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + SmoothVertical4xH_NEON<8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4xH_NEON<8>; + + // 4x16 + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + Smooth4xH_NEON<16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + SmoothVertical4xH_NEON<16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4xH_NEON<16>; + + // 8x4 + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + Smooth8xH_NEON<4>; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + SmoothVertical8xH_NEON<4>; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8xH_NEON<4>; + + // 8x8 + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + Smooth8xH_NEON<8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + SmoothVertical8xH_NEON<8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8xH_NEON<8>; + + // 8x16 + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + Smooth8xH_NEON<16>; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + SmoothVertical8xH_NEON<16>; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8xH_NEON<16>; + + // 8x32 + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + Smooth8xH_NEON<32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + SmoothVertical8xH_NEON<32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8xH_NEON<32>; + + // 16x4 + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + SmoothWxH_NEON<16, 4>; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<16, 4>; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<16, 4>; + + // 16x8 + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + SmoothWxH_NEON<16, 8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<16, 8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<16, 8>; + + // 16x16 + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + SmoothWxH_NEON<16, 16>; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<16, 16>; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<16, 16>; + + // 16x32 + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + SmoothWxH_NEON<16, 32>; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<16, 32>; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<16, 32>; + + // 16x64 + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + SmoothWxH_NEON<16, 64>; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<16, 64>; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<16, 64>; + + // 32x8 + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + SmoothWxH_NEON<32, 8>; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<32, 8>; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<32, 8>; + + // 32x16 + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + SmoothWxH_NEON<32, 16>; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<32, 16>; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<32, 16>; + + // 32x32 + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + SmoothWxH_NEON<32, 32>; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<32, 32>; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<32, 32>; + + // 32x64 + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + SmoothWxH_NEON<32, 64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<32, 64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<32, 64>; + + // 64x16 + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + SmoothWxH_NEON<64, 16>; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<64, 16>; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<64, 16>; + + // 64x32 + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + SmoothWxH_NEON<64, 32>; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<64, 32>; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<64, 32>; + + // 64x64 + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + SmoothWxH_NEON<64, 64>; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + SmoothVerticalWxH_NEON<64, 64>; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontalWxH_NEON<64, 64>; +} +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredSmoothInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 |