From 2381d803c76105f44717d75f089ec37f51e5cfe4 Mon Sep 17 00:00:00 2001 From: qinxialei Date: Thu, 22 Apr 2021 11:20:15 +0800 Subject: New upstream version 0.16.3 --- src/dsp/arm/average_blend_neon.cc | 135 +- src/dsp/arm/cdef_neon.cc | 11 +- src/dsp/arm/common_neon.h | 70 +- src/dsp/arm/convolve_neon.cc | 943 +++++----- src/dsp/arm/distance_weighted_blend_neon.cc | 162 +- src/dsp/arm/distance_weighted_blend_neon.h | 2 + src/dsp/arm/film_grain_neon.cc | 2 +- src/dsp/arm/intra_edge_neon.cc | 243 ++- src/dsp/arm/intra_edge_neon.h | 3 + src/dsp/arm/intrapred_cfl_neon.cc | 1012 ++++++++++- src/dsp/arm/intrapred_cfl_neon.h | 179 ++ src/dsp/arm/intrapred_directional_neon.cc | 594 ++++++- src/dsp/arm/intrapred_directional_neon.h | 56 + src/dsp/arm/intrapred_filter_intra_neon.cc | 176 -- src/dsp/arm/intrapred_filter_neon.cc | 176 ++ src/dsp/arm/intrapred_filter_neon.h | 37 + src/dsp/arm/intrapred_neon.cc | 247 ++- src/dsp/arm/intrapred_neon.h | 218 +-- src/dsp/arm/intrapred_smooth_neon.cc | 5 +- src/dsp/arm/intrapred_smooth_neon.h | 149 ++ src/dsp/arm/inverse_transform_10bit_neon.cc | 2543 +++++++++++++++++++++++++++ src/dsp/arm/inverse_transform_neon.cc | 2 +- src/dsp/arm/inverse_transform_neon.h | 16 + src/dsp/arm/loop_filter_neon.cc | 18 +- src/dsp/arm/loop_restoration_neon.cc | 1470 +++++++++++----- src/dsp/arm/mask_blend_neon.cc | 2 +- src/dsp/arm/motion_field_projection_neon.cc | 2 +- src/dsp/arm/motion_vector_search_neon.cc | 2 +- src/dsp/arm/obmc_neon.cc | 2 +- src/dsp/arm/super_res_neon.cc | 151 +- src/dsp/arm/super_res_neon.h | 5 +- src/dsp/arm/warp_neon.cc | 4 +- src/dsp/arm/weight_mask_neon.cc | 2 +- 33 files changed, 7217 insertions(+), 1422 deletions(-) create mode 100644 src/dsp/arm/intrapred_cfl_neon.h create mode 100644 src/dsp/arm/intrapred_directional_neon.h delete mode 100644 src/dsp/arm/intrapred_filter_intra_neon.cc create mode 100644 src/dsp/arm/intrapred_filter_neon.cc create mode 100644 src/dsp/arm/intrapred_filter_neon.h create mode 100644 src/dsp/arm/intrapred_smooth_neon.h create mode 100644 src/dsp/arm/inverse_transform_10bit_neon.cc (limited to 'src/dsp/arm') diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc index 834e8b4..5b4c094 100644 --- a/src/dsp/arm/average_blend_neon.cc +++ b/src/dsp/arm/average_blend_neon.cc @@ -35,6 +35,11 @@ namespace { constexpr int kInterPostRoundBit = kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; +} // namespace + +namespace low_bitdepth { +namespace { + inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0, const int16_t* prediction_1) { const int16x8_t pred0 = vld1q_s16(prediction_0); @@ -128,13 +133,139 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0, + const uint16_t* prediction_1, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + const uint16x8_t pred0 = vld1q_u16(prediction_0); + const uint16x8_t pred1 = vld1q_u16(prediction_1); + const uint32x4_t pred_lo = + vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1)); + const uint32x4_t pred_hi = + vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1)); + const int32x4_t offset_lo = + vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset); + const int32x4_t offset_hi = + vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset); + const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1); + const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1); + return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth); +} + +inline void AverageBlendLargeRow(const uint16_t* prediction_0, + const uint16_t* prediction_1, const int width, + uint16_t* dest, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + int x = width; + do { + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + x -= 16; + } while (x != 0); +} + +void AverageBlend_NEON(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = height; + + const ptrdiff_t dst_stride = dest_stride >> 1; + const int32x4_t compound_offset = + vdupq_n_s32(static_cast(kCompoundOffset + kCompoundOffset)); + const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); + if (width == 4) { + do { + const uint16x8_t result = + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth); + pred_0 += 8; + pred_1 += 8; + + vst1_u16(dst, vget_low_u16(result)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(result)); + dst += dst_stride; + y -= 2; + } while (y != 0); + return; + } + + if (width == 8) { + do { + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + y -= 2; + } while (y != 0); + return; + } + + do { + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->average_blend = AverageBlend_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 -void AverageBlendInit_NEON() { Init8bpp(); } +void AverageBlendInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc index 4d0e76f..60c72d6 100644 --- a/src/dsp/arm/cdef_neon.cc +++ b/src/dsp/arm/cdef_neon.cc @@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00 // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00 // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00 - partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3); @@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00 // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00 // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00 - const uint8x8_t v_zero = vdup_n_u8(0); - partial_lo[6] = vaddl_u8(v_zero, v_src[0]); - for (int i = 1; i < 8; ++i) { + partial_lo[6] = vaddl_u8(v_src[0], v_src[1]); + for (int i = 2; i < 8; ++i) { partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]); } @@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference, const uint16x8_t threshold, const int16x8_t damping) { - // If reference > pixel, the difference will be negative, so covert to 0 or + // If reference > pixel, the difference will be negative, so convert to 0 or // -1. const uint16x8_t sign = vcgtq_u16(reference, pixel); const uint16x8_t abs_diff = vabdq_u16(pixel, reference); @@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h index dcb7567..05e0d05 100644 --- a/src/dsp/arm/common_neon.h +++ b/src/dsp/arm/common_neon.h @@ -28,8 +28,7 @@ #if 0 #include - -#include "absl/strings/str_cat.h" +#include constexpr bool kEnablePrintRegs = true; @@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name, inline void PrintReg(const int32x4x2_t val, const std::string& name) { DebugRegisterQ r; - vst1q_u32(r.u32, val.val[0]); - const std::string name0 = absl::StrCat(name, ".val[0]").c_str(); + vst1q_s32(r.i32, val.val[0]); + const std::string name0 = name + std::string(".val[0]"); PrintVectQ(r, name0.c_str(), 32); - vst1q_u32(r.u32, val.val[1]); - const std::string name1 = absl::StrCat(name, ".val[1]").c_str(); + vst1q_s32(r.i32, val.val[1]); + const std::string name1 = name + std::string(".val[1]"); PrintVectQ(r, name1.c_str(), 32); } @@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) { // Print an individual (non-vector) value in decimal format. inline void PrintReg(const int x, const char* name) { if (kEnablePrintRegs) { - printf("%s: %d\n", name, x); + fprintf(stderr, "%s: %d\n", name, x); } } // Print an individual (non-vector) value in hexadecimal format. inline void PrintHex(const int x, const char* name) { if (kEnablePrintRegs) { - printf("%s: %x\n", name, x); + fprintf(stderr, "%s: %x\n", name, x); } } @@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) { ValueToMem(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane)); } +// Simplify code when caller has |buf| cast as uint8_t*. +inline void Store4(void* const buf, const uint16x4_t val) { + vst1_u16(static_cast(buf), val); +} + +// Simplify code when caller has |buf| cast as uint8_t*. +inline void Store8(void* const buf, const uint16x8_t val) { + vst1q_u16(static_cast(buf), val); +} + //------------------------------------------------------------------------------ // Bit manipulation. // vshXX_n_XX() requires an immediate. template -inline uint8x8_t LeftShift(const uint8x8_t vector) { +inline uint8x8_t LeftShiftVector(const uint8x8_t vector) { return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift)); } template -inline uint8x8_t RightShift(const uint8x8_t vector) { +inline uint8x8_t RightShiftVector(const uint8x8_t vector) { return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift)); } template -inline int8x8_t RightShift(const int8x8_t vector) { +inline int8x8_t RightShiftVector(const int8x8_t vector) { return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift)); } @@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) { #endif // defined(__aarch64__) } +inline uint32_t SumVector(const uint32x2_t a) { +#if defined(__aarch64__) + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif // defined(__aarch64__) +} + inline uint32_t SumVector(const uint32x4_t a) { #if defined(__aarch64__) return vaddvq_u32(a); @@ -446,6 +464,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { return b0; } +// Input: +// 00 01 02 03 +// 10 11 12 13 +// 20 21 22 23 +// 30 31 32 33 +inline void Transpose4x4(uint16x4_t a[4]) { + // b: + // 00 10 02 12 + // 01 11 03 13 + const uint16x4x2_t b = vtrn_u16(a[0], a[1]); + // c: + // 20 30 22 32 + // 21 31 23 33 + const uint16x4x2_t c = vtrn_u16(a[2], a[3]); + // d: + // 00 10 20 30 + // 02 12 22 32 + const uint32x2x2_t d = + vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); + // e: + // 01 11 21 31 + // 03 13 23 33 + const uint32x2x2_t e = + vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); + a[0] = vreinterpret_u16_u32(d.val[0]); + a[1] = vreinterpret_u16_u32(e.val[0]); + a[2] = vreinterpret_u16_u32(d.val[1]); + a[3] = vreinterpret_u16_u32(e.val[1]); +} + // Input: // a: 00 01 02 03 10 11 12 13 // b: 20 21 22 23 30 31 32 33 diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc index fd9b912..331bfe2 100644 --- a/src/dsp/arm/convolve_neon.cc +++ b/src/dsp/arm/convolve_neon.cc @@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src, return vreinterpretq_s16_u16(sum); } -template -int16x8_t SumHorizontalTaps(const uint8_t* const src, - const uint8x8_t* const v_tap) { - uint8x8_t v_src[8]; - const uint8x16_t src_long = vld1q_u8(src); - int16x8_t sum; - - if (filter_index < 2) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6)); - sum = SumOnePassTaps(v_src, v_tap + 1); - } else if (filter_index == 2) { - v_src[0] = vget_low_u8(src_long); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); - v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); - sum = SumOnePassTaps(v_src, v_tap); - } else if (filter_index == 3) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - sum = SumOnePassTaps(v_src, v_tap + 3); - } else if (filter_index > 3) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - sum = SumOnePassTaps(v_src, v_tap + 2); - } - return sum; -} - -template -uint8x8_t SimpleHorizontalTaps(const uint8_t* const src, - const uint8x8_t* const v_tap) { - int16x8_t sum = - SumHorizontalTaps(src, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); - return vqrshrun_n_s16(sum, kFilterBits - 1); -} - -template -uint16x8_t HorizontalTaps8To16(const uint8_t* const src, - const uint8x8_t* const v_tap) { - const int16x8_t sum = - SumHorizontalTaps(src, v_tap); - - return vreinterpretq_u16_s16( - vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); -} - -template -int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - uint16x8_t sum; - const uint8x8_t input0 = vld1_u8(src); - src += src_stride; - const uint8x8_t input1 = vld1_u8(src); - uint8x8x2_t input = vzip_u8(input0, input1); - - if (filter_index == 3) { - // tap signs : + + - sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - } else if (filter_index == 4) { - // tap signs : - + + - - sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); - } else { - // tap signs : + + + + - sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]); - sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); - } - - return vreinterpretq_s16_u16(sum); -} - -template -uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src, - const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - int16x8_t sum = SumHorizontalTaps2x2(src, src_stride, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); - return vqrshrun_n_s16(sum, kFilterBits - 1); -} - -template -uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src, - const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - const int16x8_t sum = - SumHorizontalTaps2x2(src, src_stride, v_tap); - - return vreinterpretq_u16_s16( - vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); -} - -template -void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, - const uint8x8_t* const v_tap) { +template +void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const uint8x8_t* const v_tap) { auto* dest8 = static_cast(dest); auto* dest16 = static_cast(dest); - - // 4 tap filters are never used when width > 4. - if (num_taps != 4 && width > 4) { - int y = 0; + if (!is_2d) { + int y = height; do { int x = 0; - do { - if (is_2d || is_compound) { - const uint16x8_t v_sum = - HorizontalTaps8To16(&src[x], - v_tap); + do { // Increasing loop counter x is better. + const uint8x16_t src_long = vld1q_u8(src + x); + uint8x8_t v_src[8]; + int16x8_t sum; + if (filter_index < 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + sum = SumOnePassTaps(v_src, + v_tap + 1); + } else if (filter_index == 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); + sum = SumOnePassTaps(v_src, v_tap); + } else if (filter_index == 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + sum = SumOnePassTaps(v_src, v_tap + 3); + } else if (filter_index > 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + sum = SumOnePassTaps(v_src, v_tap + 2); + } + if (is_compound) { + const uint16x8_t v_sum = vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); vst1q_u16(&dest16[x], v_sum); } else { - const uint8x8_t result = - SimpleHorizontalTaps(&src[x], - v_tap); + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. + // Combining them requires adding the rounding offset from the skipped + // shift. + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); vst1_u8(&dest8[x], result); } - x += step; + x += 8; } while (x < width); src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); + } else { + int x = 0; + do { + const uint8_t* s = src + x; + int y = height; + do { // Increasing loop counter x is better. + const uint8x16_t src_long = vld1q_u8(s); + uint8x8_t v_src[8]; + int16x8_t sum; + if (filter_index < 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + sum = SumOnePassTaps(v_src, + v_tap + 1); + } else if (filter_index == 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); + sum = SumOnePassTaps(v_src, v_tap); + } else if (filter_index == 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + sum = SumOnePassTaps(v_src, v_tap + 3); + } else if (filter_index > 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + sum = SumOnePassTaps(v_src, v_tap + 2); + } + const uint16x8_t v_sum = vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); + vst1q_u16(dest16, v_sum); + s += src_stride; + dest16 += 8; + } while (--y != 0); + x += 8; + } while (x < width); + } +} + +template +void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int height, const uint8x8_t* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + int y = height; + do { + uint8x8_t v_src[4]; + int16x8_t sum; + v_src[0] = vld1_u8(src); + if (filter_index == 3) { + v_src[1] = RightShiftVector<1 * 8>(v_src[0]); + sum = SumOnePassTaps(v_src, v_tap + 3); + } else { + v_src[1] = RightShiftVector<1 * 8>(v_src[0]); + v_src[2] = RightShiftVector<2 * 8>(v_src[0]); + v_src[3] = RightShiftVector<3 * 8>(v_src[0]); + sum = SumOnePassTaps(v_src, v_tap + 2); + } + if (is_2d || is_compound) { + const uint16x4_t v_sum = vreinterpret_u16_s16( + vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1)); + vst1_u16(dest16, v_sum); + } else { + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); + StoreLo4(&dest8[0], result); + } + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (--y != 0); +} + +template +void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int height, const uint8x8_t* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + int y = height >> 1; + do { + const uint8x8_t input0 = vld1_u8(src); + const uint8x8_t input1 = vld1_u8(src + src_stride); + const uint8x8x2_t input = vzip_u8(input0, input1); + uint16x8_t sum; + if (filter_index == 3) { + // tap signs : + + + sum = vmull_u8(input.val[0], v_tap[3]); + sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]); + } else if (filter_index == 4) { + // tap signs : - + + - + sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); + sum = vmlsl_u8(sum, input.val[0], v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); + sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); + } else { + // tap signs : + + + + + sum = vmull_u8(input.val[0], v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); + sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); + } + int16x8_t s = vreinterpretq_s16_u16(sum); + if (is_2d) { + const uint16x8_t v_sum = + vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1)); + dest16[0] = vgetq_lane_u16(v_sum, 0); + dest16[1] = vgetq_lane_u16(v_sum, 2); + dest16 += pred_stride; + dest16[0] = vgetq_lane_u16(v_sum, 1); + dest16[1] = vgetq_lane_u16(v_sum, 3); + dest16 += pred_stride; + } else { + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. + // Combining them requires adding the rounding offset from the skipped + // shift. + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1); + dest8[0] = vget_lane_u8(result, 0); + dest8[1] = vget_lane_u8(result, 2); + dest8 += pred_stride; + dest8[0] = vget_lane_u8(result, 1); + dest8[1] = vget_lane_u8(result, 3); + dest8 += pred_stride; + } + src += src_stride << 1; + } while (--y != 0); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + const uint8x8_t input = vld1_u8(src); + uint16x8_t sum; + if (filter_index == 3) { + sum = vmull_u8(input, v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]); + } else if (filter_index == 4) { + sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]); + sum = vmlsl_u8(sum, input, v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); + sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); + } else { + assert(filter_index == 5); + sum = vmull_u8(input, v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); + sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); + } + // |sum| contains an int16_t value. + sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum), + kInterRoundBitsHorizontal - 1)); + Store2<0>(dest16, sum); + } +} + +template +void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const uint8x8_t* const v_tap) { + assert(width < 8 || filter_index <= 3); + // Don't simplify the redundant if conditions with the template parameters, + // which helps the compiler generate compact code. + if (width >= 8 && filter_index <= 3) { + FilterHorizontalWidth8AndUp(src, src_stride, dest, pred_stride, + width, height, v_tap); return; } - // Horizontal passes only needs to account for |num_taps| 2 and 4 when + // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(num_taps <= 4); - if (num_taps <= 4) { + assert(filter_index >= 3 && filter_index <= 5); + if (filter_index >= 3 && filter_index <= 5) { if (width == 4) { - int y = 0; - do { - if (is_2d || is_compound) { - const uint16x8_t v_sum = - HorizontalTaps8To16(src, - v_tap); - vst1_u16(dest16, vget_low_u16(v_sum)); - } else { - const uint8x8_t result = - SimpleHorizontalTaps(src, - v_tap); - StoreLo4(&dest8[0], result); - } - src += src_stride; - dest8 += pred_stride; - dest16 += pred_stride; - } while (++y < height); + FilterHorizontalWidth4( + src, src_stride, dest, pred_stride, height, v_tap); return; } - + assert(width == 2); if (!is_compound) { - int y = 0; - do { - if (is_2d) { - const uint16x8_t sum = - HorizontalTaps8To16_2x2(src, src_stride, v_tap); - dest16[0] = vgetq_lane_u16(sum, 0); - dest16[1] = vgetq_lane_u16(sum, 2); - dest16 += pred_stride; - dest16[0] = vgetq_lane_u16(sum, 1); - dest16[1] = vgetq_lane_u16(sum, 3); - dest16 += pred_stride; - } else { - const uint8x8_t sum = - SimpleHorizontalTaps2x2(src, src_stride, v_tap); - - dest8[0] = vget_lane_u8(sum, 0); - dest8[1] = vget_lane_u8(sum, 2); - dest8 += pred_stride; - - dest8[0] = vget_lane_u8(sum, 1); - dest8[1] = vget_lane_u8(sum, 3); - dest8 += pred_stride; - } - - src += src_stride << 1; - y += 2; - } while (y < height - 1); - - // The 2d filters have an odd |height| because the horizontal pass - // generates context for the vertical pass. - if (is_2d) { - assert(height % 2 == 1); - uint16x8_t sum; - const uint8x8_t input = vld1_u8(src); - if (filter_index == 3) { // |num_taps| == 2 - sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - } else if (filter_index == 4) { - sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); - sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]); - } else { - assert(filter_index == 5); - sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]); - sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]); - } - // |sum| contains an int16_t value. - sum = vreinterpretq_u16_s16(vrshrq_n_s16( - vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1)); - Store2<0>(dest16, sum); - } + FilterHorizontalWidth2(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, } template -void Filter2DVertical(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int width, - const int height, const int16x8_t taps) { +void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const int16x8_t taps) { assert(width >= 8); constexpr int next_row = num_taps - 1; - // The Horizontal pass uses |width| as |stride| for the intermediate buffer. - const ptrdiff_t src_stride = width; - - auto* dst8 = static_cast(dst); - auto* dst16 = static_cast(dst); + auto* const dst8 = static_cast(dst); + auto* const dst16 = static_cast(dst); int x = 0; do { - int16x8_t srcs[8]; - const uint16_t* src_x = src + x; - srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + int16x8_t srcs[9]; + srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps >= 4) { - srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps >= 6) { - srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps == 8) { - srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; } } } - int y = 0; + uint8_t* d8 = dst8 + x; + uint16_t* d16 = dst16 + x; + int y = height; do { - srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - - const int16x8_t sum = - SimpleSum2DVerticalTaps(srcs, taps); + srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + const int16x8_t sum0 = + SimpleSum2DVerticalTaps(srcs + 0, taps); + const int16x8_t sum1 = + SimpleSum2DVerticalTaps(srcs + 1, taps); if (is_compound) { - vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum)); + vst1q_u16(d16, vreinterpretq_u16_s16(sum0)); + d16 += dst_stride; + vst1q_u16(d16, vreinterpretq_u16_s16(sum1)); + d16 += dst_stride; } else { - vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum)); + vst1_u8(d8, vqmovun_s16(sum0)); + d8 += dst_stride; + vst1_u8(d8, vqmovun_s16(sum1)); + d8 += dst_stride; } - - srcs[0] = srcs[1]; + srcs[0] = srcs[2]; if (num_taps >= 4) { - srcs[1] = srcs[2]; - srcs[2] = srcs[3]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; if (num_taps >= 6) { - srcs[3] = srcs[4]; - srcs[4] = srcs[5]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; if (num_taps == 8) { - srcs[5] = srcs[6]; - srcs[6] = srcs[7]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; } } } - } while (++y < height); + y -= 2; + } while (y != 0); x += 8; } while (x < width); } // Take advantage of |src_stride| == |width| to process two rows at a time. template -void Filter2DVertical4xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const int16x8_t taps) { +void Filter2DVerticalWidth4(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); @@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst, } } - int y = 0; + int y = height; do { srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; @@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst, } } } - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } // Take advantage of |src_stride| == |width| to process four rows at a time. template -void Filter2DVertical2xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const int16x8_t taps) { +void Filter2DVerticalWidth2(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { constexpr int next_row = (num_taps < 6) ? 4 : 8; auto* dst8 = static_cast(dst); @@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<8, 8, 2, true, is_2d, is_compound>( + FilterHorizontal<2, true, is_2d, is_compound>( src, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 1) { // 6 tap. // Check if outside taps are positive. if ((filter_id == 1) | (filter_id == 15)) { - FilterHorizontal<6, 8, 1, false, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<1, false, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else { - FilterHorizontal<6, 8, 1, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<1, true, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<6, 8, 0, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<0, true, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 4) { // 4 tap. - FilterHorizontal<4, 8, 4, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, true, is_2d, is_compound>( + src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<4, 8, 5, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<5, true, is_2d, is_compound>( + src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<2, 8, 3, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<3, true, is_2d, is_compound>( + src + 3, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +template +void Filter2DVertical(const uint16_t* const intermediate_result, + const int width, const int height, const int16x8_t taps, + void* const prediction, const ptrdiff_t pred_stride) { + auto* const dest = static_cast(prediction); + if (width >= 8) { + Filter2DVerticalWidth8AndUp( + intermediate_result, dest, pred_stride, width, height, taps); + } else if (width == 4) { + Filter2DVerticalWidth4(intermediate_result, dest, + pred_stride, height, taps); + } else { + assert(width == 2); + Filter2DVerticalWidth2(intermediate_result, dest, + pred_stride, height, taps); } } @@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); @@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference, intermediate_result[kMaxSuperBlockSizeInPixels * (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; const int intermediate_height = height + vertical_taps - 1; - const ptrdiff_t src_stride = reference_stride; - const auto* src = static_cast(reference) - - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + const auto* const src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride - + kHorizontalOffset; DoHorizontalPass(src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. - auto* dest = static_cast(prediction); - const ptrdiff_t dest_stride = pred_stride; assert(vertical_filter_id != 0); - const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); - if (vertical_taps == 8) { - if (width == 2) { - Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<8>(intermediate_result, width, height, taps, prediction, + pred_stride); } else if (vertical_taps == 6) { - if (width == 2) { - Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<6>(intermediate_result, width, height, taps, prediction, + pred_stride); } else if (vertical_taps == 4) { - if (width == 2) { - Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<4>(intermediate_result, width, height, taps, prediction, + pred_stride); } else { // |vertical_taps| == 2 - if (width == 2) { - Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<2>(intermediate_result, width, height, taps, prediction, + pred_stride); } } @@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference, // increments. The first load covers the initial elements of src_x, while the // final load covers the taps. template -inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) { +inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) { uint8x8x3_t ret; const uint8x16_t src_val = vld1q_u8(src_x); ret.val[0] = vget_low_u8(src_val); @@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) { } template -inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, +inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, @@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); return; } @@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); @@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. void ConvolveKernelHorizontalPositive4Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap( const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped index vectors. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap( src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); } // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4]. @@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. inline void ConvolveKernelHorizontalSigned4Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( vadd_u8(src_indices_base, vdup_n_u8(2)), vadd_u8(src_indices_base, vdup_n_u8(3))}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); } // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0]. @@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalSigned6Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap( for (int i = 0; i < 6; ++i) { taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); @@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalMixed6Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap( mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices)); mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices)); - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); @@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalSigned8Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap( taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); @@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap( // This function handles blocks of width 2 or 4. template -void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, +void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y, const int filter_index, const int step_y, - const int height, void* dest, + const int height, void* const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; const int16_t* src_y = src; @@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, int p = subpixel_y & 1023; int prev_p = p; - int y = 0; - do { // y < height + int y = height; + do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1_s16(src_y + i * src_stride); } @@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; - - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } template -inline void ConvolveVerticalScale(const int16_t* src, const int width, +inline void ConvolveVerticalScale(const int16_t* const src, const int width, const int subpixel_y, const int filter_index, const int step_y, const int height, - void* dest, const ptrdiff_t dest_stride) { + void* const dest, + const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; // A possible improvement is to use arithmetic to decide how many times to // apply filters to same source before checking whether to load new srcs. @@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, uint8_t* dest_y; int x = 0; - do { // x < width - const int16_t* src_x = src + x; + do { + const int16_t* const src_x = src + x; const int16_t* src_y = src_x; dest16_y = static_cast(dest) + x; dest_y = static_cast(dest) + x; int p = subpixel_y & 1023; int prev_p = p; - int y = 0; - do { // y < height + int y = height; + do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1q_s16(src_y + i * src_stride); } @@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; - - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); x += 8; } while (x < width); } @@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference, const int vertical_filter_index, const int subpixel_x, const int subpixel_y, const int step_x, const int step_y, const int width, const int height, - void* prediction, const ptrdiff_t pred_stride) { + void* const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); assert(step_x <= 2048); @@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. - const auto* src = static_cast(reference) - kHorizontalOffset; - auto* dest = static_cast(prediction); + const auto* const src = + static_cast(reference) - kHorizontalOffset; + auto* const dest = static_cast(prediction); DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, horizontal_filter_id, filter_index); @@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) { template -void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, +void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int width, const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; - auto* dst8 = static_cast(dst); - auto* dst16 = static_cast(dst); + auto* const dst8 = static_cast(dst); + auto* const dst16 = static_cast(dst); assert(width >= 8); int x = 0; @@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } + // Decreasing the y loop counter produces worse code with clang. + // Don't unroll this loop since it generates too much code and the decoder + // is even slower. int y = 0; do { srcs[next_row] = vld1_u8(src_x); @@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[0] = Load4(src); src += src_stride; - int y = 0; + int y = height; do { srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; @@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } srcs[0] = srcs[2]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 4) { srcs[4] = vdup_n_u8(0); @@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); - int y = 0; + int y = height; do { srcs[2] = Load4<1>(src, srcs[2]); src += src_stride; @@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[0] = srcs[2]; srcs[1] = srcs[3]; srcs[2] = srcs[4]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 6) { srcs[6] = vdup_n_u8(0); @@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[3] = vext_u8(srcs[2], srcs[4], 4); - int y = 0; + int y = height; do { srcs[4] = Load4<1>(src, srcs[4]); src += src_stride; @@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[2] = srcs[4]; srcs[3] = srcs[5]; srcs[4] = srcs[6]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 8) { srcs[8] = vdup_n_u8(0); @@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[6], 4); - int y = 0; + int y = height; do { srcs[6] = Load4<1>(src, srcs[6]); src += src_stride; @@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[4] = srcs[6]; srcs[5] = srcs[7]; srcs[6] = srcs[8]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; - auto* dest = static_cast(prediction); + auto* const dest = static_cast(prediction); const ptrdiff_t dest_stride = pred_stride; assert(vertical_filter_id != 0); @@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON( const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const auto* src = static_cast(reference); const ptrdiff_t src_stride = reference_stride; @@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON( kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; if (width >= 16) { - int y = 0; + int y = height; do { int x = 0; do { @@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON( } while (x < width); src += src_stride; dest += width; - } while (++y < height); + } while (--y != 0); } else if (width == 8) { - int y = 0; + int y = height; do { const uint8x8_t v_src = vld1_u8(&src[0]); const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); src += src_stride; dest += width; - } while (++y < height); - } else { /* width == 4 */ + } while (--y != 0); + } else { // width == 4 uint8x8_t v_src = vdup_n_u8(0); - int y = 0; + int y = height; do { v_src = Load4<0>(&src[0], v_src); src += src_stride; @@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON( const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); dest += 4 << 1; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON( const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; - auto* dest = static_cast(prediction); + auto* const dest = static_cast(prediction); assert(vertical_filter_id != 0); uint8x8_t taps[8]; @@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON( const void* const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); - const auto* src = static_cast(reference) - kHorizontalOffset; - auto* dest = static_cast(prediction); + const auto* const src = + static_cast(reference) - kHorizontalOffset; + auto* const dest = static_cast(prediction); DoHorizontalPass( src, reference_stride, dest, width, width, height, horizontal_filter_id, filter_index); } +template +void Compound2DVertical(const uint16_t* const intermediate_result, + const int width, const int height, const int16x8_t taps, + void* const prediction) { + auto* const dest = static_cast(prediction); + if (width == 4) { + Filter2DVerticalWidth4( + intermediate_result, dest, width, height, taps); + } else { + Filter2DVerticalWidth8AndUp( + intermediate_result, dest, width, width, height, taps); + } +} + void ConvolveCompound2D_NEON(const void* const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. @@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference, const auto* const src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; - DoHorizontalPass( src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. - auto* dest = static_cast(prediction); assert(vertical_filter_id != 0); - - const ptrdiff_t dest_stride = width; const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); - if (vertical_taps == 8) { - if (width == 4) { - Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<8, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<8>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 6) { - if (width == 4) { - Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<6, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<6>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 4) { - if (width == 4) { - Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<4, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<4>(intermediate_result, width, height, taps, prediction); } else { // |vertical_taps| == 2 - if (width == 4) { - Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<2, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<2>(intermediate_result, width, height, taps, prediction); } } -inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) { +inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) { const uint8x16_t left = vld1q_u8(src); const uint8x16_t right = vld1q_u8(src + 1); vst1q_u8(dst, vrhaddq_u8(left, right)); @@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); - int y = 0; + int y = height; do { HalfAddHorizontal(src, dst); if (width >= 32) { @@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, } src += src_remainder_stride; dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopyHorizontal_NEON( @@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON( IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { - int y = 0; + int y = height; do { const uint8x8_t left = vld1_u8(src); const uint8x8_t right = vld1_u8(src + 1); @@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON( src += reference_stride; dest += pred_stride; - } while (++y < height); + } while (--y != 0); } else if (width == 4) { uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); - int y = 0; + int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); @@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON( dest += pred_stride; StoreHi4(dest, result); dest += pred_stride; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else { assert(width == 2); uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); - int y = 0; + int y = height; do { left = Load2<0>(src, left); right = Load2<0>(src + 1, right); @@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON( dest += pred_stride; Store2<1>(dest, result); dest += pred_stride; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } src += src_remainder_stride; - int y = 0; + int y = height; do { below[0] = vld1q_u8(src); if (width >= 32) { @@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } } dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopyVertical_NEON( @@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON( row = vld1_u8(src); src += reference_stride; - int y = 0; + int y = height; do { below = vld1_u8(src); src += reference_stride; @@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } else if (width == 4) { uint8x8_t row = Load4(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; - int y = 0; + int y = height; do { below = Load4<0>(src, below); src += reference_stride; @@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } else { assert(width == 2); uint8x8_t row = Load2(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; - int y = 0; + int y = height; do { below = Load2<0>(src, below); src += reference_stride; @@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } } @@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } src += src_remainder_stride; - int y = 0; + int y = height; do { const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2)); @@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } src += src_remainder_stride; dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopy2D_NEON( @@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON( uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); - int y = 0; + int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); @@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON( dest += pred_stride; row = vget_high_u16(below); - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else { uint8x8_t left = Load2(src); uint8x8_t right = Load2(src + 1); @@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON( uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); - int y = 0; + int y = height; do { left = Load2<0>(src, left); right = Load2<0>(src + 1, right); @@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON( dest += pred_stride; row = vget_high_u16(below); - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc index 04952ab..a0cd0ac 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.cc +++ b/src/dsp/arm/distance_weighted_blend_neon.cc @@ -30,10 +30,12 @@ namespace libgav1 { namespace dsp { -namespace { constexpr int kInterPostRoundBit = 4; +namespace low_bitdepth { +namespace { + inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, const int16x4_t weights[2]) { @@ -185,13 +187,167 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0, + const uint16x4x2_t pred1, + const uint16x4_t weights[2]) { + const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]); + const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]); + const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]); + const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]); + const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16); + const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset); + const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset); + const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1); + // Clip the result at (1 << bd) - 1. + uint16x4x2_t result; + result.val[0] = + vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max); + result.val[1] = + vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max); + return result; +} + +inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0, + const uint16x4x4_t pred1, + const uint16x4_t weights[2]) { + const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16); + const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]); + const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]); + const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]); + const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]); + const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset); + const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset); + const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]); + const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]); + const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]); + const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]); + const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset); + const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset); + const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1); + // Clip the result at (1 << bd) - 1. + uint16x4x4_t result; + result.val[0] = + vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max); + result.val[1] = + vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max); + result.val[2] = + vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max); + result.val[3] = + vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max); + + return result; +} + +// We could use vld1_u16_x2, but for compatibility reasons, use this function +// instead. The compiler optimizes to the correct instruction. +inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) { + uint16x4x2_t x; + // gcc/clang (64 bit) optimizes the following to ldp. + x.val[0] = vld1_u16(ptr); + x.val[1] = vld1_u16(ptr + 4); + return x; +} + +// We could use vld1_u16_x4, but for compatibility reasons, use this function +// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better +// performance in the speed tests. +inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) { + uint16x4x4_t x; + x.val[0] = vld1_u16(ptr); + x.val[1] = vld1_u16(ptr + 4); + x.val[2] = vld1_u16(ptr + 8); + x.val[3] = vld1_u16(ptr + 12); + return x; +} + +void DistanceWeightedBlend_NEON(const void* prediction_0, + const void* prediction_1, + const uint8_t weight_0, const uint8_t weight_1, + const int width, const int height, + void* const dest, const ptrdiff_t dest_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)}; -void DistanceWeightedBlendInit_NEON() { Init8bpp(); } + if (width == 4) { + int y = height; + do { + const uint16x4x2_t src0 = LoadU16x4_x2(pred_0); + const uint16x4x2_t src1 = LoadU16x4_x2(pred_1); + const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst, res.val[0]); + vst1_u16(dst + dst_stride, res.val[1]); + dst += dst_stride << 1; + pred_0 += 8; + pred_1 += 8; + y -= 2; + } while (y != 0); + } else if (width == 8) { + int y = height; + do { + const uint16x4x4_t src0 = LoadU16x4_x4(pred_0); + const uint16x4x4_t src1 = LoadU16x4_x4(pred_1); + const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst, res.val[0]); + vst1_u16(dst + 4, res.val[1]); + vst1_u16(dst + dst_stride, res.val[2]); + vst1_u16(dst + dst_stride + 4, res.val[3]); + dst += dst_stride << 1; + pred_0 += 16; + pred_1 += 16; + y -= 2; + } while (y != 0); + } else { + int y = height; + do { + int x = 0; + do { + const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x); + const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x); + const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst + x, res.val[0]); + vst1_u16(dst + x + 4, res.val[1]); + vst1_u16(dst + x + 8, res.val[2]); + vst1_u16(dst + x + 12, res.val[3]); + x += 16; + } while (x < width); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + } +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->distance_weighted_blend = DistanceWeightedBlend_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void DistanceWeightedBlendInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h index 4d8824c..94a799c 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.h +++ b/src/dsp/arm/distance_weighted_blend_neon.h @@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON(); #if LIBGAV1_ENABLE_NEON #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_ diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc index 2612466..8ee3745 100644 --- a/src/dsp/arm/film_grain_neon.cc +++ b/src/dsp/arm/film_grain_neon.cc @@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc index 00b186a..074283f 100644 --- a/src/dsp/arm/intra_edge_neon.cc +++ b/src/dsp/arm/intra_edge_neon.cc @@ -25,7 +25,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" -#include "src/utils/common.h" // RightShiftWithRounding() +#include "src/utils/common.h" namespace libgav1 { namespace dsp { @@ -35,6 +35,11 @@ namespace { // required. constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}}; +} // namespace + +namespace low_bitdepth { +namespace { + void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { assert(strength == 1 || strength == 2 || strength == 3); const int kernel_index = strength - 1; @@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // elements written is |size| - 1. if (size == 1) return; + const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100), + vcreate_u8(0x0f0e0d0c0b0a0908)); // |strength| 1 and 2 use a 3 tap filter. if (strength < 3) { // The last value requires extending the buffer (duplicating @@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // |remainder| == 1 then we don't have to do anything. const int remainder = (size - 1) & 0xf; if (remainder > 1) { - uint8_t temp[16]; const uint8x16_t src_1 = vld1q_u8(dst_buffer + i); const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1); @@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[size - 1] = last_val; @@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // Like the 3 tap but if there are two remaining values we have already // calculated them. if (remainder > 2) { - uint8_t temp[16]; const uint8x16_t src_2 = vld1q_u8(dst_buffer + i); const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1); const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2); @@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[1] = special_vals[0]; @@ -284,13 +293,225 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +const uint16_t kRemainderMask[8][8] = { + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000}, +}; + +void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { + assert(strength == 1 || strength == 2 || strength == 3); + const int kernel_index = strength - 1; + auto* const dst_buffer = static_cast(buffer); + + // The first element is not written out (but it is input) so the number of + // elements written is |size| - 1. + if (size == 1) return; + + // |strength| 1 and 2 use a 3 tap filter. + if (strength < 3) { + // The last value requires extending the buffer (duplicating + // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in + // neon. + const uint16_t last_val = RightShiftWithRounding( + kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] + + kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] + + kKernelsNEON[kernel_index][0] * dst_buffer[size - 1], + 4); + + const uint16_t krn0 = kKernelsNEON[kernel_index][0]; + const uint16_t krn1 = kKernelsNEON[kernel_index][1]; + + // The first value we need gets overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer); + int i = 1; + + // Process blocks until there are less than 16 values remaining. + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_0| will read past the + // end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + // Load the next row before overwriting. This loads an extra 7 values + // past |size| on the trailing iteration. + src_0 = vld1q_u16(dst_buffer + i + 7); + vst1q_u16(dst_buffer + i, result); + } + + // The last output value |last_val| was already calculated so if + // |remainder| == 1 then we don't have to do anything. + const int remainder = (size - 1) & 0x7; + if (remainder > 1) { + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[size - 1] = last_val; + return; + } + + assert(strength == 3); + // 5 tap filter. The first element requires duplicating |buffer[0]| and the + // last two elements require duplicating |buffer[size - 1]|. + uint16_t special_vals[3]; + special_vals[0] = RightShiftWithRounding( + (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) + + (dst_buffer[2] << 2) + (dst_buffer[3] << 1), + 4); + // Clamp index for very small |size| values. + const int first_index_min = std::max(size - 4, 0); + const int second_index_min = std::max(size - 3, 0); + const int third_index_min = std::max(size - 2, 0); + special_vals[1] = RightShiftWithRounding( + (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) + + (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) + + (dst_buffer[size - 1] << 1), + 4); + special_vals[2] = RightShiftWithRounding( + (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) + + // x << 2 + x << 2 == x << 3 + (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1), + 4); + + // The first two values we need get overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer - 1); + uint16x8_t src_1 = vld1q_u16(dst_buffer); + int i = 1; + + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_[01]| will read past + // the end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + + // Load the next before overwriting. + src_0 = vld1q_u16(dst_buffer + i + 6); + src_1 = vld1q_u16(dst_buffer + i + 7); + + vst1q_u16(dst_buffer + i, result); + } + + const int remainder = (size - 1) & 0x7; + // Like the 3 tap but if there are two remaining values we have already + // calculated them. + if (remainder > 2) { + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[1] = special_vals[0]; + // Avoid overwriting |dst_buffer[0]|. + if (size > 2) dst_buffer[size - 2] = special_vals[1]; + dst_buffer[size - 1] = special_vals[2]; +} + +void IntraEdgeUpsampler_NEON(void* buffer, const int size) { + assert(size % 4 == 0 && size <= 16); + auto* const pixel_buffer = static_cast(buffer); -void IntraEdgeInit_NEON() { Init8bpp(); } + // Extend first/last samples + pixel_buffer[-2] = pixel_buffer[-1]; + pixel_buffer[size] = pixel_buffer[size - 1]; + + const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2)); + const int16x8_t src_hi = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8)); + const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3)); + const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3)); + + int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo); + sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2)); + sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3)); + sum_lo = vrshrq_n_s16(sum_lo, 4); + + uint16x8x2_t result_lo; + result_lo.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2)); + + if (size > 8) { + const int16x8_t src_hi_extra = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2)); + const int16x8_t src9_hi_extra = + vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3)); + + int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi); + sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2)); + sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3)); + sum_hi = vrshrq_n_s16(sum_hi, 4); + + uint16x8x2_t result_hi; + result_hi.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_hi.val[1] = + vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2)); + vst2q_u16(pixel_buffer - 1, result_lo); + vst2q_u16(pixel_buffer + 15, result_hi); + } else { + vst2q_u16(pixel_buffer - 1, result_lo); + } +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->intra_edge_filter = IntraEdgeFilter_NEON; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraEdgeInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h index d3bb243..28e3494 100644 --- a/src/dsp/arm/intra_edge_neon.h +++ b/src/dsp/arm/intra_edge_neon.h @@ -34,6 +34,9 @@ void IntraEdgeInit_NEON(); #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_ diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc index 45fe33b..8d8748f 100644 --- a/src/dsp/arm/intrapred_cfl_neon.cc +++ b/src/dsp/arm/intrapred_cfl_neon.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_cfl.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON @@ -27,45 +27,20 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { -namespace low_bitdepth { -namespace { - -uint8x16_t Set2ValuesQ(const uint8_t* a) { - uint16_t combined_values = a[0] | a[1] << 8; - return vreinterpretq_u8_u16(vdupq_n_u16(combined_values)); -} - -uint32_t SumVector(uint32x2_t a) { -#if defined(__aarch64__) - return vaddv_u32(a); -#else - const uint64x1_t b = vpaddl_u32(a); - return vget_lane_u32(vreinterpret_u32_u64(b), 0); -#endif // defined(__aarch64__) -} - -uint32_t SumVector(uint32x4_t a) { -#if defined(__aarch64__) - return vaddvq_u32(a); -#else - const uint64x2_t b = vpaddlq_u32(a); - const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); - return vget_lane_u32(vreinterpret_u32_u64(c), 0); -#endif // defined(__aarch64__) -} // Divide by the number of elements. -uint32_t Average(const uint32_t sum, const int width, const int height) { +inline uint32_t Average(const uint32_t sum, const int width, const int height) { return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height)); } // Subtract |val| from every element in |a|. -void BlockSubtract(const uint32_t val, - int16_t a[kCflLumaBufferStride][kCflLumaBufferStride], - const int width, const int height) { +inline void BlockSubtract(const uint32_t val, + int16_t a[kCflLumaBufferStride][kCflLumaBufferStride], + const int width, const int height) { assert(val <= INT16_MAX); const int16x8_t val_v = vdupq_n_s16(static_cast(val)); @@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val, } } +namespace low_bitdepth { +namespace { + template void CflSubsampler420_NEON( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -122,26 +100,27 @@ void CflSubsampler420_NEON( sum = SumVector(running_sum); } else if (block_width == 8) { - const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6, - 8, 8, 10, 10, 12, 12, 14, 14}; - const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); - const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); + const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14}; + const uint16x8_t x_max_index = + vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16); + const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index); uint32x4_t running_sum = vdupq_n_u32(0); for (int y = 0; y < block_height; ++y) { - const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2); - const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride); + const uint8x16_t row0 = vld1q_u8(src); + const uint8x16_t row1 = vld1q_u8(src + stride); + const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); + const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1); - uint8x16_t row0 = vld1q_u8(src); - row0 = vbslq_u8(x_mask, row0, x_max0); - uint8x16_t row1 = vld1q_u8(src + stride); - row1 = vbslq_u8(x_mask, row1, x_max1); + // Dup the 2x2 sum at the max luma offset. + const uint16x8_t max_luma_sum = + vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3); + const uint16x8_t final_sum_row = + vbslq_u16(x_mask, sum_row_shifted, max_luma_sum); + vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row)); - uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); - sum_row = vshlq_n_u16(sum_row, 1); - running_sum = vpadalq_u16(running_sum, sum_row); - vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row)); + running_sum = vpadalq_u16(running_sum, final_sum_row); if (y << 1 < max_luma_height - 2) { src += stride << 1; @@ -150,45 +129,35 @@ void CflSubsampler420_NEON( sum = SumVector(running_sum); } else /* block_width >= 16 */ { - const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); + const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2); uint32x4_t running_sum = vdupq_n_u32(0); for (int y = 0; y < block_height; ++y) { - uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30}; - const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]); - const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]); - const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]); - const uint8x16_t x_max11 = - vdupq_n_u8(src[stride + max_luma_width - 2 + 1]); - for (int x = 0; x < block_width; x += 16) { - const ptrdiff_t src_x_offset = x << 1; - const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); - const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset); - const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride); - const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00); - const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01); - const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10); - const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11); - - uint16x8_t sum_row_lo = - vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01)); - sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10)); - sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11)); - sum_row_lo = vshlq_n_u16(sum_row_lo, 1); - running_sum = vpadalq_u16(running_sum, sum_row_lo); - vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo)); - - uint16x8_t sum_row_hi = - vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01)); - sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10)); - sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11)); - sum_row_hi = vshlq_n_u16(sum_row_hi, 1); - running_sum = vpadalq_u16(running_sum, sum_row_hi); - vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi)); - - x_index = vaddq_u8(x_index, vdupq_n_u8(32)); + // Calculate the 2x2 sum at the max_luma offset + const uint8_t a00 = src[max_luma_width - 2]; + const uint8_t a01 = src[max_luma_width - 1]; + const uint8_t a10 = src[max_luma_width - 2 + stride]; + const uint8_t a11 = src[max_luma_width - 1 + stride]; + // Dup the 2x2 sum at the max luma offset. + const uint16x8_t max_luma_sum = + vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1)); + uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14}; + + ptrdiff_t src_x_offset = 0; + for (int x = 0; x < block_width; x += 8, src_x_offset += 16) { + const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index); + const uint8x16_t row0 = vld1q_u8(src + src_x_offset); + const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride); + const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); + const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1); + const uint16x8_t final_sum_row = + vbslq_u16(x_mask, sum_row_shifted, max_luma_sum); + vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row)); + + running_sum = vpadalq_u16(running_sum, final_sum_row); + x_index = vaddq_u16(x_index, vdupq_n_u16(16)); } + if (y << 1 < max_luma_height - 2) { src += stride << 1; } @@ -209,17 +178,30 @@ void CflSubsampler444_NEON( uint32_t sum; if (block_width == 4) { assert(max_luma_width >= 4); + assert(max_luma_height <= block_height); + assert((max_luma_height % 2) == 0); uint32x4_t running_sum = vdupq_n_u32(0); uint8x8_t row = vdup_n_u8(0); - for (int y = 0; y < block_height; y += 2) { + uint16x8_t row_shifted; + int y = 0; + do { row = Load4<0>(src, row); row = Load4<1>(src + stride, row); if (y < (max_luma_height - 1)) { src += stride << 1; } - const uint16x8_t row_shifted = vshll_n_u8(row, 3); + row_shifted = vshll_n_u8(row, 3); + running_sum = vpadalq_u16(running_sum, row_shifted); + vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted))); + vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted))); + y += 2; + } while (y < max_luma_height); + + row_shifted = + vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted)); + for (; y < block_height; y += 2) { running_sum = vpadalq_u16(running_sum, row_shifted); vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted))); vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted))); @@ -463,12 +445,874 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// CflSubsampler +#ifndef __aarch64__ +uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { + return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), + vpadd_u16(vget_low_u16(b), vget_high_u16(b))); +} +#endif + +// This duplicates the last two 16-bit values in |row|. +inline uint16x8_t LastRowSamples(const uint16x8_t row) { + const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row)); + const uint32x4_t b = vdupq_lane_u32(a, 1); + return vreinterpretq_u16_u32(b); +} + +// This duplicates the last unsigned 16-bit value in |row|. +inline uint16x8_t LastRowResult(const uint16x8_t row) { + const uint16x4_t a = vget_high_u16(row); + const uint16x8_t b = vdupq_lane_u16(a, 0x3); + return b; +} + +// This duplicates the last signed 16-bit value in |row|. +inline int16x8_t LastRowResult(const int16x8_t row) { + const int16x4_t a = vget_high_s16(row); + const int16x8_t b = vdupq_lane_s16(a, 0x3); + return b; +} + +// Takes in two sums of input row pairs, and completes the computation for two +// output rows. +inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0, + const uint16x8_t vertical_sum1, + int16_t* luma_ptr) { + const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1); + const uint16x8_t result_shifted = vshlq_n_u16(result, 1); + vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted))); + vst1_s16(luma_ptr + kCflLumaBufferStride, + vreinterpret_s16_u16(vget_high_u16(result_shifted))); + return result_shifted; +} + +// Takes two halves of a vertically added pair of rows and completes the +// computation for one output row. +inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0, + const uint16x8_t vertical_sum1, + int16_t* luma_ptr) { + const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1); + const uint16x8_t result_shifted = vshlq_n_u16(result, 1); + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted)); + return result_shifted; +} + +template +void CflSubsampler444_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint16x4_t sum = vdup_n_u16(0); + uint16x4_t samples[2]; + int y = visible_height; + + do { + samples[0] = vld1_u16(src); + samples[1] = vld1_u16(src + src_stride); + src += src_stride << 1; + sum = vadd_u16(sum, samples[0]); + sum = vadd_u16(sum, samples[1]); + y -= 2; + } while (y != 0); + + if (!is_inside) { + y = visible_height; + samples[1] = vshl_n_u16(samples[1], 1); + do { + sum = vadd_u16(sum, samples[1]); + y += 2; + } while (y < block_height); + } + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift ((log2 of width 4) + 1). + const uint32_t average_sum = + RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1); + const int16x4_t averages = vdup_n_s16(static_cast(average_sum)); + + const auto* ssrc = static_cast(source); + int16x4_t ssample; + luma_ptr = luma[0]; + y = visible_height; + do { + ssample = vld1_s16(ssrc); + ssample = vshl_n_s16(ssample, 3); + vst1_s16(luma_ptr, vsub_s16(ssample, averages)); + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + vst1_s16(luma_ptr, vsub_s16(ssample, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template +void CflSubsampler444_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast(max_luma_width); + static_cast(max_luma_height); + static_assert(block_height_log2 <= 4, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_NEON(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_4xH_NEON(luma, max_luma_height, + source, stride); + } +} + +template +void CflSubsampler444_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t samples; + int y = visible_height; + + do { + samples = vld1q_u16(src); + src += src_stride; + sum = vpadalq_u16(sum, samples); + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + do { + sum = vpadalq_u16(sum, samples); + } while (++y < block_height); + } + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift (log2 of width 8). + const uint32_t average_sum = + RightShiftWithRounding(SumVector(sum), block_height_log2); + const int16x8_t averages = vdupq_n_s16(static_cast(average_sum)); + + const auto* ssrc = static_cast(source); + int16x8_t ssample; + luma_ptr = luma[0]; + y = visible_height; + do { + ssample = vld1q_s16(ssrc); + ssample = vshlq_n_s16(ssample, 3); + vst1q_s16(luma_ptr, vsubq_s16(ssample, averages)); + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + vst1q_s16(luma_ptr, vsubq_s16(ssample, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template +void CflSubsampler444_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast(max_luma_width); + static_cast(max_luma_height); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 8; + + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_8xH_NEON(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_8xH_NEON(luma, max_luma_height, + source, stride); + } +} + +template +void CflSubsampler444_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const int block_width = 1 << block_width_log2; + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t samples[4]; + int y = visible_height; + + do { + samples[0] = vld1q_u16(src); + samples[1] = + (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]); + uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]); + if (block_width == 32) { + samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16) + : LastRowResult(samples[1]); + samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24) + : LastRowResult(samples[2]); + inner_sum = vaddq_u16(samples[2], inner_sum); + inner_sum = vaddq_u16(samples[3], inner_sum); + } + sum = vpadalq_u16(sum, inner_sum); + src += src_stride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]); + if (block_width == 32) { + inner_sum = vaddq_u16(samples[2], inner_sum); + inner_sum = vaddq_u16(samples[3], inner_sum); + } + do { + sum = vpadalq_u16(sum, inner_sum); + } while (++y < block_height); + } + + // Here the left shift by 3 (to increase precision) is subtracted in right + // shift factor (block_width_log2 + block_height_log2 - 3). + const uint32_t average_sum = RightShiftWithRounding( + SumVector(sum), block_width_log2 + block_height_log2 - 3); + const int16x8_t averages = vdupq_n_s16(static_cast(average_sum)); + + const auto* ssrc = static_cast(source); + int16x8_t ssamples_ext = vdupq_n_s16(0); + int16x8_t ssamples[4]; + luma_ptr = luma[0]; + y = visible_height; + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + if (max_luma_width > x) { + ssamples[idx] = vld1q_s16(&ssrc[x]); + ssamples[idx] = vshlq_n_s16(ssamples[idx], 3); + ssamples_ext = ssamples[idx]; + } else { + ssamples[idx] = LastRowResult(ssamples_ext); + } + vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages)); + } + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages)); + } + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template +void CflSubsampler444_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, + "This function will only work for block_width 16 and 32."); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + + const int block_height = 1 << block_height_log2; + const int vert_inside = block_height <= max_luma_height; + if (vert_inside) { + CflSubsampler444_WxH_NEON( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_WxH_NEON( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template +void CflSubsampler420_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16x8_t samples_row0 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row1 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1); + + const uint16x8_t samples_row2 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row3 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3); + uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr); + luma_ptr += kCflLumaBufferStride << 1; + + const uint16x8_t samples_row4 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row5 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5); + + const uint16x8_t samples_row6 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row7 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7); + sum = + vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr)); + luma_ptr += kCflLumaBufferStride << 1; + + final_sum = vpadalq_u16(final_sum, sum); + y -= 4; + } while (y != 0); + + const uint16x4_t final_fill = + vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride)); + const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill); + for (y = luma_height; y < block_height; ++y) { + vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill)); + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/); + const int16x4_t averages = vdup_n_s16(static_cast(average_sum)); + luma_ptr = luma[0]; + y = block_height; + do { + const int16x4_t samples = vld1_s16(luma_ptr); + vst1_s16(luma_ptr, vsub_s16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template +inline void CflSubsampler420Impl_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16x8_t samples_row00 = vld1q_u16(src); + const uint16x8_t samples_row01 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row00); + src += src_stride; + const uint16x8_t samples_row10 = vld1q_u16(src); + const uint16x8_t samples_row11 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row10); + src += src_stride; + const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10); + const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11); + uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row20 = vld1q_u16(src); + const uint16x8_t samples_row21 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row20); + src += src_stride; + const uint16x8_t samples_row30 = vld1q_u16(src); + const uint16x8_t samples_row31 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row30); + src += src_stride; + const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30); + const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row40 = vld1q_u16(src); + const uint16x8_t samples_row41 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row40); + src += src_stride; + const uint16x8_t samples_row50 = vld1q_u16(src); + const uint16x8_t samples_row51 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row50); + src += src_stride; + const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50); + const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row60 = vld1q_u16(src); + const uint16x8_t samples_row61 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row60); + src += src_stride; + const uint16x8_t samples_row70 = vld1q_u16(src); + const uint16x8_t samples_row71 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row70); + src += src_stride; + const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70); + const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + final_sum = vpadalq_u16(final_sum, sum); + y -= 4; + } while (y != 0); + + // Duplicate the final row downward to the end after max_luma_height. + const uint16x8_t final_fill = + vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride)); + const uint32x4_t final_fill_to_sum = + vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill)); + + for (y = luma_height; y < block_height; ++y) { + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill)); + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } + + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/); + const int16x8_t averages = vdupq_n_s16(static_cast(average_sum)); + luma_ptr = luma[0]; + y = block_height; + do { + const int16x8_t samples = vld1q_s16(luma_ptr); + vst1q_s16(luma_ptr, vsubq_s16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template +void CflSubsampler420_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + if (max_luma_width == 8) { + CflSubsampler420Impl_8xH_NEON(luma, max_luma_height, + source, stride); + } else { + CflSubsampler420Impl_8xH_NEON(luma, max_luma_height, + source, stride); + } +} + +template +inline void CflSubsampler420Impl_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const auto* src = static_cast(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const int block_height = 1 << block_height_log2; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int16_t* luma_ptr = luma[0]; + // Begin first y section, covering width up to 32. + int y = luma_height; + + uint16x8_t final_fill0, final_fill1; + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16_t* src_next = src + src_stride; + const uint16x8_t samples_row00 = vld1q_u16(src); + const uint16x8_t samples_row01 = (max_luma_width >= 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row00); + const uint16x8_t samples_row02 = (max_luma_width >= 24) + ? vld1q_u16(src + 16) + : LastRowSamples(samples_row01); + const uint16x8_t samples_row03 = (max_luma_width == 32) + ? vld1q_u16(src + 24) + : LastRowSamples(samples_row02); + const uint16x8_t samples_row10 = vld1q_u16(src_next); + const uint16x8_t samples_row11 = (max_luma_width >= 16) + ? vld1q_u16(src_next + 8) + : LastRowSamples(samples_row10); + const uint16x8_t samples_row12 = (max_luma_width >= 24) + ? vld1q_u16(src_next + 16) + : LastRowSamples(samples_row11); + const uint16x8_t samples_row13 = (max_luma_width == 32) + ? vld1q_u16(src_next + 24) + : LastRowSamples(samples_row12); + const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10); + const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11); + const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12); + const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13); + final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr); + final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); + const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1); + + final_sum = vpadalq_u16(final_sum, sum); + + // Because max_luma_width is at most 32, any values beyond x=16 will + // necessarily be duplicated. + if (block_width_log2 == 5) { + const uint16x8_t wide_fill = LastRowResult(final_fill1); + final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1)); + } + src += src_stride << 1; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + // Begin second y section. + y = luma_height; + if (y < block_height) { + uint32x4_t wide_fill; + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. (a << 2) = (a + a) << 1. + wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2); + } + const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1); + const uint32x4_t final_fill_to_sum = vaddl_u16( + vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum)); + + do { + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0)); + vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1)); + if (block_width_log2 == 5) { + final_sum = vaddq_u32(final_sum, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } while (++y < block_height); + } // End second y section. + + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_width_log2 + block_height_log2); + const int16x8_t averages = vdupq_n_s16(static_cast(average_sum)); + + luma_ptr = luma[0]; + y = block_height; + do { + const int16x8_t samples0 = vld1q_s16(luma_ptr); + vst1q_s16(luma_ptr, vsubq_s16(samples0, averages)); + const int16x8_t samples1 = vld1q_s16(luma_ptr + 8); + const int16x8_t final_row_result = vsubq_s16(samples1, averages); + vst1q_s16(luma_ptr + 8, final_row_result); + + if (block_width_log2 == 5) { + const int16x8_t wide_fill = LastRowResult(final_row_result); + vst1q_s16(luma_ptr + 16, wide_fill); + vst1q_s16(luma_ptr + 24, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +//------------------------------------------------------------------------------ +// Choose subsampler based on max_luma_width +template +void CflSubsampler420_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + switch (max_luma_width) { + case 8: + CflSubsampler420Impl_WxH_NEON( + luma, max_luma_height, source, stride); + return; + case 16: + CflSubsampler420Impl_WxH_NEON( + luma, max_luma_height, source, stride); + return; + case 24: + CflSubsampler420Impl_WxH_NEON( + luma, max_luma_height, source, stride); + return; + default: + assert(max_luma_width == 32); + CflSubsampler420Impl_WxH_NEON( + luma, max_luma_height, source, stride); + return; + } +} + +//------------------------------------------------------------------------------ +// CflIntraPredictor + +// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive. +// |alpha| can be -16 to 16 (inclusive). +// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1. +inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs, + const int16x8_t alpha_signed, const int16x8_t dc, + const uint16x8_t max_value) { + const int16x8_t luma_abs = vabsq_s16(luma); + const int16x8_t luma_alpha_sign = + vshrq_n_s16(veorq_s16(luma, alpha_signed), 15); + // (alpha * luma) >> 6 + const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs); + // Convert back to signed values. + const int16x8_t la = + vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign); + const int16x8_t result = vaddq_s16(la, dc); + const int16x8_t zero = vdupq_n_s16(0); + // Clip. + return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value); +} + +template +inline void CflIntraPredictor4xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; y += 2) { + const int16x4_t luma_row0 = vld1_s16(luma[y]); + const int16x4_t luma_row1 = vld1_s16(luma[y + 1]); + const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1); + const uint16x8_t sum = + Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value); + vst1_u16(dst, vget_low_u16(sum)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(sum)); + dst += dst_stride; + } +} + +template +inline void CflIntraPredictor8xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row = vld1q_s16(luma[y]); + const uint16x8_t sum = + Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum); + dst += dst_stride; + } +} + +template +inline void CflIntraPredictor16xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const uint16x8_t sum_0 = + Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_1 = + Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum_0); + vst1q_u16(dst + 8, sum_1); + dst += dst_stride; + } +} + +template +inline void CflIntraPredictor32xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16); + const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24); + const uint16x8_t sum_0 = + Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_1 = + Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_2 = + Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_3 = + Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum_0); + vst1q_u16(dst + 8, sum_1); + vst1q_u16(dst + 16, sum_2); + vst1q_u16(dst + 24, sum_3); + dst += dst_stride; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<4>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<4>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<5>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 2>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 3>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 5>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 3>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 4>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 5>; + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<4>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<4>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<5>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 2>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 3>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 5>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 3>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 4>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 5>; + + dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>; + + dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>; + + dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor16xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor16xN_NEON<32>; + dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor32xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor32xN_NEON<32>; + // Max Cfl predictor size is 32x32. +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredCflInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h new file mode 100644 index 0000000..b4f983a --- /dev/null +++ b/src/dsp/arm/intrapred_cfl_neon.h @@ -0,0 +1,179 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the +// defines below for specifics. These functions are not thread-safe. +void IntraPredCflInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +// 4x4 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// ----------------------------------------------------------------------------- +// 10bpp + +// 4x4 +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 805ba81..3f5edbd 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_directional.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON #include -#include // std::min +#include #include #include #include -#include // memset +#include #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" @@ -35,14 +35,14 @@ namespace dsp { namespace low_bitdepth { namespace { -// Blend two values based on a 32 bit weight. +// Blend two values based on weights that sum to 32. inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, const uint8x8_t a_weight, const uint8x8_t b_weight) { const uint16x8_t a_product = vmull_u8(a, a_weight); const uint16x8_t b_product = vmull_u8(b, b_weight); - return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5); + return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/); } // For vertical operations the weights are one constant value. @@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, // 4 wide subsamples the output. 8 wide subsamples the input. if (width == 4) { const uint8x8_t left_values = vld1_u8(top + top_base_x); - const uint8x8_t right_values = RightShift<8>(left_values); + const uint8x8_t right_values = RightShiftVector<8>(left_values); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); // If |upsampled| is true then extract every other value for output. @@ -910,12 +910,590 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// Blend two values based on weights that sum to 32. +inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, + const int a_weight, const int b_weight) { + const uint16x4_t a_product = vmul_n_u16(a, a_weight); + const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight); + + return vrshr_n_u16(sum, 5 /*log2(32)*/); +} + +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16_t a_weight, + const uint16_t b_weight) { + const uint16x8_t a_product = vmulq_n_u16(a, a_weight); + const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + +// Each element of |dest| contains values associated with one weight value. +inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source, + const bool upsampled) { + if (upsampled) { + *dest = vld2_u16(source); + } else { + dest->val[0] = vld1_u16(source); + dest->val[1] = vld1_u16(source + 1); + } +} + +// Each element of |dest| contains values associated with one weight value. +inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source, + const bool upsampled) { + if (upsampled) { + *dest = vld2q_u16(source); + } else { + dest->val[0] = vld1q_u16(source); + dest->val[1] = vld1q_u16(source + 1); + } +} + +template +inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride, + const int height, const uint16_t* const top, + const int xstep) { + const int upsample_shift = static_cast(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_x = (4 + height - 1) << upsample_shift; + const int16x4_t max_base = vdup_n_s16(max_base_x); + const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]); + const int16x4_t index_offset = {0, 1, 2, 3}; + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + int top_x = xstep; + int y = 0; + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + const int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset); + const uint16x4_t max_base_mask = vclt_s16(base_x, max_base); + + uint16x4x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x4_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + // If |upsampled| is true then extract every other value for output. + const uint16x4_t masked_result = + vbsl_u16(max_base_mask, combined, final_top_val); + + vst1_u16(dst, masked_result); + } + for (; y < height; ++y) { + Memset(dst, top[max_base_x], 4 /* width */); + dst += stride; + } +} + +// Process a multiple of 8 |width| by any |height|. Processes horizontally +// before vertically in the hopes of being a little more cache friendly. +template +inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const top, const int xstep) { + assert(width % 8 == 0); + const int upsample_shift = static_cast(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_index = (width + height - 1) << upsample_shift; + const int16x8_t max_base_x = vdupq_n_s16(max_base_index); + const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); + const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; + + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + const int16x8_t block_step = vdupq_n_s16(base_step8); + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_index / xstep_units, height); + + int top_x = xstep; + int y = 0; + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); + + int x = 0; + do { + const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); + + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + const uint16x8_t masked_result = + vbslq_u16(max_base_mask, combined, final_top_val); + vst1q_u16(dst + x, masked_result); + + base_x = vaddq_s16(base_x, block_step); + top_base_x += base_step8; + x += 8; + } while (x < width); + } + for (int i = y; i < height; ++i) { + Memset(dst, top[max_base_index], width); + dst += stride; + } +} + +// Process a multiple of 8 |width| by any |height|. Processes horizontally +// before vertically in the hopes of being a little more cache friendly. +inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const top, const int xstep, + const bool upsampled) { + assert(width % 8 == 0); + const int upsample_shift = static_cast(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_index = (width + height - 1) << upsample_shift; + const int16x8_t max_base_x = vdupq_n_s16(max_base_index); + const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); + const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; + + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + const int16x8_t block_step = vdupq_n_s16(base_step8); + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_index / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + const int max_no_corner_y = std::min( + ((max_base_index - (base_step * width)) << index_scale_bits) / xstep, + height); + // No need to check for exceeding |max_base_x| in the first loop. + int y = 0; + int top_x = xstep; + for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + int x = 0; + do { + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + vst1q_u16(dst + x, combined); + + top_base_x += base_step8; + x += 8; + } while (x < width); + } + + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); + + int x = 0; + const int min_corner_only_x = + std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) & + ~7; + for (; x < min_corner_only_x; x += 8, top_base_x += base_step8, + base_x = vaddq_s16(base_x, block_step)) { + const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); + + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + const uint16x8_t masked_result = + vbslq_u16(max_base_mask, combined, final_top_val); + vst1q_u16(dst + x, masked_result); + } + // Corner-only section of the row. + Memset(dst + x, top[max_base_index], width - x); + } + for (; y < height; ++y) { + Memset(dst, top[max_base_index], width); + dst += stride; + } +} + +void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const uint16_t* const top = static_cast(top_row); + uint16_t* dst = static_cast(dest); + stride /= sizeof(top[0]); + + assert(xstep > 0); + + if (xstep == 64) { + assert(!upsampled_top); + const uint16_t* top_ptr = top + 1; + const int width_bytes = width * sizeof(top[0]); + int y = height; + do { + memcpy(dst, top_ptr, width_bytes); + memcpy(dst + stride, top_ptr + 1, width_bytes); + memcpy(dst + 2 * stride, top_ptr + 2, width_bytes); + memcpy(dst + 3 * stride, top_ptr + 3, width_bytes); + dst += 4 * stride; + top_ptr += 4; + y -= 4; + } while (y != 0); + } else { + if (width == 4) { + if (upsampled_top) { + DirectionalZone1_4xH(dst, stride, height, top, xstep); + } else { + DirectionalZone1_4xH(dst, stride, height, top, xstep); + } + } else if (width >= 32) { + if (upsampled_top) { + DirectionalZone1_Large(dst, stride, width, height, top, xstep, true); + } else { + DirectionalZone1_Large(dst, stride, width, height, top, xstep, false); + } + } else if (upsampled_top) { + DirectionalZone1_WxH(dst, stride, width, height, top, xstep); + } else { + DirectionalZone1_WxH(dst, stride, width, height, top, xstep); + } + } +} + +// ----------------------------------------------------------------------------- +// Zone 3 +// This can be considered "the transpose of Zone 1." In Zone 1, the fractional +// step applies when moving vertically in the destination block, connected to +// the change in |y|, whereas in this mode, the step applies when moving +// horizontally, connected to the change in |x|. This makes vectorization very +// complicated in row-order, because a given vector may need source pixels that +// span 16 or 32 pixels in steep angles, requiring multiple expensive table +// lookups and checked loads. Rather than work in row order, it is simpler to +// compute |dest| in column order, and then store the transposed results. + +// Compute 4x4 sub-blocks. +// Example of computed sub-blocks of a 4x8 block before and after transpose: +// 00 10 20 30 00 01 02 03 +// 01 11 21 31 10 11 12 13 +// 02 12 22 32 20 21 22 23 +// 03 13 23 33 30 31 32 33 +// ----------- --> ----------- +// 40 50 60 70 40 41 42 43 +// 41 51 61 71 50 51 52 53 +// 42 52 62 72 60 61 62 63 +// 43 53 63 73 70 71 72 73 +template +inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride, + const uint16_t* const left, const int ystep, + const int base_left_y = 0) { + const int upsample_shift = static_cast(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x4_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x4x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x4(result); + Store4(dst, result[0]); + dst += stride; + Store4(dst, result[1]); + dst += stride; + Store4(dst, result[2]); + dst += stride; + Store4(dst, result[3]); +} + +template +inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride, + const int height, const uint16_t* const left, + const int ystep) { + const int upsample_shift = static_cast(upsampled); + int y = 0; + do { + DirectionalZone3_4x4(dest, stride, left + (y << upsample_shift), + ystep); + dest += 4 * stride; + y += 4; + } while (y < height); +} + +template +inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride, + const int width, const uint16_t* const left, + const int ystep) { + int x = 0; + int base_left_y = 0; + do { + // TODO(petersonab): Establish 8x4 transpose to reserve this function for + // 8x4 and 16x4. + DirectionalZone3_4x4(dest + 2 * x, stride, left, ystep, + base_left_y); + base_left_y += 4 * ystep; + x += 4; + } while (x < width); +} + +template +inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride, + const uint16_t* const left, const int ystep, + const int base_left_y = 0) { + const int upsample_shift = static_cast(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[8]; + + int left_y = base_left_y + ystep; + uint16x8x2_t sampled_left_col; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose8x8(result); + Store8(dest, result[0]); + dest += stride; + Store8(dest, result[1]); + dest += stride; + Store8(dest, result[2]); + dest += stride; + Store8(dest, result[3]); + dest += stride; + Store8(dest, result[4]); + dest += stride; + Store8(dest, result[5]); + dest += stride; + Store8(dest, result[6]); + dest += stride; + Store8(dest, result[7]); +} + +template +inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const left, const int ystep) { + const int upsample_shift = static_cast(upsampled); + // Zone3 never runs out of left_column values. + assert((width + height - 1) << upsample_shift > // max_base_y + ((ystep * width) >> (6 - upsample_shift)) + + (/* base_step */ 1 << upsample_shift) * + (height - 1)); // left_base_y + int y = 0; + do { + int x = 0; + uint8_t* dst_x = dest + y * stride; + do { + const int base_left_y = ystep * x; + DirectionalZone3_8x8( + dst_x, stride, left + (y << upsample_shift), ystep, base_left_y); + dst_x += 8 * sizeof(uint16_t); + x += 8; + } while (x < width); + y += 8; + } while (y < height); +} + +void DirectionalIntraPredictorZone3_NEON(void* const dest, + const ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled_left) { + const uint16_t* const left = static_cast(left_column); + uint8_t* dst = static_cast(dest); + + if (ystep == 64) { + assert(!upsampled_left); + const int width_bytes = width * sizeof(left[0]); + int y = height; + do { + const uint16_t* left_ptr = left + 1; + memcpy(dst, left_ptr, width_bytes); + memcpy(dst + stride, left_ptr + 1, width_bytes); + memcpy(dst + 2 * stride, left_ptr + 2, width_bytes); + memcpy(dst + 3 * stride, left_ptr + 3, width_bytes); + dst += 4 * stride; + left_ptr += 4; + y -= 4; + } while (y != 0); + return; + } + if (width == 4) { + if (upsampled_left) { + DirectionalZone3_4xH(dst, stride, height, left, ystep); + } else { + DirectionalZone3_4xH(dst, stride, height, left, ystep); + } + } else if (height == 4) { + if (upsampled_left) { + DirectionalZone3_Wx4(dst, stride, width, left, ystep); + } else { + DirectionalZone3_Wx4(dst, stride, width, left, ystep); + } + } else { + if (upsampled_left) { + // |upsampled_left| can only be true if |width| + |height| <= 16, + // therefore this is 8x8. + DirectionalZone3_8x8(dst, stride, left, ystep); + } else { + DirectionalZone3_WxH(dst, stride, width, height, left, ystep); + } + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; + dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredDirectionalInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h new file mode 100644 index 0000000..f7d6235 --- /dev/null +++ b/src/dsp/arm/intrapred_directional_neon.h @@ -0,0 +1,56 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for +// specifics. These functions are not thread-safe. +void IntraPredDirectionalInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON +#endif + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_intra_neon.cc deleted file mode 100644 index 411708e..0000000 --- a/src/dsp/arm/intrapred_filter_intra_neon.cc +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright 2019 The libgav1 Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "src/dsp/intrapred.h" -#include "src/utils/cpu.h" - -#if LIBGAV1_ENABLE_NEON - -#include - -#include -#include -#include - -#include "src/dsp/arm/common_neon.h" -#include "src/dsp/constants.h" -#include "src/dsp/dsp.h" -#include "src/utils/common.h" - -namespace libgav1 { -namespace dsp { - -namespace low_bitdepth { -namespace { - -// Transpose kFilterIntraTaps and convert the first row to unsigned values. -// -// With the previous orientation we were able to multiply all the input values -// by a single tap. This required that all the input values be in one vector -// which requires expensive set up operations (shifts, vext, vtbl). All the -// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but -// then the shifting, rounding, and clamping was done in GP registers. -// -// Switching to unsigned values allows multiplying the 8 bit inputs directly. -// When one value was negative we needed to vmovl_u8 first so that the results -// maintained the proper sign. -// -// We take this into account when summing the values by subtracting the product -// of the first row. -alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] = - {{{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative. - {10, 2, 1, 1, 6, 2, 2, 1}, - {0, 10, 1, 1, 0, 6, 2, 2}, - {0, 0, 10, 2, 0, 0, 6, 2}, - {0, 0, 0, 10, 0, 0, 0, 6}, - {12, 9, 7, 5, 2, 2, 2, 3}, - {0, 0, 0, 0, 12, 9, 7, 5}}, - {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative. - {16, 0, 0, 0, 16, 0, 0, 0}, - {0, 16, 0, 0, 0, 16, 0, 0}, - {0, 0, 16, 0, 0, 0, 16, 0}, - {0, 0, 0, 16, 0, 0, 0, 16}, - {10, 6, 4, 2, 0, 0, 0, 0}, - {0, 0, 0, 0, 10, 6, 4, 2}}, - {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative. - {8, 0, 0, 0, 4, 0, 0, 0}, - {0, 8, 0, 0, 0, 4, 0, 0}, - {0, 0, 8, 0, 0, 0, 4, 0}, - {0, 0, 0, 8, 0, 0, 0, 4}, - {16, 16, 16, 16, 0, 0, 0, 0}, - {0, 0, 0, 0, 16, 16, 16, 16}}, - {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative. - {8, 3, 2, 1, 4, 3, 2, 2}, - {0, 8, 3, 2, 0, 4, 3, 2}, - {0, 0, 8, 3, 0, 0, 4, 3}, - {0, 0, 0, 8, 0, 0, 0, 4}, - {10, 6, 4, 2, 3, 4, 4, 3}, - {0, 0, 0, 0, 10, 6, 4, 3}}, - {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative. - {14, 0, 0, 0, 12, 1, 0, 0}, - {0, 14, 0, 0, 0, 12, 0, 0}, - {0, 0, 14, 0, 0, 0, 12, 1}, - {0, 0, 0, 14, 0, 0, 0, 12}, - {14, 12, 11, 10, 0, 0, 1, 1}, - {0, 0, 0, 0, 14, 12, 11, 9}}}; - -void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, - FilterIntraPredictor pred, int width, - int height) { - const uint8_t* const top = static_cast(top_row); - const uint8_t* const left = static_cast(left_column); - - assert(width <= 32 && height <= 32); - - uint8_t* dst = static_cast(dest); - - uint8x8_t transposed_taps[7]; - for (int i = 0; i < 7; ++i) { - transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]); - } - - uint8_t relative_top_left = top[-1]; - const uint8_t* relative_top = top; - uint8_t relative_left[2] = {left[0], left[1]}; - - int y = 0; - do { - uint8_t* row_dst = dst; - int x = 0; - do { - uint16x8_t sum = vdupq_n_u16(0); - const uint16x8_t subtrahend = - vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left)); - for (int i = 1; i < 5; ++i) { - sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1])); - } - for (int i = 5; i < 7; ++i) { - sum = - vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5])); - } - - const int16x8_t sum_signed = - vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend)); - const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4); - - uint8x8_t sum_saturated = vqmovun_s16(sum_shifted); - - StoreLo4(row_dst, sum_saturated); - StoreHi4(row_dst + stride, sum_saturated); - - // Progress across - relative_top_left = relative_top[3]; - relative_top += 4; - relative_left[0] = row_dst[3]; - relative_left[1] = row_dst[3 + stride]; - row_dst += 4; - x += 4; - } while (x < width); - - // Progress down. - relative_top_left = left[y + 1]; - relative_top = dst + stride; - relative_left[0] = left[y + 2]; - relative_left[1] = left[y + 3]; - - dst += 2 * stride; - y += 2; - } while (y < height); -} - -void Init8bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); - assert(dsp != nullptr); - dsp->filter_intra_predictor = FilterIntraPredictor_NEON; -} - -} // namespace -} // namespace low_bitdepth - -void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); } - -} // namespace dsp -} // namespace libgav1 - -#else // !LIBGAV1_ENABLE_NEON -namespace libgav1 { -namespace dsp { - -void IntraPredFilterIntraInit_NEON() {} - -} // namespace dsp -} // namespace libgav1 -#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_filter_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc new file mode 100644 index 0000000..bd9f61d --- /dev/null +++ b/src/dsp/arm/intrapred_filter_neon.cc @@ -0,0 +1,176 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { + +namespace low_bitdepth { +namespace { + +// Transpose kFilterIntraTaps and convert the first row to unsigned values. +// +// With the previous orientation we were able to multiply all the input values +// by a single tap. This required that all the input values be in one vector +// which requires expensive set up operations (shifts, vext, vtbl). All the +// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but +// then the shifting, rounding, and clamping was done in GP registers. +// +// Switching to unsigned values allows multiplying the 8 bit inputs directly. +// When one value was negative we needed to vmovl_u8 first so that the results +// maintained the proper sign. +// +// We take this into account when summing the values by subtracting the product +// of the first row. +alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] = + {{{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative. + {10, 2, 1, 1, 6, 2, 2, 1}, + {0, 10, 1, 1, 0, 6, 2, 2}, + {0, 0, 10, 2, 0, 0, 6, 2}, + {0, 0, 0, 10, 0, 0, 0, 6}, + {12, 9, 7, 5, 2, 2, 2, 3}, + {0, 0, 0, 0, 12, 9, 7, 5}}, + {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative. + {16, 0, 0, 0, 16, 0, 0, 0}, + {0, 16, 0, 0, 0, 16, 0, 0}, + {0, 0, 16, 0, 0, 0, 16, 0}, + {0, 0, 0, 16, 0, 0, 0, 16}, + {10, 6, 4, 2, 0, 0, 0, 0}, + {0, 0, 0, 0, 10, 6, 4, 2}}, + {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative. + {8, 0, 0, 0, 4, 0, 0, 0}, + {0, 8, 0, 0, 0, 4, 0, 0}, + {0, 0, 8, 0, 0, 0, 4, 0}, + {0, 0, 0, 8, 0, 0, 0, 4}, + {16, 16, 16, 16, 0, 0, 0, 0}, + {0, 0, 0, 0, 16, 16, 16, 16}}, + {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative. + {8, 3, 2, 1, 4, 3, 2, 2}, + {0, 8, 3, 2, 0, 4, 3, 2}, + {0, 0, 8, 3, 0, 0, 4, 3}, + {0, 0, 0, 8, 0, 0, 0, 4}, + {10, 6, 4, 2, 3, 4, 4, 3}, + {0, 0, 0, 0, 10, 6, 4, 3}}, + {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative. + {14, 0, 0, 0, 12, 1, 0, 0}, + {0, 14, 0, 0, 0, 12, 0, 0}, + {0, 0, 14, 0, 0, 0, 12, 1}, + {0, 0, 0, 14, 0, 0, 0, 12}, + {14, 12, 11, 10, 0, 0, 1, 1}, + {0, 0, 0, 0, 14, 12, 11, 9}}}; + +void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + FilterIntraPredictor pred, int width, + int height) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + + assert(width <= 32 && height <= 32); + + uint8_t* dst = static_cast(dest); + + uint8x8_t transposed_taps[7]; + for (int i = 0; i < 7; ++i) { + transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]); + } + + uint8_t relative_top_left = top[-1]; + const uint8_t* relative_top = top; + uint8_t relative_left[2] = {left[0], left[1]}; + + int y = 0; + do { + uint8_t* row_dst = dst; + int x = 0; + do { + uint16x8_t sum = vdupq_n_u16(0); + const uint16x8_t subtrahend = + vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left)); + for (int i = 1; i < 5; ++i) { + sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1])); + } + for (int i = 5; i < 7; ++i) { + sum = + vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5])); + } + + const int16x8_t sum_signed = + vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend)); + const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4); + + uint8x8_t sum_saturated = vqmovun_s16(sum_shifted); + + StoreLo4(row_dst, sum_saturated); + StoreHi4(row_dst + stride, sum_saturated); + + // Progress across + relative_top_left = relative_top[3]; + relative_top += 4; + relative_left[0] = row_dst[3]; + relative_left[1] = row_dst[3 + stride]; + row_dst += 4; + x += 4; + } while (x < width); + + // Progress down. + relative_top_left = left[y + 1]; + relative_top = dst + stride; + relative_left[0] = left[y + 2]; + relative_left[1] = left[y + 3]; + + dst += 2 * stride; + y += 2; + } while (y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->filter_intra_predictor = FilterIntraPredictor_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredFilterInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h new file mode 100644 index 0000000..283c1b1 --- /dev/null +++ b/src/dsp/arm/intrapred_filter_neon.h @@ -0,0 +1,37 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::filter_intra_predictor, see the defines below for specifics. +// These functions are not thread-safe. +void IntraPredFilterInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc index c967d82..c143648 100644 --- a/src/dsp/arm/intrapred_neon.cc +++ b/src/dsp/arm/intrapred_neon.cc @@ -26,6 +26,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -964,6 +965,200 @@ struct DcDefs { using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>; }; +// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows + +template +void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + int y = 0; + do { + auto* dst16 = reinterpret_cast(dst); + const uint16x4_t row = vld1_dup_u16(left + y); + vst1_u16(dst16, row); + dst += stride; + } while (++y < block_height); +} + +template +void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + int y = 0; + do { + auto* dst16 = reinterpret_cast(dst); + const uint16x8_t row = vld1q_dup_u16(left + y); + vst1q_u16(dst16, row); + dst += stride; + } while (++y < block_height); +} + +template +void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + int y = 0; + do { + const uint16x8_t row0 = vld1q_dup_u16(left + y); + const uint16x8_t row1 = vld1q_dup_u16(left + y + 1); + auto* dst16 = reinterpret_cast(dst); + vst1q_u16(dst16, row0); + vst1q_u16(dst16 + 8, row0); + dst += stride; + dst16 = reinterpret_cast(dst); + vst1q_u16(dst16, row1); + vst1q_u16(dst16 + 8, row1); + dst += stride; + y += 2; + } while (y < block_height); +} + +template +void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + int y = 0; + do { + const uint16x8_t row0 = vld1q_dup_u16(left + y); + const uint16x8_t row1 = vld1q_dup_u16(left + y + 1); + auto* dst16 = reinterpret_cast(dst); + vst1q_u16(dst16, row0); + vst1q_u16(dst16 + 8, row0); + vst1q_u16(dst16 + 16, row0); + vst1q_u16(dst16 + 24, row0); + dst += stride; + dst16 = reinterpret_cast(dst); + vst1q_u16(dst16, row1); + vst1q_u16(dst16 + 8, row1); + vst1q_u16(dst16 + 16, row1); + vst1q_u16(dst16 + 24, row1); + dst += stride; + y += 2; + } while (y < block_height); +} + +// IntraPredFuncs_NEON::Vertical -- copy top row to all rows + +template +void Vertical4xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + const uint8x8_t row = vld1_u8(top); + int y = block_height; + do { + vst1_u8(dst, row); + dst += stride; + } while (--y != 0); +} + +template +void Vertical8xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + const uint8x16_t row = vld1q_u8(top); + int y = block_height; + do { + vst1q_u8(dst, row); + dst += stride; + } while (--y != 0); +} + +template +void Vertical16xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + dst += stride; + y -= 2; + } while (y != 0); +} + +template +void Vertical32xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + const uint8x16_t row2 = vld1q_u8(top + 32); + const uint8x16_t row3 = vld1q_u8(top + 48); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + dst += stride; + y -= 2; + } while (y != 0); +} + +template +void Vertical64xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + const uint8x16_t row2 = vld1q_u8(top + 32); + const uint8x16_t row3 = vld1q_u8(top + 48); + const uint8x16_t row4 = vld1q_u8(top + 64); + const uint8x16_t row5 = vld1q_u8(top + 80); + const uint8x16_t row6 = vld1q_u8(top + 96); + const uint8x16_t row7 = vld1q_u8(top + 112); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + vst1q_u8(dst + 64, row4); + vst1q_u8(dst + 80, row5); + vst1q_u8(dst + 96, row6); + vst1q_u8(dst + 112, row7); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + vst1q_u8(dst + 64, row4); + vst1q_u8(dst + 80, row5); + vst1q_u8(dst + 96, row6); + vst1q_u8(dst + 112, row7); + dst += stride; + y -= 2; + } while (y != 0); +} + void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); @@ -973,6 +1168,8 @@ void Init10bpp() { DcDefs::_4x4::DcLeft; dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = DcDefs::_4x4::Dc; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + Vertical4xH_NEON<4>; // 4x8 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = @@ -981,6 +1178,10 @@ void Init10bpp() { DcDefs::_4x8::DcLeft; dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = DcDefs::_4x8::Dc; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + Horizontal4xH_NEON<8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + Vertical4xH_NEON<8>; // 4x16 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = @@ -989,6 +1190,10 @@ void Init10bpp() { DcDefs::_4x16::DcLeft; dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = DcDefs::_4x16::Dc; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + Horizontal4xH_NEON<16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + Vertical4xH_NEON<16>; // 8x4 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = @@ -997,6 +1202,8 @@ void Init10bpp() { DcDefs::_8x4::DcLeft; dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = DcDefs::_8x4::Dc; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + Vertical8xH_NEON<4>; // 8x8 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = @@ -1005,6 +1212,10 @@ void Init10bpp() { DcDefs::_8x8::DcLeft; dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = DcDefs::_8x8::Dc; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + Horizontal8xH_NEON<8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + Vertical8xH_NEON<8>; // 8x16 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = @@ -1013,6 +1224,8 @@ void Init10bpp() { DcDefs::_8x16::DcLeft; dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = DcDefs::_8x16::Dc; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + Vertical8xH_NEON<16>; // 8x32 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = @@ -1021,6 +1234,10 @@ void Init10bpp() { DcDefs::_8x32::DcLeft; dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = DcDefs::_8x32::Dc; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + Horizontal8xH_NEON<32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + Vertical8xH_NEON<32>; // 16x4 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = @@ -1029,6 +1246,8 @@ void Init10bpp() { DcDefs::_16x4::DcLeft; dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = DcDefs::_16x4::Dc; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + Vertical16xH_NEON<4>; // 16x8 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = @@ -1037,6 +1256,10 @@ void Init10bpp() { DcDefs::_16x8::DcLeft; dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = DcDefs::_16x8::Dc; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + Horizontal16xH_NEON<8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + Vertical16xH_NEON<8>; // 16x16 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = @@ -1045,6 +1268,8 @@ void Init10bpp() { DcDefs::_16x16::DcLeft; dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = DcDefs::_16x16::Dc; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + Vertical16xH_NEON<16>; // 16x32 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = @@ -1053,6 +1278,8 @@ void Init10bpp() { DcDefs::_16x32::DcLeft; dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = DcDefs::_16x32::Dc; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + Vertical16xH_NEON<32>; // 16x64 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = @@ -1061,6 +1288,8 @@ void Init10bpp() { DcDefs::_16x64::DcLeft; dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = DcDefs::_16x64::Dc; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + Vertical16xH_NEON<64>; // 32x8 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = @@ -1069,6 +1298,8 @@ void Init10bpp() { DcDefs::_32x8::DcLeft; dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = DcDefs::_32x8::Dc; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + Vertical32xH_NEON<8>; // 32x16 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = @@ -1077,6 +1308,8 @@ void Init10bpp() { DcDefs::_32x16::DcLeft; dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = DcDefs::_32x16::Dc; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + Vertical32xH_NEON<16>; // 32x32 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = @@ -1085,6 +1318,8 @@ void Init10bpp() { DcDefs::_32x32::DcLeft; dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = DcDefs::_32x32::Dc; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + Vertical32xH_NEON<32>; // 32x64 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = @@ -1093,6 +1328,10 @@ void Init10bpp() { DcDefs::_32x64::DcLeft; dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = DcDefs::_32x64::Dc; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + Horizontal32xH_NEON<64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + Vertical32xH_NEON<64>; // 64x16 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = @@ -1101,6 +1340,8 @@ void Init10bpp() { DcDefs::_64x16::DcLeft; dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = DcDefs::_64x16::Dc; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + Vertical64xH_NEON<16>; // 64x32 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = @@ -1109,6 +1350,8 @@ void Init10bpp() { DcDefs::_64x32::DcLeft; dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = DcDefs::_64x32::Dc; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + Vertical64xH_NEON<32>; // 64x64 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = @@ -1117,6 +1360,8 @@ void Init10bpp() { DcDefs::_64x64::DcLeft; dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = DcDefs::_64x64::Dc; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + Vertical64xH_NEON<64>; } } // namespace @@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h index 16f858c..b27f29f 100644 --- a/src/dsp/arm/intrapred_neon.h +++ b/src/dsp/arm/intrapred_neon.h @@ -23,396 +23,282 @@ namespace libgav1 { namespace dsp { -// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, -// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and -// Dsp::filter_intra_predictor, see the defines below for specifics. These -// functions are not thread-safe. -void IntraPredCflInit_NEON(); -void IntraPredDirectionalInit_NEON(); -void IntraPredFilterIntraInit_NEON(); +// Initializes Dsp::intra_predictors. +// See the defines below for specifics. These functions are not thread-safe. void IntraPredInit_NEON(); -void IntraPredSmoothInit_NEON(); } // namespace dsp } // namespace libgav1 #if LIBGAV1_ENABLE_NEON -// 8 bit -#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON - // 4x4 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON // 4x8 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON // 4x16 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON // 8x4 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON // 8x8 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON // 8x16 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON // 8x32 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON // 16x4 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON // 16x8 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON // 16x16 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON // 16x32 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON // 16x64 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 32x8 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON // 32x16 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON // 32x32 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON // 32x64 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x16 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x32 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x64 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 10 bit // 4x4 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 4x8 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 4x16 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x4 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x8 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x16 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x32 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x4 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x8 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x16 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x32 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x64 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x8 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x16 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x32 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x64 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x16 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x32 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x64 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_ diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index abc93e8..c33f333 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_smooth.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON @@ -26,6 +26,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h new file mode 100644 index 0000000..edd01be --- /dev/null +++ b/src/dsp/arm/intrapred_smooth_neon.h @@ -0,0 +1,149 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. +// This function is not thread-safe. +void IntraPredSmoothInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc new file mode 100644 index 0000000..ff184a1 --- /dev/null +++ b/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -0,0 +1,2543 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/inverse_transform.inc" + +//------------------------------------------------------------------------------ + +LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4], + int32x4_t out[4]) { + // in: + // 00 01 02 03 + // 10 11 12 13 + // 20 21 22 23 + // 30 31 32 33 + + // 00 10 02 12 a.val[0] + // 01 11 03 13 a.val[1] + // 20 30 22 32 b.val[0] + // 21 31 23 33 b.val[1] + const int32x4x2_t a = vtrnq_s32(in[0], in[1]); + const int32x4x2_t b = vtrnq_s32(in[2], in[3]); + out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2); + out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2); + out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2); + out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2); + // out: + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 +} + +//------------------------------------------------------------------------------ +template +LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx, + const int32x4_t* const s) { + assert(store_count % 4 == 0); + for (int i = 0; i < store_count; i += 4) { + vst1q_s32(&dst[i * stride + idx], s[i]); + vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]); + vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]); + vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]); + } +} + +template +LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride, + int32_t idx, int32x4_t* x) { + assert(load_count % 4 == 0); + for (int i = 0; i < load_count; i += 4) { + x[i] = vld1q_s32(&src[i * stride + idx]); + x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]); + x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]); + x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]); + } +} + +// Butterfly rotate 4 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + const int32x4_t acc_x = vmulq_n_s32(*a, cos128); + const int32x4_t acc_y = vmulq_n_s32(*a, sin128); + // The max range for the input is 18 bits. The cos128/sin128 is 13 bits, + // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32 + // bit lane. + const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128); + const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a, + int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + assert(sin128 <= 0xfff); + const int32x4_t x0 = vmulq_n_s32(*b, -sin128); + const int32x4_t y0 = vmulq_n_s32(*b, cos128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a, + int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + const int32x4_t x0 = vmulq_n_s32(*a, cos128); + const int32x4_t y0 = vmulq_n_s32(*a, sin128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b, + bool flip) { + int32x4_t x, y; + if (flip) { + y = vqaddq_s32(*b, *a); + x = vqsubq_s32(*b, *a); + } else { + x = vqaddq_s32(*a, *b); + y = vqsubq_s32(*a, *b); + } + *a = x; + *b = y; +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b, + bool flip, const int32x4_t* min, + const int32x4_t* max) { + int32x4_t x, y; + if (flip) { + y = vqaddq_s32(*b, *a); + x = vqsubq_s32(*b, *a); + } else { + x = vqaddq_s32(*a, *b); + y = vqsubq_s32(*a, *b); + } + *a = vmaxq_s32(vminq_s32(x, *max), *min); + *b = vmaxq_s32(vminq_s32(y, *max), *min); +} + +using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle, + bool flip); + +//------------------------------------------------------------------------------ +// Discrete Cosine Transforms (DCT). + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src); + const int32_t cos128 = Cos128(32); + const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12)); + // vqrshlq_s32 will shift right if shift value is negative. + const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift)); + // Clamp result to signed 16 bits. + const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted)); + if (width == 4) { + vst1q_s32(dst, result); + } else { + for (int i = 0; i < width; i += 4) { + vst1q_s32(dst, result); + dst += 4; + } + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int32_t cos128 = Cos128(32); + + // Calculate dc values for first row. + if (width == 4) { + const int32x4_t v_src = vld1q_s32(dst); + const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12)); + vst1q_s32(dst, xy); + } else { + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&dst[i]); + const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12)); + vst1q_s32(&dst[i], xy); + i += 4; + } while (i < width); + } + + // Copy first row to the rest of the block. + for (int y = 1; y < height; ++y) { + memcpy(&dst[y * width], dst, width * sizeof(dst[0])); + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 12. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true); + ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false); + } else { + butterfly_rotation(&s[0], &s[1], 32, true); + butterfly_rotation(&s[2], &s[3], 48, false); + } + + // stage 17. + if (is_last_stage) { + HadamardRotation(&s[0], &s[3], false); + HadamardRotation(&s[1], &s[2], false); + } else { + HadamardRotation(&s[0], &s[3], false, min, max); + HadamardRotation(&s[1], &s[2], false, min, max); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + // When |is_row| is true, set range to the row range, otherwise, set to the + // column range. + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[4], x[4]; + + LoadSrc<4>(dst, step, 0, x); + if (is_row) { + Transpose4x4(x, x); + } + + // stage 1. + // kBitReverseLookup 0, 2, 1, 3 + s[0] = x[0]; + s[1] = x[2]; + s[2] = x[1]; + s[3] = x[3]; + + Dct4Stages(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 4; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + Transpose4x4(s, s); + } + StoreDst<4>(dst, step, 0, s); +} + +template +LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 8. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false); + ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false); + } else { + butterfly_rotation(&s[4], &s[7], 56, false); + butterfly_rotation(&s[5], &s[6], 24, false); + } + + // stage 13. + HadamardRotation(&s[4], &s[5], false, min, max); + HadamardRotation(&s[6], &s[7], true, min, max); + + // stage 18. + butterfly_rotation(&s[6], &s[5], 32, true); + + // stage 22. + if (is_last_stage) { + HadamardRotation(&s[0], &s[7], false); + HadamardRotation(&s[1], &s[6], false); + HadamardRotation(&s[2], &s[5], false); + HadamardRotation(&s[3], &s[4], false); + } else { + HadamardRotation(&s[0], &s[7], false, min, max); + HadamardRotation(&s[1], &s[6], false, min, max); + HadamardRotation(&s[2], &s[5], false, min, max); + HadamardRotation(&s[3], &s[4], false, min, max); + } +} + +// Process dct8 rows or columns, depending on the |is_row| flag. +template +LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[8], x[8]; + + if (is_row) { + LoadSrc<4>(dst, step, 0, &x[0]); + LoadSrc<4>(dst, step, 4, &x[4]); + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + } else { + LoadSrc<8>(dst, step, 0, &x[0]); + } + + // stage 1. + // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7, + s[0] = x[0]; + s[1] = x[4]; + s[2] = x[2]; + s[3] = x[6]; + s[4] = x[1]; + s[5] = x[5]; + s[6] = x[3]; + s[7] = x[7]; + + Dct4Stages(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 8; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + Transpose4x4(&s[0], &s[0]); + Transpose4x4(&s[4], &s[4]); + StoreDst<4>(dst, step, 0, &s[0]); + StoreDst<4>(dst, step, 4, &s[4]); + } else { + StoreDst<8>(dst, step, 0, &s[0]); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 5. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false); + ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false); + ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false); + ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false); + } else { + butterfly_rotation(&s[8], &s[15], 60, false); + butterfly_rotation(&s[9], &s[14], 28, false); + butterfly_rotation(&s[10], &s[13], 44, false); + butterfly_rotation(&s[11], &s[12], 12, false); + } + + // stage 9. + HadamardRotation(&s[8], &s[9], false, min, max); + HadamardRotation(&s[10], &s[11], true, min, max); + HadamardRotation(&s[12], &s[13], false, min, max); + HadamardRotation(&s[14], &s[15], true, min, max); + + // stage 14. + butterfly_rotation(&s[14], &s[9], 48, true); + butterfly_rotation(&s[13], &s[10], 112, true); + + // stage 19. + HadamardRotation(&s[8], &s[11], false, min, max); + HadamardRotation(&s[9], &s[10], false, min, max); + HadamardRotation(&s[12], &s[15], true, min, max); + HadamardRotation(&s[13], &s[14], true, min, max); + + // stage 23. + butterfly_rotation(&s[13], &s[10], 32, true); + butterfly_rotation(&s[12], &s[11], 32, true); + + // stage 26. + if (is_last_stage) { + HadamardRotation(&s[0], &s[15], false); + HadamardRotation(&s[1], &s[14], false); + HadamardRotation(&s[2], &s[13], false); + HadamardRotation(&s[3], &s[12], false); + HadamardRotation(&s[4], &s[11], false); + HadamardRotation(&s[5], &s[10], false); + HadamardRotation(&s[6], &s[9], false); + HadamardRotation(&s[7], &s[8], false); + } else { + HadamardRotation(&s[0], &s[15], false, min, max); + HadamardRotation(&s[1], &s[14], false, min, max); + HadamardRotation(&s[2], &s[13], false, min, max); + HadamardRotation(&s[3], &s[12], false, min, max); + HadamardRotation(&s[4], &s[11], false, min, max); + HadamardRotation(&s[5], &s[10], false, min, max); + HadamardRotation(&s[6], &s[9], false, min, max); + HadamardRotation(&s[7], &s[8], false, min, max); + } +} + +// Process dct16 rows or columns, depending on the |is_row| flag. +template +LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[16], x[16]; + + if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<16>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + s[0] = x[0]; + s[1] = x[8]; + s[2] = x[4]; + s[3] = x[12]; + s[4] = x[2]; + s[5] = x[10]; + s[6] = x[6]; + s[7] = x[14]; + s[8] = x[1]; + s[9] = x[9]; + s[10] = x[5]; + s[11] = x[13]; + s[12] = x[3]; + s[13] = x[11]; + s[14] = x[7]; + s[15] = x[15]; + + Dct4Stages(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages(s, &min, &max, /*is_last_stage=*/false); + Dct16Stages(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 16; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + for (int idx = 0; idx < 16; idx += 8) { + Transpose4x4(&s[idx], &s[idx]); + Transpose4x4(&s[idx + 4], &s[idx + 4]); + StoreDst<4>(dst, step, idx, &s[idx]); + StoreDst<4>(dst, step, idx + 4, &s[idx + 4]); + } + } else { + StoreDst<16>(dst, step, 0, &s[0]); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 3 + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false); + ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false); + ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false); + ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false); + ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false); + ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false); + ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false); + ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false); + } else { + butterfly_rotation(&s[16], &s[31], 62, false); + butterfly_rotation(&s[17], &s[30], 30, false); + butterfly_rotation(&s[18], &s[29], 46, false); + butterfly_rotation(&s[19], &s[28], 14, false); + butterfly_rotation(&s[20], &s[27], 54, false); + butterfly_rotation(&s[21], &s[26], 22, false); + butterfly_rotation(&s[22], &s[25], 38, false); + butterfly_rotation(&s[23], &s[24], 6, false); + } + + // stage 6. + HadamardRotation(&s[16], &s[17], false, min, max); + HadamardRotation(&s[18], &s[19], true, min, max); + HadamardRotation(&s[20], &s[21], false, min, max); + HadamardRotation(&s[22], &s[23], true, min, max); + HadamardRotation(&s[24], &s[25], false, min, max); + HadamardRotation(&s[26], &s[27], true, min, max); + HadamardRotation(&s[28], &s[29], false, min, max); + HadamardRotation(&s[30], &s[31], true, min, max); + + // stage 10. + butterfly_rotation(&s[30], &s[17], 24 + 32, true); + butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true); + butterfly_rotation(&s[26], &s[21], 24, true); + butterfly_rotation(&s[25], &s[22], 24 + 64, true); + + // stage 15. + HadamardRotation(&s[16], &s[19], false, min, max); + HadamardRotation(&s[17], &s[18], false, min, max); + HadamardRotation(&s[20], &s[23], true, min, max); + HadamardRotation(&s[21], &s[22], true, min, max); + HadamardRotation(&s[24], &s[27], false, min, max); + HadamardRotation(&s[25], &s[26], false, min, max); + HadamardRotation(&s[28], &s[31], true, min, max); + HadamardRotation(&s[29], &s[30], true, min, max); + + // stage 20. + butterfly_rotation(&s[29], &s[18], 48, true); + butterfly_rotation(&s[28], &s[19], 48, true); + butterfly_rotation(&s[27], &s[20], 48 + 64, true); + butterfly_rotation(&s[26], &s[21], 48 + 64, true); + + // stage 24. + HadamardRotation(&s[16], &s[23], false, min, max); + HadamardRotation(&s[17], &s[22], false, min, max); + HadamardRotation(&s[18], &s[21], false, min, max); + HadamardRotation(&s[19], &s[20], false, min, max); + HadamardRotation(&s[24], &s[31], true, min, max); + HadamardRotation(&s[25], &s[30], true, min, max); + HadamardRotation(&s[26], &s[29], true, min, max); + HadamardRotation(&s[27], &s[28], true, min, max); + + // stage 27. + butterfly_rotation(&s[27], &s[20], 32, true); + butterfly_rotation(&s[26], &s[21], 32, true); + butterfly_rotation(&s[25], &s[22], 32, true); + butterfly_rotation(&s[24], &s[23], 32, true); + + // stage 29. + if (is_last_stage) { + HadamardRotation(&s[0], &s[31], false); + HadamardRotation(&s[1], &s[30], false); + HadamardRotation(&s[2], &s[29], false); + HadamardRotation(&s[3], &s[28], false); + HadamardRotation(&s[4], &s[27], false); + HadamardRotation(&s[5], &s[26], false); + HadamardRotation(&s[6], &s[25], false); + HadamardRotation(&s[7], &s[24], false); + HadamardRotation(&s[8], &s[23], false); + HadamardRotation(&s[9], &s[22], false); + HadamardRotation(&s[10], &s[21], false); + HadamardRotation(&s[11], &s[20], false); + HadamardRotation(&s[12], &s[19], false); + HadamardRotation(&s[13], &s[18], false); + HadamardRotation(&s[14], &s[17], false); + HadamardRotation(&s[15], &s[16], false); + } else { + HadamardRotation(&s[0], &s[31], false, min, max); + HadamardRotation(&s[1], &s[30], false, min, max); + HadamardRotation(&s[2], &s[29], false, min, max); + HadamardRotation(&s[3], &s[28], false, min, max); + HadamardRotation(&s[4], &s[27], false, min, max); + HadamardRotation(&s[5], &s[26], false, min, max); + HadamardRotation(&s[6], &s[25], false, min, max); + HadamardRotation(&s[7], &s[24], false, min, max); + HadamardRotation(&s[8], &s[23], false, min, max); + HadamardRotation(&s[9], &s[22], false, min, max); + HadamardRotation(&s[10], &s[21], false, min, max); + HadamardRotation(&s[11], &s[20], false, min, max); + HadamardRotation(&s[12], &s[19], false, min, max); + HadamardRotation(&s[13], &s[18], false, min, max); + HadamardRotation(&s[14], &s[17], false, min, max); + HadamardRotation(&s[15], &s[16], false, min, max); + } +} + +// Process dct32 rows or columns, depending on the |is_row| flag. +LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step, + const bool is_row, int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[32], x[32]; + + if (is_row) { + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<32>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup + // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + s[0] = x[0]; + s[1] = x[16]; + s[2] = x[8]; + s[3] = x[24]; + s[4] = x[4]; + s[5] = x[20]; + s[6] = x[12]; + s[7] = x[28]; + s[8] = x[2]; + s[9] = x[18]; + s[10] = x[10]; + s[11] = x[26]; + s[12] = x[6]; + s[13] = x[22]; + s[14] = x[14]; + s[15] = x[30]; + + // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31, + s[16] = x[1]; + s[17] = x[17]; + s[18] = x[9]; + s[19] = x[25]; + s[20] = x[5]; + s[21] = x[21]; + s[22] = x[13]; + s[23] = x[29]; + s[24] = x[3]; + s[25] = x[19]; + s[26] = x[11]; + s[27] = x[27]; + s[28] = x[7]; + s[29] = x[23]; + s[30] = x[15]; + s[31] = x[31]; + + Dct4Stages(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages(s, &min, &max, /*is_last_stage=*/false); + Dct16Stages(s, &min, &max, /*is_last_stage=*/false); + Dct32Stages(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int idx = 0; idx < 32; idx += 8) { + int32x4_t output[8]; + Transpose4x4(&s[idx], &output[0]); + Transpose4x4(&s[idx + 4], &output[4]); + for (int i = 0; i < 8; ++i) { + output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift))); + } + StoreDst<4>(dst, step, idx, &output[0]); + StoreDst<4>(dst, step, idx + 4, &output[4]); + } + } else { + StoreDst<32>(dst, step, 0, &s[0]); + } +} + +void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[64], x[32]; + + if (is_row) { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + // The last 32 values of every column are always zero if the |tx_height| is + // 64. + LoadSrc<32>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup + // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60, + s[0] = x[0]; + s[2] = x[16]; + s[4] = x[8]; + s[6] = x[24]; + s[8] = x[4]; + s[10] = x[20]; + s[12] = x[12]; + s[14] = x[28]; + + // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62, + s[16] = x[2]; + s[18] = x[18]; + s[20] = x[10]; + s[22] = x[26]; + s[24] = x[6]; + s[26] = x[22]; + s[28] = x[14]; + s[30] = x[30]; + + // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61, + s[32] = x[1]; + s[34] = x[17]; + s[36] = x[9]; + s[38] = x[25]; + s[40] = x[5]; + s[42] = x[21]; + s[44] = x[13]; + s[46] = x[29]; + + // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63 + s[48] = x[3]; + s[50] = x[19]; + s[52] = x[11]; + s[54] = x[27]; + s[56] = x[7]; + s[58] = x[23]; + s[60] = x[15]; + s[62] = x[31]; + + Dct4Stages( + s, &min, &max, /*is_last_stage=*/false); + Dct8Stages( + s, &min, &max, /*is_last_stage=*/false); + Dct16Stages( + s, &min, &max, /*is_last_stage=*/false); + Dct32Stages( + s, &min, &max, /*is_last_stage=*/false); + + //-- start dct 64 stages + // stage 2. + ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false); + ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false); + ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false); + ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false); + ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false); + ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false); + ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false); + ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false); + ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false); + ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false); + ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false); + ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false); + ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false); + ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false); + ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false); + ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false); + + // stage 4. + HadamardRotation(&s[32], &s[33], false, &min, &max); + HadamardRotation(&s[34], &s[35], true, &min, &max); + HadamardRotation(&s[36], &s[37], false, &min, &max); + HadamardRotation(&s[38], &s[39], true, &min, &max); + HadamardRotation(&s[40], &s[41], false, &min, &max); + HadamardRotation(&s[42], &s[43], true, &min, &max); + HadamardRotation(&s[44], &s[45], false, &min, &max); + HadamardRotation(&s[46], &s[47], true, &min, &max); + HadamardRotation(&s[48], &s[49], false, &min, &max); + HadamardRotation(&s[50], &s[51], true, &min, &max); + HadamardRotation(&s[52], &s[53], false, &min, &max); + HadamardRotation(&s[54], &s[55], true, &min, &max); + HadamardRotation(&s[56], &s[57], false, &min, &max); + HadamardRotation(&s[58], &s[59], true, &min, &max); + HadamardRotation(&s[60], &s[61], false, &min, &max); + HadamardRotation(&s[62], &s[63], true, &min, &max); + + // stage 7. + ButterflyRotation_4(&s[62], &s[33], 60 - 0, true); + ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true); + ButterflyRotation_4(&s[58], &s[37], 60 - 32, true); + ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true); + ButterflyRotation_4(&s[54], &s[41], 60 - 16, true); + ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true); + ButterflyRotation_4(&s[50], &s[45], 60 - 48, true); + ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true); + + // stage 11. + HadamardRotation(&s[32], &s[35], false, &min, &max); + HadamardRotation(&s[33], &s[34], false, &min, &max); + HadamardRotation(&s[36], &s[39], true, &min, &max); + HadamardRotation(&s[37], &s[38], true, &min, &max); + HadamardRotation(&s[40], &s[43], false, &min, &max); + HadamardRotation(&s[41], &s[42], false, &min, &max); + HadamardRotation(&s[44], &s[47], true, &min, &max); + HadamardRotation(&s[45], &s[46], true, &min, &max); + HadamardRotation(&s[48], &s[51], false, &min, &max); + HadamardRotation(&s[49], &s[50], false, &min, &max); + HadamardRotation(&s[52], &s[55], true, &min, &max); + HadamardRotation(&s[53], &s[54], true, &min, &max); + HadamardRotation(&s[56], &s[59], false, &min, &max); + HadamardRotation(&s[57], &s[58], false, &min, &max); + HadamardRotation(&s[60], &s[63], true, &min, &max); + HadamardRotation(&s[61], &s[62], true, &min, &max); + + // stage 16. + ButterflyRotation_4(&s[61], &s[34], 56, true); + ButterflyRotation_4(&s[60], &s[35], 56, true); + ButterflyRotation_4(&s[59], &s[36], 56 + 64, true); + ButterflyRotation_4(&s[58], &s[37], 56 + 64, true); + ButterflyRotation_4(&s[53], &s[42], 56 - 32, true); + ButterflyRotation_4(&s[52], &s[43], 56 - 32, true); + ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true); + ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true); + + // stage 21. + HadamardRotation(&s[32], &s[39], false, &min, &max); + HadamardRotation(&s[33], &s[38], false, &min, &max); + HadamardRotation(&s[34], &s[37], false, &min, &max); + HadamardRotation(&s[35], &s[36], false, &min, &max); + HadamardRotation(&s[40], &s[47], true, &min, &max); + HadamardRotation(&s[41], &s[46], true, &min, &max); + HadamardRotation(&s[42], &s[45], true, &min, &max); + HadamardRotation(&s[43], &s[44], true, &min, &max); + HadamardRotation(&s[48], &s[55], false, &min, &max); + HadamardRotation(&s[49], &s[54], false, &min, &max); + HadamardRotation(&s[50], &s[53], false, &min, &max); + HadamardRotation(&s[51], &s[52], false, &min, &max); + HadamardRotation(&s[56], &s[63], true, &min, &max); + HadamardRotation(&s[57], &s[62], true, &min, &max); + HadamardRotation(&s[58], &s[61], true, &min, &max); + HadamardRotation(&s[59], &s[60], true, &min, &max); + + // stage 25. + ButterflyRotation_4(&s[59], &s[36], 48, true); + ButterflyRotation_4(&s[58], &s[37], 48, true); + ButterflyRotation_4(&s[57], &s[38], 48, true); + ButterflyRotation_4(&s[56], &s[39], 48, true); + ButterflyRotation_4(&s[55], &s[40], 112, true); + ButterflyRotation_4(&s[54], &s[41], 112, true); + ButterflyRotation_4(&s[53], &s[42], 112, true); + ButterflyRotation_4(&s[52], &s[43], 112, true); + + // stage 28. + HadamardRotation(&s[32], &s[47], false, &min, &max); + HadamardRotation(&s[33], &s[46], false, &min, &max); + HadamardRotation(&s[34], &s[45], false, &min, &max); + HadamardRotation(&s[35], &s[44], false, &min, &max); + HadamardRotation(&s[36], &s[43], false, &min, &max); + HadamardRotation(&s[37], &s[42], false, &min, &max); + HadamardRotation(&s[38], &s[41], false, &min, &max); + HadamardRotation(&s[39], &s[40], false, &min, &max); + HadamardRotation(&s[48], &s[63], true, &min, &max); + HadamardRotation(&s[49], &s[62], true, &min, &max); + HadamardRotation(&s[50], &s[61], true, &min, &max); + HadamardRotation(&s[51], &s[60], true, &min, &max); + HadamardRotation(&s[52], &s[59], true, &min, &max); + HadamardRotation(&s[53], &s[58], true, &min, &max); + HadamardRotation(&s[54], &s[57], true, &min, &max); + HadamardRotation(&s[55], &s[56], true, &min, &max); + + // stage 30. + ButterflyRotation_4(&s[55], &s[40], 32, true); + ButterflyRotation_4(&s[54], &s[41], 32, true); + ButterflyRotation_4(&s[53], &s[42], 32, true); + ButterflyRotation_4(&s[52], &s[43], 32, true); + ButterflyRotation_4(&s[51], &s[44], 32, true); + ButterflyRotation_4(&s[50], &s[45], 32, true); + ButterflyRotation_4(&s[49], &s[46], 32, true); + ButterflyRotation_4(&s[48], &s[47], 32, true); + + // stage 31. + for (int i = 0; i < 32; i += 4) { + HadamardRotation(&s[i], &s[63 - i], false, &min, &max); + HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max); + HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max); + HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max); + } + //-- end dct 64 stages + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int idx = 0; idx < 64; idx += 8) { + int32x4_t output[8]; + Transpose4x4(&s[idx], &output[0]); + Transpose4x4(&s[idx + 4], &output[4]); + for (int i = 0; i < 8; ++i) { + output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift))); + } + StoreDst<4>(dst, step, idx, &output[0]); + StoreDst<4>(dst, step, idx + 4, &output[4]); + } + } else { + StoreDst<64>(dst, step, 0, &s[0]); + } +} + +//------------------------------------------------------------------------------ +// Asymmetric Discrete Sine Transforms (ADST). +LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + int32x4_t s[8]; + int32x4_t x[4]; + + LoadSrc<4>(dst, step, 0, x); + if (is_row) { + Transpose4x4(x, x); + } + + // stage 1. + s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]); + s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]); + + // stage 2. + const int32x4_t a7 = vsubq_s32(x[0], x[2]); + const int32x4_t b7 = vaddq_s32(a7, x[3]); + + // stage 3. + s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]); + s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]); + // s[0] = s[0] + s[3] + s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]); + // s[1] = s[1] - s[4] + s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]); + + s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]); + s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); + + // stage 4. + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[6]); + + // stages 5 and 6. + const int32x4_t x0 = vaddq_s32(s[0], s[3]); + const int32x4_t x1 = vaddq_s32(s[1], s[3]); + const int32x4_t x3_a = vaddq_s32(s[0], s[1]); + const int32x4_t x3 = vsubq_s32(x3_a, s[3]); + x[0] = vrshrq_n_s32(x0, 12); + x[1] = vrshrq_n_s32(x1, 12); + x[2] = vrshrq_n_s32(s[2], 12); + x[3] = vrshrq_n_s32(x3, 12); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift))); + x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); + x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); + x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); + Transpose4x4(x, x); + } + StoreDst<4>(dst, step, 0, x); +} + +alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, + 2482}; + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[2]; + + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src0_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + + const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0); + const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier); + s[1] = vdupq_n_s32(0); + + // s0*k0 s0*k1 s0*k2 s0*k1 + s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src); + // 0 0 0 s0*k0 + s[1] = vextq_s32(s[1], s[0], 1); + + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int32x4_t dst_0 = vrshrq_n_s32(x3, 12); + + // vqrshlq_s32 will shift right if shift value is negative. + vst1q_s32(dst, + vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift))))); + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[4]; + + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&dst[i]); + + s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]); + s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]); + s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]); + + const int32x4_t x0 = s[0]; + const int32x4_t x1 = s[1]; + const int32x4_t x2 = s[2]; + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int32x4_t dst_0 = vrshrq_n_s32(x0, 12); + const int32x4_t dst_1 = vrshrq_n_s32(x1, 12); + const int32x4_t dst_2 = vrshrq_n_s32(x2, 12); + const int32x4_t dst_3 = vrshrq_n_s32(x3, 12); + + vst1q_s32(&dst[i], dst_0); + vst1q_s32(&dst[i + width * 1], dst_1); + vst1q_s32(&dst[i + width * 2], dst_2); + vst1q_s32(&dst[i + width * 3], dst_3); + + i += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[8], x[8]; + + if (is_row) { + LoadSrc<4>(dst, step, 0, &x[0]); + LoadSrc<4>(dst, step, 4, &x[4]); + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + } else { + LoadSrc<8>(dst, step, 0, &x[0]); + } + + // stage 1. + s[0] = x[7]; + s[1] = x[0]; + s[2] = x[5]; + s[3] = x[2]; + s[4] = x[3]; + s[5] = x[4]; + s[6] = x[1]; + s[7] = x[6]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 60 - 0, true); + butterfly_rotation(&s[2], &s[3], 60 - 16, true); + butterfly_rotation(&s[4], &s[5], 60 - 32, true); + butterfly_rotation(&s[6], &s[7], 60 - 48, true); + + // stage 3. + HadamardRotation(&s[0], &s[4], false, &min, &max); + HadamardRotation(&s[1], &s[5], false, &min, &max); + HadamardRotation(&s[2], &s[6], false, &min, &max); + HadamardRotation(&s[3], &s[7], false, &min, &max); + + // stage 4. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[2], false, &min, &max); + HadamardRotation(&s[4], &s[6], false, &min, &max); + HadamardRotation(&s[1], &s[3], false, &min, &max); + HadamardRotation(&s[5], &s[7], false, &min, &max); + + // stage 6. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + + // stage 7. + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 8; ++i) { + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift))); + } + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + StoreDst<4>(dst, step, 0, &x[0]); + StoreDst<4>(dst, step, 4, &x[4]); + } else { + StoreDst<8>(dst, step, 0, &x[0]); + } +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[8]; + + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + // stage 1. + s[1] = vbslq_s32(v_mask, v_src_round, v_src); + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int32x4_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + for (int i = 0; i < 8; ++i) { + // vqrshlq_s32 will shift right if shift value is negative. + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift)))); + vst1q_lane_s32(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[8]; + + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(dst); + // stage 1. + s[1] = v_src; + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int32x4_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + for (int j = 0; j < 8; ++j) { + vst1q_s32(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[16], x[16]; + + if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<16>(dst, step, 0, &x[0]); + } + + // stage 1. + s[0] = x[15]; + s[1] = x[0]; + s[2] = x[13]; + s[3] = x[2]; + s[4] = x[11]; + s[5] = x[4]; + s[6] = x[9]; + s[7] = x[6]; + s[8] = x[7]; + s[9] = x[8]; + s[10] = x[5]; + s[11] = x[10]; + s[12] = x[3]; + s[13] = x[12]; + s[14] = x[1]; + s[15] = x[14]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 62 - 0, true); + butterfly_rotation(&s[2], &s[3], 62 - 8, true); + butterfly_rotation(&s[4], &s[5], 62 - 16, true); + butterfly_rotation(&s[6], &s[7], 62 - 24, true); + butterfly_rotation(&s[8], &s[9], 62 - 32, true); + butterfly_rotation(&s[10], &s[11], 62 - 40, true); + butterfly_rotation(&s[12], &s[13], 62 - 48, true); + butterfly_rotation(&s[14], &s[15], 62 - 56, true); + + // stage 3. + HadamardRotation(&s[0], &s[8], false, &min, &max); + HadamardRotation(&s[1], &s[9], false, &min, &max); + HadamardRotation(&s[2], &s[10], false, &min, &max); + HadamardRotation(&s[3], &s[11], false, &min, &max); + HadamardRotation(&s[4], &s[12], false, &min, &max); + HadamardRotation(&s[5], &s[13], false, &min, &max); + HadamardRotation(&s[6], &s[14], false, &min, &max); + HadamardRotation(&s[7], &s[15], false, &min, &max); + + // stage 4. + butterfly_rotation(&s[8], &s[9], 56 - 0, true); + butterfly_rotation(&s[13], &s[12], 8 + 0, true); + butterfly_rotation(&s[10], &s[11], 56 - 32, true); + butterfly_rotation(&s[15], &s[14], 8 + 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[4], false, &min, &max); + HadamardRotation(&s[8], &s[12], false, &min, &max); + HadamardRotation(&s[1], &s[5], false, &min, &max); + HadamardRotation(&s[9], &s[13], false, &min, &max); + HadamardRotation(&s[2], &s[6], false, &min, &max); + HadamardRotation(&s[10], &s[14], false, &min, &max); + HadamardRotation(&s[3], &s[7], false, &min, &max); + HadamardRotation(&s[11], &s[15], false, &min, &max); + + // stage 6. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[12], &s[13], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + butterfly_rotation(&s[15], &s[14], 48 - 32, true); + + // stage 7. + HadamardRotation(&s[0], &s[2], false, &min, &max); + HadamardRotation(&s[4], &s[6], false, &min, &max); + HadamardRotation(&s[8], &s[10], false, &min, &max); + HadamardRotation(&s[12], &s[14], false, &min, &max); + HadamardRotation(&s[1], &s[3], false, &min, &max); + HadamardRotation(&s[5], &s[7], false, &min, &max); + HadamardRotation(&s[9], &s[11], false, &min, &max); + HadamardRotation(&s[13], &s[15], false, &min, &max); + + // stage 8. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + butterfly_rotation(&s[10], &s[11], 32, true); + butterfly_rotation(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s32(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s32(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s32(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s32(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s32(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s32(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s32(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s32(s[1]); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 16; ++i) { + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift))); + } + for (int idx = 0; idx < 16; idx += 8) { + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + StoreDst<4>(dst, step, idx, &x[idx]); + StoreDst<4>(dst, step, idx + 4, &x[idx + 4]); + } + } else { + StoreDst<16>(dst, step, 0, &x[0]); + } +} + +LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) { + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true); + + // stage 3. + s[8] = s[0]; + s[9] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[8], &s[9], 56, true); + + // stage 5. + s[4] = s[0]; + s[12] = s[8]; + s[5] = s[1]; + s[13] = s[9]; + + // stage 6. + ButterflyRotation_4(&s[4], &s[5], 48, true); + ButterflyRotation_4(&s[12], &s[13], 48, true); + + // stage 7. + s[2] = s[0]; + s[6] = s[4]; + s[10] = s[8]; + s[14] = s[12]; + s[3] = s[1]; + s[7] = s[5]; + s[11] = s[9]; + s[15] = s[13]; + + // stage 8. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + ButterflyRotation_4(&s[10], &s[11], 32, true); + ButterflyRotation_4(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s32(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s32(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s32(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s32(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s32(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s32(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s32(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s32(s[1]); +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[16]; + int32x4_t x[16]; + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + // stage 1. + s[1] = vbslq_s32(v_mask, v_src_round, v_src); + + Adst16DcOnlyInternal(s, x); + + for (int i = 0; i < 16; ++i) { + // vqrshlq_s32 will shift right if shift value is negative. + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift)))); + vst1q_lane_s32(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, + int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int i = 0; + do { + int32x4_t s[16]; + int32x4_t x[16]; + const int32x4_t v_src = vld1q_s32(dst); + // stage 1. + s[1] = v_src; + + Adst16DcOnlyInternal(s, x); + + for (int j = 0; j < 16; ++j) { + vst1q_s32(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +//------------------------------------------------------------------------------ +// Identity Transforms. + +LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) { + auto* const dst = static_cast(dest); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_mult_lo = + vmlaq_s32(v_dual_round, v_src, v_multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo))); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int shift = tx_height < 16 ? 0 : 1; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int32_t* source) { + static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16, + "Invalid identity_size."); + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11); + const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); + + if (tx_width == 4) { + int i = 0; + do { + int32x4x2_t v_src, v_dst_i, a, b; + v_src.val[0] = vld1q_s32(&source[i * 4]); + v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]); + if (identity_size == 4) { + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } else if (identity_size == 8) { + v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]); + v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]); + a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4); + a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4); + } else { // identity_size == 16 + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst); + frame_data.val[1] = vld1_u16(dst + stride); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + dst += stride << 1; + i += 2; + } while (i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + int32x4x2_t v_src, v_dst_i, a, b; + v_src.val[0] = vld1q_s32(&source[row + j]); + v_src.val[1] = vld1q_s32(&source[row + j + 4]); + if (identity_size == 4) { + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } else if (identity_size == 8) { + v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]); + v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]); + a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4); + a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4); + } else { // identity_size == 16 + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst + j); + frame_data.val[1] = vld1_u16(dst + j + 4); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int32_t* source) { + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11); + const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); + + if (tx_width == 4) { + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&source[i * 4]); + const int32x4_t v_dst_row = + vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12); + const int32x4_t v_dst_col = + vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier); + const uint16x4_t frame_data = vld1_u16(dst); + const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12); + const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data)); + vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b; + v_src.val[0] = vld1q_s32(&source[row + j]); + v_src.val[1] = vld1q_s32(&source[row + j + 4]); + v_src_round.val[0] = vshrq_n_s32( + vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12); + v_src_round.val[1] = vshrq_n_s32( + vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12); + v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]); + v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]); + v_dst_col.val[0] = + vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier); + v_dst_col.val[1] = + vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier); + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst + j); + frame_data.val[1] = vld1_u16(dst + j + 4); + a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12); + a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height equal to 32 can be simplified from + // ((A * 2) + 2) >> 2) to ((A + 1) >> 1). + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]); + const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1); + const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo))); + vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi))); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]); + const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo); + const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo))); + vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi))); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src); + const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift)); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step, + int shift) { + auto* const dst = static_cast(dest); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + int32x4x2_t v_src; + v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]); + v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]); + const int32x4_t v_src_mult_lo = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + const int32x4_t v_src_mult_hi = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift); + vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo))); + vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi))); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_src_mult_lo = + vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift))); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +//------------------------------------------------------------------------------ +// row/column transform loops + +template +LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) { + if (tx_width >= 16) { + int i = 0; + do { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + const int32x4_t c = vld1q_s32(&source[i + 8]); + const int32x4_t d = vld1q_s32(&source[i + 12]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + const int32x4_t c_rev = vrev64q_s32(c); + const int32x4_t d_rev = vrev64q_s32(d); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2)); + vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2)); + vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2)); + i += 16; + } while (i < tx_width * tx_height); + } else if (tx_width == 8) { + for (int i = 0; i < 8 * tx_height; i += 8) { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2)); + } + } else { + // Process two rows per iteration. + for (int i = 0; i < 4 * tx_height; i += 8) { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2)); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) { + // Process two rows per iteration. + int i = 0; + do { + const int32x4_t a_lo = vld1q_s32(&source[i]); + const int32x4_t a_hi = vld1q_s32(&source[i + 4]); + const int32x4_t b_lo = + vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12)); + const int32x4_t b_hi = + vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12)); + vst1q_s32(&source[i], b_lo); + vst1q_s32(&source[i + 4], b_hi); + i += 8; + } while (i < tx_width * num_rows); +} + +template +LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows, + int row_shift) { + // vqrshlq_s32 will shift right if shift value is negative. + row_shift = -row_shift; + + // Process two rows per iteration. + int i = 0; + do { + const int32x4_t residual0 = vld1q_s32(&source[i]); + const int32x4_t residual1 = vld1q_s32(&source[i + 4]); + vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift))); + vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift))); + i += 8; + } while (i < tx_width * num_rows); +} + +template +LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int32_t* source, TransformType tx_type) { + const bool flip_rows = + enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false; + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + + if (tx_width == 4) { + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4; + const int32x4_t residual = vld1q_s32(&source[row]); + const uint16x4_t frame_data = vld1_u16(dst); + const int32x4_t a = vrshrq_n_s32(residual, 4); + const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data); + const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b)); + vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1))); + dst += stride; + } + } else { + for (int i = 0; i < tx_height; ++i) { + const int y = start_y + i; + const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width; + int j = 0; + do { + const int x = start_x + j; + const int32x4_t residual = vld1q_s32(&source[row + j]); + const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]); + const uint16x8_t frame_data = vld1q_u16(frame[y] + x); + const int32x4_t a = vrshrq_n_s32(residual, 4); + const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4); + const uint32x4_t b = + vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data)); + const uint32x4_t b_hi = + vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data)); + const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b)); + const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi)); + vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi), + vdupq_n_u16((1 << kBitdepth10) - 1))); + j += 8; + } while (j < tx_width); + } + } +} + +void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + const int row_shift = (tx_height == 16); + + if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d dct4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Dct4_NEON(data, /*step=*/4, /*is_row=*/true, + row_shift); + data += 16; + i -= 4; + } while (i != 0); +} + +void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct4_NEON(data, tx_width, /*transpose=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // Process 4 1d dct8 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Dct8_NEON(data, /*step=*/8, /*is_row=*/true, + row_shift); + data += 32; + i -= 4; + } while (i != 0); +} + +void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct8_NEON(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct16 rows in parallel per iteration. + Dct16_NEON(data, 16, /*is_row=*/true, row_shift); + data += 64; + i -= 4; + } while (i != 0); +} + +void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct16 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct16_NEON(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<32>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct32 rows in parallel per iteration. + Dct32_NEON(data, 32, /*is_row=*/true, row_shift); + data += 128; + i -= 4; + } while (i != 0); +} + +void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<32>(src, tx_width); + } + + if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct32 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<64>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct64 rows in parallel per iteration. + Dct64_NEON(data, 64, /*is_row=*/true, row_shift); + data += 128 * 2; + i -= 4; + } while (i != 0); +} + +void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<64>(src, tx_width); + } + + if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct64 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const int row_shift = static_cast(tx_height == 16); + const bool should_round = (tx_height == 8); + + if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d adst4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift); + data += 16; + i -= 4; + } while (i != 0); +} + +void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // Process 4 1d adst8 rows in parallel per iteration. + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + Adst8_NEON(data, /*step=*/8, + /*transpose=*/true, row_shift); + data += 32; + i -= 4; + } while (i != 0); +} + +void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst8_NEON(data, tx_width, /*transpose=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + do { + // Process 4 1d adst16 rows in parallel per iteration. + Adst16_NEON(src, 16, /*is_row=*/true, row_shift); + src += 64; + i -= 4; + } while (i != 0); +} + +void Adst16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + int i = tx_width; + auto* data = src; + do { + // Process 4 1d adst16 columns in parallel per iteration. + Adst16_NEON(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Identity4TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize4x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + + if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + const int shift = tx_height > 8 ? 1 : 0; + int i = adjusted_tx_height; + do { + Identity4_NEON(src, /*step=*/4, shift); + src += 16; + i -= 4; + } while (i != 0); +} + +void Identity4TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + // Special case: Process row calculations during column transform call. + if (tx_type == kTransformTypeIdentityIdentity && + (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) { + Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); + return; + } + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity8TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize8x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 16 can be simplified + // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16 + // bit value. + if ((tx_height & 0x18) != 0) { + for (int i = 0; i < tx_height; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]); + const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]); + vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo))); + vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi))); + } + return; + } + if (tx_height == 32) { + int i = adjusted_tx_height; + do { + Identity8Row32_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); + return; + } + + assert(tx_size == kTransformSize8x4); + int i = adjusted_tx_height; + do { + Identity8Row4_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); +} + +void Identity8TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + auto& frame = *static_cast*>(dst_frame); + IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + int i = adjusted_tx_height; + do { + Identity16Row_NEON(src, /*step=*/16, row_shift); + src += 64; + i -= 4; + } while (i != 0); +} + +void Identity16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + auto& frame = *static_cast*>(dst_frame); + IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +//------------------------------------------------------------------------------ + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + // Maximum transform size for Dct is 64. + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + Dct4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + Dct4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + Dct8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + Dct8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + Dct16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + Dct16TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + Dct32TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + Dct32TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + Dct64TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + Dct64TransformLoopColumn_NEON; + + // Maximum transform size for Adst is 16. + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + Adst4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + Adst4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + Adst8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + Adst8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + Adst16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + Adst16TransformLoopColumn_NEON; + + // Maximum transform size for Identity transform is 32. + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + Identity4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + Identity4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + Identity8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + Identity8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + Identity16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + Identity16TransformLoopColumn_NEON; +} + +} // namespace + +void InverseTransformInit10bpp_NEON() { Init10bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10 +namespace libgav1 { +namespace dsp { + +void InverseTransformInit10bpp_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc index 072991a..315d5e9 100644 --- a/src/dsp/arm/inverse_transform_neon.cc +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h index af647e8..91e0e83 100644 --- a/src/dsp/arm/inverse_transform_neon.h +++ b/src/dsp/arm/inverse_transform_neon.h @@ -26,6 +26,7 @@ namespace dsp { // Initializes Dsp::inverse_transforms, see the defines below for specifics. // This function is not thread-safe. void InverseTransformInit_NEON(); +void InverseTransformInit10bpp_NEON(); } // namespace dsp } // namespace libgav1 @@ -47,6 +48,21 @@ void InverseTransformInit_NEON(); #define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_ diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc index 146c983..8d72892 100644 --- a/src/dsp/arm/loop_filter_neon.cc +++ b/src/dsp/arm/loop_filter_neon.cc @@ -35,7 +35,7 @@ namespace { // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) { const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh)); - return vorr_u8(a, RightShift<32>(a)); + return vorr_u8(a, RightShiftVector<32>(a)); } // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh @@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1, const uint8x8x2_t a = Interleave32(p0q0, p1q1); const uint8x8_t b = vabd_u8(a.val[0], a.val[1]); const uint8x8_t p0q0_double = vqadd_u8(b, b); - const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1)); + const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1)); const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half); return vcle_u8(c, vdup_n_u8(outer_thresh)); } @@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1, const uint8_t inner_thresh, const uint8_t outer_thresh) { const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a)); + const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1, vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l)); // Need to shift the second term or we end up with a2_ma2. const int8x8_t a2_ma1 = - InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1))); + InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1))); const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1); *p1q1_result = vqmovun_s16(p1q1_a3); @@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1, const uint8x8_t abd_p0p2_q0q2) { const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2); const uint8x8_t b = vcle_u8(a, vdup_n_u8(1)); - return vand_u8(b, RightShift<32>(b)); + return vand_u8(b, RightShiftVector<32>(b)); } // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && @@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1, const uint8_t outer_thresh) { const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b)); + const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0, const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1); const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2); const uint8x8_t c = vcle_u8(b, vdup_n_u8(1)); - return vand_u8(c, RightShift<32>(c)); + return vand_u8(c, RightShiftVector<32>(c)); } // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && @@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1, const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3); const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c)); + const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc index 337c9b4..e6ceb66 100644 --- a/src/dsp/arm/loop_restoration_neon.cc +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -40,11 +40,26 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) { return vext_u8(src.val[0], src.val[1], bytes); } +template +inline uint8x8_t VshrU128(const uint8x8_t src[2]) { + return vext_u8(src[0], src[1], bytes); +} + +template +inline uint8x16_t VshrU128(const uint8x16_t src[2]) { + return vextq_u8(src[0], src[1], bytes); +} + template inline uint16x8_t VshrU128(const uint16x8x2_t src) { return vextq_u16(src.val[0], src.val[1], bytes / 2); } +template +inline uint16x8_t VshrU128(const uint16x8_t src[2]) { + return vextq_u16(src[0], src[1], bytes / 2); +} + // Wiener // Must make a local copy of coefficients to help compiler know that they have @@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, int16_t** const wiener_buffer) { for (int y = height; y != 0; --y) { const uint8_t* src_ptr = src; - uint8x16_t s[4]; - s[0] = vld1q_u8(src_ptr); + uint8x16_t s[3]; ptrdiff_t x = width; do { - src_ptr += 16; - s[3] = vld1q_u8(src_ptr); - s[1] = vextq_u8(s[0], s[3], 1); - s[2] = vextq_u8(s[0], s[3], 2); + // Slightly faster than using vextq_u8(). + s[0] = vld1q_u8(src_ptr); + s[1] = vld1q_u8(src_ptr + 1); + s[2] = vld1q_u8(src_ptr + 2); int16x8x2_t sum; sum.val[0] = sum.val[1] = vdupq_n_s16(0); WienerHorizontalSum(s, filter, sum, *wiener_buffer); - s[0] = s[3]; + src_ptr += 16; *wiener_buffer += 16; x -= 16; } while (x != 0); @@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, // For width 16 and up, store the horizontal results, and then do the vertical // filter row by row. This is faster than doing it column by column when // considering cache issues. -void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_NEON( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, const auto* const top = static_cast(top_border); const auto* const bottom = static_cast(bottom_border); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, filter_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, filter_horizontal, + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, + &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, filter_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, + &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, //------------------------------------------------------------------------------ // SGR -inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) { +inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) { dst[0] = VshrU128<0>(src); dst[1] = VshrU128<1>(src); dst[2] = VshrU128<2>(src); } -inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3], +template +inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) { + dst[0] = VshrU128(src); + dst[1] = VshrU128(src); + dst[2] = VshrU128(src); +} + +inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3], uint16x4_t high[3]) { uint16x8_t s[3]; s[0] = VshrU128<0>(src); @@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3], high[2] = vget_high_u16(s[2]); } -inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) { +inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) { dst[0] = VshrU128<0>(src); dst[1] = VshrU128<1>(src); dst[2] = VshrU128<2>(src); @@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) { dst[4] = VshrU128<4>(src); } -inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5], +template +inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) { + dst[0] = VshrU128(src); + dst[1] = VshrU128(src); + dst[2] = VshrU128(src); + dst[3] = VshrU128(src); + dst[4] = VshrU128(src); +} + +inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5], uint16x4_t high[5]) { Prepare3_16(src, low, high); const uint16x8_t s3 = VshrU128<6>(src); @@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) { return vaddw_u8(sum, src[2]); } +inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) { + const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1])); + return vaddw_u8(sum, vget_low_u8(src[2])); +} + +inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) { + const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1])); + return vaddw_u8(sum, vget_high_u8(src[2])); +} + +inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) { + const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1])); + const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3])); + const uint16x8_t sum = vaddq_u16(sum01, sum23); + return vaddw_u8(sum, vget_low_u8(src[4])); +} + +inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) { + const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1])); + const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3])); + const uint16x8_t sum = vaddq_u16(sum01, sum23); + return vaddw_u8(sum, vget_high_u8(src[4])); +} + inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) { const uint32x4_t sum = vaddl_u16(src[0], src[1]); return vaddw_u16(sum, src[2]); @@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) { return vaddw_u16(sum0123, src[4]); } -inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) { +inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) { uint8x8_t s[3]; Prepare3_8(src, s); return Sum3W_16(s); } -inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) { +inline uint16x8_t Sum3Horizontal(const uint8x16_t src) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return Sum3Horizontal(s); +} + +template +inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) { + uint8x16_t s[3]; + Prepare3_8(src, s); + dst[0] = Sum3WLo16(s); + dst[1] = Sum3WHi16(s); +} + +inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t sum; Prepare3_16(src, low, high); @@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) { return sum; } -inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) { +inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) { uint8x8_t s[5]; Prepare5_8(src, s); const uint16x8_t sum01 = vaddl_u8(s[0], s[1]); @@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) { return vaddw_u8(sum0123, s[4]); } -inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) { +inline uint16x8_t Sum5Horizontal(const uint8x16_t src) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return Sum5Horizontal(s); +} + +template +inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0, + uint16x8_t* const dst1) { + uint8x16_t s[5]; + Prepare5_8(src, s); + *dst0 = Sum5WLo16(s); + *dst1 = Sum5WHi16(s); +} + +inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) { uint16x4_t low[5], high[5]; Prepare5_16(src, low, high); uint32x4x2_t sum; @@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) { return sum; } -void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3, - uint32x4_t* const row_sq5) { - const uint32x4_t sum04 = vaddl_u16(src[0], src[4]); - const uint32x4_t sum12 = vaddl_u16(src[1], src[2]); - *row_sq3 = vaddw_u16(sum12, src[3]); - *row_sq5 = vaddq_u32(sum04, *row_sq3); +template +void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0, + uint16x8_t* const row3_1, uint16x8_t* const row5_0, + uint16x8_t* const row5_1) { + uint8x16_t s[5]; + Prepare5_8(src, s); + const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4])); + const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4])); + *row3_0 = Sum3WLo16(s + 1); + *row3_1 = Sum3WHi16(s + 1); + *row5_0 = vaddq_u16(sum04_lo, *row3_0); + *row5_1 = vaddq_u16(sum04_hi, *row3_1); } -void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq, - uint16x8_t* const row3, uint16x8_t* const row5, - uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { +void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3, + uint16x8_t* const row5) { uint8x8_t s[5]; Prepare5_8(src, s); const uint16x8_t sum04 = vaddl_u8(s[0], s[4]); const uint16x8_t sum12 = vaddl_u8(s[1], s[2]); *row3 = vaddw_u8(sum12, s[3]); *row5 = vaddq_u16(sum04, *row3); +} + +void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3, + uint32x4_t* const row_sq5) { + const uint32x4_t sum04 = vaddl_u16(src[0], src[4]); + const uint32x4_t sum12 = vaddl_u16(src[1], src[2]); + *row_sq3 = vaddw_u16(sum12, src[3]); + *row_sq5 = vaddq_u32(sum04, *row_sq3); +} + +void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3, + uint32x4x2_t* const row_sq5) { uint16x4_t low[5], high[5]; Prepare5_16(sq, low, high); SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]); SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]); } -inline uint16x8_t Sum343(const uint8x8x2_t src) { - uint8x8_t s[3]; - Prepare3_8(src, s); - const uint16x8_t sum = Sum3W_16(s); +void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2], + uint16x8_t* const row3, uint16x8_t* const row5, + uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { + SumHorizontal(src, row3, row5); + SumHorizontal(sq, row_sq3, row_sq5); +} + +void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2], + uint16x8_t* const row3, uint16x8_t* const row5, + uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5); +} + +template +inline uint16x8_t Sum343(const uint8x16_t ma3[2]) { + const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3); const uint16x8_t sum3 = Sum3_16(sum, sum, sum); - return vaddw_u8(sum3, s[1]); + return vaddw_u8(sum3, + (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1])); } inline uint32x4_t Sum343W(const uint16x4_t src[3]) { @@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) { return vaddw_u16(sum3, src[1]); } -inline uint32x4x2_t Sum343W(const uint16x8x2_t src) { +inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t d; Prepare3_16(src, low, high); @@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) { return d; } -inline uint16x8_t Sum565(const uint8x8x2_t src) { - uint8x8_t s[3]; - Prepare3_8(src, s); - const uint16x8_t sum = Sum3W_16(s); +template +inline uint16x8_t Sum565(const uint8x16_t ma5[2]) { + const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5); const uint16x8_t sum4 = vshlq_n_u16(sum, 2); const uint16x8_t sum5 = vaddq_u16(sum4, sum); - return vaddw_u8(sum5, s[1]); + return vaddw_u8(sum5, + (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1])); } inline uint32x4_t Sum565W(const uint16x4_t src[3]) { @@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) { return vaddw_u16(sum5, src[1]); } -inline uint32x4x2_t Sum565W(const uint16x8x2_t src) { +inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t d; Prepare3_16(src, low, high); @@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) { } inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, - const int height, const ptrdiff_t sum_stride, uint16_t* sum3, - uint16_t* sum5, uint32_t* square_sum3, - uint32_t* square_sum5) { - int y = height; + const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + int y = 2; + // Don't change loop width to 16, which is even slower. do { - uint8x8x2_t s; - uint16x8x2_t sq; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); + uint8x8_t s[2]; + uint16x8_t sq[2]; + s[0] = vld1_u8(src); + sq[0] = vmull_u8(s[0], s[0]); ptrdiff_t x = 0; do { uint16x8_t row3, row5; uint32x4x2_t row_sq3, row_sq5; - s.val[1] = vld1_u8(src + x + 8); - sq.val[1] = vmull_u8(s.val[1], s.val[1]); + s[1] = vld1_u8(src + x + 8); + sq[1] = vmull_u8(s[1], s[1]); SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5); vst1q_u16(sum3, row3); vst1q_u16(sum5, row5); @@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, vst1q_u32(square_sum3 + 4, row_sq3.val[1]); vst1q_u32(square_sum5 + 0, row_sq5.val[0]); vst1q_u32(square_sum5 + 4, row_sq5.val[1]); - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + s[0] = s[1]; + sq[0] = sq[1]; sum3 += 8; sum5 += 8; square_sum3 += 8; @@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, template inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, - const int height, const ptrdiff_t sum_stride, uint16_t* sums, + const ptrdiff_t sum_stride, uint16_t* sums, uint32_t* square_sums) { static_assert(size == 3 || size == 5, ""); - int y = height; + int y = 2; + // Don't change loop width to 16, which is even slower. do { - uint8x8x2_t s; - uint16x8x2_t sq; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); + uint8x8_t s[2]; + uint16x8_t sq[2]; + s[0] = vld1_u8(src); + sq[0] = vmull_u8(s[0], s[0]); ptrdiff_t x = 0; do { uint16x8_t row; uint32x4x2_t row_sq; - s.val[1] = vld1_u8(src + x + 8); - sq.val[1] = vmull_u8(s.val[1], s.val[1]); + s[1] = vld1_u8(src + x + 8); + sq[1] = vmull_u8(s[1], s[1]); if (size == 3) { row = Sum3Horizontal(s); row_sq = Sum3WHorizontal(sq); @@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, vst1q_u16(sums, row); vst1q_u32(square_sums + 0, row_sq.val[0]); vst1q_u32(square_sums + 4, row_sq.val[1]); - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + s[0] = s[1]; + sq[0] = sq[1]; sums += 8; square_sums += 8; x += 8; @@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq, return vmovn_u32(shifted); } -template +inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index, + const int threshold) { + const uint8x8_t thresholds = vdup_n_u8(threshold); + const uint8x8_t offset = vcgt_u8(index, thresholds); + // Adding 255 is equivalent to subtracting 1 for 8-bit data. + return vadd_u8(value, offset); +} + +template inline void CalculateIntermediate(const uint16x8_t sum, const uint32x4x2_t sum_sq, - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { constexpr uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; @@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum, const uint16x4_t z1 = CalculateMa(vget_high_u16(sum), sum_sq.val[1], scale); const uint16x8_t z01 = vcombine_u16(z0, z1); - // Using vqmovn_u16() needs an extra sign extension instruction. - const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255)); - // Using vgetq_lane_s16() can save the sign extension instruction. - const uint8_t lookup[8] = { - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]}; - *ma = vld1_u8(lookup); + const uint8x8_t idx = vqmovn_u16(z01); + // Use table lookup to read elements whose indices are less than 48. + // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than + // using two uint8x8x3_t vectors. + uint8x8x4_t table0; + uint8x8x2_t table1; + table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8); + table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8); + table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8); + table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8); + table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8); + table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8); + // All elements whose indices are out of range [0, 47] are set to 0. + uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31]. + // Subtract 8 to shuffle the next index range. + const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32)); + const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47]. + // Use OR instruction to combine shuffle results together. + val = vorr_u8(val, res); + + // For elements whose indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Elements whose indices are larger than 47 (with value 0) are set to 5. + val = vmax_u8(val, vdup_n_u8(5)); + val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5. + val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4. + val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3. + val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2. + val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1. + *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma)) + : vcombine_u8(vget_low_u8(*ma), val); + // b = ma * b * one_over_n // |ma| = [0, 255] // |sum| is a box sum with radius 1 or 2. @@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum, // |kSgrProjReciprocalBits| is 12. // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). - const uint16x8_t maq = vmovl_u8(*ma); + const uint16x8_t maq = + vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma)); const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum)); const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum)); const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n); @@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum, *b = vcombine_u16(b_lo, b_hi); } +template inline void CalculateIntermediate5(const uint16x8_t s5[5], const uint32x4x2_t sq5[5], - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { const uint16x8_t sum = Sum5_16(s5); const uint32x4x2_t sum_sq = Sum5_32(sq5); - CalculateIntermediate<25>(sum, sum_sq, scale, ma, b); + CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b); } +template inline void CalculateIntermediate3(const uint16x8_t s3[3], const uint32x4x2_t sq3[3], - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { const uint16x8_t sum = Sum3_16(s3); const uint32x4x2_t sum_sq = Sum3_32(sq3); - CalculateIntermediate<9>(sum, sum_sq, scale, ma, b); + CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16x8_t* const sum_ma343, uint16x8_t* const sum_ma444, uint32x4x2_t* const sum_b343, uint32x4x2_t* const sum_b444, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { - uint8x8_t s[3]; - Prepare3_8(ma3, s); - const uint16x8_t sum_ma111 = Sum3W_16(s); + const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3); *sum_ma444 = vshlq_n_u16(sum_ma111, 2); const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111); - *sum_ma343 = vaddw_u8(sum333, s[1]); + *sum_ma343 = vaddw_u8( + sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1])); uint16x4_t low[3], high[3]; uint32x4x2_t sum_b111; Prepare3_16(b3, low, high); @@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, vst1q_u32(b444 + x + 4, sum_b444->val[1]); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16x8_t* const sum_ma343, uint32x4x2_t* const sum_b343, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { uint16x8_t sum_ma444; uint32x4x2_t sum_b444; - Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343, - ma444, b343, b444); + Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, + ma343, ma444, b343, b444); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { uint16x8_t sum_ma343; uint32x4x2_t sum_b343; - Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444); + Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, + b444); } -LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( - const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, - const uint32_t scale, uint16_t* const sum5[5], - uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2], - uint8x8_t* const ma, uint16x8_t* const b) { +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale, + uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5], + uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) { uint16x8_t s5[5]; uint32x4x2_t sq5[5]; - s[0].val[1] = vld1_u8(src0 + x + 8); - s[1].val[1] = vld1_u8(src1 + x + 8); - sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); - sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); - s5[3] = Sum5Horizontal(s[0]); - s5[4] = Sum5Horizontal(s[1]); + s[0][0] = vld1q_u8(src0); + s[1][0] = vld1q_u8(src1); + sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0])); + sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0])); + sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0])); + sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0])); + s5[3] = Sum5Horizontal(s[0][0]); + s5[4] = Sum5Horizontal(s[1][0]); sq5[3] = Sum5WHorizontal(sq[0]); sq5[4] = Sum5WHorizontal(sq[1]); - vst1q_u16(sum5[3] + x, s5[3]); - vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u16(sum5[3], s5[3]); + vst1q_u16(sum5[4], s5[4]); + vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, + const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2], + uint16x8_t b[2]) { + uint16x8_t s5[2][5]; + uint32x4x2_t sq5[5]; + s[0][1] = vld1q_u8(src0 + x + 8); + s[1][1] = vld1q_u8(src1 + x + 8); + sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1])); + sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1])); + Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]); + Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]); + sq5[3] = Sum5WHorizontal(sq[0] + 1); + sq5[4] = Sum5WHorizontal(sq[1] + 1); + vst1q_u16(sum5[3] + x, s5[0][3]); + vst1q_u16(sum5[4] + x, s5[0][4]); vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate5(s5, sq5, scale, ma, b); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]); + + sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1])); + sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1])); + sq5[3] = Sum5WHorizontal(sq[0] + 2); + sq5[4] = Sum5WHorizontal(sq[1] + 2); + vst1q_u16(sum5[3] + x + 8, s5[1][3]); + vst1q_u16(sum5[4] + x + 8, s5[1][4]); + vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const uint8_t* const src, const uint32_t scale, uint8x16_t* const s, + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) { + uint16x8_t s5[5]; + uint32x4x2_t sq5[5]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + s5[3] = s5[4] = Sum5Horizontal(*s); + sq5[3] = sq5[4] = Sum5WHorizontal(sq); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, - const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, - uint16x8_t* const b) { - uint16x8_t s5[5]; + uint8x16_t s[2], const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2], + uint16x8_t b[2]) { + uint16x8_t s5[2][5]; uint32x4x2_t sq5[5]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - s5[3] = s5[4] = Sum5Horizontal(*s); - sq5[3] = sq5[4] = Sum5WHorizontal(*sq); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]); + sq5[3] = sq5[4] = Sum5WHorizontal(sq); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); + s5[0][4] = s5[0][3]; sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate5(s5, sq5, scale, ma, b); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + s5[1][4] = s5[1][3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const uint8_t* const src, const uint32_t scale, uint8x16_t* const s, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2], + uint8x16_t* const ma, uint16x8_t* const b) { + uint16x8_t s3[3]; + uint32x4x2_t sq3[3]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + s3[2] = Sum3Horizontal(*s); + sq3[2] = Sum3WHorizontal(sq); + vst1q_u16(sum3[2], s3[2]); + vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + CalculateIntermediate3<0>(s3, sq3, scale, ma, b); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, - uint16_t* const sum3[3], uint32_t* const square_sum3[3], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, - uint16x8_t* const b) { - uint16x8_t s3[3]; + uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2], + uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) { + uint16x8_t s3[4]; uint32x4x2_t sq3[3]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - s3[2] = Sum3Horizontal(*s); - sq3[2] = Sum3WHorizontal(*sq); + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + Sum3Horizontal<8>(s, s3 + 2); + sq3[2] = Sum3WHorizontal(sq); vst1q_u16(sum3[2] + x, s3[2]); vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); @@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - CalculateIntermediate3(s3, sq3, scale, ma, b); + CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + sq3[2] = Sum3WHorizontal(sq + 1); + vst1q_u16(sum3[2] + x + 8, s3[3]); + vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]); + s3[1] = vld1q_u16(sum3[0] + x + 8); + s3[2] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const uint8_t* const src0, const uint8_t* const src1, + const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2], + uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) { + uint16x8_t s3[4], s5[5]; + uint32x4x2_t sq3[4], sq5[5]; + s[0][0] = vld1q_u8(src0); + s[1][0] = vld1q_u8(src1); + sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0])); + sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0])); + sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0])); + sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0])); + SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]); + SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]); + vst1q_u16(sum3[2], s3[2]); + vst1q_u16(sum3[3], s3[3]); + vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]); + vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]); + vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]); + vst1q_u16(sum5[3], s5[3]); + vst1q_u16(sum5[4], s5[4]); + vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]); + CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]); + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, - const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], - uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0, - uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1, - uint8x8_t* const ma5, uint16x8_t* const b5) { - uint16x8_t s3[4], s5[5]; + const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2], + uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) { + uint16x8_t s3[2][4], s5[2][5]; uint32x4x2_t sq3[4], sq5[5]; - s[0].val[1] = vld1_u8(src0 + x + 8); - s[1].val[1] = vld1_u8(src1 + x + 8); - sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); - sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); - SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]); - SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]); - vst1q_u16(sum3[2] + x, s3[2]); - vst1q_u16(sum3[3] + x, s3[3]); + s[0][1] = vld1q_u8(src0 + x + 8); + s[1][1] = vld1q_u8(src1 + x + 8); + sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1])); + sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1])); + SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]); + SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]); + SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]); + vst1q_u16(sum3[2] + x, s3[0][2]); + vst1q_u16(sum3[3] + x, s3[0][3]); vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]); vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]); - vst1q_u16(sum5[3] + x, s5[3]); - vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u16(sum5[3] + x, s5[0][3]); + vst1q_u16(sum5[4] + x, s5[0][4]); vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); - s3[0] = vld1q_u16(sum3[0] + x); - s3[1] = vld1q_u16(sum3[1] + x); + s3[0][0] = vld1q_u16(sum3[0] + x); + s3[0][1] = vld1q_u16(sum3[1] + x); sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0); - CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1); - CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); + CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]); + CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0], + &b3[1][1]); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]); + + sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1])); + sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1])); + SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]); + SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]); + vst1q_u16(sum3[2] + x + 8, s3[1][2]); + vst1q_u16(sum3[3] + x + 8, s3[1][3]); + vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]); + vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]); + vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]); + vst1q_u16(sum5[3] + x + 8, s5[1][3]); + vst1q_u16(sum5[4] + x + 8, s5[1][4]); + vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]); + s3[1][0] = vld1q_u16(sum3[0] + x + 8); + s3[1][1] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]); + CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1], + &b3[1][2]); + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const uint8_t* const src, const uint16_t scales[2], + const uint16_t* const sum3[4], const uint16_t* const sum5[5], + const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], + uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3, + uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) { + uint16x8_t s3[3], s5[5]; + uint32x4x2_t sq3[3], sq5[5]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + s5[4] = s5[3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + sq5[4] = sq5[3]; + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2], const uint16_t* const sum3[4], const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3, - uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) { - uint16x8_t s3[3], s5[5]; + uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], + uint16x8_t b3[2], uint16x8_t b5[2]) { + uint16x8_t s3[2][3], s5[2][5]; uint32x4x2_t sq3[3], sq5[5]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); - s5[4] = s5[3]; + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal(sq, &sq3[2], &sq5[3]); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); + s5[0][4] = s5[0][3]; sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); @@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); sq5[4] = sq5[3]; - CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); - s3[0] = vld1q_u16(sum3[0] + x); - s3[1] = vld1q_u16(sum3[1] + x); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]); + s3[0][0] = vld1q_u16(sum3[0] + x); + s3[0][1] = vld1q_u16(sum3[1] + x); sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); + CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + SumHorizontal(sq + 1, &sq3[2], &sq5[3]); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + s5[1][4] = s5[1][3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + sq5[4] = sq5[3]; + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]); + s3[1][0] = vld1q_u16(sum3[0] + x + 8); + s3[1][1] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]); } inline void BoxSumFilterPreProcess5(const uint8_t* const src0, @@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0, uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, uint32_t* b565) { - uint8x8x2_t s[2], mas; - uint16x8x2_t sq[2], bs; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2][2], mas[2]; + uint16x8_t sq[2][4], bs[3]; + BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, - &mas.val[1], &bs.val[1]); - const uint16x8_t ma = Sum565(mas); - const uint32x4x2_t b = Sum565W(bs); - vst1q_u16(ma565, ma); - vst1q_u32(b565 + 0, b.val[0]); - vst1q_u32(b565 + 4, b.val[1]); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - ma565 += 8; - b565 += 8; - x += 8; + uint16x8_t ma[2]; + uint8x16_t masx[3]; + uint32x4x2_t b[2]; + BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq, + mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[0] = Sum565<0>(masx); + b[0] = Sum565W(bs); + vst1q_u16(ma565, ma[0]); + vst1q_u32(b565 + 0, b[0].val[0]); + vst1q_u32(b565 + 4, b[0].val[1]); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + vst1q_u16(ma565 + 8, ma[1]); + vst1q_u32(b565 + 8, b[1].val[0]); + vst1q_u32(b565 + 12, b[1].val[1]); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( const uint8_t* const src, const int width, const uint32_t scale, uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343, uint16_t* ma444, uint32_t* b343, uint32_t* b444) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], - &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[3]; + BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq, - &mas.val[1], &bs.val[1]); + uint8x16_t ma3x[3]; + BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas, + bs + 1); + Prepare3_8<0>(mas, ma3x); if (calculate444) { - Store343_444(mas, bs, 0, ma343, ma444, b343, b444); - ma444 += 8; - b444 += 8; + Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8, + b444 + 8); + ma444 += 16; + b444 += 16; } else { - const uint16x8_t ma = Sum343(mas); - const uint32x4x2_t b = Sum343W(bs); - vst1q_u16(ma343, ma); - vst1q_u32(b343 + 0, b.val[0]); - vst1q_u32(b343 + 4, b.val[1]); + uint16x8_t ma[2]; + uint32x4x2_t b[2]; + ma[0] = Sum343<0>(ma3x); + b[0] = Sum343W(bs); + vst1q_u16(ma343, ma[0]); + vst1q_u32(b343 + 0, b[0].val[0]); + vst1q_u32(b343 + 4, b[0].val[1]); + ma[1] = Sum343<8>(ma3x); + b[1] = Sum343W(bs + 1); + vst1q_u16(ma343 + 8, ma[1]); + vst1q_u32(b343 + 8, b[1].val[0]); + vst1q_u32(b343 + 12, b[1].val[1]); } - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - ma343 += 8; - b343 += 8; - x += 8; + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma343 += 16; + b343 += 16; + x += 16; } while (x < width); } @@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const int width, const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565, - uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) { - uint8x8x2_t s[2]; - uint8x8x2_t ma3[2], ma5; - uint16x8x2_t sq[2], b3[2], b5; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], - &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565, + uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) { + uint8x16_t s[2][2], ma3[2][2], ma5[2]; + uint16x8_t sq[2][4], b3[2][3], b5[3]; + BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, &ma5[0], &b5[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], - &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); - uint16x8_t ma = Sum343(ma3[0]); - uint32x4x2_t b = Sum343W(b3[0]); - vst1q_u16(ma343[0] + x, ma); - vst1q_u32(b343[0] + x, b.val[0]); - vst1q_u32(b343[0] + x + 4, b.val[1]); - Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); - ma = Sum565(ma5); - b = Sum565W(b5); - vst1q_u16(ma565, ma); - vst1q_u32(b565 + 0, b.val[0]); - vst1q_u32(b565 + 4, b.val[1]); - ma3[0].val[0] = ma3[0].val[1]; - ma3[1].val[0] = ma3[1].val[1]; - b3[0].val[0] = b3[0].val[1]; - b3[1].val[0] = b3[1].val[1]; - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - ma565 += 8; - b565 += 8; - x += 8; + uint16x8_t ma[2]; + uint8x16_t ma3x[3], ma5x[3]; + uint32x4x2_t b[2]; + BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, ma5, b5 + 1); + Prepare3_8<0>(ma3[0], ma3x); + ma[0] = Sum343<0>(ma3x); + ma[1] = Sum343<8>(ma3x); + b[0] = Sum343W(b3[0] + 0); + b[1] = Sum343W(b3[0] + 1); + vst1q_u16(ma343[0] + x, ma[0]); + vst1q_u16(ma343[0] + x + 8, ma[1]); + vst1q_u32(b343[0] + x, b[0].val[0]); + vst1q_u32(b343[0] + x + 4, b[0].val[1]); + vst1q_u32(b343[0] + x + 8, b[1].val[0]); + vst1q_u32(b343[0] + x + 12, b[1].val[1]); + Prepare3_8<0>(ma3[1], ma3x); + Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444); + Prepare3_8<0>(ma5, ma5x); + ma[0] = Sum565<0>(ma5x); + ma[1] = Sum565<8>(ma5x); + b[0] = Sum565W(b5); + b[1] = Sum565W(b5 + 1); + vst1q_u16(ma565, ma[0]); + vst1q_u16(ma565 + 8, ma[1]); + vst1q_u32(b565 + 0, b[0].val[0]); + vst1q_u32(b565 + 4, b[0].val[1]); + vst1q_u32(b565 + 8, b[1].val[0]); + vst1q_u32(b565 + 12, b[1].val[1]); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + ma5[0] = ma5[1]; + b5[0] = b5[2]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s, return CalculateFilteredOutput<5>(s, ma_sum, b_sum); } -inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2], - uint8_t* const dst) { +inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) { const int16x4_t v_lo = vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); const int16x4_t v_hi = vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); const int16x8_t vv = vcombine_s16(v_lo, v_hi); - const int16x8_t s = ZeroExtend(src); - const int16x8_t d = vaddq_s16(s, vv); - vst1_u8(dst, vqmovun_s16(d)); + const int16x8_t d = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src)); + return vqmovun_s16(d); } -inline void SelfGuidedDoubleMultiplier(const uint8x8_t src, - const int16x8_t filter[2], const int w0, - const int w2, uint8_t* const dst) { +inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src, + const int16x8_t filter[2], + const int w0, const int w2) { int32x4_t v[2]; v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0); v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0); v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2); v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2); - SelfGuidedFinal(src, v, dst); + return SelfGuidedFinal(src, v); } -inline void SelfGuidedSingleMultiplier(const uint8x8_t src, - const int16x8_t filter, const int w0, - uint8_t* const dst) { +inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src, + const int16x8_t filter, + const int w0) { // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) int32x4_t v[2]; v[0] = vmull_n_s16(vget_low_s16(filter), w0); v[1] = vmull_n_s16(vget_high_s16(filter), w0); - SelfGuidedFinal(src, v, dst); + return SelfGuidedFinal(src, v); } LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( @@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( uint32_t* const square_sum5[5], const int width, const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s[2], mas; - uint16x8x2_t sq[2], bs; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2][2], mas[2]; + uint16x8_t sq[2][4], bs[3]; + BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[2]; + uint8x16_t masx[3]; uint32x4x2_t b[2]; - ma[1] = Sum565(mas); + int16x8_t p0, p1; + BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq, + mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[1] = Sum565<0>(masx); b[1] = Sum565W(bs); vst1q_u16(ma565[1] + x, ma[1]); vst1q_u32(b565[1] + x + 0, b[1].val[0]); vst1q_u32(b565[1] + x + 4, b[1].val[1]); - const uint8x8_t sr0 = vld1_u8(src + x); - const uint8x8_t sr1 = vld1_u8(src + stride + x); - int16x8_t p0, p1; + const uint8x16_t sr0 = vld1q_u8(src + x); + const uint8x16_t sr1 = vld1q_u8(src + stride + x); + const uint8x8_t sr00 = vget_low_u8(sr0); + const uint8x8_t sr10 = vget_low_u8(sr1); ma[0] = vld1q_u16(ma565[0] + x); b[0].val[0] = vld1q_u32(b565[0] + x + 0); b[0].val[1] = vld1q_u32(b565[0] + x + 4); - p0 = CalculateFilteredOutputPass1(sr0, ma, b); - p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]); - SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x); - SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - x += 8; + p0 = CalculateFilteredOutputPass1(sr00, ma, b); + p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]); + const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0); + const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + vst1q_u16(ma565[1] + x + 8, ma[1]); + vst1q_u32(b565[1] + x + 8, b[1].val[0]); + vst1q_u32(b565[1] + x + 12, b[1].val[1]); + const uint8x8_t sr01 = vget_high_u8(sr0); + const uint8x8_t sr11 = vget_high_u8(sr1); + ma[0] = vld1q_u16(ma565[0] + x + 8); + b[0].val[0] = vld1q_u32(b565[0] + x + 8); + b[0].val[1] = vld1q_u32(b565[0] + x + 12); + p0 = CalculateFilteredOutputPass1(sr01, ma, b); + p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]); + const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0); + const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d00, d01)); + vst1q_u8(dst + stride + x, vcombine_u8(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; } while (x < width); } @@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint32_t* const square_sum5[5], uint16_t* ma565, uint32_t* b565, uint8_t* const dst) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[4]; + BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[2]; + uint8x16_t masx[3]; uint32x4x2_t b[2]; - ma[1] = Sum565(mas); + BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5, + sq + 1, mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[1] = Sum565<0>(masx); b[1] = Sum565W(bs); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; ma[0] = vld1q_u16(ma565); b[0].val[0] = vld1q_u32(b565 + 0); b[0].val[1] = vld1q_u32(b565 + 4); - const uint8x8_t sr = vld1_u8(src + x); - const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b); - SelfGuidedSingleMultiplier(sr, p, w0, dst + x); - ma565 += 8; - b565 += 8; - x += 8; + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); + const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + bs[0] = bs[2]; + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma565 + 8); + b[0].val[0] = vld1q_u32(b565 + 8); + b[0].val[1] = vld1q_u32(b565 + 12); + const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( uint32_t* const square_sum3[3], uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2], uint8_t* const dst) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], - &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[3]; + BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[3]; + uint8x16_t ma3x[3]; uint32x4x2_t b[3]; - Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2], - b444[1]); - const uint8x8_t sr = vld1_u8(src + x); + BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas, + bs + 1); + Prepare3_8<0>(mas, ma3x); + Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2], + b444[1]); + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); ma[0] = vld1q_u16(ma343[0] + x); ma[1] = vld1q_u16(ma444[0] + x); b[0].val[0] = vld1q_u32(b343[0] + x + 0); b[0].val[1] = vld1q_u32(b343[0] + x + 4); b[1].val[0] = vld1q_u32(b444[0] + x + 0); b[1].val[1] = vld1q_u32(b444[0] + x + 4); - const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b); - SelfGuidedSingleMultiplier(sr, p, w0, dst + x); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - x += 8; + const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0); + + Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1], + b343[2], b444[1]); + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma343[0] + x + 8); + ma[1] = vld1q_u16(ma444[0] + x + 8); + b[0].val[0] = vld1q_u32(b343[0] + x + 8); + b[0].val[1] = vld1q_u32(b343[0] + x + 12); + b[1].val[0] = vld1q_u32(b444[0] + x + 8); + b[1].val[1] = vld1q_u32(b444[0] + x + 12); + const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; } while (x < width); } @@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( uint16_t* const ma343[4], uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s[2], ma3[2], ma5; - uint16x8x2_t sq[2], b3[2], b5; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], - &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + uint8x16_t s[2][2], ma3[2][2], ma5[2]; + uint16x8_t sq[2][4], b3[2][3], b5[3]; + BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, &ma5[0], &b5[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], - &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); uint16x8_t ma[3][3]; + uint8x16_t ma3x[2][3], ma5x[3]; uint32x4x2_t b[3][3]; - Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1], - ma343[2], ma444[1], b343[2], b444[1]); - Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2], - b343[3], b444[2]); - ma[0][1] = Sum565(ma5); + int16x8_t p[2][2]; + BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, ma5, b5 + 1); + Prepare3_8<0>(ma3[0], ma3x[0]); + Prepare3_8<0>(ma3[1], ma3x[1]); + Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + Prepare3_8<0>(ma5, ma5x); + ma[0][1] = Sum565<0>(ma5x); b[0][1] = Sum565W(b5); vst1q_u16(ma565[1] + x, ma[0][1]); vst1q_u32(b565[1] + x, b[0][1].val[0]); vst1q_u32(b565[1] + x + 4, b[0][1].val[1]); - ma3[0].val[0] = ma3[0].val[1]; - ma3[1].val[0] = ma3[1].val[1]; - b3[0].val[0] = b3[0].val[1]; - b3[1].val[0] = b3[1].val[1]; - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - int16x8_t p[2][2]; - const uint8x8_t sr0 = vld1_u8(src + x); - const uint8x8_t sr1 = vld1_u8(src + stride + x); + const uint8x16_t sr0 = vld1q_u8(src + x); + const uint8x16_t sr1 = vld1q_u8(src + stride + x); + const uint8x8_t sr00 = vget_low_u8(sr0); + const uint8x8_t sr10 = vget_low_u8(sr1); ma[0][0] = vld1q_u16(ma565[0] + x); b[0][0].val[0] = vld1q_u32(b565[0] + x); b[0][0].val[1] = vld1q_u32(b565[0] + x + 4); - p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]); - p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]); + p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]); ma[1][0] = vld1q_u16(ma343[0] + x); ma[1][1] = vld1q_u16(ma444[0] + x); b[1][0].val[0] = vld1q_u32(b343[0] + x); b[1][0].val[1] = vld1q_u32(b343[0] + x + 4); b[1][1].val[0] = vld1q_u32(b444[0] + x); b[1][1].val[1] = vld1q_u32(b444[0] + x + 4); - p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]); + p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]); ma[2][0] = vld1q_u16(ma343[1] + x); b[2][0].val[0] = vld1q_u32(b343[1] + x); b[2][0].val[1] = vld1q_u32(b343[1] + x + 4); - p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]); - SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x); - SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x); - x += 8; + p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]); + const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2); + const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2); + + Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2], + &b[2][1], ma343[2], ma444[1], b343[2], b444[1]); + Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3], + ma444[2], b343[3], b444[2]); + ma[0][1] = Sum565<8>(ma5x); + b[0][1] = Sum565W(b5 + 1); + vst1q_u16(ma565[1] + x + 8, ma[0][1]); + vst1q_u32(b565[1] + x + 8, b[0][1].val[0]); + vst1q_u32(b565[1] + x + 12, b[0][1].val[1]); + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + const uint8x8_t sr01 = vget_high_u8(sr0); + const uint8x8_t sr11 = vget_high_u8(sr1); + ma[0][0] = vld1q_u16(ma565[0] + x + 8); + b[0][0].val[0] = vld1q_u32(b565[0] + x + 8); + b[0][0].val[1] = vld1q_u32(b565[0] + x + 12); + p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]); + ma[1][0] = vld1q_u16(ma343[0] + x + 8); + ma[1][1] = vld1q_u16(ma444[0] + x + 8); + b[1][0].val[0] = vld1q_u32(b343[0] + x + 8); + b[1][0].val[1] = vld1q_u32(b343[0] + x + 12); + b[1][1].val[0] = vld1q_u32(b444[0] + x + 8); + b[1][1].val[1] = vld1q_u32(b444[0] + x + 12); + p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]); + ma[2][0] = vld1q_u16(ma343[1] + x + 8); + b[2][0].val[0] = vld1q_u32(b343[1] + x + 8); + b[2][0].val[1] = vld1q_u32(b343[1] + x + 12); + p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]); + const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2); + const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2); + vst1q_u8(dst + x, vcombine_u8(d00, d01)); + vst1q_u8(dst + stride + x, vcombine_u8(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + x += 16; } while (x < width); } @@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow( const uint16_t scales[2], const int16_t w0, const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[3], - uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], - uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s, ma3, ma5; - uint16x8x2_t sq, b3, b5; - uint16x8_t ma[3]; + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint8_t* const dst) { + uint8x16_t s[2], ma3[2], ma5[2]; + uint16x8_t sq[4], ma[3], b3[3], b5[3]; uint32x4x2_t b[3]; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3, - square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0], - &b3.val[0], &b5.val[0]); + BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3, + square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0], + &b5[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + uint8x16_t ma3x[3], ma5x[3]; + int16x8_t p[2]; BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1], - &b3.val[1], &b5.val[1]); - ma[1] = Sum565(ma5); + square_sum5, s, sq + 1, ma3, ma5, &b3[1], + &b5[1]); + Prepare3_8<0>(ma5, ma5x); + ma[1] = Sum565<0>(ma5x); b[1] = Sum565W(b5); - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - ma[2] = Sum343(ma3); + Prepare3_8<0>(ma3, ma3x); + ma[2] = Sum343<0>(ma3x); b[2] = Sum343W(b3); - ma3.val[0] = ma3.val[1]; - b3.val[0] = b3.val[1]; - const uint8x8_t sr = vld1_u8(src + x); - int16x8_t p[2]; - ma[0] = vld1q_u16(ma565[0] + x); - b[0].val[0] = vld1q_u32(b565[0] + x + 0); - b[0].val[1] = vld1q_u32(b565[0] + x + 4); - p[0] = CalculateFilteredOutputPass1(sr, ma, b); - ma[0] = vld1q_u16(ma343[0] + x); - ma[1] = vld1q_u16(ma444[0] + x); - b[0].val[0] = vld1q_u32(b343[0] + x + 0); - b[0].val[1] = vld1q_u32(b343[0] + x + 4); - b[1].val[0] = vld1q_u32(b444[0] + x + 0); - b[1].val[1] = vld1q_u32(b444[0] + x + 4); - p[1] = CalculateFilteredOutputPass2(sr, ma, b); - SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x); - x += 8; + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); + ma[0] = vld1q_u16(ma565 + x); + b[0].val[0] = vld1q_u32(b565 + x + 0); + b[0].val[1] = vld1q_u32(b565 + x + 4); + p[0] = CalculateFilteredOutputPass1(sr0, ma, b); + ma[0] = vld1q_u16(ma343 + x); + ma[1] = vld1q_u16(ma444 + x); + b[0].val[0] = vld1q_u32(b343 + x + 0); + b[0].val[1] = vld1q_u32(b343 + x + 4); + b[1].val[0] = vld1q_u32(b444 + x + 0); + b[1].val[1] = vld1q_u32(b444 + x + 4); + p[1] = CalculateFilteredOutputPass2(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2); + + ma[1] = Sum565<8>(ma5x); + b[1] = Sum565W(b5 + 1); + b5[0] = b5[2]; + ma[2] = Sum343<8>(ma3x); + b[2] = Sum343W(b3 + 1); + b3[0] = b3[2]; + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma565 + x + 8); + b[0].val[0] = vld1q_u32(b565 + x + 8); + b[0].val[1] = vld1q_u32(b565 + x + 12); + p[0] = CalculateFilteredOutputPass1(sr1, ma, b); + ma[0] = vld1q_u16(ma343 + x + 8); + ma[1] = vld1q_u16(ma444 + x + 8); + b[0].val[0] = vld1q_u32(b343 + x + 8); + b[0].val[1] = vld1q_u32(b343 + x + 12); + b[1].val[0] = vld1q_u32(b444 + x + 8); + b[1].val[1] = vld1q_u32(b444 + x + 12); + p[1] = CalculateFilteredOutputPass2(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + ma3[0] = ma3[1]; + ma5[0] = ma5[1]; + x += 16; } while (x < width); } LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const RestorationUnitInfo& restoration_info, const uint8_t* src, - const uint8_t* const top_border, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, uint8_t* dst) { - const auto temp_stride = Align(width, 8); + const auto temp_stride = Align(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int sgr_proj_index = restoration_info.sgr_proj_info.index; const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. @@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0], - square_sum5[1]); + BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1], + square_sum3[0], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, - square_sum5, ma343, ma444, ma565[0], b343, b444, + square_sum5, ma343, ma444[0], ma565[0], b343, b444[0], b565[0]); sum5[0] = sgr_buffer->sum5; square_sum5[0] = sgr_buffer->square_sum5; @@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2, - sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565, - b343, b444, b565, dst); + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, + ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0], + dst); } } inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { - const auto temp_stride = Align(width, 8); + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { + const auto temp_stride = Align(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int sgr_proj_index = restoration_info.sgr_proj_info.index; const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. @@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]); + BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; @@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2(sum5); Circulate5PointersBy2(square_sum5); } - BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0, - sum5, square_sum5, ma565[0], b565[0], dst); + BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width, + scale, w0, sum5, square_sum5, ma565[0], b565[0], dst); } } inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); - const auto temp_stride = Align(width, 8); + const auto temp_stride = Align(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; @@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]); + BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]); BoxSumFilterPreProcess3(src, width, scale, sum3, square_sum3, ma343[0], nullptr, b343[0], nullptr); Circulate3PointersBy1(sum3); @@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSumFilterPreProcess3(s, width, scale, sum3, square_sum3, ma343[1], ma444[0], b343[1], b444[0]); @@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, ma343, ma444, b343, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1(ma343); Circulate3PointersBy1(b343); std::swap(ma444[0], ma444[1]); @@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // part of the visible frame. void SelfGuidedFilter_NEON( const RestorationUnitInfo& restoration_info, const void* const source, - const void* const top_border, const void* const bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 @@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON( // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); } else { - BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, - width, height, sgr_buffer, dst); + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); } } @@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc index 084f42f..ee50923 100644 --- a/src/dsp/arm/mask_blend_neon.cc +++ b/src/dsp/arm/mask_blend_neon.cc @@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc index 8caba7d..3e731b2 100644 --- a/src/dsp/arm/motion_field_projection_neon.cc +++ b/src/dsp/arm/motion_field_projection_neon.cc @@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc index 8a403a6..da3ba17 100644 --- a/src/dsp/arm/motion_vector_search_neon.cc +++ b/src/dsp/arm/motion_vector_search_neon.cc @@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 66ad663..1111a90 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc index 1680450..91537c4 100644 --- a/src/dsp/arm/super_res_neon.cc +++ b/src/dsp/arm/super_res_neon.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/arm/common_neon.h" #include "src/dsp/super_res.h" #include "src/utils/cpu.h" @@ -20,6 +19,7 @@ #include +#include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" @@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps], } void SuperRes_NEON(const void* const coefficients, void* const source, - const ptrdiff_t stride, const int height, + const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest) { + void* const dest, const ptrdiff_t dest_stride) { auto* src = static_cast(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast(dest); int y = height; @@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source, int x = RightShiftWithCeiling(upscaled_width, 4); // The below code calculates up to 15 extra upscaled // pixels which will over-read up to 15 downscaled pixels in the end of each - // row. kSuperResHorizontalBorder accounts for this. + // row. kSuperResHorizontalPadding accounts for this. do { for (int i = 0; i < 8; ++i, subpixel_x += step) { sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]); @@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source, vst1q_u8(dst_ptr, vcombine_u8(d0, d1)); dst_ptr += 16; } while (--x != 0); - src += stride; - dst += stride; + src += source_stride; + dst += dest_stride; } while (--y != 0); } @@ -149,12 +149,147 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void SuperResInit_NEON() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void SuperResCoefficients_NEON(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + uint16x8_t filter[8]; + for (int i = 0; i < 8; ++i, subpixel_x += step) { + const uint8x8_t filter_8 = + vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >> + kSuperResExtraBits]); + // uint8_t -> uint16_t + filter[i] = vmovl_u8(filter_8); + } + + Transpose8x8(filter); + + vst1q_u16(dst, filter[0]); + dst += 8; + vst1q_u16(dst, filter[1]); + dst += 8; + vst1q_u16(dst, filter[2]); + dst += 8; + vst1q_u16(dst, filter[3]); + dst += 8; + vst1q_u16(dst, filter[4]); + dst += 8; + vst1q_u16(dst, filter[5]); + dst += 8; + vst1q_u16(dst, filter[6]); + dst += 8; + vst1q_u16(dst, filter[7]); + dst += 8; + } while (--x != 0); +} + +// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then +// subtracting all negative with saturation will clip to zero. +// 0 1 2 3 4 5 6 7 +// tap sign: - + - + + - + - +inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps], + const uint16_t** coefficients, int bitdepth) { + uint16x8_t f[kSuperResFilterTaps]; + for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) { + f[i] = vld1q_u16(*coefficients); + } + + uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6])); + + uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7])); + + res_lo = vqsubq_u32(res_lo, temp_lo); + + uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6])); + uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7])); + + res_hi = vqsubq_u32(res_hi, temp_hi); + + const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits), + vqrshrn_n_u32(res_hi, kFilterBits)); + + // Clip the result at (1 << bd) - 1. + return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1)); +} + +template +void SuperRes_NEON(const void* const coefficients, void* const source, + const ptrdiff_t source_stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest, const ptrdiff_t dest_stride) { + auto* src = static_cast(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast(dest); + int y = height; + do { + const auto* filter = static_cast(coefficients); + uint16_t* dst_ptr = dst; + ExtendLine(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + int subpixel_x = initial_subpixel_x; + uint16x8_t sr[8]; + int x = RightShiftWithCeiling(upscaled_width, 3); + // The below code calculates up to 7 extra upscaled + // pixels which will over-read up to 7 downscaled pixels in the end of each + // row. kSuperResHorizontalBorder accounts for this. + do { + for (int i = 0; i < 8; ++i, subpixel_x += step) { + sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]); + } + + Transpose8x8(sr); + + const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth); + vst1q_u16(dst_ptr, d0); + dst_ptr += 8; + } while (--x != 0); + src += source_stride; + dst += dest_stride; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->super_res_coefficients = SuperResCoefficients_NEON; + dsp->super_res = SuperRes_NEON<10>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void SuperResInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h index f51785d..65e48c5 100644 --- a/src/dsp/arm/super_res_neon.h +++ b/src/dsp/arm/super_res_neon.h @@ -31,7 +31,10 @@ void SuperResInit_NEON(); #if LIBGAV1_ENABLE_NEON #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_ diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc index 7a41998..c7fb739 100644 --- a/src/dsp/arm/warp_neon.cc +++ b/src/dsp/arm/warp_neon.cc @@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride, const int16x8_t sum = vld1q_s16(tmp); vst1_u8(reinterpret_cast(dst_row), vqmovun_s16(sum)); } -#else // !defined(__aarch64__) +#else // !defined(__aarch64__) int16x8_t filter[8]; for (int x = 0; x < 8; ++x) { const int offset = @@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc index 49d3be0..7e5bff0 100644 --- a/src/dsp/arm/weight_mask_neon.cc +++ b/src/dsp/arm/weight_mask_neon.cc @@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { -- cgit v1.2.3