diff options
author | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:57 -0400 |
---|---|---|
committer | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:57 -0400 |
commit | d4dbf19f6b0181ee78034bfe4caf189d1c016998 (patch) | |
tree | 47d5d28d2ab770a10e6c48788725c51dffeb84a9 /src/dsp/arm | |
parent | 320ef65362608ee1148c299d8d5d7618af34e470 (diff) | |
download | libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.gz libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.bz2 libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.zip |
New upstream version 0.18.0
Diffstat (limited to 'src/dsp/arm')
-rw-r--r-- | src/dsp/arm/common_neon.h | 52 | ||||
-rw-r--r-- | src/dsp/arm/convolve_10bit_neon.cc | 224 | ||||
-rw-r--r-- | src/dsp/arm/distance_weighted_blend_neon.cc | 105 | ||||
-rw-r--r-- | src/dsp/arm/film_grain_neon.cc | 218 | ||||
-rw-r--r-- | src/dsp/arm/film_grain_neon.h | 4 | ||||
-rw-r--r-- | src/dsp/arm/intrapred_directional_neon.cc | 688 | ||||
-rw-r--r-- | src/dsp/arm/intrapred_neon.cc | 10 | ||||
-rw-r--r-- | src/dsp/arm/intrapred_smooth_neon.cc | 339 | ||||
-rw-r--r-- | src/dsp/arm/inverse_transform_10bit_neon.cc | 28 | ||||
-rw-r--r-- | src/dsp/arm/inverse_transform_neon.cc | 146 | ||||
-rw-r--r-- | src/dsp/arm/loop_filter_10bit_neon.cc | 1218 | ||||
-rw-r--r-- | src/dsp/arm/loop_filter_neon.cc | 1298 | ||||
-rw-r--r-- | src/dsp/arm/loop_filter_neon.h | 1 | ||||
-rw-r--r-- | src/dsp/arm/loop_restoration_neon.cc | 8 | ||||
-rw-r--r-- | src/dsp/arm/mask_blend_neon.cc | 375 | ||||
-rw-r--r-- | src/dsp/arm/obmc_neon.cc | 523 | ||||
-rw-r--r-- | src/dsp/arm/warp_neon.cc | 97 |
17 files changed, 2702 insertions, 2632 deletions
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h index 9c46525..c0af2c1 100644 --- a/src/dsp/arm/common_neon.h +++ b/src/dsp/arm/common_neon.h @@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source, return dst; } +inline uint16x8_t MaskOverreadsQ(const uint16x8_t source, + const ptrdiff_t over_read_in_bytes) { + return vreinterpretq_u16_u8( + MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes)); +} + inline uint8x8_t Load1MsanU8(const uint8_t* const source, const ptrdiff_t over_read_in_bytes) { return MaskOverreads(vld1_u8(source), over_read_in_bytes); @@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source, vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes)); } -inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source, - const ptrdiff_t over_read_in_bytes) { - // Relative source index of elements (2 bytes each): - // dst.val[0]: 00 02 04 06 08 10 12 14 - // dst.val[1]: 01 03 05 07 09 11 13 15 - uint16x8x2_t dst = vld2q_u16(source); - dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ( - vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1)); - dst.val[1] = vreinterpretq_u16_u8( - MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]), - (over_read_in_bytes >> 1) + (over_read_in_bytes % 4))); - return dst; -} - inline uint32x4_t Load1QMsanU32(const uint32_t* const source, const ptrdiff_t over_read_in_bytes) { return vreinterpretq_u32_u8(MaskOverreadsQ( @@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) { vst1q_u16(static_cast<uint16_t*>(buf), val); } +inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) { +#if LIBGAV1_MSAN + // The memory shadow is incorrect for vst4q_u16, only marking the first 16 + // bytes of the destination as initialized. To avoid missing truly + // uninitialized memory, check the input vectors first, before marking the + // whole 64 bytes initialized. If any input vector contains unused values, it + // should pass through MaskOverreadsQ first. + __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0])); + __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1])); + __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2])); + __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3])); + vst4q_s16(static_cast<int16_t*>(buf), src); + __msan_unpoison(buf, sizeof(int16x8x4_t)); +#else + vst4q_s16(static_cast<int16_t*>(buf), src); +#endif // LIBGAV1_MSAN +} + //------------------------------------------------------------------------------ // Pointer helpers. @@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) { //------------------------------------------------------------------------------ // Saturation helpers. -inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) { +inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low, + const int16x4_t high) { return vmin_s16(vmax_s16(val, low), high); } @@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low, return vminq_s16(vmaxq_s16(val, low), high); } -inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) { +inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) { const int16x8_t low = vdupq_n_s16(0); const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); @@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); } // Output: // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { +inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { return b0; } -inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { +inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) { uint16x8x2_t b0; b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); @@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { // 10 11 12 13 // 20 21 22 23 // 30 31 32 33 +// Output: +// 00 10 20 30 +// 01 11 21 31 +// 02 12 22 32 +// 03 13 23 33 inline void Transpose4x4(uint16x4_t a[4]) { // b: // 00 10 02 12 diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc index b7205df..389f029 100644 --- a/src/dsp/arm/convolve_10bit_neon.cc +++ b/src/dsp/arm/convolve_10bit_neon.cc @@ -45,12 +45,12 @@ namespace { // Pixel output range: [ 0, 1023] // Compound output range: [ 3988, 61532] -template <int filter_index> +template <int num_taps> int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x8_t*>(src); int32x4x2_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, return sum; } -template <int filter_index> +template <int num_taps> int32x4_t SumOnePassTaps(const uint16x4_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x4_t*>(src); int32x4_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[3], taps[3]); sum = vmlal_s16(sum, ssrc[4], taps[4]); sum = vmlal_s16(sum, ssrc[5], taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[5], taps[5]); sum = vmlal_s16(sum, ssrc[6], taps[6]); sum = vmlal_s16(sum, ssrc[7], taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, return sum; } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(s + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } const int16x4_t d0 = @@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(src + x + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); @@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, int32x4_t v_sum; const uint16x8_t src_long = vld1q_u16(src); v_src[0] = vget_low_u16(src_long); - if (filter_index == 3) { + if (num_taps == 2) { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); } else { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2)); v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound || is_2d) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); @@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_2d> +template <int num_taps, bool is_2d> void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride)); const int16x8x2_t input = vzipq_s16(input0, input1); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)), @@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src)); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]); @@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, } } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const int16x4_t* const v_tap) { - assert(width < 8 || filter_index <= 3); + assert(width < 8 || num_taps != 4); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. - if (width >= 8 && filter_index <= 3) { - FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>( + if (width >= 8 && num_taps != 4) { + FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, width, height, v_tap); return; } @@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(filter_index >= 3 && filter_index <= 5); - if (filter_index >= 3 && filter_index <= 5) { + assert(num_taps == 2 || num_taps == 4); + if (num_taps == 2 || num_taps == 4) { if (width == 4) { - FilterHorizontalWidth4<filter_index, is_compound, is_2d>( + FilterHorizontalWidth4<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, height, v_tap); return; } assert(width == 2); if (!is_compound) { - FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, - pred_stride, height, v_tap); + FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride, + FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 1) { // 6 tap. - FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst, + } else if (filter_index < 2) { // 6 tap. + FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst, - dst_stride, width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst, - dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst, + FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, dst_stride, width, height, v_tap); } } @@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON( filter_index); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* const dst16 = static_cast<uint16_t*>(dst); @@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, srcs[next_row] = vld1q_u16(src_x); src_x += src_stride; - const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); const int16x4_t d0 = @@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, srcs[num_taps] = vld1_u16(src); src += src_stride; - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); - const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); + const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps); if (is_compound) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); const int16x4_t d1 = @@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index> +template <int num_taps> void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, src += src_stride; srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2); - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); const uint16x4_t d0 = vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); Store2<0>(dst16, d0); @@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps + 3); } } else { @@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } @@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 3); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 3); } } else { @@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } @@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap( PermuteSrcVals(src_bytes, src_lookup[1])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap( const uint16x4_t src_high[2] = {vget_high_u16(src[0]), vget_high_u16(src[1])}; - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc index 7d287c8..6087276 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.cc +++ b/src/dsp/arm/distance_weighted_blend_neon.cc @@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4; namespace low_bitdepth { namespace { -inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, +inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, - const int16x4_t weights[2]) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0)); - const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0)); - const int32x4_t blended_lo = - vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1)); - const int32x4_t blended_hi = - vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1)); - - return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4), - vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4)); + const int16x8_t weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const int16x8_t diff = vsubq_s16(pred0, pred1); + // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4) + const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit); } -template <int width, int height> +template <int width> inline void DistanceWeightedBlendSmall_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT prediction_1, const int height, + const int16x8_t weight, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); constexpr int step = 16 / width; - for (int y = 0; y < height; y += step) { + int y = height; + do { const int16x8_t src_00 = vld1q_s16(prediction_0); const int16x8_t src_10 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights); + const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight); const int16x8_t src_01 = vld1q_s16(prediction_0); const int16x8_t src_11 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights); + const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight); - const uint8x8_t result0 = vqmovun_s16(res0); - const uint8x8_t result1 = vqmovun_s16(res1); if (width == 4) { StoreLo4(dst, result0); dst += dest_stride; @@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON( vst1_u8(dst, result1); dst += dest_stride; } - } + y -= step; + } while (y != 0); } inline void DistanceWeightedBlendLarge_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], + const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); @@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON( do { const int16x8_t src0_lo = vld1q_s16(prediction_0 + x); const int16x8_t src1_lo = vld1q_s16(prediction_1 + x); - const int16x8_t res_lo = - ComputeWeightedAverage8(src0_lo, src1_lo, weights); + const uint8x8_t res_lo = + ComputeWeightedAverage8(src0_lo, src1_lo, weight); const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8); const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8); - const int16x8_t res_hi = - ComputeWeightedAverage8(src0_hi, src1_hi, weights); + const uint8x8_t res_hi = + ComputeWeightedAverage8(src0_hi, src1_hi, weight); - const uint8x16_t result = - vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi)); + const uint8x16_t result = vcombine_u8(res_lo, res_hi); vst1q_u8(dst + x, result); x += 16; } while (x < width); @@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON( inline void DistanceWeightedBlend_NEON( const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)}; - // TODO(johannkoenig): Investigate the branching. May be fine to call with a - // variable height. + // Upscale the weight for vqdmulh. + const int16x8_t weight = vdupq_n_s16(weight_0 << 11); if (width == 4) { - if (height == 4) { - DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest, - dest_stride); - } else if (height == 8) { - DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest, - dest_stride); - } else { - assert(height == 16); - DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest, - dest_stride); - } + DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest, + dest_stride); return; } if (width == 8) { - switch (height) { - case 4: - DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 8: - DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 16: - DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest, - dest_stride); - return; - default: - assert(height == 32); - DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest, - dest_stride); - - return; - } + DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest, + dest_stride); + return; } - DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest, + DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest, dest_stride); } diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc index 0b1b481..76e1151 100644 --- a/src/dsp/arm/film_grain_neon.cc +++ b/src/dsp/arm/film_grain_neon.cc @@ -18,23 +18,21 @@ #if LIBGAV1_ENABLE_NEON #include <arm_neon.h> -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <cstring> -#include <new> #include "src/dsp/arm/common_neon.h" -#include "src/dsp/arm/film_grain_neon.h" -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) { return ZeroExtend(vld1_u8(src)); } -inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return ZeroExtend(Load1MsanU8(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) { + return ZeroExtend(Load1MsanU8(src, 8 - valid_range)); } inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) { @@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) { return vreinterpretq_s16_u16(vld1q_u16(src)); } -inline int16x8_t GetSignedSource8Msan(const uint16_t* src, - int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vreinterpretq_s16_u16(Load1QMsanU16(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) { + return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range)); } inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) { @@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) { } inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint8x16_t src = Load1QMsanU8(luma, 0); - + const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range); + // MemorySanitizer registers vpaddlq_u8 as a use of the memory. return vrshrq_n_u16(vpaddlq_u8(src), 1); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vmovl_u8(Load1MsanU8(luma, 0)); + return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range); } #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma, } inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint16x8x2_t src = Load2QMsanU16(luma, 0); - return vrhaddq_u16(src.val[0], src.val[1]); + const uint16x8x2_t src = vld2q_u16(luma); + const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]); + return MaskOverreadsQ(result, 16 - valid_range); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return Load1QMsanU16(luma, 0); + return Load1QMsanU16(luma, 16 - valid_range); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 @@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points, } static_assert(sizeof(scaling_lut[0]) == 2, ""); Memset(scaling_lut, point_scaling[0], - std::max(static_cast<int>(point_value[0]), 1) - << (bitdepth - kBitdepth8)); + (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8)); const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000)); const int32x4_t rounding = vdupq_n_s32(32768); for (int i = 0; i < num_points - 1; ++i) { @@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points, const int16x8x4_t result = { start, vaddq_s16(start, vrshrq_n_s16(delta, 2)), vaddq_s16(start, delta2), vaddq_s16(start, delta3)}; - vst4q_s16(&scaling_lut[x_base], result); + Store4QMsanS16(&scaling_lut[x_base], result); } else { vst1q_s16(&scaling_lut[x_base], full_interp); } @@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, } template <int bitdepth, typename Pixel> -inline int16x8_t GetScalingFactors( - const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source) { int16_t start_vals[8]; static_assert(bitdepth <= kBitdepth10, "NEON Film Grain is not yet implemented for 12bpp."); +#if LIBGAV1_MSAN + memset(start_vals, 0, sizeof(start_vals)); +#endif for (int i = 0; i < 8; ++i) { - assert(source[i] < kScalingLookupTableSize << (bitdepth - 2)); + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); + start_vals[i] = scaling_lut[source[i]]; + } + return vld1q_s16(start_vals); +} + +template <int bitdepth, typename Pixel> +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source, const int valid_range) { + int16_t start_vals[8]; + static_assert(bitdepth <= kBitdepth10, + "NEON Film Grain is not yet implemented for 12bpp."); + for (int i = 0; i < valid_range; ++i) { + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); start_vals[i] = scaling_lut[source[i]]; } return vld1q_s16(start_vals); @@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON( const int16x8_t scaling_shift_vect = vdupq_n_s16( (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift); + const int safe_width = width & ~15; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_width; x += 8) { // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]); @@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON( // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]); - const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>( - scaling_lut_y, &in_y_row[std::min(x, width)]); + const int16x8_t scaling1 = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect); @@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON( // function for just that case, though the gain would be very small. StoreUnsigned8(&out_y_row[x], vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling))); - x += 8; - } while (x < width); + } + + if (x < width) { + assert(width - x < 16); + if (x < width - 8) { + const int16x8_t orig = GetSignedSource8(&in_y_row[x]); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + const int16x8_t combined = vaddq_s16(orig, noise); + // In 8bpp, when params_.clip_to_restricted_range == false, we can + // replace clipping with vqmovun_s16, but it's not likely to be worth + // copying the function for just that case, though the gain would be + // very small. + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + x += 8; + } + const int valid_range_pixels = width - x; + const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]); + const int16x8_t orig = + GetSignedSource8Msan(&in_y_row[x], valid_range_bytes); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut_y, &in_y_row[x], valid_range_pixels); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + + const int16x8_t combined = vaddq_s16(orig, noise); + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + } in_y_row += source_stride_y; out_y_row += dest_stride_y; } while (++y < height); @@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON( template <int bitdepth, typename GrainType, typename Pixel> inline int16x8_t BlendChromaValsWithCfl( - const Pixel* LIBGAV1_RESTRICT average_luma_buffer, - const int16_t* LIBGAV1_RESTRICT scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor, const GrainType* LIBGAV1_RESTRICT noise_image_cursor, - const int16x8_t scaling_shift_vect) { - const int16x8_t scaling = - GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const int16x8_t scaling, const int16x8_t scaling_shift_vect) { const int16x8_t orig = GetSignedSource8(chroma_cursor); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); @@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( const int16x8_t floor = vdupq_n_s16(min_value); const int16x8_t ceiling = vdupq_n_s16(max_chroma); Pixel luma_buffer[16]; - memset(luma_buffer, 0, sizeof(luma_buffer)); // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe // for 16 bit signed integers. In higher bitdepths, however, we have to // expand to 32 to protect the sign bit. @@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const uint16x8_t average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; const uint16x8_t average_luma = GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut, average_luma_buffer, valid_range_chroma_pixels); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. @@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int8_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) { + const int16x8_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint8_t merged_buffer[8]; const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier); const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier); @@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl( // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required. const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4); vst1_u8(merged_buffer, merged); + const int16x8_t scaling = - GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect); return vaddq_s16(orig, noise); @@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint8_t luma_buffer[16]; -#if LIBGAV1_MSAN - // Quiet msan warnings. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif const int16x8_t offset = vdupq_n_s16(chroma_offset << 5); start_height >>= subsampling_y; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; - const int valid_range = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]); - const int16x8_t average_luma = vreinterpretq_s16_u16( - GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range)); + const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( + &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); // In 8bpp, when params_.clip_to_restricted_range == false, we can // replace clipping with vqmovun_s16, but the gain would be small. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; - const int valid_range_chroma_bytes = - (chroma_width - x) * sizeof(in_chroma_row[0]); + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = - GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); + GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int16_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) { + const int32x4_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint16_t merged_buffer[8]; const int32x4_t weighted_luma_low = vmull_n_s16(vget_low_s16(average_luma), luma_multiplier); @@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl( vst1q_u16(merged_buffer, vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel)); const int16x8_t scaling = - GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, + merged_buffer); const int16x8_t noise = GetSignedSource8(noise_image_cursor); const int16x8_t scaled_noise = ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect); @@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint16_t luma_buffer[16]; -#if LIBGAV1_MSAN - // TODO(b/194217060): This can be removed if the range calculations below are - // fixed. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif // Offset is added before downshifting in order to take advantage of // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp. const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2)); @@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const int16x8_t average_luma = vreinterpretq_s16_u16( GetAverageLuma(&in_y_row[luma_x], subsampling_x)); @@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_chroma_bytes = (chroma_width - x) * sizeof(in_chroma_row[0]); const int16x8_t orig_chroma = GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1442,10 +1478,8 @@ void Init10bpp() { dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON<kBitdepth10>; - // TODO(b/194442742): reenable this function after segfault under armv7 ASan - // is fixed. - // dsp->film_grain.blend_noise_luma = - // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON; dsp->film_grain.blend_noise_chroma[1] = BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>; diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h index 3ba2eef..09596e2 100644 --- a/src/dsp/arm/film_grain_neon.h +++ b/src/dsp/arm/film_grain_neon.h @@ -39,9 +39,7 @@ void FilmGrainInit_NEON(); #define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON -// TODO(b/194442742): reenable this function after segfault under armv7 ASan is -// fixed. -// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 3cad4a6..e9bdcf0 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH( } while (++y < height); } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - -// 7.11.2.4 (8) 90 < angle > 180 -// The strategy for these functions (4xH and 8+xH) is to know how many blocks -// can be processed with just pixels from |top_ptr|, then handle mixed blocks, -// then handle only blocks that take from |left_ptr|. Additionally, a fast -// index-shuffle approach is used for pred values from |left_column| in sections -// that permit it. +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in +// sections that permit it. inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, @@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH( assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - // TODO(johannkoenig): Revisit this for |width| == 4. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH( // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. if (min_top_only_x > 0) { - // Round down to the nearest multiple of 8. - // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, upsampled_top); @@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH( // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only goes once. - for (; y < left_shuffle_stop_y; + for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_WxH<4>( dst, stride, min_height, @@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH( upsample_top_shift); } - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<4>( - dst, stride, min_height, - left_column + ((y - left_base_increment) << upsample_left_shift), - base_left_y, -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, - xstep_bounds, top_x, xstep, - upsample_top_shift); - } // Loop over y for left_only rows. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < height; y += 8, dst += stride8) { DirectionalZone3_WxH<4>( dst, stride, min_height, @@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH( } } -// Process a multiple of 8 |width|. -inline void DirectionalZone2_8( +template <bool shuffle_left_column> +inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, - const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, - const int height, const int xstep, const int ystep, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - // Helper vector. - const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; - const int ystep8 = ystep << 3; - // Process Wx4 blocks. + // Cover 8x4 case. const int min_height = (height == 4) ? 4 : 8; - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute and can therefore call the Zone1 functions. This assumes |xstep| is - // at least 3. - assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + } else { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } +} + +// Process a multiple of 8 |width|. +inline void DirectionalZone2_WxH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { + const int ystep8 = ystep << 3; // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -677,90 +696,43 @@ inline void DirectionalZone2_8( // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); + // For ystep > 90, at least two sets of 8 columns can be fully computed from + // top_row only. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. |d| represents the number of pixels that can - // fit in one contiguous vector when stepping by |ystep|. For a given x - // position, the left column values can be obtained by VTBL as long as the - // values at row[x + d] and beyond come from the top row. However, this does - // not guarantee that the vector will also contain all of the values needed - // from top row. - const int d = 16 / ((ystep >> 6) + 1); + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height, + xstep, ystep, x, left_offset, xstep_bounds_base, + left_y, upsampled_top, upsampled_left); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - const int max_shuffle_height = - std::min(((x + d) << 6) / xstep, height) & ~7; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep, - upsampled_top); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - } + DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep, + ystep, x, left_offset, xstep_bounds_base, left_y, + upsampled_top, upsampled_left); } - // TODO(johannkoenig): May be able to remove this branch. if (x < width) { + const int upsample_top_shift = static_cast<int>(upsampled_top); DirectionalZone1_WxH(dst + x, stride, width - x, height, top_row + (x << upsample_top_shift), -xstep, upsampled_top); @@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON( DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, upsampled_top, upsampled_left); } else { - DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep, - ystep, upsampled_top, upsampled_left); + DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep, + ystep, upsampled_top, upsampled_left); } } @@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, return vrshrq_n_u16(sum, 5 /*log2(32)*/); } +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16x8_t a_weight, + const uint16x8_t b_weight) { + const uint16x8_t a_product = vmulq_u16(a, a_weight); + const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, @@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, } } +// For Wx4 blocks, load the source for 2 columns. The source for the second +// column is held in the high half of each vector. +inline void LoadEdgeVals2x4(uint16x8x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source_low, + const uint16_t* LIBGAV1_RESTRICT const source_high, + const bool upsampled) { + if (upsampled) { + const uint16x4x2_t low = vld2_u16(source_low); + const uint16x4x2_t high = vld2_u16(source_high); + dest->val[0] = vcombine_u16(low.val[0], high.val[0]); + dest->val[1] = vcombine_u16(low.val[1], high.val[1]); + } else { + dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high)); + dest->val[1] = + vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1)); + } +} + template <bool upsampled> inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, @@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, } template <bool upsampled> +inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const uint16x8_t inverter = vdupq_n_u16(32); + + uint16x8x2_t sampled_left_col; + // Compute two columns at a time, then transpose for storage. + uint16x8_t result[4]; + + // The low half of pre-transpose vectors contains columns 0 through 3. + int left_y_low = base_left_y + ystep; + int left_offset_low = left_y_low >> index_scale_bits; + int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + // The high half of pre-transpose vectors contains columns 4 through 7. + int left_y_high = left_y_low + (ystep << 2); + int left_offset_high = left_y_high >> index_scale_bits; + int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + uint16x8_t weights_0 = + vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + uint16x8_t weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_high += ystep; + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + Transpose4x8(result); + Store8(dst, result[0]); + dst += stride; + Store8(dst, result[1]); + dst += stride; + Store8(dst, result[2]); + dst += stride; + Store8(dst, result[3]); +} + +template <bool upsampled> +inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x8x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x8(result); + Store4(dst, vget_low_u16(result[0])); + dst += stride; + Store4(dst, vget_low_u16(result[1])); + dst += stride; + Store4(dst, vget_low_u16(result[2])); + dst += stride; + Store4(dst, vget_low_u16(result[3])); + dst += stride; + Store4(dst, vget_high_u16(result[0])); + dst += stride; + Store4(dst, vget_high_u16(result[1])); + dst += stride; + Store4(dst, vget_high_u16(result[2])); + dst += stride; + Store4(dst, vget_high_u16(result[3])); +} + +template <bool upsampled> inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { + assert(height == 8 || height == 16); const int upsample_shift = static_cast<int>(upsampled); - int y = 0; - do { - DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift), + DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep); + if (height == 16) { + dest += stride << 3; + DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift), ystep); - dest += 4 * stride; - y += 4; - } while (y < height); + } } template <bool upsampled> @@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { - int x = 0; - int base_left_y = 0; - do { - // TODO(petersonab): Establish 8x4 transpose to reserve this function for - // 8x4 and 16x4. - DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep, - base_left_y); - base_left_y += 4 * ystep; - x += 4; - } while (x < width); + assert(width <= 16); + if (width == 4) { + DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep); + return; + } + DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep); + if (width == 16) { + const int base_left_y = ystep << 3; + DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left, + ystep, base_left_y); + } } template <bool upsampled> @@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON( } while (y != 0); return; } - if (width == 4) { + if (height == 4) { if (upsampled_left) { - DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); } else { - DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); } - } else if (height == 4) { + } else if (width == 4) { if (upsampled_left) { - DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); } else { - DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); } } else { if (upsampled_left) { @@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, return vrshr_n_u16(sum, 5 /*log2(32)*/); } -// Blend two values based on weight pairs that each sum to 32. -inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, - const uint16x8_t a_weight, - const uint16x8_t b_weight) { - const uint16x8_t a_product = vmulq_u16(a, a_weight); - const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); - - return vrshrq_n_u16(sum, 5 /*log2(32)*/); -} - // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative in localized functions. // This is accommodated by making sure the relative indices are within [-15, 0] @@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH( } while (++y < height); } -inline void DirectionalZone2FromLeftCol_8xH( - uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, +inline void DirectionalZone2FromLeftCol_8x8( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const bool upsampled) { const int upsample_shift = static_cast<int>(upsampled); @@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH( vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); - int y = 0; - do { + for (int y = 0; y < 8; ++y) { uint16x8_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), @@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH( Store8(dst, val); dst += stride; - } while (++y < height); + } } template <bool upsampled> @@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH( } template <bool upsampled> -inline void DirectionalZone1Blend_8xH( - uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, +inline void DirectionalZone1Blend_8x8( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast<int>(upsampled); @@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH( const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8x2_t top_vals; - int y = height; - do { + for (int y = 0; y < 8; ++y) { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); @@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH( dest += stride; zone_bounds += xstep; top_x -= xstep; - } while (--y != 0); + } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices -// that do not correspond to angle derivatives are left at zero. -// Notably, in cases with upsampling, the shuffle-invalid height is always -// greater than the prediction height (which is 8 at maximum). -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, @@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH( // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. - // Round down to the nearest multiple of 8. - // TODO(petersonab): Check if rounding to the nearest 4 is okay. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst), stride >> 1, max_top_only_y, top_row, -xstep); @@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH( xstep_bounds, top_x, xstep); } - // Loop over y for left-only rows. - for (; y < height; y += 8, dst += stride8) { - // Angle expected by Zone3 is flipped about the 180 degree vector, which - // is the x-axis. + // Left-only section. |height| - |y| is assumed equivalent to: + // (y == 0) && (height == 4) + if (height - y == 4) { + DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep); + return; + } + if (y < height) { DirectionalZone3_4xH<upsampled_left>( - dst, stride, min_height, left_column + (y << upsample_left_shift), + dst, stride, height - y, left_column + (y << upsample_left_shift), -ystep); } } @@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4( } } +template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x * sizeof(uint16_t); + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsampled_left); + } else { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } + + DirectionalZone1Blend_8x8<upsampled_top>( + dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x, + xstep); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } +} + // Process a multiple of 8 |width|. template <bool upsampled_top, bool upsampled_left> -inline void DirectionalZone2_8( +inline void DirectionalZone2_NEON( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, @@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8( dst, stride, top_row, left_column, width, xstep, ystep); return; } - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop increments for moving by block (8x8). This function handles blocks - // with height 4 as well. They are calculated in one pass so these variables - // do not get used. - const ptrdiff_t stride8 = stride << 3; - const int xstep8 = xstep << 3; const int ystep8 = ystep << 3; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8( int16x8_t left_y = vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. int x = 0; + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x * sizeof(uint16_t); - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<upsampled_top>( - reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_8xH( - dst_x, stride, 8, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - } + DirectionalZone2_8xH<true, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); } // Reached |min_top_only_x|. if (x < width) { @@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON( } if (upsampled_top) { if (upsampled_left) { - DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } else if (upsampled_left) { - DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc index cd47a22..d1adbdf 100644 --- a/src/dsp/arm/intrapred_neon.cc +++ b/src/dsp/arm/intrapred_neon.cc @@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist, const uint16x8_t top_left_dist_low, const uint16x8_t top_left_dist_high) { - // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of - // using movl(x_dist). - const uint8x8_t x_le_top_left_low = - vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low)); - const uint8x8_t x_le_top_left_high = - vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high)); - return vcombine_u8(x_le_top_left_low, x_le_top_left_high); + const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), + vqmovn_u16(top_left_dist_high)); + return vcleq_u8(x_dist, top_left_dist); } // Select the closest values and collect them. diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index bcda131..d6c1450 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -31,7 +31,6 @@ namespace libgav1 { namespace dsp { - namespace low_bitdepth { namespace { @@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; -inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, - const uint16x4_t weighted_left, - const uint16x4_t weighted_bl, - const uint16x4_t weighted_tr) { - const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); - const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); - const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); - return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); +// 256 - v = vneg_s8(v) +inline uint8x8_t NegateS8(const uint8x8_t v) { + return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); } template <int height> -inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 4; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v))); - const uint16x4_t weighted_bl = - vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v)); - - const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v)); - const uint16x4_t weighted_left = - vget_low_u16(vmull_u8(weights_x_v, left_v)); - const uint16x4_t weighted_tr = - vget_low_u16(vmull_u8(scaled_weights_x, top_right_v)); - const uint16x4_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); - - StoreLo4(dst, vmovn_u16(vcombine_u16(result, result))); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_bl, weights_y_v, top_v); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x_v, left_v); + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale); + + StoreLo4(dst, result); dst += stride; } } -inline uint8x8_t CalculatePred(const uint16x8_t weighted_top, - const uint16x8_t weighted_left, - const uint16x8_t weighted_bl, - const uint16x8_t weighted_tr) { - // Maximum value: 0xFF00 - const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl); - // Maximum value: 0xFF00 - const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr); - const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1); - return vrshrn_n_u16(pred_2, kSmoothWeightScale); +inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl, + const uint16x8_t weighted_left_tr) { + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + return vrshrn_n_u16(avg, kSmoothWeightScale); +} + +inline uint8x8_t CalculateWeightsAndPred( + const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, + const uint8x8_t bottom_left, const uint8x8_t weights_x, + const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { + const uint16x8_t weighted_top = vmull_u8(weights_y, top); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left); + const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); + return CalculatePred(weighted_top_bl, weighted_left_tr); } template <int height> -inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 8; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - - const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint8x8_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); + CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v, + weights_x_v, scaled_weights_y, weights_y_v); vst1_u8(dst, result); dst += stride; @@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred( const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, const uint8x8_t weights_y, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_top_bl_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint8x8_t result_low = CalculatePred( - weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t result_low = + CalculatePred(weighted_top_bl_low, weighted_left_tr_low); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t weighted_top_bl_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint8x8_t result_high = CalculatePred( - weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t result_high = + CalculatePred(weighted_top_bl_high, weighted_left_tr_high); return vcombine_u8(result_low, result_high); } +// 256 - v = vneg_s8(v) +inline uint8x16_t NegateS8(const uint8x16_t v) { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); +} + template <int width, int height> -inline void Smooth16PlusxN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; @@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); - // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. - // This currently has a performance slope similar to Paeth so it does not - // appear to be register bound for arm64. uint8x16_t weights_x_v[4]; weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); if (width > 16) { @@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); + scaled_weights_x[0] = NegateS8(weights_x_v[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); + scaled_weights_x[1] = NegateS8(weights_x_v[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); + scaled_weights_x[2] = NegateS8(weights_x_v[2]); + scaled_weights_x[3] = NegateS8(weights_x_v[3]); } } for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, @@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON( } template <int width, int height> -inline void SmoothVertical4Or8xN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; @@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON( inline uint8x16_t CalculateVerticalWeightsAndPred( const uint8x16_t top, const uint8x8_t weights_y, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); - const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); - const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); + const uint16x8_t pred_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t pred_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); const uint8x8_t pred_scaled_high = vrshrn_n_u16(pred_high, kSmoothWeightScale); @@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred( } template <int width, int height> -inline void SmoothVertical16PlusxN_NEON( +void SmoothVertical16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); const uint8x16_t pred_0 = @@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON( } template <int width, int height> -inline void SmoothHorizontal4Or8xN_NEON( +void SmoothHorizontal4Or8xN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); // Over-reads for 4xN but still within the array. const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); - - const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); - const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x, left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred( const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x) { const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); - const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_high = - vrshrn_n_u16(pred_high, kSmoothWeightScale); + vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale); return vcombine_u8(pred_scaled_low, pred_scaled_high); } template <int width, int height> -inline void SmoothHorizontal16PlusxN_NEON( +void SmoothHorizontal16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); + scaled_weights_x[0] = NegateS8(weights_x[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); + scaled_weights_x[1] = NegateS8(weights_x[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); + scaled_weights_x[2] = NegateS8(weights_x[2]); + scaled_weights_x[3] = NegateS8(weights_x[3]); } } @@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; +// 256 - v = vneg_s8(v) +inline uint16x4_t NegateS8(const uint16x4_t v) { + return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); +} + template <int height> -inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t top_v = vld1_u16(top); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v); - - // Weighted top right doesn't change with each row. + const uint16x4_t scaled_weights_x = NegateS8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // Common code between 8xH and [16|32|64]xH. inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, - const uint32x4_t& weighted_corners_low, - const uint32x4_t& weighted_corners_high, - const uint16x4x2_t& top_vals, - const uint16x4x2_t& weights_x, const uint16_t left_y, + const uint32x4_t weighted_corners_low, + const uint32x4_t weighted_corners_high, + const uint16x4x2_t top_vals, + const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. @@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, } template <int height> -inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), vld1_u16(kSmoothWeights + 8)}; - // Weighted top right doesn't change with each row. const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = @@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // For width 16 and above. template <int width, int height> -inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; @@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); // Precompute weighted values that don't vary with |y|. uint32x4_t weighted_tr_low[width >> 3]; uint32x4_t weighted_tr_high[width >> 3]; for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right); const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right); } const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); auto* dst_x = reinterpret_cast<uint16_t*>(dst); @@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, } template <int height> -inline void SmoothVertical4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON( } template <int height> -inline void SmoothVertical8xH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON( for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothVerticalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON( const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON( } template <int height> -inline void SmoothHorizontal4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON( auto* dst = static_cast<uint8_t*>(dest); const uint16x4_t weights_x = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x); + const uint16x4_t scaled_weights_x = NegateS8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON( } template <int height> -inline void SmoothHorizontal8xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON( vld1_u16(kSmoothWeights + 8)}; const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); @@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothHorizontalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); - uint16x4_t weights_x_low[width >> 3]; uint16x4_t weights_x_high[width >> 3]; uint32x4_t weighted_tr_low[width >> 3]; @@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON( for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right); weights_x_high[i] = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right); } for (int y = 0; y < height; ++y) { @@ -1141,6 +1113,7 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = SmoothHorizontalWxH_NEON<64, 64>; } + } // namespace } // namespace high_bitdepth #endif // LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc index 617accc..e6f0d9d 100644 --- a/src/dsp/arm/inverse_transform_10bit_neon.cc +++ b/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, const int32x4_t max = vdupq_n_s32((1 << range) - 1); int32x4_t s[4], x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, for (auto& i : s) { i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift))); } - Transpose4x4(s, s); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, s); } - StoreDst<4>(dst, step, 0, s); } template <ButterflyRotationFunc butterfly_rotation, @@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, int32x4_t s[8]; int32x4_t x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); - Transpose4x4(x, x); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, x); } - StoreDst<4>(dst, step, 0, x); } alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc index 1c2e111..452f14a 100644 --- a/src/dsp/arm/inverse_transform_neon.cc +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -41,50 +41,6 @@ namespace { //------------------------------------------------------------------------------ -// TODO(slavarnway): Move transpose functions to transpose_neon.h or -// common_neon.h. - -LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4], - int16x8_t out[4]) { - // Swap 16 bit elements. Goes from: - // a0: 00 01 02 03 - // a1: 10 11 12 13 - // a2: 20 21 22 23 - // a3: 30 31 32 33 - // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - const int16x4_t a0 = vget_low_s16(in[0]); - const int16x4_t a1 = vget_low_s16(in[1]); - const int16x4_t a2 = vget_low_s16(in[2]); - const int16x4_t a3 = vget_low_s16(in[3]); - - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - - // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 - // c0.val[1]: 02 12 22 32 06 16 26 36 - // c1.val[0]: 01 11 21 31 05 15 25 35 - // c1.val[1]: 03 13 23 33 07 17 27 37 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - - const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]); - const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]); - const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]); - const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]); - - out[0] = vcombine_s16(d0, d0); - out[1] = vcombine_s16(d1, d1); - out[2] = vcombine_s16(d2, d2); - out[3] = vcombine_s16(d3, d3); -} - // Note this is only used in the final stage of Dct32/64 and Adst16 as the in // place version causes additional stack usage with clang. LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8], @@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); + assert(step == 4); + int16x8x4_t y = vld4q_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { LoadSrc<16, 4>(dst, step, 0, x); } } else { - LoadSrc<8, 4>(dst, step, 0, x); if (transpose) { - Transpose4x4(x, x); + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]); + } else { + LoadSrc<8, 4>(dst, step, 0, x); } } @@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(s, output); - StoreDst<8, 8>(dst, step, 0, output); + int16x8x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s16(dst, y); } else { StoreDst<16, 4>(dst, step, 0, s); } } else { if (transpose) { - Transpose4x4(s, s); + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]); + vst4_s16(dst, y); + } else { + StoreDst<8, 4>(dst, step, 0, s); } - StoreDst<8, 4>(dst, step, 0, s); } } @@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { //------------------------------------------------------------------------------ // Asymmetric Discrete Sine Transforms (ADST). -template <bool stage_is_rectangular> + LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool transpose) { auto* const dst = static_cast<int16_t*>(dest); - int32x4_t s[8]; - int16x8_t x[4]; + int32x4_t s[7]; + int16x4_t x[4]; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); - } else { - LoadSrc<16, 4>(dst, step, 0, x); - } + if (transpose) { + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { - LoadSrc<8, 4>(dst, step, 0, x); - if (transpose) { - Transpose4x4(x, x); - } + x[0] = vld1_s16(dst); + x[1] = vld1_s16(dst + 1 * step); + x[2] = vld1_s16(dst + 2 * step); + x[3] = vld1_s16(dst + 3 * step); } // stage 1. - s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]); - s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]); + s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]); + s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]); // stage 2. - const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2])); - const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3])); + const int32x4_t a7 = vsubl_s16(x[0], x[2]); + const int32x4_t b7 = vaddw_s16(a7, x[3]); // stage 3. - s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]); - s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]); + s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]); + s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]); // s[0] = s[0] + s[3] - s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]); + s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]); // s[1] = s[1] - s[4] - s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]); + s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]); - s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]); + s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]); s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); // stage 4. @@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12); const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12); - x[0] = vcombine_s16(dst_0, dst_0); - x[1] = vcombine_s16(dst_1, dst_1); - x[2] = vcombine_s16(dst_2, dst_2); - x[3] = vcombine_s16(dst_3, dst_3); + x[0] = dst_0; + x[1] = dst_1; + x[2] = dst_2; + x[3] = dst_3; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(x, output); - StoreDst<8, 8>(dst, step, 0, output); - } else { - StoreDst<16, 4>(dst, step, 0, x); - } + if (transpose) { + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4_s16(dst, y); } else { - if (transpose) { - Transpose4x4(x, x); - } - StoreDst<8, 4>(dst, step, 0, x); + vst1_s16(dst, x[0]); + vst1_s16(dst + 1 * step, x[1]); + vst1_s16(dst + 2 * step, x[2]); + vst1_s16(dst + 3 * step, x[3]); } } @@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, int i = adjusted_tx_height; auto* data = src; do { - Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true); + Adst4_NEON(data, /*step=*/4, /*transpose=*/true); data += 16; i -= 4; } while (i != 0); @@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, int i = tx_width; auto* data = src; do { - Adst4_NEON<false>(data, tx_width, /*transpose=*/false); + Adst4_NEON(data, tx_width, /*transpose=*/false); data += 4; i -= 4; } while (i != 0); diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc new file mode 100644 index 0000000..a9dd98f --- /dev/null +++ b/src/dsp/arm/loop_filter_10bit_neon.cc @@ -0,0 +1,1218 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 + +#include <arm_neon.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) +inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { + const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); + return vorr_u16(vget_low_u16(a), vget_high_u16(a)); +} + +// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh +inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, + const uint16x4_t q0, const uint16x4_t q1, + const uint16_t outer_thresh) { + const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); + const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); + const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); + const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); + const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); + return vcle_u16(sum, vdup_n_u16(outer_thresh)); +} + +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && +// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh +// OuterThreshold() +inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16x8_t abd_p2p3_q2q3, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); + return vand_u16(inner_mask, outer_mask); +} + +// ----------------------------------------------------------------------------- +// FilterNMasks functions. + +inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const hev_mask, + uint16x4_t* const needs_filter4_mask) { + const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + // This includes cases where NeedsFilter4() is not true and so Filter2() will + // not be applied. + const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); + + *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); + + // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. + *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); +} + +// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && +// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p0p2_q0q2) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(b), vget_high_u16(b)); +} + +inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, const uint16_t hev_thresh, + const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter6_mask, + uint16x4_t* const is_flat3_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); + *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), + inner_thresh, outer_mask); +} + +// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. +// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && +// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && +// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, + const uint16x8_t abd_pn1p0_qn1q0, + const uint16x8_t abd_pn2p0_qn2q0) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); + const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(c), vget_high_u16(c)); +} + +inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter8_mask, + uint16x4_t* const is_flat4_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + const uint16x4_t is_flat4 = + IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); + *needs_filter8_mask = + NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), + inner_thresh, outer_mask); + // |is_flat4_mask| is used to decide where to use the result of Filter8. + // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, + // overriding the question of whether to use Filter8. Because Filter4 doesn't + // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the + // source value. To be correct, the mask must account for this override. + *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); +} + +// ----------------------------------------------------------------------------- +// FilterN functions. + +// Calculate Filter4() or Filter2() based on |hev_mask|. +inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, + const uint16x8_t p1q1, const uint16x4_t hev_mask, + uint16x8_t* const p1q1_result, + uint16x8_t* const p0q0_result) { + const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); + // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + // q0mp0 means "q0 minus p0". + const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); + const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); + + // If this is for Filter2() then include |p1mq1|. Otherwise zero it. + const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); + const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); + const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); + const int16x4_t p1mq1_saturated = + Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); + const int16x4_t hev_option = + vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); + + const int16x4_t a = vadd_s16(q0mp0_3, hev_option); + + // Need to figure out what's going on here because there are some unnecessary + // tricks to accommodate 8x8 as smallest 8bpp vector + + // We can not shift with rounding because the clamp comes *before* the + // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = + // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int16x4_t plus_four = + Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); + const int16x4_t plus_three = + Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); + const int16x4_t a1 = vshr_n_s16(plus_four, 3); + const int16x4_t a2 = vshr_n_s16(plus_three, 3); + + // a3 = (a1 + 1) >> 1; + const int16x4_t a3 = vrshr_n_s16(a1, 1); + + const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); + const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); + + // Need to shift the second term or we end up with a2_ma2. + const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); + const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); + *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); + *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); +} + +void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + + const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Offset by 2 uint16_t values to load from first p1 position. + auto* dst = static_cast<uint8_t*>(dest) - 4; + auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); + auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); + auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); + auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); + + uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1)}; + Transpose4x4(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + vst1_u16(dst_p1, output[0]); + vst1_u16(dst_p0, output[1]); + vst1_u16(dst_q0, output[2]); + vst1_u16(dst_q1, output[3]); +} + +inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p1 and q1 output from opposite directions. + // The formula is regrouped to allow 3 doubling operations to be combined. + // + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) + // ^^^^^^^^ + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^^^^^^ + uint16x8_t sum = vaddq_u16(p2q2, p1q1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p0q0); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^ + sum = vshlq_n_u16(sum, 1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ ^^^^^^ + // Should dual issue with the left shift. + const uint16x8_t q0p0 = Transpose64(p0q0); + const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); + sum = vaddq_u16(sum, outer_sum); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - (2 * p2) + q0 + q1 + // q0 = q1 - (2 * q2) + p0 + p1 + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + sum = vsubq_u16(sum, p2q2_double); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + + const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), + vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Left side of the filter window. + auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Overread by 2 values. These overreads become the high halves of src_raw[2] + // and src_raw[3] after transpose. + uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + Transpose4x8(src_raw); + // p2, p1, p0, q0, q1, q2 + const uint16x4_t src[6] = { + vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), + vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), + vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), + }; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + // dst_n starts at p2, so adjust to p1. + vst1_u16(dst_0 + 1, output[0]); + vst1_u16(dst_1 + 1, output[1]); + vst1_u16(dst_2 + 1, output[2]); + vst1_u16(dst_3 + 1, output[3]); +} + +inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p2 and q2 output from opposite directions. + // The formula is regrouped to allow 2 doubling operations to be combined. + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^ + uint16x8_t sum = vshlq_n_u16(p23q23, 1); + + // Add two other terms to make dual issue with shift more likely. + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^^^ + sum = vaddq_u16(sum, p01q01); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p3q3); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p2q2_output = vrshrq_n_u16(sum, 3); + + // Convert to p1 and q1 output: + // p1 = p2 - p3 - p2 + p1 + q1 + // q1 = q2 - q3 - q2 + q0 + p1 + sum = vsubq_u16(sum, p23q23); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - p3 - p1 + p0 + q2 + // q0 = q1 - q3 - q1 + q0 + p2 + sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + const uint16x4_t src[8] = { + vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); + const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); + const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); + const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); +} + +inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { + return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); +} + +void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. + // To get desired pairs after transpose, one half should be reversed. + uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + + // src[0] = p0q0 + // src[1] = p1q1 + // src[2] = p2q2 + // src[3] = p3q3 + LoopFilterTranspose4x8(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), + vget_high_u16(src[1]), outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = src[0]; + const uint16x8_t p1q1 = src[1]; + const uint16x8_t p2q2 = src[2]; + const uint16x8_t p3q3 = src[3]; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; + // After transpose, |output| will contain rows of the form: + // p0 p1 p2 p3 q0 q1 q2 q3 + Transpose4x8(output); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, ReverseLowHalf(output[0])); + vst1q_u16(dst_1, ReverseLowHalf(output[1])); + vst1q_u16(dst_2, ReverseLowHalf(output[2])); + vst1q_u16(dst_3, ReverseLowHalf(output[3])); +} + +inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, + const uint16x8_t p4q4, const uint16x8_t p3q3, + const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p5q5_output, + uint16x8_t* const p4q4_output, + uint16x8_t* const p3q3_output, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p5 and q5 output from opposite directions. + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^^^^^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^^^^^^^^^^^^ + uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); + sum = vaddq_u16(sum, p6q6_x7); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p5q5_output = vrshrq_n_u16(sum, 4); + + // Convert to p4 and q4 output: + // p4 = p5 - (2 * p6) + p3 + q1 + // q4 = q5 - (2 * q6) + q3 + p1 + sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); + + *p4q4_output = vrshrq_n_u16(sum, 4); + + // Convert to p3 and q3 output: + // p3 = p4 - p6 - p5 + p2 + q2 + // q3 = q4 - q6 - q5 + q2 + p2 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); + + *p3q3_output = vrshrq_n_u16(sum, 4); + + // Convert to p2 and q2 output: + // p2 = p3 - p6 - p4 + p1 + q3 + // q2 = q3 - q6 - q4 + q1 + p3 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); + const uint16x8_t q3p3 = Transpose64(p3q3); + sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); + + *p2q2_output = vrshrq_n_u16(sum, 4); + + // Convert to p1 and q1 output: + // p1 = p2 - p6 - p3 + p0 + q4 + // q1 = q2 - q6 - q3 + q0 + p4 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); + const uint16x8_t q4p4 = Transpose64(p4q4); + sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); + + *p1q1_output = vrshrq_n_u16(sum, 4); + + // Convert to p0 and q0 output: + // p0 = p1 - p6 - p2 + q0 + q5 + // q0 = q1 - q6 - q2 + p0 + p5 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); + const uint16x8_t q5p5 = Transpose64(p5q5); + sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); + + *p0q0_output = vrshrq_n_u16(sum, 4); +} + +void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); + auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); + auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); + auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); + auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); + + const uint16x4_t src[14] = { + vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), + vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), + vld1_u16(dst_q5), vld1_u16(dst_q6)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); + const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); + const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); + const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); + const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); + const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + + vst1_u16(dst_p5, vget_low_u16(p5q5_output)); + vst1_u16(dst_p4, vget_low_u16(p4q4_output)); + vst1_u16(dst_p3, vget_low_u16(p3q3_output)); + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); + vst1_u16(dst_q3, vget_high_u16(p3q3_output)); + vst1_u16(dst_q4, vget_high_u16(p4q4_output)); + vst1_u16(dst_q5, vget_high_u16(p5q5_output)); +} + +inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { + uint16x8x2_t acdb; +#if defined(__aarch64__) + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); +#else + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), + vreinterpretq_u64_u16(ab), 1)); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), + vreinterpretq_u64_u16(ab), 0)); +#endif // defined(__aarch64__) + return acdb; +} + +void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Low halves: p7 p6 p5 p4 + // High halves: p3 p2 p1 p0 + uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + // p7 will be the low half of src_p[0]. Not used until the end. + Transpose4x8(src_p); + + // Low halves: q0 q1 q2 q3 + // High halves: q4 q5 q6 q7 + uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), + vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; + // q7 will be the high half of src_q[3]. Not used until the end. + Transpose4x8(src_q); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), + vget_low_u16(src_q[1]), outer_thresh); + const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); + const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); + const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); + const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = + vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); + const uint16x8_t p5q5 = + vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); + const uint16x8_t p6q6 = + vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); + const uint16x8_t p7q7 = + vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + // To get the correctly ordered rows from the transpose, we need: + // p7p3 p6p2 p5p1 p4p0 + // q0q4 q1q5 q2q6 q3q7 + const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); + const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); + const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); + const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); + uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], + p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; + Transpose4x8(output_p); + uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], + p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; + Transpose4x8(output_q); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, output_p[0]); + vst1q_u16(dst_0 + 8, output_q[0]); + vst1q_u16(dst_1, output_p[1]); + vst1q_u16(dst_1 + 8, output_q[1]); + vst1q_u16(dst_2, output_p[2]); + vst1q_u16(dst_2 + 8, output_q[2]); + vst1q_u16(dst_3, output_p[3]); + vst1q_u16(dst_3 + 8, output_q[3]); +} + +} // namespace + +void LoopFilterInit10bpp_NEON() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Horizontal4_NEON; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Horizontal6_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Horizontal8_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Horizontal14_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Vertical14_NEON; +} + +} // namespace dsp +} // namespace libgav1 + +#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10) +namespace libgav1 { +namespace dsp { + +void LoopFilterInit10bpp_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc index 8c03928..a8b236d 100644 --- a/src/dsp/arm/loop_filter_neon.cc +++ b/src/dsp/arm/loop_filter_neon.cc @@ -29,7 +29,6 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) @@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, vst1q_u8(dst, output_3); } -void Init8bpp() { +} // namespace + +void LoopFilterInit_NEON() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = @@ -1178,1267 +1147,6 @@ void Init8bpp() { dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14_NEON; } -} // namespace -} // namespace low_bitdepth - -#if LIBGAV1_MAX_BITDEPTH >= 10 -namespace high_bitdepth { -namespace { - -// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) -inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { - const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); - return vorr_u16(vget_low_u16(a), vget_high_u16(a)); -} - -// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh -inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, - const uint16x4_t q0, const uint16x4_t q1, - const uint16_t outer_thresh) { - const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); - const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); - const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); - const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); - const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); - return vcle_u16(sum, vdup_n_u16(outer_thresh)); -} - -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && -// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh -// OuterThreshold() -inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16x8_t abd_p2p3_q2q3, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); - return vand_u16(inner_mask, outer_mask); -} - -// ----------------------------------------------------------------------------- -// FilterNMasks functions. - -inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const hev_mask, - uint16x4_t* const needs_filter4_mask) { - const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - // This includes cases where NeedsFilter4() is not true and so Filter2() will - // not be applied. - const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); - - *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); - - // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. - *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); -} - -// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && -// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p0p2_q0q2) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(b), vget_high_u16(b)); -} - -inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, const uint16_t hev_thresh, - const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter6_mask, - uint16x4_t* const is_flat3_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); - *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), - inner_thresh, outer_mask); -} - -// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. -// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && -// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && -// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, - const uint16x8_t abd_pn1p0_qn1q0, - const uint16x8_t abd_pn2p0_qn2q0) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); - const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(c), vget_high_u16(c)); -} - -inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter8_mask, - uint16x4_t* const is_flat4_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - const uint16x4_t is_flat4 = - IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); - *needs_filter8_mask = - NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), - inner_thresh, outer_mask); - // |is_flat4_mask| is used to decide where to use the result of Filter8. - // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, - // overriding the question of whether to use Filter8. Because Filter4 doesn't - // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the - // source value. To be correct, the mask must account for this override. - *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); -} - -// ----------------------------------------------------------------------------- -// FilterN functions. - -// Calculate Filter4() or Filter2() based on |hev_mask|. -inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, - const uint16x8_t p1q1, const uint16x4_t hev_mask, - uint16x8_t* const p1q1_result, - uint16x8_t* const p0q0_result) { - const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); - // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); - // q0mp0 means "q0 minus p0". - const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); - const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); - - // If this is for Filter2() then include |p1mq1|. Otherwise zero it. - const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); - const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); - const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); - const int16x4_t p1mq1_saturated = - Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); - const int16x4_t hev_option = - vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); - - const int16x4_t a = vadd_s16(q0mp0_3, hev_option); - - // Need to figure out what's going on here because there are some unnecessary - // tricks to accommodate 8x8 as smallest 8bpp vector - - // We can not shift with rounding because the clamp comes *before* the - // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = - // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; - const int16x4_t plus_four = - Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); - const int16x4_t plus_three = - Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); - const int16x4_t a1 = vshr_n_s16(plus_four, 3); - const int16x4_t a2 = vshr_n_s16(plus_three, 3); - - // a3 = (a1 + 1) >> 1; - const int16x4_t a3 = vrshr_n_s16(a1, 1); - - const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); - const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); - - // Need to shift the second term or we end up with a2_ma2. - const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); - const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); - *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); - *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); -} - -void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - - const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test, but may not come up often - // enough to warrant it. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Offset by 2 uint16_t values to load from first p1 position. - auto* dst = static_cast<uint8_t*>(dest) - 4; - auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); - auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); - auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); - auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); - - uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1)}; - Transpose4x4(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - vst1_u16(dst_p1, output[0]); - vst1_u16(dst_p0, output[1]); - vst1_u16(dst_q0, output[2]); - vst1_u16(dst_q1, output[3]); -} - -inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p1 and q1 output from opposite directions. - // The formula is regrouped to allow 3 doubling operations to be combined. - // - // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 - // ^^^^^^^^ - // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) - // ^^^^^^^^ - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^^^^^^ - uint16x8_t sum = vaddq_u16(p2q2, p1q1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p0q0); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^ - sum = vshlq_n_u16(sum, 1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ ^^^^^^ - // Should dual issue with the left shift. - const uint16x8_t q0p0 = Transpose64(p0q0); - const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); - sum = vaddq_u16(sum, outer_sum); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - (2 * p2) + q0 + q1 - // q0 = q1 - (2 * q2) + p0 + p1 - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - sum = vsubq_u16(sum, p2q2_double); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - - const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), - vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Left side of the filter window. - auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Overread by 2 values. These overreads become the high halves of src_raw[2] - // and src_raw[3] after transpose. - uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - Transpose4x8(src_raw); - // p2, p1, p0, q0, q1, q2 - const uint16x4_t src[6] = { - vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), - vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), - vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), - }; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - // dst_n starts at p2, so adjust to p1. - vst1_u16(dst_0 + 1, output[0]); - vst1_u16(dst_1 + 1, output[1]); - vst1_u16(dst_2 + 1, output[2]); - vst1_u16(dst_3 + 1, output[3]); -} - -inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p2 and q2 output from opposite directions. - // The formula is regrouped to allow 2 doubling operations to be combined. - // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 - // ^^^^^^^^ - // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) - // ^^^^^^^^ - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^ - uint16x8_t sum = vshlq_n_u16(p23q23, 1); - - // Add two other terms to make dual issue with shift more likely. - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^^^ - sum = vaddq_u16(sum, p01q01); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p3q3); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p2q2_output = vrshrq_n_u16(sum, 3); - - // Convert to p1 and q1 output: - // p1 = p2 - p3 - p2 + p1 + q1 - // q1 = q2 - q3 - q2 + q0 + p1 - sum = vsubq_u16(sum, p23q23); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - p3 - p1 + p0 + q2 - // q0 = q1 - q3 - q1 + q0 + p2 - sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - const uint16x4_t src[8] = { - vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); - const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); - const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); - const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); -} - -inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { - return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); -} - -void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. - // To get desired pairs after transpose, one half should be reversed. - uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - - // src[0] = p0q0 - // src[1] = p1q1 - // src[2] = p2q2 - // src[3] = p3q3 - LoopFilterTranspose4x8(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), - vget_high_u16(src[1]), outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = src[0]; - const uint16x8_t p1q1 = src[1]; - const uint16x8_t p2q2 = src[2]; - const uint16x8_t p3q3 = src[3]; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; - // After transpose, |output| will contain rows of the form: - // p0 p1 p2 p3 q0 q1 q2 q3 - Transpose4x8(output); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, ReverseLowHalf(output[0])); - vst1q_u16(dst_1, ReverseLowHalf(output[1])); - vst1q_u16(dst_2, ReverseLowHalf(output[2])); - vst1q_u16(dst_3, ReverseLowHalf(output[3])); -} -inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, - const uint16x8_t p4q4, const uint16x8_t p3q3, - const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p5q5_output, - uint16x8_t* const p4q4_output, - uint16x8_t* const p3q3_output, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p5 and q5 output from opposite directions. - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^ - const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^^^^^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^^^^^^^^^^^^ - uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); - sum = vaddq_u16(sum, p6q6_x7); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p5q5_output = vrshrq_n_u16(sum, 4); - - // Convert to p4 and q4 output: - // p4 = p5 - (2 * p6) + p3 + q1 - // q4 = q5 - (2 * q6) + q3 + p1 - sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); - - *p4q4_output = vrshrq_n_u16(sum, 4); - - // Convert to p3 and q3 output: - // p3 = p4 - p6 - p5 + p2 + q2 - // q3 = q4 - q6 - q5 + q2 + p2 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); - - *p3q3_output = vrshrq_n_u16(sum, 4); - - // Convert to p2 and q2 output: - // p2 = p3 - p6 - p4 + p1 + q3 - // q2 = q3 - q6 - q4 + q1 + p3 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); - const uint16x8_t q3p3 = Transpose64(p3q3); - sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); - - *p2q2_output = vrshrq_n_u16(sum, 4); - - // Convert to p1 and q1 output: - // p1 = p2 - p6 - p3 + p0 + q4 - // q1 = q2 - q6 - q3 + q0 + p4 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); - const uint16x8_t q4p4 = Transpose64(p4q4); - sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); - - *p1q1_output = vrshrq_n_u16(sum, 4); - - // Convert to p0 and q0 output: - // p0 = p1 - p6 - p2 + q0 + q5 - // q0 = q1 - q6 - q2 + p0 + p5 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); - const uint16x8_t q5p5 = Transpose64(p5q5); - sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); - - *p0q0_output = vrshrq_n_u16(sum, 4); -} - -void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); - auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); - auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); - auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); - auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); - - const uint16x4_t src[14] = { - vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), - vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), - vld1_u16(dst_q5), vld1_u16(dst_q6)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); - const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); - const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); - const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); - const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); - const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - - vst1_u16(dst_p5, vget_low_u16(p5q5_output)); - vst1_u16(dst_p4, vget_low_u16(p4q4_output)); - vst1_u16(dst_p3, vget_low_u16(p3q3_output)); - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); - vst1_u16(dst_q3, vget_high_u16(p3q3_output)); - vst1_u16(dst_q4, vget_high_u16(p4q4_output)); - vst1_u16(dst_q5, vget_high_u16(p5q5_output)); -} - -inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { - uint16x8x2_t acdb; -#if defined(__aarch64__) - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); -#else - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), - vreinterpretq_u64_u16(ab), 1)); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), - vreinterpretq_u64_u16(ab), 0)); -#endif // defined(__aarch64__) - return acdb; -} - -void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Low halves: p7 p6 p5 p4 - // High halves: p3 p2 p1 p0 - uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - // p7 will be the low half of src_p[0]. Not used until the end. - Transpose4x8(src_p); - - // Low halves: q0 q1 q2 q3 - // High halves: q4 q5 q6 q7 - uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), - vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; - // q7 will be the high half of src_q[3]. Not used until the end. - Transpose4x8(src_q); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), - vget_low_u16(src_q[1]), outer_thresh); - const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); - const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); - const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); - const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = - vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); - const uint16x8_t p5q5 = - vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); - const uint16x8_t p6q6 = - vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); - const uint16x8_t p7q7 = - vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - // To get the correctly ordered rows from the transpose, we need: - // p7p3 p6p2 p5p1 p4p0 - // q0q4 q1q5 q2q6 q3q7 - const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); - const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); - const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); - const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); - uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], - p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; - Transpose4x8(output_p); - uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], - p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; - Transpose4x8(output_q); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, output_p[0]); - vst1q_u16(dst_0 + 8, output_q[0]); - vst1q_u16(dst_1, output_p[1]); - vst1q_u16(dst_1 + 8, output_q[1]); - vst1q_u16(dst_2, output_p[2]); - vst1q_u16(dst_2 + 8, output_q[2]); - vst1q_u16(dst_3, output_p[3]); - vst1q_u16(dst_3 + 8, output_q[3]); -} - -void Init10bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); - assert(dsp != nullptr); - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = - Horizontal4_NEON; - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = - Horizontal6_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = - Horizontal8_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = - Horizontal14_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = - Vertical14_NEON; -} - -} // namespace -} // namespace high_bitdepth -#endif // LIBGAV1_MAX_BITDEPTH >= 10 - -void LoopFilterInit_NEON() { - low_bitdepth::Init8bpp(); -#if LIBGAV1_MAX_BITDEPTH >= 10 - high_bitdepth::Init10bpp(); -#endif -} } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h index 540defc..531cd0d 100644 --- a/src/dsp/arm/loop_filter_neon.h +++ b/src/dsp/arm/loop_filter_neon.h @@ -26,6 +26,7 @@ namespace dsp { // Initializes Dsp::loop_filters, see the defines below for specifics. This // function is not thread-safe. void LoopFilterInit_NEON(); +void LoopFilterInit10bpp_NEON(); } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc index 2db137f..cd8552e 100644 --- a/src/dsp/arm/loop_restoration_neon.cc +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0, const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], mas[2]; uint16x8_t sq[2][4], bs[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint8_t* const dst) { uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[4]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], @@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint16x8_t ma[2]; uint8x16_t masx[3]; uint32x4x2_t b[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas, @@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width; uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]); @@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow( uint8x16_t s[2], ma3[2], ma5[2]; uint16x8_t sq[4], ma[3], b3[3], b5[3]; uint32x4x2_t b[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, @@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow( do { uint8x16_t ma3x[3], ma5x[3]; int16x8_t p[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3, diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc index 853f949..ecc67f8 100644 --- a/src/dsp/arm/mask_blend_neon.cc +++ b/src/dsp/arm/mask_blend_neon.cc @@ -33,50 +33,40 @@ namespace dsp { namespace low_bitdepth { namespace { -// TODO(b/150461164): Consider combining with GetInterIntraMask4x2(). -// Compound predictors use int16_t values and need to multiply long because the -// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by -// int8_t and accumulate into int32_t instruction. -template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask))); - const int16x4_t mask_val1 = vreinterpret_s16_u16( - vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y)))); - int16x8_t final_val; - if (subsampling_y == 1) { - const int16x4_t next_mask_val0 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride))); - const int16x4_t next_mask_val1 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3))); - final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1), - vcombine_s16(next_mask_val0, next_mask_val1)); - } else { - final_val = vreinterpretq_s16_u16( - vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1)))); - } - return vrshrq_n_s16(final_val, subsampling_y + 1); +template <int subsampling_y> +inline uint8x8_t GetMask4x2(const uint8_t* mask) { + if (subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz)); + const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz)); + + const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]), + vreinterpret_u8_u32(row_02_13.val[1])), + 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val0 = Load4(mask); - const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + // subsampling_x == 1 + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) { +inline uint8x8_t GetMask8(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8( + vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1); + } if (subsampling_x == 1) { - int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask))); - if (subsampling_y == 1) { - const int16x8_t next_mask_val = - vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride))); - mask_val = vaddq_s16(mask_val, next_mask_val); - } - return vrshrq_n_s16(mask_val, 1 + subsampling_y); + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val = vld1_u8(mask); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + return vld1_u8(mask); } inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, @@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, StoreHi4(dst + dst_stride, result); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); - int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + // Compound predictors use int16_t values and need to multiply long because + // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply + // int16_t by int8_t and accumulate into int32_t instruction. + int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - // TODO(b/150461164): Arm tends to do better with load(val); val += stride - // It may be possible to turn this into a loop with a templated height. - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, + const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { - MaskBlending4x4_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride); return; } + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); int y = 0; do { int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask))); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; y += 8; } while (y < height); } +inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const int16x8_t pred_mask_0, + const int16x8_t pred_mask_1) { + // First 8 values. + const int16x8_t pred_val_0 = vld1q_s16(pred_0); + const int16x8_t pred_val_1 = vld1q_s16(pred_1); + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const int32x4_t weighted_pred_lo = + vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); + const int32x4_t weighted_pred_hi = + vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); + const int32x4_t weighted_combo_lo = vmlal_s16( + weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1)); + const int32x4_t weighted_combo_hi = vmlal_s16( + weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1)); + + // dst[x] = static_cast<Pixel>( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), + vshrn_n_s32(weighted_combo_hi, 6)), + 4); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + const int16x8_t mask_inverter = vdupq_n_s16(64); + int y = height; + do { + const int16x8_t pred_mask_0 = + ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask)); + // 64 - mask + const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + const uint8x8_t result = + CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1); + vst1_u8(dst, result); + dst += dst_stride; + mask += 8 << (subsampling_x + subsampling_y); + pred_0 += 8; + pred_1 += 8; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val0 = vld2q_u8(mask); + const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride); + const uint8x16_t combined_horz0 = + vaddq_u8(mask_val0.val[0], mask_val0.val[1]); + const uint8x16_t combined_horz1 = + vaddq_u8(mask_val1.val[0], mask_val1.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1); + } + if (subsampling_x == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + return vrhaddq_u8(mask_val.val[0], mask_val.val[1]); + } + assert(subsampling_y == 0 && subsampling_x == 0); + return vld1q_u8(mask); +} + template <int subsampling_x, int subsampling_y> inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, @@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); if (width == 4) { - MaskBlending4xH_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst, + dst_stride); + return; + } + if (width == 8) { + MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr, + height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, do { int x = 0; do { - const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>( mask + (x << subsampling_x), mask_stride); + const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0)); + const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0)); // 64 - mask - const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); - const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x); - const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x); + const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo); + const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi); + uint8x8_t result; - // int res = (mask_value * prediction_0[x] + - // (64 - mask_value) * prediction_1[x]) >> 6; - const int32x4_t weighted_pred_0_lo = - vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); - const int32x4_t weighted_pred_0_hi = - vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); - const int32x4_t weighted_combo_lo = - vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), - vget_low_s16(pred_val_1)); - const int32x4_t weighted_combo_hi = - vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), - vget_high_s16(pred_val_1)); - - // dst[x] = static_cast<Pixel>( - // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, - // (1 << kBitdepth8) - 1)); - result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), - vshrn_n_s32(weighted_combo_hi, 6)), - 4); + result = + CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo); vst1_u8(dst + x, result); - x += 8; + result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi, + pred_mask_1_hi); + vst1_u8(dst + x + 8, result); + + x += 16; } while (x < width); dst += dst_stride; pred_0 += width; @@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, } while (++y < height); } -// TODO(b/150461164): This is much faster for inter_intra (input is Pixel -// values) but regresses compound versions (input is int16_t). Try to -// consolidate these. template <int subsampling_x, int subsampling_y> inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const uint8x8_t mask_val = - vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y))); - if (subsampling_y == 1) { - const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride), - vld1_u8(mask + mask_stride * 3)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_val, /*subsampling_x=*/1); + return GetMask4x2<subsampling_y>(mask); } - + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. assert(subsampling_y == 0 && subsampling_x == 0); const uint8x8_t mask_val0 = Load4(mask); - // TODO(b/150461164): Investigate the source of |mask| and see if the stride - // can be removed. - // TODO(b/150461164): The unit tests start at 8x8. Does this get run? return Load4<1>(mask + mask_stride, mask_val0); } -template <int subsampling_x, int subsampling_y> -inline uint8x8_t GetInterIntraMask8(const uint8_t* mask, - ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const uint8x16_t mask_val = vld1q_u8(mask); - const uint8x8_t mask_paired = - vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val)); - if (subsampling_y == 1) { - const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride); - const uint8x8_t next_mask_paired = - vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_paired, /*subsampling_x=*/1); - } - - assert(subsampling_y == 0 && subsampling_x == 0); - return vld1_u8(mask); -} - inline void InterIntraWriteMaskBlendLine8bpp4x2( const uint8_t* LIBGAV1_RESTRICT const pred_0, uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1, @@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON( } template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlending8bpp8xH_NEON( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, const int height) { + const uint8x8_t mask_inverter = vdup_n_u8(64); + int y = height; + do { + const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask); + // 64 - mask + const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0 = vld1_u8(pred_0); + const uint8x8_t pred_val_1 = vld1_u8(pred_1); + const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo = + vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); + const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); + vst1_u8(pred_1, result); + + pred_0 += 8; + pred_1 += pred_stride_1; + mask += mask_stride << subsampling_y; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend8bpp_NEON( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, @@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON( height); return; } + if (width == 8) { + InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>( + prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, + height); + return; + } const uint8_t* mask = mask_ptr; - const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8x16_t mask_inverter = vdupq_n_u8(64); int y = 0; do { int x = 0; do { - // TODO(b/150461164): Consider a 16 wide specialization (at least for the - // unsampled version) to take advantage of vld1q_u8(). - const uint8x8_t pred_mask_1 = - GetInterIntraMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride); + const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride); // 64 - mask - const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); - const uint8x8_t pred_val_0 = vld1_u8(prediction_0); + const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0); + prediction_0 += 8; + const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0); prediction_0 += 8; - const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x); - const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // Ensure armv7 build combines the load. + const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x); + const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1); + const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1); + const uint16x8_t weighted_pred_0_lo = + vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo); // weighted_pred0 + weighted_pred1 - const uint16x8_t weighted_combo = - vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); - const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); - vst1_u8(prediction_1 + x, result); + const uint16x8_t weighted_combo_lo = + vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo); + const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6); + vst1_u8(prediction_1 + x, result_lo); + const uint16x8_t weighted_pred_0_hi = + vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo_hi = vmlal_u8( + weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi); + const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6); + vst1_u8(prediction_1 + x + 8, result_hi); - x += 8; + x += 16; } while (x < width); prediction_1 += prediction_stride_1; mask += mask_stride << subsampling_y; diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 659ed8e..271bbaa 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred, StoreLo4(pred, result); } +inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8x8_t obmc_pred_val, + const uint8x8_t pred_mask, + const uint8x8_t obmc_pred_mask) { + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + vst1_u8(pred, result); +} + inline void OverlapBlendFromLeft2xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, @@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON( inline void OverlapBlendFromLeft8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); + constexpr int obmc_prediction_stride = 8; // 64 - mask const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); int y = 0; do { - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask); + pred += prediction_stride; - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask, + obmc_pred_mask); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != height); + + obmc_pred += obmc_prediction_stride << 1; + y += 2; + } while (y != height); } void OverlapBlendFromLeft_NEON( @@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON( return; } if (width == 8) { - OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint8x16_t mask_inverter = vdupq_n_u8(64); @@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON( inline void OverlapBlendFromTop8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 8; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8_t* mask = kObmcMask + height - 2; const int compute_height = height - (height >> 2); int y = 0; do { - const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]); // 64 - mask - const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0, + obmc_pred_mask0); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != compute_height); + ++y; + + const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1, + obmc_pred_mask1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (++y < compute_height); } void OverlapBlendFromTop_NEON( @@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON( } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } @@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; -inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x4_t BlendObmc2Or4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, const uint16x4_t pred_mask, const uint16x4_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val); const uint16x4_t result = vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); return result; } -inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val); const uint16x8_t result = vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); @@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, } inline void OverlapBlendFromLeft2xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 2; const uint16x4_t mask_inverter = vdup_n_u16(64); // Second two lanes unused. const uint16x4_t pred_mask = vld1_u16(kObmcMask); const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { + const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred); const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0); + BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_0); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; + const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred); const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1); + BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_1); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; y += 2; @@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON( } inline void OverlapBlendFromLeft4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 4; const uint16x4_t mask_inverter = vdup_n_u16(64); const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2); // 64 - mask const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { - const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result_0); + pred = AddByteStride(pred, prediction_stride); + + const uint16x4_t result_1 = BlendObmc2Or4( + pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask); + vst1_u16(pred, result_1); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y != height); @@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 2); assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16_t* mask = kObmcMask + width - 2; int x = 0; do { - pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x); - obmc_pred = reinterpret_cast<const uint8_t*>( - static_cast<const uint16_t*>(obmc_prediction) + x); + uint16_t* pred_x = pred + x; + const uint16_t* obmc_pred_x = obmc_pred + x; const uint16x8_t pred_mask = vld1q_u16(mask + x); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); int y = 0; do { const uint16x8_t result = - BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask); + vst1q_u16(pred_x, result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred_x = AddByteStride(pred_x, prediction_stride); + obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride); } while (++y < height); x += 8; } while (x < width); } template <int lane> -inline uint16x4_t BlendObmcFromTop4( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); +inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, + const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask); const uint16x4_t result = vrshr_n_u16( VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4( template <int lane> inline uint16x8_t BlendObmcFromTop8( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask); const uint16x8_t result = vrshrq_n_u16( VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8( } inline void OverlapBlendFromTop4x2Or4_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { + constexpr int obmc_prediction_stride = 4; const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]); const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); if (height == 2) { // Mask value is 64, meaning |pred| is unchanged. return; } - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred); + result = + BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask); + vst1_u16(pred, result); } inline void OverlapBlendFromTop4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { if (height < 8) { - OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height); return; } + constexpr int obmc_prediction_stride = 4; const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); int y = 0; @@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON( do { const uint16x8_t pred_mask = vld1q_u16(&mask[y]); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + // Load obmc row 0, 1. + uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 2, 3. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 4, 5. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; // Increment for the right mask index. y += 6; @@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON( } inline void OverlapBlendFromTop8xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); uint16x8_t pred_mask = vld1q_u16(mask); uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); uint16x8_t result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 2) return; - pred += prediction_stride; + constexpr int obmc_prediction_stride = 8; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 4) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 8) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[8]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 16) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[16]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); } void OverlapBlendFromTop_NEON( @@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 4); assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height); return; } @@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON( const uint16x8_t pred_mask = vld1q_u16(mask); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); -#define OBMC_ROW_FROM_TOP(n) \ - do { \ - int x = 0; \ - do { \ - const uint16x8_t result = BlendObmcFromTop8<n>( \ - reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \ - reinterpret_cast<const uint8_t*>( \ - reinterpret_cast<const uint16_t*>(obmc_pred) + x), \ - pred_mask, obmc_pred_mask); \ - vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \ - \ - x += 8; \ - } while (x < width); \ +#define OBMC_ROW_FROM_TOP(n) \ + do { \ + int x = 0; \ + do { \ + const uint16x8_t result = BlendObmcFromTop8<n>( \ + pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \ + vst1q_u16(pred + x, result); \ + \ + x += 8; \ + } while (x < width); \ } while (false) // Compute 1 row. @@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON( // Compute 3 rows. if (height == 4) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); return; } @@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON( // Compute 6 rows. if (height == 8) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); return; } @@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON( // Compute 12 rows. if (height == 16) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); const uint16x8_t pred_mask = vld1q_u16(&mask[8]); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); return; } @@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON( // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); y += 8; } while (y < compute_height); diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc index 71e0a43..da380b1 100644 --- a/src/dsp/arm/warp_neon.cc +++ b/src/dsp/arm/warp_neon.cc @@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint8_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t row_border_pixel = first_row_border[row * source_stride]; @@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * source_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_centered, intermediate_result[y + 7]); @@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = + vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); @@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; @@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint16_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t row_border_pixel = first_row_border[row * src_stride]; DestType* dst_row = dst + start_x - block_start_x; @@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * src_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 pixels that extend the // frame boundary pixels. We also assume there is at least one extra // padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; @@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // frame has left and right borders of at least 13 pixels that extend // the frame boundary pixels. We also assume there is at least one // extra padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; } @@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; |