// Copyright 2019 The libgav1 Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "src/dsp/convolve.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON #include #include #include #include #include #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" namespace libgav1 { namespace dsp { namespace low_bitdepth { namespace { // Include the constants and utility functions inside the anonymous namespace. #include "src/dsp/convolve.inc" // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. template int16x8_t SumOnePassTaps(const uint8x8_t* const src, const uint8x8_t* const taps) { uint16x8_t sum; if (filter_index == 0) { // 6 taps. + - + + - + sum = vmull_u8(src[0], taps[0]); // Unsigned overflow will result in a valid int16_t value. sum = vmlsl_u8(sum, src[1], taps[1]); sum = vmlal_u8(sum, src[2], taps[2]); sum = vmlal_u8(sum, src[3], taps[3]); sum = vmlsl_u8(sum, src[4], taps[4]); sum = vmlal_u8(sum, src[5], taps[5]); } else if (filter_index == 1 && negative_outside_taps) { // 6 taps. - + + + + - // Set a base we can subtract from. sum = vmull_u8(src[1], taps[1]); sum = vmlsl_u8(sum, src[0], taps[0]); sum = vmlal_u8(sum, src[2], taps[2]); sum = vmlal_u8(sum, src[3], taps[3]); sum = vmlal_u8(sum, src[4], taps[4]); sum = vmlsl_u8(sum, src[5], taps[5]); } else if (filter_index == 1) { // 6 taps. All are positive. sum = vmull_u8(src[0], taps[0]); sum = vmlal_u8(sum, src[1], taps[1]); sum = vmlal_u8(sum, src[2], taps[2]); sum = vmlal_u8(sum, src[3], taps[3]); sum = vmlal_u8(sum, src[4], taps[4]); sum = vmlal_u8(sum, src[5], taps[5]); } else if (filter_index == 2) { // 8 taps. - + - + + - + - sum = vmull_u8(src[1], taps[1]); sum = vmlsl_u8(sum, src[0], taps[0]); sum = vmlsl_u8(sum, src[2], taps[2]); sum = vmlal_u8(sum, src[3], taps[3]); sum = vmlal_u8(sum, src[4], taps[4]); sum = vmlsl_u8(sum, src[5], taps[5]); sum = vmlal_u8(sum, src[6], taps[6]); sum = vmlsl_u8(sum, src[7], taps[7]); } else if (filter_index == 3) { // 2 taps. All are positive. sum = vmull_u8(src[0], taps[0]); sum = vmlal_u8(sum, src[1], taps[1]); } else if (filter_index == 4) { // 4 taps. - + + - sum = vmull_u8(src[1], taps[1]); sum = vmlsl_u8(sum, src[0], taps[0]); sum = vmlal_u8(sum, src[2], taps[2]); sum = vmlsl_u8(sum, src[3], taps[3]); } else if (filter_index == 5) { // 4 taps. All are positive. sum = vmull_u8(src[0], taps[0]); sum = vmlal_u8(sum, src[1], taps[1]); sum = vmlal_u8(sum, src[2], taps[2]); sum = vmlal_u8(sum, src[3], taps[3]); } return vreinterpretq_s16_u16(sum); } template void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const uint8x8_t* const v_tap) { auto* dest8 = static_cast(dest); auto* dest16 = static_cast(dest); if (!is_2d) { int y = height; do { int x = 0; do { // Increasing loop counter x is better. const uint8x16_t src_long = vld1q_u8(src + x); uint8x8_t v_src[8]; int16x8_t sum; if (filter_index < 2) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); sum = SumOnePassTaps(v_src, v_tap + 1); } else if (filter_index == 2) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); sum = SumOnePassTaps(v_src, v_tap); } else if (filter_index == 3) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); sum = SumOnePassTaps(v_src, v_tap + 3); } else if (filter_index > 3) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); sum = SumOnePassTaps(v_src, v_tap + 2); } if (is_compound) { const uint16x8_t v_sum = vreinterpretq_u16_s16( vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); vst1q_u16(&dest16[x], v_sum); } else { // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - // kInterRoundBitsHorizontal). Each one uses a rounding shift. // Combining them requires adding the rounding offset from the skipped // shift. constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); vst1_u8(&dest8[x], result); } x += 8; } while (x < width); src += src_stride; dest8 += pred_stride; dest16 += pred_stride; } while (--y != 0); } else { int x = 0; do { const uint8_t* s = src + x; int y = height; do { // Increasing loop counter x is better. const uint8x16_t src_long = vld1q_u8(s); uint8x8_t v_src[8]; int16x8_t sum; if (filter_index < 2) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); sum = SumOnePassTaps(v_src, v_tap + 1); } else if (filter_index == 2) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); sum = SumOnePassTaps(v_src, v_tap); } else if (filter_index == 3) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); sum = SumOnePassTaps(v_src, v_tap + 3); } else if (filter_index > 3) { v_src[0] = vget_low_u8(src_long); v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); sum = SumOnePassTaps(v_src, v_tap + 2); } const uint16x8_t v_sum = vreinterpretq_u16_s16( vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); vst1q_u16(dest16, v_sum); s += src_stride; dest16 += 8; } while (--y != 0); x += 8; } while (x < width); } } template void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int height, const uint8x8_t* const v_tap) { auto* dest8 = static_cast(dest); auto* dest16 = static_cast(dest); int y = height; do { uint8x8_t v_src[4]; int16x8_t sum; v_src[0] = vld1_u8(src); if (filter_index == 3) { v_src[1] = RightShiftVector<1 * 8>(v_src[0]); sum = SumOnePassTaps(v_src, v_tap + 3); } else { v_src[1] = RightShiftVector<1 * 8>(v_src[0]); v_src[2] = RightShiftVector<2 * 8>(v_src[0]); v_src[3] = RightShiftVector<3 * 8>(v_src[0]); sum = SumOnePassTaps(v_src, v_tap + 2); } if (is_2d || is_compound) { const uint16x4_t v_sum = vreinterpret_u16_s16( vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1)); vst1_u16(dest16, v_sum); } else { constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); StoreLo4(&dest8[0], result); } src += src_stride; dest8 += pred_stride; dest16 += pred_stride; } while (--y != 0); } template void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int height, const uint8x8_t* const v_tap) { auto* dest8 = static_cast(dest); auto* dest16 = static_cast(dest); int y = height >> 1; do { const uint8x8_t input0 = vld1_u8(src); const uint8x8_t input1 = vld1_u8(src + src_stride); const uint8x8x2_t input = vzip_u8(input0, input1); uint16x8_t sum; if (filter_index == 3) { // tap signs : + + sum = vmull_u8(input.val[0], v_tap[3]); sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]); } else if (filter_index == 4) { // tap signs : - + + - sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); sum = vmlsl_u8(sum, input.val[0], v_tap[2]); sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); } else { // tap signs : + + + + sum = vmull_u8(input.val[0], v_tap[2]); sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); } int16x8_t s = vreinterpretq_s16_u16(sum); if (is_2d) { const uint16x8_t v_sum = vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1)); dest16[0] = vgetq_lane_u16(v_sum, 0); dest16[1] = vgetq_lane_u16(v_sum, 2); dest16 += pred_stride; dest16[0] = vgetq_lane_u16(v_sum, 1); dest16[1] = vgetq_lane_u16(v_sum, 3); dest16 += pred_stride; } else { // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - // kInterRoundBitsHorizontal). Each one uses a rounding shift. // Combining them requires adding the rounding offset from the skipped // shift. constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit)); const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1); dest8[0] = vget_lane_u8(result, 0); dest8[1] = vget_lane_u8(result, 2); dest8 += pred_stride; dest8[0] = vget_lane_u8(result, 1); dest8[1] = vget_lane_u8(result, 3); dest8 += pred_stride; } src += src_stride << 1; } while (--y != 0); // The 2d filters have an odd |height| because the horizontal pass // generates context for the vertical pass. if (is_2d) { assert(height % 2 == 1); const uint8x8_t input = vld1_u8(src); uint16x8_t sum; if (filter_index == 3) { sum = vmull_u8(input, v_tap[3]); sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]); } else if (filter_index == 4) { sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]); sum = vmlsl_u8(sum, input, v_tap[2]); sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); } else { assert(filter_index == 5); sum = vmull_u8(input, v_tap[2]); sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]); sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); } // |sum| contains an int16_t value. sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1)); Store2<0>(dest16, sum); } } template void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const uint8x8_t* const v_tap) { assert(width < 8 || filter_index <= 3); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. if (width >= 8 && filter_index <= 3) { FilterHorizontalWidth8AndUp(src, src_stride, dest, pred_stride, width, height, v_tap); return; } // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); assert(filter_index >= 3 && filter_index <= 5); if (filter_index >= 3 && filter_index <= 5) { if (width == 4) { FilterHorizontalWidth4( src, src_stride, dest, pred_stride, height, v_tap); return; } assert(width == 2); if (!is_compound) { FilterHorizontalWidth2(src, src_stride, dest, pred_stride, height, v_tap); } } } // Process 16 bit inputs and output 32 bits. template inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src, const int16x8_t taps) { const int16x4_t taps_lo = vget_low_s16(taps); const int16x4_t taps_hi = vget_high_s16(taps); int32x4_t sum; if (num_taps == 8) { sum = vmull_lane_s16(src[0], taps_lo, 0); sum = vmlal_lane_s16(sum, src[1], taps_lo, 1); sum = vmlal_lane_s16(sum, src[2], taps_lo, 2); sum = vmlal_lane_s16(sum, src[3], taps_lo, 3); sum = vmlal_lane_s16(sum, src[4], taps_hi, 0); sum = vmlal_lane_s16(sum, src[5], taps_hi, 1); sum = vmlal_lane_s16(sum, src[6], taps_hi, 2); sum = vmlal_lane_s16(sum, src[7], taps_hi, 3); } else if (num_taps == 6) { sum = vmull_lane_s16(src[0], taps_lo, 1); sum = vmlal_lane_s16(sum, src[1], taps_lo, 2); sum = vmlal_lane_s16(sum, src[2], taps_lo, 3); sum = vmlal_lane_s16(sum, src[3], taps_hi, 0); sum = vmlal_lane_s16(sum, src[4], taps_hi, 1); sum = vmlal_lane_s16(sum, src[5], taps_hi, 2); } else if (num_taps == 4) { sum = vmull_lane_s16(src[0], taps_lo, 2); sum = vmlal_lane_s16(sum, src[1], taps_lo, 3); sum = vmlal_lane_s16(sum, src[2], taps_hi, 0); sum = vmlal_lane_s16(sum, src[3], taps_hi, 1); } else if (num_taps == 2) { sum = vmull_lane_s16(src[0], taps_lo, 3); sum = vmlal_lane_s16(sum, src[1], taps_hi, 0); } if (is_compound) { return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1); } return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1); } template int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, const int16x8_t taps) { const int16x4_t taps_lo = vget_low_s16(taps); const int16x4_t taps_hi = vget_high_s16(taps); int32x4_t sum_lo, sum_hi; if (num_taps == 8) { sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0); sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3); } else if (num_taps == 6) { sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1); sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2); } else if (num_taps == 4) { sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2); sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1); } else if (num_taps == 2) { sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3); sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0); } if (is_compound) { return vcombine_s16( vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1), vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1)); } return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1), vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1)); } template void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x8_t taps) { assert(width >= 8); constexpr int next_row = num_taps - 1; auto* const dst8 = static_cast(dst); auto* const dst16 = static_cast(dst); int x = 0; do { int16x8_t srcs[9]; srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps >= 4) { srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps >= 6) { srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps == 8) { srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; } } } uint8_t* d8 = dst8 + x; uint16_t* d16 = dst16 + x; int y = height; do { srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; const int16x8_t sum0 = SimpleSum2DVerticalTaps(srcs + 0, taps); const int16x8_t sum1 = SimpleSum2DVerticalTaps(srcs + 1, taps); if (is_compound) { vst1q_u16(d16, vreinterpretq_u16_s16(sum0)); d16 += dst_stride; vst1q_u16(d16, vreinterpretq_u16_s16(sum1)); d16 += dst_stride; } else { vst1_u8(d8, vqmovun_s16(sum0)); d8 += dst_stride; vst1_u8(d8, vqmovun_s16(sum1)); d8 += dst_stride; } srcs[0] = srcs[2]; if (num_taps >= 4) { srcs[1] = srcs[3]; srcs[2] = srcs[4]; if (num_taps >= 6) { srcs[3] = srcs[5]; srcs[4] = srcs[6]; if (num_taps == 8) { srcs[5] = srcs[7]; srcs[6] = srcs[8]; } } } y -= 2; } while (y != 0); x += 8; } while (x < width); } // Take advantage of |src_stride| == |width| to process two rows at a time. template void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x8_t taps) { auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); int16x8_t srcs[9]; srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps >= 4) { srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2])); if (num_taps >= 6) { srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4])); if (num_taps == 8) { srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6])); } } } int y = height; do { srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]), vget_low_s16(srcs[num_taps])); const int16x8_t sum = SimpleSum2DVerticalTaps(srcs, taps); if (is_compound) { const uint16x8_t results = vreinterpretq_u16_s16(sum); vst1q_u16(dst16, results); dst16 += 4 << 1; } else { const uint8x8_t results = vqmovun_s16(sum); StoreLo4(dst8, results); dst8 += dst_stride; StoreHi4(dst8, results); dst8 += dst_stride; } srcs[0] = srcs[2]; if (num_taps >= 4) { srcs[1] = srcs[3]; srcs[2] = srcs[4]; if (num_taps >= 6) { srcs[3] = srcs[5]; srcs[4] = srcs[6]; if (num_taps == 8) { srcs[5] = srcs[7]; srcs[6] = srcs[8]; } } } y -= 2; } while (y != 0); } // Take advantage of |src_stride| == |width| to process four rows at a time. template void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x8_t taps) { constexpr int next_row = (num_taps < 6) ? 4 : 8; auto* dst8 = static_cast(dst); int16x8_t srcs[9]; srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps >= 6) { srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; srcs[1] = vextq_s16(srcs[0], srcs[4], 2); if (num_taps == 8) { srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); srcs[3] = vextq_s16(srcs[0], srcs[4], 6); } } int y = 0; do { srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; if (num_taps == 2) { srcs[1] = vextq_s16(srcs[0], srcs[4], 2); } else if (num_taps == 4) { srcs[1] = vextq_s16(srcs[0], srcs[4], 2); srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); srcs[3] = vextq_s16(srcs[0], srcs[4], 6); } else if (num_taps == 6) { srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); srcs[3] = vextq_s16(srcs[0], srcs[4], 6); srcs[5] = vextq_s16(srcs[4], srcs[8], 2); } else if (num_taps == 8) { srcs[5] = vextq_s16(srcs[4], srcs[8], 2); srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8])); srcs[7] = vextq_s16(srcs[4], srcs[8], 6); } const int16x8_t sum = SimpleSum2DVerticalTaps(srcs, taps); const uint8x8_t results = vqmovun_s16(sum); Store2<0>(dst8, results); dst8 += dst_stride; Store2<1>(dst8, results); // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. // Therefore we don't need to check this condition when |height| > 4. if (num_taps <= 4 && height == 2) return; dst8 += dst_stride; Store2<2>(dst8, results); dst8 += dst_stride; Store2<3>(dst8, results); dst8 += dst_stride; srcs[0] = srcs[4]; if (num_taps == 6) { srcs[1] = srcs[5]; srcs[4] = srcs[8]; } else if (num_taps == 8) { srcs[1] = srcs[5]; srcs[2] = srcs[6]; srcs[3] = srcs[7]; srcs[4] = srcs[8]; } y += 4; } while (y < height); } template LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int filter_id, const int filter_index) { // Duplicate the absolute value for each tap. Negative taps are corrected // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8. uint8x8_t v_tap[kSubPixelTaps]; assert(filter_id != 0); for (int k = 0; k < kSubPixelTaps; ++k) { v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]); } if (filter_index == 2) { // 8 tap. FilterHorizontal<2, true, is_2d, is_compound>( src, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 1) { // 6 tap. // Check if outside taps are positive. if ((filter_id == 1) | (filter_id == 15)) { FilterHorizontal<1, false, is_2d, is_compound>( src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else { FilterHorizontal<1, true, is_2d, is_compound>( src + 1, src_stride, dst, dst_stride, width, height, v_tap); } } else if (filter_index == 0) { // 6 tap. FilterHorizontal<0, true, is_2d, is_compound>( src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 4) { // 4 tap. FilterHorizontal<4, true, is_2d, is_compound>( src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 5) { // 4 tap. FilterHorizontal<5, true, is_2d, is_compound>( src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else { // 2 tap. FilterHorizontal<3, true, is_2d, is_compound>( src + 3, src_stride, dst, dst_stride, width, height, v_tap); } } template void Filter2DVertical( const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width, const int height, const int16x8_t taps, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { auto* const dest = static_cast(prediction); if (width >= 8) { Filter2DVerticalWidth8AndUp( intermediate_result, dest, pred_stride, width, height, taps); } else if (width == 4) { Filter2DVerticalWidth4(intermediate_result, dest, pred_stride, height, taps); } else { assert(width == 2); Filter2DVerticalWidth2(intermediate_result, dest, pred_stride, height, taps); } } void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(vert_filter_index); // The output of the horizontal filter is guaranteed to fit in 16 bits. uint16_t intermediate_result[kMaxSuperBlockSizeInPixels * (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; #if LIBGAV1_MSAN // Quiet msan warnings. Set with random non-zero value to aid in debugging. memset(intermediate_result, 0x33, sizeof(intermediate_result)); #endif const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; DoHorizontalPass(src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. assert(vertical_filter_id != 0); const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); if (vertical_taps == 8) { Filter2DVertical<8>(intermediate_result, width, height, taps, prediction, pred_stride); } else if (vertical_taps == 6) { Filter2DVertical<6>(intermediate_result, width, height, taps, prediction, pred_stride); } else if (vertical_taps == 4) { Filter2DVertical<4>(intermediate_result, width, height, taps, prediction, pred_stride); } else { // |vertical_taps| == 2 Filter2DVertical<2>(intermediate_result, width, height, taps, prediction, pred_stride); } } // There are many opportunities for overreading in scaled convolve, because the // range of starting points for filter windows is anywhere from 0 to 16 for 8 // destination pixels, and the window sizes range from 2 to 8. To accommodate // this range concisely, we use |grade_x| to mean the most steps in src that can // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2, // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x| // increments. The first load covers the initial elements of src_x, while the // final load covers the taps. template inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) { uint8x8x3_t ret; const uint8x16_t src_val = vld1q_u8(src_x); ret.val[0] = vget_low_u8(src_val); ret.val[1] = vget_high_u8(src_val); #if LIBGAV1_MSAN // Initialize to quiet msan warnings when grade_x <= 1. ret.val[2] = vdup_n_u8(0); #endif if (grade_x > 1) { ret.val[2] = vld1_u8(src_x + 16); } return ret; } // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3] inline uint8x16_t GetPositive2TapFilter(const int tap_index) { assert(tap_index < 2); alignas( 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = { {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4}, {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}}; return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]); } template inline void ConvolveKernelHorizontal2Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) { // Account for the 0-taps that precede the 2 nonzero taps. const int kernel_offset = 3; const int ref_x = subpixel_x >> kScaleSubPixelBits; const int step_x8 = step_x << 3; const uint8x16_t filter_taps0 = GetPositive2TapFilter(0); const uint8x16_t filter_taps1 = GetPositive2TapFilter(1); const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); int p = subpixel_x; if (width <= 4) { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t filter_indices = vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask); // This is a special case. The 2-tap filter has no negative taps, so we // can use unsigned values. // For each x, a lane of tapsK has // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); // For each x, a lane of srcK contains src_x[k]. const uint8x8_t src[2] = { VQTbl1U8(src_vals, src_indices), VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))}; vst1q_s16(intermediate, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; } while (--y != 0); return; } // |width| >= 8 int x = 0; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t filter_indices = vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); // This is a special case. The 2-tap filter has no negative taps, so we // can use unsigned values. // For each x, a lane of tapsK has // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); // For each x, a lane of srcK contains src_x[k]. const uint8x8_t src[2] = { vtbl3_u8(src_vals, src_indices), vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))}; vst1q_s16(intermediate, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; } while (--y != 0); x += 8; p += step_x8; } while (x < width); } // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5]. inline uint8x16_t GetPositive4TapFilter(const int tap_index) { assert(tap_index < 4); alignas( 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = { {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1}, {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}}; return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]); } // This filter is only possible when width <= 4. void ConvolveKernelHorizontalPositive4Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const uint8x16_t filter_taps0 = GetPositive4TapFilter(0); const uint8x16_t filter_taps1 = GetPositive4TapFilter(1); const uint8x16_t filter_taps2 = GetPositive4TapFilter(2); const uint8x16_t filter_taps3 = GetPositive4TapFilter(3); const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); const int p = subpixel_x; // First filter is special, just a 128 tap on the center. const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t filter_indices = vand_u8( vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); // Note that filter_id depends on x. // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k]. const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices), VQTbl1U8(filter_taps2, filter_indices), VQTbl1U8(filter_taps3, filter_indices)}; const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); int y = intermediate_height; do { // Load a pool of samples to select from using stepped index vectors. const uint8x16_t src_vals = vld1q_u8(src_x); // For each x, srcK contains src_x[k] where k=1. // Whereas taps come from different arrays, src pixels are drawn from the // same contiguous line. const uint8x8_t src[4] = { VQTbl1U8(src_vals, src_indices), VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))), VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))), VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))}; vst1q_s16(intermediate, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; } while (--y != 0); } // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4]. inline uint8x16_t GetSigned4TapFilter(const int tap_index) { assert(tap_index < 4); alignas(16) static constexpr uint8_t kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = { {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1}, {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}}; return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]); } // This filter is only possible when width <= 4. inline void ConvolveKernelHorizontalSigned4Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const uint8x16_t filter_taps0 = GetSigned4TapFilter(0); const uint8x16_t filter_taps1 = GetSigned4TapFilter(1); const uint8x16_t filter_taps2 = GetSigned4TapFilter(2); const uint8x16_t filter_taps3 = GetSigned4TapFilter(3); const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000), static_cast(step_x)); const int p = subpixel_x; const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x4_t p_fraction = vdup_n_u16(p & 1023); const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction); const uint8x8_t filter_index_offsets = vshrn_n_u16( vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift); const uint8x8_t filter_indices = vand_u8(filter_index_offsets, filter_index_mask); // Note that filter_id depends on x. // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k]. const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices), VQTbl1U8(filter_taps2, filter_indices), VQTbl1U8(filter_taps3, filter_indices)}; const uint8x8_t src_indices_base = vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift); const uint8x8_t src_indices[4] = {src_indices_base, vadd_u8(src_indices_base, vdup_n_u8(1)), vadd_u8(src_indices_base, vdup_n_u8(2)), vadd_u8(src_indices_base, vdup_n_u8(3))}; int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); // For each x, srcK contains src_x[k] where k=1. // Whereas taps come from different arrays, src pixels are drawn from the // same contiguous line. const uint8x8_t src[4] = { VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]), VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])}; vst1q_s16(intermediate, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; } while (--y != 0); } // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0]. inline uint8x16_t GetSigned6TapFilter(const int tap_index) { assert(tap_index < 6); alignas(16) static constexpr uint8_t kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = { {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1}, {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3}, {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]); } // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalSigned6Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; const int step_x8 = step_x << 3; uint8x16_t filter_taps[6]; for (int i = 0; i < 6; ++i) { filter_taps[i] = GetSigned6TapFilter(i); } const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { // Avoid overloading outside the reference boundaries. This means // |trailing_width| can be up to 24. const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); uint8x8_t src_lookup[6]; src_lookup[0] = src_indices; for (int i = 1; i < 6; ++i) { src_lookup[i] = vadd_u8(src_lookup[i - 1], one); } const uint8x8_t filter_indices = vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); // For each x, a lane of taps[k] has // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends // on x. uint8x8_t taps[6]; for (int i = 0; i < 6; ++i) { taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); const uint8x8_t src[6] = { vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]), vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]), vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])}; vst1q_s16(intermediate_x, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); x += 8; p += step_x8; } while (x < width); } // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter // has mixed positive and negative outer taps which are handled in // GetMixed6TapFilter(). inline uint8x16_t GetPositive6TapFilter(const int tap_index) { assert(tap_index < 6); alignas(16) static constexpr uint8_t kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = { {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1}, {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}}; return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]); } inline int8x16_t GetMixed6TapFilter(const int tap_index) { assert(tap_index < 2); alignas( 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = { {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}}; return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]); } // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalMixed6Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; const int step_x8 = step_x << 3; uint8x8_t taps[4]; int16x8_t mixed_taps[2]; uint8x16_t positive_filter_taps[4]; for (int i = 0; i < 4; ++i) { positive_filter_taps[i] = GetPositive6TapFilter(i); } int8x16_t mixed_filter_taps[2]; mixed_filter_taps[0] = GetMixed6TapFilter(0); mixed_filter_taps[1] = GetMixed6TapFilter(1); const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); uint8x8_t src_lookup[6]; src_lookup[0] = src_indices; for (int i = 1; i < 6; ++i) { src_lookup[i] = vadd_u8(src_lookup[i - 1], one); } const uint8x8_t filter_indices = vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); // For each x, a lane of taps[k] has // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends // on x. for (int i = 0; i < 4; ++i) { taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices); } mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices)); mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices)); int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); int16x8_t sum_mixed = vmulq_s16( mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0]))); sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1], ZeroExtend(vtbl3_u8(src_vals, src_lookup[5]))); uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed); sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1])); sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2])); sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3])); sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4])); vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); x += 8; p += step_x8; } while (x < width); } // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2]. inline uint8x16_t GetSigned8TapFilter(const int tap_index) { assert(tap_index < 8); alignas(16) static constexpr uint8_t kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = { {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0}, {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1}, {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1}, {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4}, {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63}, {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3}, {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1}, {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}}; return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]); } // This filter is only possible when width >= 8. template inline void ConvolveKernelHorizontalSigned8Tap( const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* LIBGAV1_RESTRICT const intermediate) { const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; const int step_x8 = step_x << 3; uint8x8_t taps[8]; uint8x16_t filter_taps[8]; for (int i = 0; i < 8; ++i) { filter_taps[i] = GetSigned8TapFilter(i); } const uint16x8_t index_steps = vmulq_n_u16( vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); int16_t* intermediate_x = intermediate; int x = 0; int p = subpixel_x; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x]; // Only add steps to the 10-bit truncated p to avoid overflow. const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); uint8x8_t src_lookup[8]; src_lookup[0] = src_indices; for (int i = 1; i < 8; ++i) { src_lookup[i] = vadd_u8(src_lookup[i - 1], one); } const uint8x8_t filter_indices = vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); // For each x, a lane of taps[k] has // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends // on x. for (int i = 0; i < 8; ++i) { taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals(src_x); const uint8x8_t src[8] = { vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]), vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]), vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]), vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])}; vst1q_s16(intermediate_x, vrshrq_n_s16(SumOnePassTaps(src, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); x += 8; p += step_x8; } while (x < width); } // This function handles blocks of width 2 or 4. template void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src, const int subpixel_y, const int filter_index, const int step_y, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; const int16_t* src_y = src; // |dest| is 16-bit in compound mode, Pixel otherwise. auto* dest16_y = static_cast(dest); auto* dest_y = static_cast(dest); int16x4_t s[num_taps + grade_y]; int p = subpixel_y & 1023; int prev_p = p; int y = height; do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1_s16(src_y + i * src_stride); } int filter_id = (p >> 6) & kSubPixelMask; int16x8_t filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); int16x4_t sums = Sum2DVerticalTaps4(s, filter); if (is_compound) { assert(width != 2); const uint16x4_t result = vreinterpret_u16_s16(sums); vst1_u16(dest16_y, result); } else { const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums)); if (width == 2) { Store2<0>(dest_y, result); } else { StoreLo4(dest_y, result); } } p += step_y; const int p_diff = (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); prev_p = p; // Here we load extra source in case it is needed. If |p_diff| == 0, these // values will be unused, but it's faster to load than to branch. s[num_taps] = vld1_s16(src_y + num_taps * src_stride); if (grade_y > 1) { s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride); } dest16_y += dest_stride; dest_y += dest_stride; filter_id = (p >> 6) & kSubPixelMask; filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); sums = Sum2DVerticalTaps4(&s[p_diff], filter); if (is_compound) { assert(width != 2); const uint16x4_t result = vreinterpret_u16_s16(sums); vst1_u16(dest16_y, result); } else { const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums)); if (width == 2) { Store2<0>(dest_y, result); } else { StoreLo4(dest_y, result); } } p += step_y; src_y = src + (p >> kScaleSubPixelBits) * src_stride; prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; y -= 2; } while (y != 0); } template inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source, const int intermediate_height, const int width, const int subpixel_y, const int filter_index, const int step_y, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; // A possible improvement is to use arithmetic to decide how many times to // apply filters to same source before checking whether to load new srcs. // However, this will only improve performance with very small step sizes. int16x8_t s[num_taps + grade_y]; // |dest| is 16-bit in compound mode, Pixel otherwise. uint16_t* dest16_y; uint8_t* dest_y; const int16_t* src = source; int x = 0; do { const int16_t* src_y = src; dest16_y = static_cast(dest) + x; dest_y = static_cast(dest) + x; int p = subpixel_y & 1023; int prev_p = p; int y = height; do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1q_s16(src_y + i * src_stride); } int filter_id = (p >> 6) & kSubPixelMask; int16x8_t filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); int16x8_t sum = SimpleSum2DVerticalTaps(s, filter); if (is_compound) { vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum)); } else { vst1_u8(dest_y, vqmovun_s16(sum)); } p += step_y; const int p_diff = (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); // |grade_y| > 1 always means p_diff > 0, so load vectors that may be // needed. Otherwise, we only need to load one vector because |p_diff| // can't exceed 1. s[num_taps] = vld1q_s16(src_y + num_taps * src_stride); if (grade_y > 1) { s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride); } dest16_y += dest_stride; dest_y += dest_stride; filter_id = (p >> 6) & kSubPixelMask; filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); sum = SimpleSum2DVerticalTaps(&s[p_diff], filter); if (is_compound) { vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum)); } else { vst1_u8(dest_y, vqmovun_s16(sum)); } p += step_y; src_y = src + (p >> kScaleSubPixelBits) * src_stride; prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; y -= 2; } while (y != 0); src += kIntermediateStride * intermediate_height; x += 8; } while (x < width); } template void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int subpixel_x, const int subpixel_y, const int step_x, const int step_y, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); assert(step_x <= 2048); assert(step_y <= 2048); const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + num_vert_taps; // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. int16_t intermediate_result[kIntermediateAllocWidth * (2 * kIntermediateAllocWidth + 8)]; #if LIBGAV1_MSAN // Quiet msan warnings. Set with random non-zero value to aid in debugging. memset(intermediate_result, 0x44, sizeof(intermediate_result)); #endif // Horizontal filter. // Filter types used for width <= 4 are different from those for width > 4. // When width > 4, the valid filter index range is always [0, 3]. // When width <= 4, the valid filter index range is always [3, 5]. // Similarly for height. int filter_index = GetFilterIndex(horizontal_filter_index, width); int16_t* intermediate = intermediate_result; const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference); const int vert_kernel_offset = (8 - num_vert_taps) / 2; src += vert_kernel_offset * src_stride; // Derive the maximum value of |step_x| at which all source values fit in one // 16-byte load. Final index is src_x + |num_taps| - 1 < 16 // step_x*7 is the final base subpel index for the shuffle mask for filter // inputs in each iteration on large blocks. When step_x is large, we need a // larger structure and use a larger table lookup in order to gather all // filter inputs. // |num_taps| - 1 is the shuffle index of the final filter input. const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); const int kernel_start_ceiling = 16 - num_horiz_taps; // This truncated quotient |grade_x_threshold| selects |step_x| such that: // (step_x * 7) >> kScaleSubPixelBits < single load limit const int grade_x_threshold = (kernel_start_ceiling << kScaleSubPixelBits) / 7; switch (filter_index) { case 0: if (step_x > grade_x_threshold) { ConvolveKernelHorizontalSigned6Tap<2>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } else { ConvolveKernelHorizontalSigned6Tap<1>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } break; case 1: if (step_x > grade_x_threshold) { ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } else { ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } break; case 2: if (step_x > grade_x_threshold) { ConvolveKernelHorizontalSigned8Tap<2>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } else { ConvolveKernelHorizontalSigned8Tap<1>( src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } break; case 3: if (step_x > grade_x_threshold) { ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } else { ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x, step_x, intermediate_height, intermediate); } break; case 4: assert(width <= 4); ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x, intermediate_height, intermediate); break; default: assert(filter_index == 5); ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x, intermediate_height, intermediate); } // Vertical filter. filter_index = GetFilterIndex(vertical_filter_index, height); intermediate = intermediate_result; switch (filter_index) { case 0: case 1: if (step_y <= 1024) { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<6, 1, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<6, 1, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<6, 1, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<6, 2, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<6, 2, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<6, 2, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } break; case 2: if (step_y <= 1024) { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<8, 1, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<8, 1, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<8, 1, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<8, 2, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<8, 2, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<8, 2, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } break; case 3: if (step_y <= 1024) { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<2, 1, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<2, 1, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<2, 1, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<2, 2, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<2, 2, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<2, 2, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } break; case 4: default: assert(filter_index == 4 || filter_index == 5); assert(height <= 4); if (step_y <= 1024) { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<4, 1, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<4, 1, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<4, 1, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } else { if (!is_compound && width == 2) { ConvolveVerticalScale4xH<4, 2, 2, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale4xH<4, 2, 4, is_compound>( intermediate, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<4, 2, is_compound>( intermediate, intermediate_height, width, subpixel_y, filter_index, step_y, height, prediction, pred_stride); } } } } void ConvolveHorizontal_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. const auto* const src = static_cast(reference) - kHorizontalOffset; auto* const dest = static_cast(prediction); DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, horizontal_filter_id, filter_index); } // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D // Vertical calculations. uint16x8_t Compound1DShift(const int16x8_t sum) { return vreinterpretq_u16_s16( vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); } template void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* const dst8 = static_cast(dst); auto* const dst16 = static_cast(dst); assert(width >= 8); int x = 0; do { const uint8_t* src_x = src + x; uint8x8_t srcs[8]; srcs[0] = vld1_u8(src_x); src_x += src_stride; if (num_taps >= 4) { srcs[1] = vld1_u8(src_x); src_x += src_stride; srcs[2] = vld1_u8(src_x); src_x += src_stride; if (num_taps >= 6) { srcs[3] = vld1_u8(src_x); src_x += src_stride; srcs[4] = vld1_u8(src_x); src_x += src_stride; if (num_taps == 8) { srcs[5] = vld1_u8(src_x); src_x += src_stride; srcs[6] = vld1_u8(src_x); src_x += src_stride; } } } // Decreasing the y loop counter produces worse code with clang. // Don't unroll this loop since it generates too much code and the decoder // is even slower. int y = 0; do { srcs[next_row] = vld1_u8(src_x); src_x += src_stride; const int16x8_t sums = SumOnePassTaps(srcs, taps); if (is_compound) { const uint16x8_t results = Compound1DShift(sums); vst1q_u16(dst16 + x + y * dst_stride, results); } else { const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); vst1_u8(dst8 + x + y * dst_stride, results); } srcs[0] = srcs[1]; if (num_taps >= 4) { srcs[1] = srcs[2]; srcs[2] = srcs[3]; if (num_taps >= 6) { srcs[3] = srcs[4]; srcs[4] = srcs[5]; if (num_taps == 8) { srcs[5] = srcs[6]; srcs[6] = srcs[7]; } } } } while (++y < height); x += 8; } while (x < width); } template void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); auto* dst8 = static_cast(dst); auto* dst16 = static_cast(dst); uint8x8_t srcs[9]; if (num_taps == 2) { srcs[2] = vdup_n_u8(0); srcs[0] = Load4(src); src += src_stride; int y = height; do { srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; srcs[2] = Load4<0>(src, srcs[2]); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); const int16x8_t sums = SumOnePassTaps(srcs, taps); if (is_compound) { const uint16x8_t results = Compound1DShift(sums); vst1q_u16(dst16, results); dst16 += 4 << 1; } else { const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); StoreLo4(dst8, results); dst8 += dst_stride; StoreHi4(dst8, results); dst8 += dst_stride; } srcs[0] = srcs[2]; y -= 2; } while (y != 0); } else if (num_taps == 4) { srcs[4] = vdup_n_u8(0); srcs[0] = Load4(src); src += src_stride; srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; srcs[2] = Load4(src); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); int y = height; do { srcs[2] = Load4<1>(src, srcs[2]); src += src_stride; srcs[4] = Load4<0>(src, srcs[4]); src += src_stride; srcs[3] = vext_u8(srcs[2], srcs[4], 4); const int16x8_t sums = SumOnePassTaps(srcs, taps); if (is_compound) { const uint16x8_t results = Compound1DShift(sums); vst1q_u16(dst16, results); dst16 += 4 << 1; } else { const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); StoreLo4(dst8, results); dst8 += dst_stride; StoreHi4(dst8, results); dst8 += dst_stride; } srcs[0] = srcs[2]; srcs[1] = srcs[3]; srcs[2] = srcs[4]; y -= 2; } while (y != 0); } else if (num_taps == 6) { srcs[6] = vdup_n_u8(0); srcs[0] = Load4(src); src += src_stride; srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; srcs[2] = Load4(src); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); srcs[2] = Load4<1>(src, srcs[2]); src += src_stride; srcs[4] = Load4(src); src += src_stride; srcs[3] = vext_u8(srcs[2], srcs[4], 4); int y = height; do { srcs[4] = Load4<1>(src, srcs[4]); src += src_stride; srcs[6] = Load4<0>(src, srcs[6]); src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[6], 4); const int16x8_t sums = SumOnePassTaps(srcs, taps); if (is_compound) { const uint16x8_t results = Compound1DShift(sums); vst1q_u16(dst16, results); dst16 += 4 << 1; } else { const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); StoreLo4(dst8, results); dst8 += dst_stride; StoreHi4(dst8, results); dst8 += dst_stride; } srcs[0] = srcs[2]; srcs[1] = srcs[3]; srcs[2] = srcs[4]; srcs[3] = srcs[5]; srcs[4] = srcs[6]; y -= 2; } while (y != 0); } else if (num_taps == 8) { srcs[8] = vdup_n_u8(0); srcs[0] = Load4(src); src += src_stride; srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; srcs[2] = Load4(src); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); srcs[2] = Load4<1>(src, srcs[2]); src += src_stride; srcs[4] = Load4(src); src += src_stride; srcs[3] = vext_u8(srcs[2], srcs[4], 4); srcs[4] = Load4<1>(src, srcs[4]); src += src_stride; srcs[6] = Load4(src); src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[6], 4); int y = height; do { srcs[6] = Load4<1>(src, srcs[6]); src += src_stride; srcs[8] = Load4<0>(src, srcs[8]); src += src_stride; srcs[7] = vext_u8(srcs[6], srcs[8], 4); const int16x8_t sums = SumOnePassTaps(srcs, taps); if (is_compound) { const uint16x8_t results = Compound1DShift(sums); vst1q_u16(dst16, results); dst16 += 4 << 1; } else { const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); StoreLo4(dst8, results); dst8 += dst_stride; StoreHi4(dst8, results); dst8 += dst_stride; } srcs[0] = srcs[2]; srcs[1] = srcs[3]; srcs[2] = srcs[4]; srcs[3] = srcs[5]; srcs[4] = srcs[6]; srcs[5] = srcs[7]; srcs[6] = srcs[8]; y -= 2; } while (y != 0); } } template void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); auto* dst8 = static_cast(dst); uint8x8_t srcs[9]; if (num_taps == 2) { srcs[2] = vdup_n_u8(0); srcs[0] = Load2(src); src += src_stride; int y = 0; do { srcs[0] = Load2<1>(src, srcs[0]); src += src_stride; srcs[0] = Load2<2>(src, srcs[0]); src += src_stride; srcs[0] = Load2<3>(src, srcs[0]); src += src_stride; srcs[2] = Load2<0>(src, srcs[2]); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 2); // This uses srcs[0]..srcs[1]. const int16x8_t sums = SumOnePassTaps(srcs, taps); const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); Store2<0>(dst8, results); dst8 += dst_stride; Store2<1>(dst8, results); if (height == 2) return; dst8 += dst_stride; Store2<2>(dst8, results); dst8 += dst_stride; Store2<3>(dst8, results); dst8 += dst_stride; srcs[0] = srcs[2]; y += 4; } while (y < height); } else if (num_taps == 4) { srcs[4] = vdup_n_u8(0); srcs[0] = Load2(src); src += src_stride; srcs[0] = Load2<1>(src, srcs[0]); src += src_stride; srcs[0] = Load2<2>(src, srcs[0]); src += src_stride; int y = 0; do { srcs[0] = Load2<3>(src, srcs[0]); src += src_stride; srcs[4] = Load2<0>(src, srcs[4]); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[4], 2); srcs[4] = Load2<1>(src, srcs[4]); src += src_stride; srcs[2] = vext_u8(srcs[0], srcs[4], 4); srcs[4] = Load2<2>(src, srcs[4]); src += src_stride; srcs[3] = vext_u8(srcs[0], srcs[4], 6); // This uses srcs[0]..srcs[3]. const int16x8_t sums = SumOnePassTaps(srcs, taps); const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); Store2<0>(dst8, results); dst8 += dst_stride; Store2<1>(dst8, results); if (height == 2) return; dst8 += dst_stride; Store2<2>(dst8, results); dst8 += dst_stride; Store2<3>(dst8, results); dst8 += dst_stride; srcs[0] = srcs[4]; y += 4; } while (y < height); } else if (num_taps == 6) { // During the vertical pass the number of taps is restricted when // |height| <= 4. assert(height > 4); srcs[8] = vdup_n_u8(0); srcs[0] = Load2(src); src += src_stride; srcs[0] = Load2<1>(src, srcs[0]); src += src_stride; srcs[0] = Load2<2>(src, srcs[0]); src += src_stride; srcs[0] = Load2<3>(src, srcs[0]); src += src_stride; srcs[4] = Load2(src); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[4], 2); int y = 0; do { srcs[4] = Load2<1>(src, srcs[4]); src += src_stride; srcs[2] = vext_u8(srcs[0], srcs[4], 4); srcs[4] = Load2<2>(src, srcs[4]); src += src_stride; srcs[3] = vext_u8(srcs[0], srcs[4], 6); srcs[4] = Load2<3>(src, srcs[4]); src += src_stride; srcs[8] = Load2<0>(src, srcs[8]); src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[8], 2); // This uses srcs[0]..srcs[5]. const int16x8_t sums = SumOnePassTaps(srcs, taps); const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); Store2<0>(dst8, results); dst8 += dst_stride; Store2<1>(dst8, results); dst8 += dst_stride; Store2<2>(dst8, results); dst8 += dst_stride; Store2<3>(dst8, results); dst8 += dst_stride; srcs[0] = srcs[4]; srcs[1] = srcs[5]; srcs[4] = srcs[8]; y += 4; } while (y < height); } else if (num_taps == 8) { // During the vertical pass the number of taps is restricted when // |height| <= 4. assert(height > 4); srcs[8] = vdup_n_u8(0); srcs[0] = Load2(src); src += src_stride; srcs[0] = Load2<1>(src, srcs[0]); src += src_stride; srcs[0] = Load2<2>(src, srcs[0]); src += src_stride; srcs[0] = Load2<3>(src, srcs[0]); src += src_stride; srcs[4] = Load2(src); src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[4], 2); srcs[4] = Load2<1>(src, srcs[4]); src += src_stride; srcs[2] = vext_u8(srcs[0], srcs[4], 4); srcs[4] = Load2<2>(src, srcs[4]); src += src_stride; srcs[3] = vext_u8(srcs[0], srcs[4], 6); int y = 0; do { srcs[4] = Load2<3>(src, srcs[4]); src += src_stride; srcs[8] = Load2<0>(src, srcs[8]); src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[8], 2); srcs[8] = Load2<1>(src, srcs[8]); src += src_stride; srcs[6] = vext_u8(srcs[4], srcs[8], 4); srcs[8] = Load2<2>(src, srcs[8]); src += src_stride; srcs[7] = vext_u8(srcs[4], srcs[8], 6); // This uses srcs[0]..srcs[7]. const int16x8_t sums = SumOnePassTaps(srcs, taps); const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); Store2<0>(dst8, results); dst8 += dst_stride; Store2<1>(dst8, results); dst8 += dst_stride; Store2<2>(dst8, results); dst8 += dst_stride; Store2<3>(dst8, results); dst8 += dst_stride; srcs[0] = srcs[4]; srcs[1] = srcs[5]; srcs[2] = srcs[6]; srcs[3] = srcs[7]; srcs[4] = srcs[8]; y += 4; } while (y < height); } } // This function is a simplified version of Convolve2D_C. // It is called when it is single prediction mode, where only vertical // filtering is required. // The output is the single prediction of the block, clipped to valid pixel // range. void ConvolveVertical_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; auto* const dest = static_cast(prediction); const ptrdiff_t dest_stride = pred_stride; assert(vertical_filter_id != 0); uint8x8_t taps[8]; for (int k = 0; k < kSubPixelTaps; ++k) { taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]); } if (filter_index == 0) { // 6 tap. if (width == 2) { FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps + 1); } else { FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast(filter_index == 1) & (static_cast(vertical_filter_id == 1) | static_cast(vertical_filter_id == 15))) != 0) { // 5 tap. if (width == 2) { FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, taps + 1); } else { FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast(filter_index == 1) & (static_cast(vertical_filter_id == 7) | static_cast(vertical_filter_id == 8) | static_cast(vertical_filter_id == 9))) != 0) { // 6 tap with weird negative taps. if (width == 2) { FilterVertical2xH<1, /*negative_outside_taps=*/true>( src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { FilterVertical4xH<1, /*is_compound=*/false, /*negative_outside_taps=*/true>( src, src_stride, dest, dest_stride, height, taps + 1); } else { FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>( src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 2) { FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 2) { FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps + 3); } else if (width == 4) { FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps + 3); } else { FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, taps + 3); } } else if (filter_index == 4) { // 4 tap. // Outside taps are negative. if (width == 2) { FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } else { // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed // below map to 4 tap filters. assert(filter_index == 5 || (filter_index == 1 && (vertical_filter_id == 2 || vertical_filter_id == 3 || vertical_filter_id == 4 || vertical_filter_id == 5 || vertical_filter_id == 6 || vertical_filter_id == 10 || vertical_filter_id == 11 || vertical_filter_id == 12 || vertical_filter_id == 13 || vertical_filter_id == 14))); // According to GetNumTapsInFilter() this has 6 taps but here we are // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 2) { FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps + 2); } else { FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } } void ConvolveCompoundCopy_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const auto* src = static_cast(reference); const ptrdiff_t src_stride = reference_stride; auto* dest = static_cast(prediction); constexpr int final_shift = kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; if (width >= 16) { int y = height; do { int x = 0; do { const uint8x16_t v_src = vld1q_u8(&src[x]); const uint16x8_t v_dest_lo = vshll_n_u8(vget_low_u8(v_src), final_shift); const uint16x8_t v_dest_hi = vshll_n_u8(vget_high_u8(v_src), final_shift); vst1q_u16(&dest[x], v_dest_lo); x += 8; vst1q_u16(&dest[x], v_dest_hi); x += 8; } while (x < width); src += src_stride; dest += width; } while (--y != 0); } else if (width == 8) { int y = height; do { const uint8x8_t v_src = vld1_u8(&src[0]); const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); src += src_stride; dest += width; } while (--y != 0); } else { // width == 4 uint8x8_t v_src = vdup_n_u8(0); int y = height; do { v_src = Load4<0>(&src[0], v_src); src += src_stride; v_src = Load4<1>(&src[0], v_src); src += src_stride; const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); dest += 4 << 1; y -= 2; } while (y != 0); } } void ConvolveCompoundVertical_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride; auto* const dest = static_cast(prediction); assert(vertical_filter_id != 0); uint8x8_t taps[8]; for (int k = 0; k < kSubPixelTaps; ++k) { taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]); } if (filter_index == 0) { // 6 tap. if (width == 4) { FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast(filter_index == 1) & (static_cast(vertical_filter_id == 1) | static_cast(vertical_filter_id == 15))) != 0) { // 5 tap. if (width == 4) { FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast(filter_index == 1) & (static_cast(vertical_filter_id == 7) | static_cast(vertical_filter_id == 8) | static_cast(vertical_filter_id == 9))) != 0) { // 6 tap with weird negative taps. if (width == 4) { FilterVertical4xH<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>( src, src_stride, dest, width, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 4) { FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps); } else { FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 4) { FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 3); } else { FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 3); } } else if (filter_index == 4) { // 4 tap. if (width == 4) { FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } else { // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map // to 4 tap filters. assert(filter_index == 5 || (filter_index == 1 && (vertical_filter_id == 2 || vertical_filter_id == 3 || vertical_filter_id == 4 || vertical_filter_id == 5 || vertical_filter_id == 6 || vertical_filter_id == 10 || vertical_filter_id == 11 || vertical_filter_id == 12 || vertical_filter_id == 13 || vertical_filter_id == 14))); // According to GetNumTapsInFilter() this has 6 taps but here we are // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 4) { FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } } void ConvolveCompoundHorizontal_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); const auto* const src = static_cast(reference) - kHorizontalOffset; auto* const dest = static_cast(prediction); DoHorizontalPass( src, reference_stride, dest, width, width, height, horizontal_filter_id, filter_index); } template void Compound2DVertical( const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width, const int height, const int16x8_t taps, void* LIBGAV1_RESTRICT const prediction) { auto* const dest = static_cast(prediction); if (width == 4) { Filter2DVerticalWidth4( intermediate_result, dest, width, height, taps); } else { Filter2DVerticalWidth8AndUp( intermediate_result, dest, width, width, height, taps); } } void ConvolveCompound2D_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) { // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. uint16_t intermediate_result[kMaxSuperBlockSizeInPixels * (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; // Horizontal filter. // Filter types used for width <= 4 are different from those for width > 4. // When width > 4, the valid filter index range is always [0, 3]. // When width <= 4, the valid filter index range is always [4, 5]. // Similarly for height. const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(vert_filter_index); const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast(reference) - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; DoHorizontalPass( src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. assert(vertical_filter_id != 0); const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); if (vertical_taps == 8) { Compound2DVertical<8>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 6) { Compound2DVertical<6>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 4) { Compound2DVertical<4>(intermediate_result, width, height, taps, prediction); } else { // |vertical_taps| == 2 Compound2DVertical<2>(intermediate_result, width, height, taps, prediction); } } inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src, uint8_t* LIBGAV1_RESTRICT const dst) { const uint8x16_t left = vld1q_u8(src); const uint8x16_t right = vld1q_u8(src + 1); vst1q_u8(dst, vrhaddq_u8(left, right)); } template inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); int y = height; do { HalfAddHorizontal(src, dst); if (width >= 32) { src += 16; dst += 16; HalfAddHorizontal(src, dst); if (width >= 64) { src += 16; dst += 16; HalfAddHorizontal(src, dst); src += 16; dst += 16; HalfAddHorizontal(src, dst); if (width == 128) { src += 16; dst += 16; HalfAddHorizontal(src, dst); src += 16; dst += 16; HalfAddHorizontal(src, dst); src += 16; dst += 16; HalfAddHorizontal(src, dst); src += 16; dst += 16; HalfAddHorizontal(src, dst); } } } src += src_remainder_stride; dst += dst_remainder_stride; } while (--y != 0); } void ConvolveIntraBlockCopyHorizontal_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*subpixel_x*/, const int /*subpixel_y*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast(reference); auto* dest = static_cast(prediction); if (width == 128) { IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest, pred_stride); } else if (width == 64) { IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest, pred_stride); } else if (width == 32) { IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest, pred_stride); } else if (width == 16) { IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { int y = height; do { const uint8x8_t left = vld1_u8(src); const uint8x8_t right = vld1_u8(src + 1); vst1_u8(dest, vrhadd_u8(left, right)); src += reference_stride; dest += pred_stride; } while (--y != 0); } else { // width == 4 uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); src += reference_stride; left = Load4<1>(src, left); right = Load4<1>(src + 1, right); src += reference_stride; const uint8x8_t result = vrhadd_u8(left, right); StoreLo4(dest, result); dest += pred_stride; StoreHi4(dest, result); dest += pred_stride; y -= 2; } while (y != 0); } } template inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); uint8x16_t row[8], below[8]; row[0] = vld1q_u8(src); if (width >= 32) { src += 16; row[1] = vld1q_u8(src); if (width >= 64) { src += 16; row[2] = vld1q_u8(src); src += 16; row[3] = vld1q_u8(src); if (width == 128) { src += 16; row[4] = vld1q_u8(src); src += 16; row[5] = vld1q_u8(src); src += 16; row[6] = vld1q_u8(src); src += 16; row[7] = vld1q_u8(src); } } } src += src_remainder_stride; int y = height; do { below[0] = vld1q_u8(src); if (width >= 32) { src += 16; below[1] = vld1q_u8(src); if (width >= 64) { src += 16; below[2] = vld1q_u8(src); src += 16; below[3] = vld1q_u8(src); if (width == 128) { src += 16; below[4] = vld1q_u8(src); src += 16; below[5] = vld1q_u8(src); src += 16; below[6] = vld1q_u8(src); src += 16; below[7] = vld1q_u8(src); } } } src += src_remainder_stride; vst1q_u8(dst, vrhaddq_u8(row[0], below[0])); row[0] = below[0]; if (width >= 32) { dst += 16; vst1q_u8(dst, vrhaddq_u8(row[1], below[1])); row[1] = below[1]; if (width >= 64) { dst += 16; vst1q_u8(dst, vrhaddq_u8(row[2], below[2])); row[2] = below[2]; dst += 16; vst1q_u8(dst, vrhaddq_u8(row[3], below[3])); row[3] = below[3]; if (width >= 128) { dst += 16; vst1q_u8(dst, vrhaddq_u8(row[4], below[4])); row[4] = below[4]; dst += 16; vst1q_u8(dst, vrhaddq_u8(row[5], below[5])); row[5] = below[5]; dst += 16; vst1q_u8(dst, vrhaddq_u8(row[6], below[6])); row[6] = below[6]; dst += 16; vst1q_u8(dst, vrhaddq_u8(row[7], below[7])); row[7] = below[7]; } } } dst += dst_remainder_stride; } while (--y != 0); } void ConvolveIntraBlockCopyVertical_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast(reference); auto* dest = static_cast(prediction); if (width == 128) { IntraBlockCopyVertical<128>(src, reference_stride, height, dest, pred_stride); } else if (width == 64) { IntraBlockCopyVertical<64>(src, reference_stride, height, dest, pred_stride); } else if (width == 32) { IntraBlockCopyVertical<32>(src, reference_stride, height, dest, pred_stride); } else if (width == 16) { IntraBlockCopyVertical<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { uint8x8_t row, below; row = vld1_u8(src); src += reference_stride; int y = height; do { below = vld1_u8(src); src += reference_stride; vst1_u8(dest, vrhadd_u8(row, below)); dest += pred_stride; row = below; } while (--y != 0); } else { // width == 4 uint8x8_t row = Load4(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; int y = height; do { below = Load4<0>(src, below); src += reference_stride; StoreLo4(dest, vrhadd_u8(row, below)); dest += pred_stride; row = below; } while (--y != 0); } } template inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 8); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); uint16x8_t row[16]; row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); if (width >= 16) { src += 8; row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); if (width >= 32) { src += 8; row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); if (width >= 64) { src += 8; row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); if (width == 128) { src += 8; row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); src += 8; row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); } } } } src += src_remainder_stride; int y = height; do { const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2)); row[0] = below_0; if (width >= 16) { src += 8; dst += 8; const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2)); row[1] = below_1; if (width >= 32) { src += 8; dst += 8; const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2)); row[2] = below_2; src += 8; dst += 8; const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2)); row[3] = below_3; if (width >= 64) { src += 8; dst += 8; const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2)); row[4] = below_4; src += 8; dst += 8; const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2)); row[5] = below_5; src += 8; dst += 8; const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2)); row[6] = below_6; src += 8; dst += 8; const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2)); row[7] = below_7; if (width == 128) { src += 8; dst += 8; const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2)); row[8] = below_8; src += 8; dst += 8; const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2)); row[9] = below_9; src += 8; dst += 8; const uint16x8_t below_10 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2)); row[10] = below_10; src += 8; dst += 8; const uint16x8_t below_11 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2)); row[11] = below_11; src += 8; dst += 8; const uint16x8_t below_12 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2)); row[12] = below_12; src += 8; dst += 8; const uint16x8_t below_13 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2)); row[13] = below_13; src += 8; dst += 8; const uint16x8_t below_14 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2)); row[14] = below_14; src += 8; dst += 8; const uint16x8_t below_15 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2)); row[15] = below_15; } } } } src += src_remainder_stride; dst += dst_remainder_stride; } while (--y != 0); } void ConvolveIntraBlockCopy2D_NEON( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, const int width, const int height, void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast(reference); auto* dest = static_cast(prediction); // Note: allow vertical access to height + 1. Because this function is only // for u/v plane of intra block copy, such access is guaranteed to be within // the prediction block. if (width == 128) { IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride); } else if (width == 64) { IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride); } else if (width == 32) { IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride); } else if (width == 16) { IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride); } else { // width == 4 uint8x8_t left = Load4(src); uint8x8_t right = Load4(src + 1); src += reference_stride; uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); src += reference_stride; left = Load4<1>(src, left); right = Load4<1>(src + 1, right); src += reference_stride; const uint16x8_t below = vaddl_u8(left, right); const uint8x8_t result = vrshrn_n_u16( vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2); StoreLo4(dest, result); dest += pred_stride; StoreHi4(dest, result); dest += pred_stride; row = vget_high_u16(below); y -= 2; } while (y != 0); } } void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON; dsp->convolve[0][0][1][0] = ConvolveVertical_NEON; dsp->convolve[0][0][1][1] = Convolve2D_NEON; dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON; dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON; dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON; dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON; dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON; dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON; dsp->convolve_scale[0] = ConvolveScale2D_NEON; dsp->convolve_scale[1] = ConvolveScale2D_NEON; } } // namespace } // namespace low_bitdepth void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 #else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { void ConvolveInit_NEON() {} } // namespace dsp } // namespace libgav1 #endif // LIBGAV1_ENABLE_NEON