diff options
Diffstat (limited to 'src/dsp/arm/intra_edge_neon.cc')
-rw-r--r-- | src/dsp/arm/intra_edge_neon.cc | 243 |
1 files changed, 232 insertions, 11 deletions
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc index 00b186a..074283f 100644 --- a/src/dsp/arm/intra_edge_neon.cc +++ b/src/dsp/arm/intra_edge_neon.cc @@ -25,7 +25,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" -#include "src/utils/common.h" // RightShiftWithRounding() +#include "src/utils/common.h" namespace libgav1 { namespace dsp { @@ -35,6 +35,11 @@ namespace { // required. constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}}; +} // namespace + +namespace low_bitdepth { +namespace { + void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { assert(strength == 1 || strength == 2 || strength == 3); const int kernel_index = strength - 1; @@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // elements written is |size| - 1. if (size == 1) return; + const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100), + vcreate_u8(0x0f0e0d0c0b0a0908)); // |strength| 1 and 2 use a 3 tap filter. if (strength < 3) { // The last value requires extending the buffer (duplicating @@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // |remainder| == 1 then we don't have to do anything. const int remainder = (size - 1) & 0xf; if (remainder > 1) { - uint8_t temp[16]; const uint8x16_t src_1 = vld1q_u8(dst_buffer + i); const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1); @@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[size - 1] = last_val; @@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // Like the 3 tap but if there are two remaining values we have already // calculated them. if (remainder > 2) { - uint8_t temp[16]; const uint8x16_t src_2 = vld1q_u8(dst_buffer + i); const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1); const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2); @@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[1] = special_vals[0]; @@ -284,13 +293,225 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +const uint16_t kRemainderMask[8][8] = { + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000}, +}; + +void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { + assert(strength == 1 || strength == 2 || strength == 3); + const int kernel_index = strength - 1; + auto* const dst_buffer = static_cast<uint16_t*>(buffer); + + // The first element is not written out (but it is input) so the number of + // elements written is |size| - 1. + if (size == 1) return; + + // |strength| 1 and 2 use a 3 tap filter. + if (strength < 3) { + // The last value requires extending the buffer (duplicating + // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in + // neon. + const uint16_t last_val = RightShiftWithRounding( + kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] + + kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] + + kKernelsNEON[kernel_index][0] * dst_buffer[size - 1], + 4); + + const uint16_t krn0 = kKernelsNEON[kernel_index][0]; + const uint16_t krn1 = kKernelsNEON[kernel_index][1]; + + // The first value we need gets overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer); + int i = 1; + + // Process blocks until there are less than 16 values remaining. + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_0| will read past the + // end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + // Load the next row before overwriting. This loads an extra 7 values + // past |size| on the trailing iteration. + src_0 = vld1q_u16(dst_buffer + i + 7); + vst1q_u16(dst_buffer + i, result); + } + + // The last output value |last_val| was already calculated so if + // |remainder| == 1 then we don't have to do anything. + const int remainder = (size - 1) & 0x7; + if (remainder > 1) { + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[size - 1] = last_val; + return; + } + + assert(strength == 3); + // 5 tap filter. The first element requires duplicating |buffer[0]| and the + // last two elements require duplicating |buffer[size - 1]|. + uint16_t special_vals[3]; + special_vals[0] = RightShiftWithRounding( + (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) + + (dst_buffer[2] << 2) + (dst_buffer[3] << 1), + 4); + // Clamp index for very small |size| values. + const int first_index_min = std::max(size - 4, 0); + const int second_index_min = std::max(size - 3, 0); + const int third_index_min = std::max(size - 2, 0); + special_vals[1] = RightShiftWithRounding( + (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) + + (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) + + (dst_buffer[size - 1] << 1), + 4); + special_vals[2] = RightShiftWithRounding( + (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) + + // x << 2 + x << 2 == x << 3 + (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1), + 4); + + // The first two values we need get overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer - 1); + uint16x8_t src_1 = vld1q_u16(dst_buffer); + int i = 1; + + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_[01]| will read past + // the end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + + // Load the next before overwriting. + src_0 = vld1q_u16(dst_buffer + i + 6); + src_1 = vld1q_u16(dst_buffer + i + 7); + + vst1q_u16(dst_buffer + i, result); + } + + const int remainder = (size - 1) & 0x7; + // Like the 3 tap but if there are two remaining values we have already + // calculated them. + if (remainder > 2) { + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[1] = special_vals[0]; + // Avoid overwriting |dst_buffer[0]|. + if (size > 2) dst_buffer[size - 2] = special_vals[1]; + dst_buffer[size - 1] = special_vals[2]; +} + +void IntraEdgeUpsampler_NEON(void* buffer, const int size) { + assert(size % 4 == 0 && size <= 16); + auto* const pixel_buffer = static_cast<uint16_t*>(buffer); -void IntraEdgeInit_NEON() { Init8bpp(); } + // Extend first/last samples + pixel_buffer[-2] = pixel_buffer[-1]; + pixel_buffer[size] = pixel_buffer[size - 1]; + + const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2)); + const int16x8_t src_hi = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8)); + const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3)); + const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3)); + + int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo); + sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2)); + sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3)); + sum_lo = vrshrq_n_s16(sum_lo, 4); + + uint16x8x2_t result_lo; + result_lo.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2)); + + if (size > 8) { + const int16x8_t src_hi_extra = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2)); + const int16x8_t src9_hi_extra = + vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3)); + + int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi); + sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2)); + sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3)); + sum_hi = vrshrq_n_s16(sum_hi, 4); + + uint16x8x2_t result_hi; + result_hi.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_hi.val[1] = + vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2)); + vst2q_u16(pixel_buffer - 1, result_lo); + vst2q_u16(pixel_buffer + 15, result_hi); + } else { + vst2q_u16(pixel_buffer - 1, result_lo); + } +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->intra_edge_filter = IntraEdgeFilter_NEON; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraEdgeInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { |