diff options
Diffstat (limited to 'src/dsp/arm/average_blend_neon.cc')
-rw-r--r-- | src/dsp/arm/average_blend_neon.cc | 135 |
1 files changed, 133 insertions, 2 deletions
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc index 834e8b4..5b4c094 100644 --- a/src/dsp/arm/average_blend_neon.cc +++ b/src/dsp/arm/average_blend_neon.cc @@ -35,6 +35,11 @@ namespace { constexpr int kInterPostRoundBit = kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; +} // namespace + +namespace low_bitdepth { +namespace { + inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0, const int16_t* prediction_1) { const int16x8_t pred0 = vld1q_s16(prediction_0); @@ -128,13 +133,139 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0, + const uint16_t* prediction_1, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + const uint16x8_t pred0 = vld1q_u16(prediction_0); + const uint16x8_t pred1 = vld1q_u16(prediction_1); + const uint32x4_t pred_lo = + vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1)); + const uint32x4_t pred_hi = + vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1)); + const int32x4_t offset_lo = + vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset); + const int32x4_t offset_hi = + vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset); + const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1); + const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1); + return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth); +} + +inline void AverageBlendLargeRow(const uint16_t* prediction_0, + const uint16_t* prediction_1, const int width, + uint16_t* dest, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + int x = width; + do { + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + x -= 16; + } while (x != 0); +} + +void AverageBlend_NEON(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y = height; + + const ptrdiff_t dst_stride = dest_stride >> 1; + const int32x4_t compound_offset = + vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset)); + const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); + if (width == 4) { + do { + const uint16x8_t result = + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth); + pred_0 += 8; + pred_1 += 8; + + vst1_u16(dst, vget_low_u16(result)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(result)); + dst += dst_stride; + y -= 2; + } while (y != 0); + return; + } + + if (width == 8) { + do { + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + y -= 2; + } while (y != 0); + return; + } + + do { + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->average_blend = AverageBlend_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 -void AverageBlendInit_NEON() { Init8bpp(); } +void AverageBlendInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { |