diff options
Diffstat (limited to 'src/dsp/arm/obmc_neon.cc')
-rw-r--r-- | src/dsp/arm/obmc_neon.cc | 688 |
1 files changed, 618 insertions, 70 deletions
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 1111a90..659ed8e 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -33,10 +33,15 @@ namespace libgav1 { namespace dsp { namespace { - #include "src/dsp/obmc.inc" -inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred, +} // namespace + +namespace low_bitdepth { +namespace { + +inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint8x8_t pred_mask, const uint8x8_t obmc_pred_mask) { const uint8x8_t pred_val = Load4(pred); @@ -47,35 +52,17 @@ inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred, StoreLo4(pred, result); } -template <bool from_left> -inline void OverlapBlend2xH_NEON(uint8_t* const prediction, - const ptrdiff_t prediction_stride, - const int height, - const uint8_t* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { - uint8_t* pred = prediction; +inline void OverlapBlendFromLeft2xH_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride) { const uint8x8_t mask_inverter = vdup_n_u8(64); - const uint8_t* obmc_pred = obmc_prediction; - uint8x8_t pred_mask; - uint8x8_t obmc_pred_mask; - int compute_height; - const int mask_offset = height - 2; - if (from_left) { - pred_mask = Load2(kObmcMask); - obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - compute_height = height; - } else { - // Weights for the last line are all 64, which is a no-op. - compute_height = height - 1; - } + const uint8x8_t pred_mask = Load2(kObmcMask); + const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); uint8x8_t pred_val = vdup_n_u8(0); uint8x8_t obmc_pred_val = vdup_n_u8(0); int y = 0; do { - if (!from_left) { - pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]); - obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - } pred_val = Load2<0>(pred, pred_val); const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val); @@ -85,16 +72,13 @@ inline void OverlapBlend2xH_NEON(uint8_t* const prediction, pred += prediction_stride; obmc_pred += obmc_prediction_stride; - } while (++y != compute_height); + } while (++y != height); } inline void OverlapBlendFromLeft4xH_NEON( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, const ptrdiff_t obmc_prediction_stride) { - uint8_t* pred = prediction; - const uint8_t* obmc_pred = obmc_prediction; - const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = Load4(kObmcMask + 2); // 64 - mask @@ -114,11 +98,9 @@ inline void OverlapBlendFromLeft4xH_NEON( } inline void OverlapBlendFromLeft8xH_NEON( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, const ptrdiff_t obmc_prediction_stride) { - uint8_t* pred = prediction; - const uint8_t* obmc_pred = obmc_prediction; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); // 64 - mask @@ -137,17 +119,19 @@ inline void OverlapBlendFromLeft8xH_NEON( } while (++y != height); } -void OverlapBlendFromLeft_NEON(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromLeft_NEON( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint8_t*>(prediction); const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 2); + assert(height >= 4); if (width == 2) { - OverlapBlend2xH_NEON<true>(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); return; } if (width == 4) { @@ -194,13 +178,10 @@ void OverlapBlendFromLeft_NEON(void* const prediction, } while (x < width); } -inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction, - const ptrdiff_t prediction_stride, - const uint8_t* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride, - const int height) { - uint8_t* pred = prediction; - const uint8_t* obmc_pred = obmc_prediction; +inline void OverlapBlendFromTop4x4_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride, const int height) { uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]); const uint8x8_t mask_inverter = vdup_n_u8(64); uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); @@ -224,16 +205,14 @@ inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction, } inline void OverlapBlendFromTop4xH_NEON( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, const ptrdiff_t obmc_prediction_stride) { if (height < 8) { - OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction, + OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred, obmc_prediction_stride, height); return; } - uint8_t* pred = prediction; - const uint8_t* obmc_pred = obmc_prediction; const uint8_t* mask = kObmcMask + height - 2; const uint8x8_t mask_inverter = vdup_n_u8(64); int y = 0; @@ -282,11 +261,9 @@ inline void OverlapBlendFromTop4xH_NEON( } inline void OverlapBlendFromTop8xH_NEON( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, const ptrdiff_t obmc_prediction_stride) { - uint8_t* pred = prediction; - const uint8_t* obmc_pred = obmc_prediction; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8_t* mask = kObmcMask + height - 2; const int compute_height = height - (height >> 2); @@ -307,19 +284,16 @@ inline void OverlapBlendFromTop8xH_NEON( } while (++y != compute_height); } -void OverlapBlendFromTop_NEON(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromTop_NEON( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint8_t*>(prediction); const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 4); + assert(height >= 2); - if (width == 2) { - OverlapBlend2xH_NEON<false>(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); - return; - } if (width == 4) { OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, obmc_prediction_stride); @@ -374,8 +348,582 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void ObmcInit_NEON() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// This is a flat array of masks for each block dimension from 2 to 32. The +// starting index for each length is length-2. The value 64 leaves the result +// equal to |pred| and may be ignored if convenient. Vector loads may overrread +// values meant for larger sizes, but these values will be unused. +constexpr uint16_t kObmcMask[62] = { + // Obmc Mask 2 + 45, 64, + // Obmc Mask 4 + 39, 50, 59, 64, + // Obmc Mask 8 + 36, 42, 48, 53, 57, 61, 64, 64, + // Obmc Mask 16 + 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64, + // Obmc Mask 32 + 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, + 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; + +inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x4_t pred_mask, + const uint16x4_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); + const uint16x4_t obmc_pred_val = + vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val); + const uint16x4_t result = + vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + return result; +} + +inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); + const uint16x8_t obmc_pred_val = + vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val); + const uint16x8_t result = + vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + return result; +} + +inline void OverlapBlendFromLeft2xH_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride) { + const uint16x4_t mask_inverter = vdup_n_u16(64); + // Second two lanes unused. + const uint16x4_t pred_mask = vld1_u16(kObmcMask); + const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); + int y = 0; + do { + const uint16x4_t result_0 = + BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); + Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + const uint16x4_t result_1 = + BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); + Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + y += 2; + } while (y != height); +} + +inline void OverlapBlendFromLeft4xH_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride) { + const uint16x4_t mask_inverter = vdup_n_u16(64); + const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2); + // 64 - mask + const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); + int y = 0; + do { + const uint16x4_t result_0 = + BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + const uint16x4_t result_1 = + BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + y += 2; + } while (y != height); +} + +void OverlapBlendFromLeft_NEON( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint8_t*>(prediction); + const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 2); + assert(height >= 4); + + if (width == 2) { + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + const uint16x8_t mask_inverter = vdupq_n_u16(64); + const uint16_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x); + obmc_pred = reinterpret_cast<const uint8_t*>( + static_cast<const uint16_t*>(obmc_prediction) + x); + const uint16x8_t pred_mask = vld1q_u16(mask + x); + // 64 - mask + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + int y = 0; + do { + const uint16x8_t result = + BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < height); + x += 8; + } while (x < width); +} + +template <int lane> +inline uint16x4_t BlendObmcFromTop4( + uint8_t* LIBGAV1_RESTRICT const pred, + const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); + const uint16x4_t obmc_pred_val = + vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask); + const uint16x4_t result = vrshr_n_u16( + VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); + return result; +} + +template <int lane> +inline uint16x8_t BlendObmcFromTop8( + uint8_t* LIBGAV1_RESTRICT const pred, + const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); + const uint16x8_t obmc_pred_val = + vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask); + const uint16x8_t result = vrshrq_n_u16( + VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); + return result; +} + +inline void OverlapBlendFromTop4x2Or4_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride, const int height) { + const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]); + const uint16x8_t mask_inverter = vdupq_n_u16(64); + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + uint16x4_t result = + BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + if (height == 2) { + // Mask value is 64, meaning |pred| is unchanged. + return; + } + + result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); +} + +inline void OverlapBlendFromTop4xH_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride) { + if (height < 8) { + OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, + obmc_prediction_stride, height); + return; + } + const uint16_t* mask = kObmcMask + height - 2; + const uint16x8_t mask_inverter = vdupq_n_u16(64); + int y = 0; + // Compute 6 lines for height 8, or 12 lines for height 16. The remaining + // lines are unchanged as the corresponding mask value is 64. + do { + const uint16x8_t pred_mask = vld1q_u16(&mask[y]); + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + uint16x4_t result = + BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + // Increment for the right mask index. + y += 6; + } while (y < height - 4); +} + +inline void OverlapBlendFromTop8xH_NEON( + uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint8_t* LIBGAV1_RESTRICT obmc_pred, + const ptrdiff_t obmc_prediction_stride, const int height) { + const uint16_t* mask = kObmcMask + height - 2; + const uint16x8_t mask_inverter = vdupq_n_u16(64); + uint16x8_t pred_mask = vld1q_u16(mask); + uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + uint16x8_t result = + BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + if (height == 2) return; + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + if (height == 4) return; + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + + if (height == 8) return; + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vld1q_u16(&mask[8]); + obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + + result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + + if (height == 16) return; + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vld1q_u16(&mask[16]); + obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + + result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); + vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); +} + +void OverlapBlendFromTop_NEON( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint8_t*>(prediction); + const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 4); + assert(height >= 2); + + if (width == 4) { + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + + if (width == 8) { + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, + obmc_prediction_stride, height); + return; + } + + const uint16_t* mask = kObmcMask + height - 2; + const uint16x8_t mask_inverter = vdupq_n_u16(64); + const uint16x8_t pred_mask = vld1q_u16(mask); + // 64 - mask + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); +#define OBMC_ROW_FROM_TOP(n) \ + do { \ + int x = 0; \ + do { \ + const uint16x8_t result = BlendObmcFromTop8<n>( \ + reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \ + reinterpret_cast<const uint8_t*>( \ + reinterpret_cast<const uint16_t*>(obmc_pred) + x), \ + pred_mask, obmc_pred_mask); \ + vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \ + \ + x += 8; \ + } while (x < width); \ + } while (false) + + // Compute 1 row. + if (height == 2) { + OBMC_ROW_FROM_TOP(0); + return; + } + + // Compute 3 rows. + if (height == 4) { + OBMC_ROW_FROM_TOP(0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(2); + return; + } + + // Compute 6 rows. + if (height == 8) { + OBMC_ROW_FROM_TOP(0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(2); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(3); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(4); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(5); + return; + } + + // Compute 12 rows. + if (height == 16) { + OBMC_ROW_FROM_TOP(0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(2); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(3); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(4); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(5); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(6); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(7); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + const uint16x8_t pred_mask = vld1q_u16(&mask[8]); + // 64 - mask + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + OBMC_ROW_FROM_TOP(0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(2); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(3); + return; + } + + // Stop when mask value becomes 64. This is a multiple of 8 for height 32 + // and 64. + const int compute_height = height - (height >> 2); + int y = 0; + do { + const uint16x8_t pred_mask = vld1q_u16(&mask[y]); + // 64 - mask + const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); + OBMC_ROW_FROM_TOP(0); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(2); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(3); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(4); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(5); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(6); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + OBMC_ROW_FROM_TOP(7); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + y += 8; + } while (y < compute_height); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON; + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void ObmcInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 |