diff options
Diffstat (limited to 'src/dsp/arm/obmc_neon.cc')
-rw-r--r-- | src/dsp/arm/obmc_neon.cc | 523 |
1 files changed, 268 insertions, 255 deletions
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 659ed8e..271bbaa 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred, StoreLo4(pred, result); } +inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8x8_t obmc_pred_val, + const uint8x8_t pred_mask, + const uint8x8_t obmc_pred_mask) { + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + vst1_u8(pred, result); +} + inline void OverlapBlendFromLeft2xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, @@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON( inline void OverlapBlendFromLeft8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); + constexpr int obmc_prediction_stride = 8; // 64 - mask const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); int y = 0; do { - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask); + pred += prediction_stride; - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask, + obmc_pred_mask); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != height); + + obmc_pred += obmc_prediction_stride << 1; + y += 2; + } while (y != height); } void OverlapBlendFromLeft_NEON( @@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON( return; } if (width == 8) { - OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint8x16_t mask_inverter = vdupq_n_u8(64); @@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON( inline void OverlapBlendFromTop8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 8; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8_t* mask = kObmcMask + height - 2; const int compute_height = height - (height >> 2); int y = 0; do { - const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]); // 64 - mask - const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0, + obmc_pred_mask0); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != compute_height); + ++y; + + const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1, + obmc_pred_mask1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (++y < compute_height); } void OverlapBlendFromTop_NEON( @@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON( } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } @@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; -inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x4_t BlendObmc2Or4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, const uint16x4_t pred_mask, const uint16x4_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val); const uint16x4_t result = vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); return result; } -inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val); const uint16x8_t result = vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); @@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, } inline void OverlapBlendFromLeft2xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 2; const uint16x4_t mask_inverter = vdup_n_u16(64); // Second two lanes unused. const uint16x4_t pred_mask = vld1_u16(kObmcMask); const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { + const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred); const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0); + BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_0); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; + const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred); const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1); + BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_1); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; y += 2; @@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON( } inline void OverlapBlendFromLeft4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 4; const uint16x4_t mask_inverter = vdup_n_u16(64); const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2); // 64 - mask const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { - const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result_0); + pred = AddByteStride(pred, prediction_stride); + + const uint16x4_t result_1 = BlendObmc2Or4( + pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask); + vst1_u16(pred, result_1); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y != height); @@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 2); assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16_t* mask = kObmcMask + width - 2; int x = 0; do { - pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x); - obmc_pred = reinterpret_cast<const uint8_t*>( - static_cast<const uint16_t*>(obmc_prediction) + x); + uint16_t* pred_x = pred + x; + const uint16_t* obmc_pred_x = obmc_pred + x; const uint16x8_t pred_mask = vld1q_u16(mask + x); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); int y = 0; do { const uint16x8_t result = - BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask); + vst1q_u16(pred_x, result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred_x = AddByteStride(pred_x, prediction_stride); + obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride); } while (++y < height); x += 8; } while (x < width); } template <int lane> -inline uint16x4_t BlendObmcFromTop4( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); +inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, + const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask); const uint16x4_t result = vrshr_n_u16( VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4( template <int lane> inline uint16x8_t BlendObmcFromTop8( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask); const uint16x8_t result = vrshrq_n_u16( VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8( } inline void OverlapBlendFromTop4x2Or4_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { + constexpr int obmc_prediction_stride = 4; const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]); const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); if (height == 2) { // Mask value is 64, meaning |pred| is unchanged. return; } - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred); + result = + BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask); + vst1_u16(pred, result); } inline void OverlapBlendFromTop4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { if (height < 8) { - OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height); return; } + constexpr int obmc_prediction_stride = 4; const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); int y = 0; @@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON( do { const uint16x8_t pred_mask = vld1q_u16(&mask[y]); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + // Load obmc row 0, 1. + uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 2, 3. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 4, 5. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; // Increment for the right mask index. y += 6; @@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON( } inline void OverlapBlendFromTop8xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); uint16x8_t pred_mask = vld1q_u16(mask); uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); uint16x8_t result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 2) return; - pred += prediction_stride; + constexpr int obmc_prediction_stride = 8; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 4) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 8) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[8]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 16) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[16]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); } void OverlapBlendFromTop_NEON( @@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 4); assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height); return; } @@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON( const uint16x8_t pred_mask = vld1q_u16(mask); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); -#define OBMC_ROW_FROM_TOP(n) \ - do { \ - int x = 0; \ - do { \ - const uint16x8_t result = BlendObmcFromTop8<n>( \ - reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \ - reinterpret_cast<const uint8_t*>( \ - reinterpret_cast<const uint16_t*>(obmc_pred) + x), \ - pred_mask, obmc_pred_mask); \ - vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \ - \ - x += 8; \ - } while (x < width); \ +#define OBMC_ROW_FROM_TOP(n) \ + do { \ + int x = 0; \ + do { \ + const uint16x8_t result = BlendObmcFromTop8<n>( \ + pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \ + vst1q_u16(pred + x, result); \ + \ + x += 8; \ + } while (x < width); \ } while (false) // Compute 1 row. @@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON( // Compute 3 rows. if (height == 4) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); return; } @@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON( // Compute 6 rows. if (height == 8) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); return; } @@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON( // Compute 12 rows. if (height == 16) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); const uint16x8_t pred_mask = vld1q_u16(&mask[8]); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); return; } @@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON( // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); y += 8; } while (y < compute_height); |