// Copyright 2019 The libgav1 Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "src/dsp/distance_weighted_blend.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON #include #include #include #include #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" namespace libgav1 { namespace dsp { namespace { constexpr int kInterPostRoundBit = 4; inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, const int16x4_t weights[2]) { // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0)); const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0)); const int32x4_t blended_lo = vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1)); const int32x4_t blended_hi = vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1)); return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4), vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4)); } template inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0, const int16_t* prediction_1, const int16x4_t weights[2], void* const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast(dest); constexpr int step = 16 / width; for (int y = 0; y < height; y += step) { const int16x8_t src_00 = vld1q_s16(prediction_0); const int16x8_t src_10 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights); const int16x8_t src_01 = vld1q_s16(prediction_0); const int16x8_t src_11 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights); const uint8x8_t result0 = vqmovun_s16(res0); const uint8x8_t result1 = vqmovun_s16(res1); if (width == 4) { StoreLo4(dst, result0); dst += dest_stride; StoreHi4(dst, result0); dst += dest_stride; StoreLo4(dst, result1); dst += dest_stride; StoreHi4(dst, result1); dst += dest_stride; } else { assert(width == 8); vst1_u8(dst, result0); dst += dest_stride; vst1_u8(dst, result1); dst += dest_stride; } } } inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0, const int16_t* prediction_1, const int16x4_t weights[2], const int width, const int height, void* const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast(dest); int y = height; do { int x = 0; do { const int16x8_t src0_lo = vld1q_s16(prediction_0 + x); const int16x8_t src1_lo = vld1q_s16(prediction_1 + x); const int16x8_t res_lo = ComputeWeightedAverage8(src0_lo, src1_lo, weights); const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8); const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8); const int16x8_t res_hi = ComputeWeightedAverage8(src0_hi, src1_hi, weights); const uint8x16_t result = vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi)); vst1q_u8(dst + x, result); x += 16; } while (x < width); dst += dest_stride; prediction_0 += width; prediction_1 += width; } while (--y != 0); } inline void DistanceWeightedBlend_NEON(const void* prediction_0, const void* prediction_1, const uint8_t weight_0, const uint8_t weight_1, const int width, const int height, void* const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast(prediction_0); const auto* pred_1 = static_cast(prediction_1); int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)}; // TODO(johannkoenig): Investigate the branching. May be fine to call with a // variable height. if (width == 4) { if (height == 4) { DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest, dest_stride); } else if (height == 8) { DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest, dest_stride); } else { assert(height == 16); DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest, dest_stride); } return; } if (width == 8) { switch (height) { case 4: DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest, dest_stride); return; case 8: DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest, dest_stride); return; case 16: DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest, dest_stride); return; default: assert(height == 32); DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest, dest_stride); return; } } DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest, dest_stride); } void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->distance_weighted_blend = DistanceWeightedBlend_NEON; } } // namespace void DistanceWeightedBlendInit_NEON() { Init8bpp(); } } // namespace dsp } // namespace libgav1 #else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { void DistanceWeightedBlendInit_NEON() {} } // namespace dsp } // namespace libgav1 #endif // LIBGAV1_ENABLE_NEON