diff options
Diffstat (limited to 'src/dsp/arm/motion_vector_search_neon.cc')
-rw-r--r-- | src/dsp/arm/motion_vector_search_neon.cc | 267 |
1 files changed, 267 insertions, 0 deletions
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc new file mode 100644 index 0000000..8a403a6 --- /dev/null +++ b/src/dsp/arm/motion_vector_search_neon.cc @@ -0,0 +1,267 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include <arm_neon.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator, + const int32x4_t numerator) { + const int32x4_t m0 = vmull_s16(mv, denominator); + const int32x4_t m = vmulq_s32(m0, numerator); + // Add the sign (0 or -1) to round towards zero. + const int32x4_t add_sign = vsraq_n_s32(m, m, 31); + return vqrshrn_n_s32(add_sign, 14); +} + +inline int16x4_t MvProjectionCompound(const int16x4_t mv, + const int temporal_reference_offsets, + const int reference_offsets[2]) { + const int16x4_t denominator = + vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]); + const int32x2_t offset = vld1_s32(reference_offsets); + const int32x2x2_t offsets = vzip_s32(offset, offset); + const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]); + return MvProjection(mv, denominator, numerator); +} + +inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) { + const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp); + const int16x8_t mv = vcombine_s16(mv0, mv1); + const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp); + return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp)); +} + +inline int16x8_t MvProjectionCompoundClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offsets[2]) { + const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs); + const int32x2_t temporal_mv = vld1_s32(tmvs); + const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0)); + const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1)); + const int16x4_t mv0 = MvProjectionCompound( + tmv0, temporal_reference_offsets[0], reference_offsets); + const int16x4_t mv1 = MvProjectionCompound( + tmv1, temporal_reference_offsets[1], reference_offsets); + return ProjectionClip(mv0, mv1); +} + +inline int16x8_t MvProjectionSingleClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, const int reference_offset, + int16x4_t* const lookup) { + const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs); + const int16x8_t temporal_mv = vld1q_s16(tmvs); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3); + const int16x4x2_t denominator = vzip_s16(*lookup, *lookup); + const int16x4_t tmv0 = vget_low_s16(temporal_mv); + const int16x4_t tmv1 = vget_high_s16(temporal_mv); + const int32x4_t numerator = vdupq_n_s32(reference_offset); + const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator); + const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator); + return ProjectionClip(mv0, mv1); +} + +inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(1); + const uint16x8_t mvu = vreinterpretq_u16_s16(mv); + const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); + const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask); + vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1); +} + +inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(7); + const uint16x8_t mvu = vreinterpretq_u16_s16(mv); + const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); + const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3)); + const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask); + vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2); +} + +void MvProjectionCompoundLowPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + LowPrecision(mv, candidate_mvs); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionCompoundForceInteger_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + ForceInteger(mv, candidate_mvs); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionCompoundHighPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionSingleLowPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + LowPrecision(mv, candidate_mvs); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void MvProjectionSingleForceInteger_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + ForceInteger(mv, candidate_mvs); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void MvProjectionSingleHighPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON; +} +#endif + +} // namespace + +void MotionVectorSearchInit_NEON() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void MotionVectorSearchInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON |