aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/motion_vector_search_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/motion_vector_search_neon.cc')
-rw-r--r--src/dsp/arm/motion_vector_search_neon.cc267
1 files changed, 267 insertions, 0 deletions
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
new file mode 100644
index 0000000..8a403a6
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -0,0 +1,267 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+ const int32x4_t numerator) {
+ const int32x4_t m0 = vmull_s16(mv, denominator);
+ const int32x4_t m = vmulq_s32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+ return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+ const int temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const int16x4_t denominator =
+ vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+ const int32x2_t offset = vld1_s32(reference_offsets);
+ const int32x2x2_t offsets = vzip_s32(offset, offset);
+ const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+ return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+ const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+ const int16x8_t mv = vcombine_s16(mv0, mv1);
+ const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+ return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const int32x2_t temporal_mv = vld1_s32(tmvs);
+ const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+ const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+ const int16x4_t mv0 = MvProjectionCompound(
+ tmv0, temporal_reference_offsets[0], reference_offsets);
+ const int16x4_t mv1 = MvProjectionCompound(
+ tmv1, temporal_reference_offsets[1], reference_offsets);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets, const int reference_offset,
+ int16x4_t* const lookup) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const int16x8_t temporal_mv = vld1q_s16(tmvs);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+ *lookup = vld1_lane_s16(
+ &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+ const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+ const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+ const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+ const int32x4_t numerator = vdupq_n_s32(reference_offset);
+ const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+ const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+ return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+ const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+ const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+ const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+ const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+ const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+ vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int loop_count = (count + 1) >> 1;
+ do {
+ const int16x8_t mv = MvProjectionCompoundClip(
+ temporal_mvs, temporal_reference_offsets, offsets);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 2;
+ temporal_reference_offsets += 2;
+ candidate_mvs += 2;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ LowPrecision(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ ForceInteger(mv, candidate_mvs);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int loop_count = (count + 3) >> 2;
+ int16x4_t lookup = vdup_n_s16(0);
+ do {
+ const int16x8_t mv = MvProjectionSingleClip(
+ temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+ vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+ temporal_mvs += 4;
+ temporal_reference_offsets += 4;
+ candidate_mvs += 4;
+ } while (--loop_count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_NEON() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON