aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/motion_vector_search_sse4.cc
diff options
context:
space:
mode:
authorqinxialei <xialeiqin@gmail.com>2020-10-29 11:26:59 +0800
committerqinxialei <xialeiqin@gmail.com>2020-10-29 11:26:59 +0800
commite8d277081293b6fb2a5d469616baaa7a06f52496 (patch)
tree1179bb07d3927d1837d4a90bd81b2034c4c696a9 /src/dsp/x86/motion_vector_search_sse4.cc
downloadlibgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.gz
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.tar.bz2
libgav1-e8d277081293b6fb2a5d469616baaa7a06f52496.zip
Import Upstream version 0.16.0
Diffstat (limited to 'src/dsp/x86/motion_vector_search_sse4.cc')
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.cc262
1 files changed, 262 insertions, 0 deletions
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 0000000..e9cdd4c
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const __m128i numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+ const __m128i denominators[2],
+ const __m128i numerator) {
+ const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+ const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+ const __m128i mv = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t temporal_reference_offsets[2],
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadLo8(tmvs);
+ const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+ mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+ denominators[0] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+ denominators[1] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+ const __m128i offsets = LoadLo8(reference_offsets);
+ const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offset) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadAligned16(tmvs);
+ __m128i lookup = _mm_cvtsi32_si128(
+ kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+ 1);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+ 2);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+ 3);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+ mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+ denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+ denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+ const __m128i numerator = _mm_set1_epi32(reference_offset);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+ const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+ StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+ const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+ const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+ StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 4;
+ } while (i < count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1